# catboost4j-prediction tutorial

In [1]:
!pip install -q numpy pandas catboost

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

In [3]:
import catboost as cb
import catboost.datasets as cbd
import numpy as np
import pandas as pd

# print module versions for reproducibility
print('CatBoost version {}'.format(cb.__version__))
print('NumPy version {}'.format(np.__version__))
print('Pandas version {}'.format(pd.__version__))

CatBoost version 0.14.2
NumPy version 1.16.3
Pandas version 0.24.2


In [4]:
# We are going to use UCI Adult Data Set because it has both numerical and categorical 
# features and also has missing features.
print(cbd.adult.__doc__)


    Download "Adult Data Set" [1] from UCI Machine Learning Repository.

    Will return two pandas.DataFrame-s, first with train part (adult.data) and second with test part
    (adult.test) of the dataset.

    [1]: https://archive.ics.uci.edu/ml/datasets/Adult
    


In [5]:
def get_fixed_adult():
    train, test = cbd.adult()
    
    # CatBoost doesn't support pandas.DataFrame missing values for categorical features out 
    # of the box (seed issue #571 on GitHub or issue MLTOOLS-2785 in internal tracker). So 
    # we have to replace them with some designated string manually. 
    for dataset in (train, test, ):
        for name in (name for name, dtype in dict(dataset.dtypes).items() if dtype == np.object):
            dataset[name].fillna('nan', inplace=True)
    
    X_train, y_train = train.drop('income', axis=1), train.income
    X_test, y_test = test.drop('income', axis=1), test.income
    return X_train, y_train, X_test, y_test

In [6]:
X_train, y_train, _, _ = get_fixed_adult()

In [7]:
X_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba


In [8]:
# If you want to find out how we found these parameters check "Simple classification 
# example with missing feature handling and parameter tuning" tutorial in `classification`
# subdirectory of tutorials
model = cb.CatBoostClassifier(
    class_names=('<=50K', '>50K'),
    loss_function='Logloss',
    eval_metric='AUC', 
    custom_metric=['AUC'],
    iterations=100,
    random_seed=20181224,
    learning_rate=0.4234185321620083, 
    depth=5, 
    l2_leaf_reg=9.464266235679002)
model.fit(
    cb.Pool(X_train, y_train, cat_features=np.where(X_train.dtypes != np.float)[0]),
    verbose=False)



<catboost.core.CatBoostClassifier at 0x7f1f7d037eb8>

In [9]:
model.save_model('adult.cbm')

In [10]:
!du -sh adult.cbm

156K	adult.cbm


We got the model, now it's time to use it via `catboost4j-prediction` package for Java. Next part of the tutorial
will be in a Maven project (seed directory named the same way as this notebook).