# Kaggle Amazon

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/catboost/tutorials/blob/master/ru/kaggle_amazon_tutorial_ru.ipynb)

В этом туториале показана основная функциональность библиотеки CatBoost с использованием датасета Amazon из соревнования на [Kaggle](https://www.kaggle.com).
Данные можно скачать [здесь](https://www.kaggle.com/c/amazon-employee-access-challenge/data) (для этого надо создать свой аккаунт на Kaggle)

# Чтение данных

In [None]:
import pandas as pd
train_df = pd.read_csv('amazon/train.csv')
test_df = pd.read_csv('amazon/test.csv')

In [None]:
train_df.head()

# Подготовка датасета

Выделение целевой переменной

In [None]:
y = train_df.ACTION
X = train_df.drop('ACTION', axis=1)

Объявление категориальных факторов

In [None]:
cat_features = range(0, X.shape[1])
print cat_features

# Обучение модели

Разделение данных на train и validation

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.8, random_state=1234)

Обучение модели

In [None]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(
    thread_count=2,
    iterations=5,
    random_seed=1136926949945377,
)
model.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_validation, y_validation),
    logging_level='Silent'
)

In [None]:
print model.random_seed_

In [None]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(
    thread_count=2,
    iterations=300,
    learning_rate=0.1,
    random_seed=63,
    custom_loss=['AUC', 'Accuracy'],
    use_best_model=True
)
model.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_validation, y_validation),
    logging_level='Silent',
    plot=True
)

In [None]:
print 'Model is fitted:', model.is_fitted()

In [None]:
print 'Model params:', model.get_params()

In [None]:
print 'Resulting tree count:', model.tree_count_

# Способы задания датасета

In [None]:
from catboost.utils import create_cd
import os

feature_names = dict()
for column, name in enumerate(train_df):
    if column == 0:
        continue
    feature_names[column - 1] = name
    
create_cd(
    label=0, 
    cat_features=list(range(1, train_df.columns.shape[0])),
    feature_names=feature_names,
    output_path='amazon/test.cd'
)

In [None]:
!cat amazon/test.cd

In [None]:
import numpy as np
from catboost import Pool
pool1 = Pool(data=X, label=y, cat_features=cat_features)
pool2 = Pool(data='amazon/train.csv', delimiter=',', has_header=True, column_description='amazon/test.cd', thread_count=2)

print 'Dataset shape'

print 'dataset 1:', pool1.shape, '\ndataset 2:', pool2.shape

print
print 'Column names'
print 'dataset 1:', pool1.get_feature_names(), '\ndataset 2:',  pool1.get_feature_names()

# Кросс-валидация

In [None]:
from catboost import cv

params = model.get_params()
params['iterations'] = 4
params['custom_loss'] = 'AUC'
del params['use_best_model']

cv_data = cv(
    params = params,
    pool = Pool(X, label=y, cat_features=cat_features),
    fold_count=2,
    type = 'Classical',
    shuffle=True,
    partition_random_seed=0
)

In [None]:
import numpy as np
np.set_printoptions(precision=3)

for name, values in cv_data.iteritems():
    print name + ':'
    print np.array(values)
    print '\n'

In [None]:
best_value = np.max(cv_data['test-AUC-mean'])
best_iter = np.argmax(cv_data['test-AUC-mean'])
print 'Best validation AUC score: {:.2f}±{:.2f} on step {}'.format(
    best_value,
    cv_data['test-AUC-std'][best_iter],
    best_iter
)

# Подбор параметров

In [None]:
best_model = CatBoostClassifier(
    iterations=1500,
    learning_rate=0.01,
    l2_leaf_reg=3,
    bagging_temperature=1,
    random_strength=1,
    one_hot_max_size=0,
    random_seed=63,
    use_best_model=True
)
best_model.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_validation, y_validation),
    logging_level='Silent',
    plot=True
)

## Детектор переобучения

In [None]:
model_full = CatBoostClassifier(
    eval_metric='AUC',
    learning_rate=0.8,
    iterations=500,
    random_seed=42
)
model_full.fit(
    X_train, y_train,
    eval_set=(X_validation, y_validation),
    cat_features=cat_features,
    logging_level='Silent',
    plot=True
)

In [None]:
model_with_earlystop = CatBoostClassifier(
    eval_metric='AUC',
    learning_rate=0.8,
    iterations=500,
    random_seed=42,
    od_type='Iter',
    od_wait=20
)

model_with_earlystop.fit(
    X_train, y_train,
    eval_set=(X_validation, y_validation),
    cat_features=cat_features,
    logging_level='Silent',
    plot=True
)

In [None]:
model_with_trunk = CatBoostClassifier(
    eval_metric='AUC',
    learning_rate=0.8,
    iterations=500,
    random_seed=42,
    od_type='Iter',
    od_wait=20,
    use_best_model=True
)

model_with_trunk.fit(
    X_train, y_train,
    eval_set=(X_validation, y_validation),
    cat_features=cat_features,
    logging_level='Silent',
    plot=True
)

In [None]:
print 'Full model tree count:', model_full.tree_count_
print 'Early-stopped model tree count:', model_with_earlystop.tree_count_
print 'Trunkated model tree count:', model_with_trunk.tree_count_

## Сравнение нескольких моделей

In [None]:
model1 = CatBoostClassifier(
    learning_rate=0.9,
    iterations=100,
    train_dir='learing_rate_0.9',
    name='learing_rate_0.9'
)

model2 = CatBoostClassifier(
    learning_rate=0.1,
    iterations=100,
    train_dir='learing_rate_0.1',
    name='learning_rate_0.1'
)

In [None]:
model1.fit(
    X_train, y_train,
    eval_set=(X_validation, y_validation),
    cat_features=cat_features,
    logging_level='Verbose'
)
model2.fit(
    X_train, y_train,
    eval_set=(X_validation, y_validation),
    cat_features=cat_features,
    logging_level='Verbose'
)

In [None]:
from catboost import MetricVisualizer
widget = MetricVisualizer(['learing_rate_0.9', 'learing_rate_0.1'])
widget.start()

## Снепшоты

In [None]:
model = CatBoostClassifier(
    iterations=40,
    save_snapshot=True,
    snapshot_file='snapshot.bkp',
    random_seed=43
)
model.fit(
    X_train, y_train,
    eval_set=(X_validation, y_validation),
    cat_features=cat_features,
    logging_level='Verbose'
)

# Предсказание формулы

In [None]:
print model.predict_proba(pool1)

In [None]:
predictions = list(model.staged_predict_proba(pool1, ntree_start=0, ntree_end=0, eval_period=1, thread_count=2))
print predictions[-1]

# Вычисление метрик на новом датасете

In [None]:
tree_count = model.tree_count_
metrics = model.eval_metrics(pool1, metrics=['Logloss','AUC','Accuracy'], ntree_start=0, ntree_end=0,
                             eval_period=tree_count, thread_count=2)
auc = metrics['AUC']
print auc

# Важность факторов

In [None]:
# Найдем самые важные факторы
importances = best_model.feature_importances_
print 'Feature importances:', np.array(importances)
print 'Feature names:', np.array(pool1.get_feature_names())

# Сохранение модели

In [None]:
my_best_model = CatBoostClassifier(iterations=10)
my_best_model.fit(
    X_train, y_train,
    eval_set=(X_validation, y_validation),
    cat_features=cat_features,
    logging_level='Verbose'
)

In [None]:
my_best_model.save_model('catboost_model.bin')

In [None]:
my_best_model.load_model('catboost_model.bin')
print my_best_model.get_params()

Теперь у вас есть время, чтобы натренировать лучшую модель. Ее результаты вы будете отправлять на kaggle