# Preparing data

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/catboost/tutorials/blob/master/hyperparameters_tuning/hyperparameters_tuning.ipynb)

In [None]:
import catboost
from catboost.datasets import titanic

titanic_train, titanic_test = titanic()
titanic_train_target = titanic_train.Survived

titanic_train.drop(['PassengerId', 'Survived', 'Name', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
titanic_test.drop(['PassengerId', 'Name', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)

titanic_train.head(3)

# Simple grid search

In [None]:
titanic_model = catboost.CatBoostClassifier(
    iterations=1000)

In [None]:
train_pool = catboost.Pool(titanic_train, titanic_train_target, cat_features=['Pclass', 'Sex', 'SibSp'])
test_pool = catboost.Pool(titanic_test, cat_features=['Pclass', 'Sex', 'SibSp'])

In [None]:
grid = {
    'learning_rate': [0.03, 0.1],
    'depth':[4, 6, 10],
    'l2_leaf_reg': [1, 3, 5, 7, 9]
}
grid_search_results = titanic_model.grid_search(grid, train_pool, shuffle=False, verbose=3, plot=True)

Parameters giving the best value of the loss function:

In [None]:
grid_search_results['params']

Available cross-validation statistics

In [None]:
grid_search_results['cv_results'].keys()

Quality estimated using cross-validation:

In [None]:
grid_search_results['cv_results']['test-Logloss-mean'][-1]

Model is ready to use after searching:

In [None]:
predicted = titanic_model.predict_proba(test_pool)
predicted[:3]

# Searching over several grids

In [None]:
grid_1 = {
    'learning_rate': [0.03, 0.1, 0.3, 0.9],
    'bootstrap_type':['Bayesian'],
    'bagging_temperature': [0, 1, 10]
}
grid_2 = {
    'learning_rate': [0.03, 0.1, 0.3, 0.9],
    'bootstrap_type':['Bernoulli'],
    'subsample': [0.66, 0.7, 0.8]
}
grid_search_results = titanic_model.grid_search([grid_1, grid_2], train_pool, shuffle=False, verbose=4, plot=True)

In [None]:
grid_search_results['params']

# Randomized search

In [None]:
from scipy import stats

class StrangeDistribution:
    def __init__(self, values):
        self.values = values

    def rvs(self):
        return self.values[0]

param_distribution = {
    'one_hot_max_size': stats.bernoulli(p=0.2, loc=2),
    'learning_rate': StrangeDistribution([0.03, 0.1]),
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'depth': stats.binom(n=10, p=0.2)
}

randomized_search_results = titanic_model.randomized_search(
    param_distribution,
    train_pool,
    n_iter=12,
    shuffle=False,
    plot=True
)

In [None]:
randomized_search_results['params']

In [None]:
# If search_by_train_test_split=False, every iteration of grid search evaluates results on cross-validation.
randomized_search_results = titanic_model.randomized_search(
    param_distribution,
    train_pool,
    n_iter=6,
    shuffle=False,
    search_by_train_test_split=False,
    plot=True
)

In [None]:
randomized_search_results['params']