[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/catboost/tutorials/blob/master/model_analysis/feature_statistics_tutorial.ipynb)

In [None]:
import catboost
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
%pylab inline

### Example on generated data

In [None]:
n_features = 3
X, y = make_regression(n_samples=1000, n_features=10, n_informative=n_features, random_state=0)
plt.scatter(X[:, 0], X[:, 1], c=y)
X = pd.DataFrame(X)
X.columns = ['Column_{}'.format(i) for i in range(X.shape[1])]

cat_values_1 = ['A', 'B', 'C']
cat_values_2 = ['some', 'random', 'categorical', 'feature', 'values', 'testing']
X.loc[:, 'CatColumn_1'] = [cat_values_1[np.random.randint(0, len(cat_values_1))] for _ in range(X.shape[0])]
X.loc[:, 'CatColumn_2'] = [cat_values_2[np.random.randint(0, len(cat_values_2))] for _ in range(X.shape[0])]

In [None]:
X.sample(3)

### Train model and plot statistics

In [None]:
model = catboost.CatBoostRegressor(cat_features=['CatColumn_1', 'CatColumn_2'], 
                                   one_hot_max_size=300, iterations=500)
model.fit(X, y, silent=True)

#### Float feature

In [None]:
feature_num = 'Column_3'
res = model.calc_feature_statistics(X, y, feature_num, plot=True)

#### One-Hot feature

In [None]:
feature_num = 'CatColumn_2'
res = model.calc_feature_statistics(X, y, feature_num, cat_feature_values=cat_values_2, plot=True)

### Test on Titanic dataset

In [None]:
from catboost.datasets import titanic

titanic_train, titanic_test = titanic()
titanic_train_target = titanic_train.Survived
titanic_train.drop(['PassengerId', 'Survived', 'Name', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
titanic_train.head(3)

In [None]:
titanic_model = catboost.CatBoostClassifier(
    iterations=200, 
    cat_features=['Pclass', 'Sex', 'SibSp'], 
    one_hot_max_size=10)
titanic_model.fit(titanic_train, titanic_train_target, silent=True)

In [None]:
titanic_train.dtypes

#### Float feature

In [None]:
feature = 'Fare'
res = titanic_model.calc_feature_statistics(titanic_train, titanic_train_target, feature, plot=True)

#### One-hot feature

In [None]:
feature = 'Sex'
res = titanic_model.calc_feature_statistics(titanic_train, titanic_train_target, feature, plot=True)