{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# catboost for rust tutorial" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "!pip install -q numpy pandas catboost" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from __future__ import absolute_import, division, print_function, unicode_literals" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CatBoost version 0.14.2\n", "NumPy version 1.16.3\n", "Pandas version 0.24.2\n" ] } ], "source": [ "import catboost as cb\n", "import catboost.datasets as cbd\n", "import numpy as np\n", "import pandas as pd\n", "\n", "# print module versions for reproducibility\n", "print('CatBoost version {}'.format(cb.__version__))\n", "print('NumPy version {}'.format(np.__version__))\n", "print('Pandas version {}'.format(pd.__version__))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " Download \"Adult Data Set\" [1] from UCI Machine Learning Repository.\n", "\n", " Will return two pandas.DataFrame-s, first with train part (adult.data) and second with test part\n", " (adult.test) of the dataset.\n", "\n", " [1]: https://archive.ics.uci.edu/ml/datasets/Adult\n", " \n" ] } ], "source": [ "# We are going to use UCI Adult Data Set because it has both numerical and categorical \n", "# features and also has missing features.\n", "print(cbd.adult.__doc__)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def get_fixed_adult():\n", " train, test = cbd.adult()\n", " \n", " # CatBoost doesn't support pandas.DataFrame missing values for categorical features out \n", " # of the box (seed issue #571 on GitHub or issue MLTOOLS-2785 in internal tracker). So \n", " # we have to replace them with some designated string manually. \n", " for dataset in (train, test, ):\n", " for name in (name for name, dtype in dict(dataset.dtypes).items() if dtype == np.object):\n", " dataset[name].fillna('nan', inplace=True)\n", " \n", " X_train, y_train = train.drop('income', axis=1), train.income\n", " X_test, y_test = test.drop('income', axis=1), test.income\n", " return X_train, y_train, X_test, y_test" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "X_train, y_train, _, _ = get_fixed_adult()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ageworkclassfnlwgteducationeducation-nummarital-statusoccupationrelationshipracesexcapital-gaincapital-losshours-per-weeknative-country
039.0State-gov77516.0Bachelors13.0Never-marriedAdm-clericalNot-in-familyWhiteMale2174.00.040.0United-States
150.0Self-emp-not-inc83311.0Bachelors13.0Married-civ-spouseExec-managerialHusbandWhiteMale0.00.013.0United-States
238.0Private215646.0HS-grad9.0DivorcedHandlers-cleanersNot-in-familyWhiteMale0.00.040.0United-States
353.0Private234721.011th7.0Married-civ-spouseHandlers-cleanersHusbandBlackMale0.00.040.0United-States
428.0Private338409.0Bachelors13.0Married-civ-spouseProf-specialtyWifeBlackFemale0.00.040.0Cuba
\n", "
" ], "text/plain": [ " age workclass fnlwgt education education-num \\\n", "0 39.0 State-gov 77516.0 Bachelors 13.0 \n", "1 50.0 Self-emp-not-inc 83311.0 Bachelors 13.0 \n", "2 38.0 Private 215646.0 HS-grad 9.0 \n", "3 53.0 Private 234721.0 11th 7.0 \n", "4 28.0 Private 338409.0 Bachelors 13.0 \n", "\n", " marital-status occupation relationship race sex \\\n", "0 Never-married Adm-clerical Not-in-family White Male \n", "1 Married-civ-spouse Exec-managerial Husband White Male \n", "2 Divorced Handlers-cleaners Not-in-family White Male \n", "3 Married-civ-spouse Handlers-cleaners Husband Black Male \n", "4 Married-civ-spouse Prof-specialty Wife Black Female \n", "\n", " capital-gain capital-loss hours-per-week native-country \n", "0 2174.0 0.0 40.0 United-States \n", "1 0.0 0.0 13.0 United-States \n", "2 0.0 0.0 40.0 United-States \n", "3 0.0 0.0 40.0 United-States \n", "4 0.0 0.0 40.0 Cuba " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train.head()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Warning: Custom metrics will not be evaluated because there are no test datasets\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# If you want to find out how we found these parameters check \"Simple classification \n", "# example with missing feature handling and parameter tuning\" tutorial in `classification`\n", "# subdirectory of tutorials\n", "model = cb.CatBoostClassifier(\n", " class_names=('<=50K', '>50K'),\n", " loss_function='Logloss',\n", " eval_metric='AUC', \n", " custom_metric=['AUC'],\n", " iterations=100,\n", " random_seed=20181224,\n", " learning_rate=0.4234185321620083, \n", " depth=5, \n", " l2_leaf_reg=9.464266235679002)\n", "model.fit(\n", " cb.Pool(X_train, y_train, cat_features=np.where(X_train.dtypes != np.float)[0]),\n", " verbose=False)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "model.save_model('adult.cbm')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "156K\tadult.cbm\r\n" ] } ], "source": [ "!du -sh adult.cbm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We got the model, now it's time to use it via `catboost` package for Rust. Next part of the tutorial\n", "will be in a Cargo project." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }