{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Instruction" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To obtain the datasets KDD Appetency, Churn and Upselling used for algorithms comparison:\n", "\n", "1) Download `orange_small_train.data.zip` file from http://www.kdd.org/kdd-cup/view/kdd-cup-2009/Data and extract the file `orange_small_train.data`. This file contains the features shared between all the three datasets.\n", "\n", "2) Download files with labels: \n", "* `orange_small_train_appetency.labels` from http://www.kdd.org/cupfiles/KDDCupData/2009/orange_small_train_appetency.labels \n", "* `orange_small_train_churn.labels` from http://www.kdd.org/cupfiles/KDDCupData/2009/orange_small_train_churn.labels\n", "* `orange_small_train_upselling.labels` from http://www.kdd.org/cupfiles/KDDCupData/2009/orange_small_train_upselling.labels\n", "\n", "3) Put the files to the same directory as this notebook.\n", "\n", "4) Run all the cells of this notebook successively to produce files for training and testing - they will appear in corresponding folders." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "resulting_train_filename = \"train\"\n", "resulting_test_filename = \"test\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Preparing the data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "data = pd.read_csv(\"./orange_small_train.data\", sep = \"\\t\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Var1Var2Var3Var4Var5Var6Var7Var8Var9Var10...Var221Var222Var223Var224Var225Var226Var227Var228Var229Var230
0NaNNaNNaNNaNNaN15267NaNNaNNaN...oslkfXVEsaqjySVZNlOJyNaNNaNxb3VRAYpF2FyR07IdsN7INaNNaN
1NaNNaNNaNNaNNaN5250NaNNaNNaN...oslk2Kb5FSFLM8l689qOpNaNNaNfKCeRAYpF2FyR07IdsN7INaNNaN
2NaNNaNNaNNaNNaN52367NaNNaNNaN...Al6ZaUTNKv4yOcjySVZNlOJyNaNkG3kQu4f02N6s8fib5G6X1eUxUn6am7cNaN
3NaNNaNNaNNaNNaNNaN0NaNNaNNaN...oslkCE7uk3uLM8l689qOpNaNNaNFSa2RAYpF2FyR07IdsN7INaNNaN
4NaNNaNNaNNaNNaN10297NaNNaNNaN...oslk1J2cvxeLM8l689qOpNaNkG3kFSa2RAYpF2FyR07IdsN7Imj86NaN
\n", "

5 rows × 230 columns

\n", "
" ], "text/plain": [ " Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9 Var10 ... \\\n", "0 NaN NaN NaN NaN NaN 1526 7 NaN NaN NaN ... \n", "1 NaN NaN NaN NaN NaN 525 0 NaN NaN NaN ... \n", "2 NaN NaN NaN NaN NaN 5236 7 NaN NaN NaN ... \n", "3 NaN NaN NaN NaN NaN NaN 0 NaN NaN NaN ... \n", "4 NaN NaN NaN NaN NaN 1029 7 NaN NaN NaN ... \n", "\n", " Var221 Var222 Var223 Var224 Var225 Var226 Var227 \\\n", "0 oslk fXVEsaq jySVZNlOJy NaN NaN xb3V RAYp \n", "1 oslk 2Kb5FSF LM8l689qOp NaN NaN fKCe RAYp \n", "2 Al6ZaUT NKv4yOc jySVZNlOJy NaN kG3k Qu4f 02N6s8f \n", "3 oslk CE7uk3u LM8l689qOp NaN NaN FSa2 RAYp \n", "4 oslk 1J2cvxe LM8l689qOp NaN kG3k FSa2 RAYp \n", "\n", " Var228 Var229 Var230 \n", "0 F2FyR07IdsN7I NaN NaN \n", "1 F2FyR07IdsN7I NaN NaN \n", "2 ib5G6X1eUxUn6 am7c NaN \n", "3 F2FyR07IdsN7I NaN NaN \n", "4 F2FyR07IdsN7I mj86 NaN \n", "\n", "[5 rows x 230 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(50000, 230)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Preparing categorical features" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def to_float_str(element):\n", " try:\n", " return str(float(element))\n", " except ValueError:\n", " return element" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "categorical_features = { 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228 }" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "for i in categorical_features:\n", " data[data.columns[i]].fillna(\"?\", inplace=True)\n", " data[data.columns[i]] = data[data.columns[i]].apply(lambda x: to_float_str(x))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Preparing numerical features" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "columns_to_impute = []\n", "for i, column in enumerate(data.columns):\n", " if i not in categorical_features and pd.isnull(data[column]).any():\n", " columns_to_impute.append(column)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "189" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(columns_to_impute)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "for column_name in columns_to_impute:\n", " data[column_name + \"_imputed\"] = pd.isnull(data[column_name]).astype(float)\n", " data[column_name].fillna(0, inplace=True)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "for i, column in enumerate(data.columns):\n", " if i not in categorical_features:\n", " data[column] = data[column].astype(float)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(50000, 419)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Preparing train/test split" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# creating file with features\n", "def prepare_pool(data, labels, filename):\n", " X = data.values\n", " y = labels.values\n", " with open(filename, \"w\") as fout:\n", " for i in range(data.shape[0]):\n", " fout.write(str(y[i]) + \"\\t\" + \"\\t\".join(map(str, X[i])) + \"\\n\")" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "for dataset in [\"appetency\", \"churn\", \"upselling\"]:\n", " target = -pd.read_csv(\"./orange_small_train_\" + dataset + \".labels\", header=None)[0]\n", " \n", " train_idx = pd.read_csv(dataset + \"/stratified_train_idx_\" + dataset + \".txt\", header=None)\n", " test_idx = pd.read_csv(dataset + \"/stratified_test_idx_\" + dataset + \".txt\", header=None)\n", "\n", " Xtrain = data.iloc[train_idx[0]]\n", " Ytrain = target.iloc[train_idx[0]]\n", " Xtest = data.iloc[test_idx[0]]\n", " Ytest = target.iloc[test_idx[0]]\n", "\n", " prepare_pool(Xtrain, Ytrain, dataset + \"/\" + resulting_train_filename)\n", " prepare_pool(Xtest, Ytest, dataset + \"/\" + resulting_test_filename)\n", " \n", " with open(dataset + \"/\" + resulting_train_filename + '.cd', 'w') as fout:\n", " fout.write('0\\tTarget\\n')\n", " for cat_f_id in sorted(categorical_features):\n", " fout.write('{}\\tCateg\\n'.format(cat_f_id + 1))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.13" } }, "nbformat": 4, "nbformat_minor": 1 }