{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Instruction"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To obtain the datasets KDD Appetency, Churn and Upselling used for algorithms comparison:\n",
    "\n",
    "1) Download `orange_small_train.data.zip` file from http://www.kdd.org/kdd-cup/view/kdd-cup-2009/Data and extract the file `orange_small_train.data`. This file contains the features shared between all the three datasets.\n",
    "\n",
    "2) Download files with labels: \n",
    "* `orange_small_train_appetency.labels` from http://www.kdd.org/cupfiles/KDDCupData/2009/orange_small_train_appetency.labels  \n",
    "* `orange_small_train_churn.labels` from http://www.kdd.org/cupfiles/KDDCupData/2009/orange_small_train_churn.labels\n",
    "* `orange_small_train_upselling.labels` from http://www.kdd.org/cupfiles/KDDCupData/2009/orange_small_train_upselling.labels\n",
    "\n",
    "3) Put the files to the same directory as this notebook.\n",
    "\n",
    "4) Run all the cells of this notebook successively to produce files for training and testing - they will appear in corresponding folders."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "resulting_train_filename = \"train\"\n",
    "resulting_test_filename = \"test\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Preparing the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data = pd.read_csv(\"./orange_small_train.data\", sep = \"\\t\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Var1</th>\n",
       "      <th>Var2</th>\n",
       "      <th>Var3</th>\n",
       "      <th>Var4</th>\n",
       "      <th>Var5</th>\n",
       "      <th>Var6</th>\n",
       "      <th>Var7</th>\n",
       "      <th>Var8</th>\n",
       "      <th>Var9</th>\n",
       "      <th>Var10</th>\n",
       "      <th>...</th>\n",
       "      <th>Var221</th>\n",
       "      <th>Var222</th>\n",
       "      <th>Var223</th>\n",
       "      <th>Var224</th>\n",
       "      <th>Var225</th>\n",
       "      <th>Var226</th>\n",
       "      <th>Var227</th>\n",
       "      <th>Var228</th>\n",
       "      <th>Var229</th>\n",
       "      <th>Var230</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1526</td>\n",
       "      <td>7</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>oslk</td>\n",
       "      <td>fXVEsaq</td>\n",
       "      <td>jySVZNlOJy</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>xb3V</td>\n",
       "      <td>RAYp</td>\n",
       "      <td>F2FyR07IdsN7I</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>525</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>oslk</td>\n",
       "      <td>2Kb5FSF</td>\n",
       "      <td>LM8l689qOp</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>fKCe</td>\n",
       "      <td>RAYp</td>\n",
       "      <td>F2FyR07IdsN7I</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5236</td>\n",
       "      <td>7</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>Al6ZaUT</td>\n",
       "      <td>NKv4yOc</td>\n",
       "      <td>jySVZNlOJy</td>\n",
       "      <td>NaN</td>\n",
       "      <td>kG3k</td>\n",
       "      <td>Qu4f</td>\n",
       "      <td>02N6s8f</td>\n",
       "      <td>ib5G6X1eUxUn6</td>\n",
       "      <td>am7c</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>oslk</td>\n",
       "      <td>CE7uk3u</td>\n",
       "      <td>LM8l689qOp</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>FSa2</td>\n",
       "      <td>RAYp</td>\n",
       "      <td>F2FyR07IdsN7I</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1029</td>\n",
       "      <td>7</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>oslk</td>\n",
       "      <td>1J2cvxe</td>\n",
       "      <td>LM8l689qOp</td>\n",
       "      <td>NaN</td>\n",
       "      <td>kG3k</td>\n",
       "      <td>FSa2</td>\n",
       "      <td>RAYp</td>\n",
       "      <td>F2FyR07IdsN7I</td>\n",
       "      <td>mj86</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 230 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   Var1  Var2  Var3  Var4  Var5  Var6  Var7  Var8  Var9  Var10   ...    \\\n",
       "0   NaN   NaN   NaN   NaN   NaN  1526     7   NaN   NaN    NaN   ...     \n",
       "1   NaN   NaN   NaN   NaN   NaN   525     0   NaN   NaN    NaN   ...     \n",
       "2   NaN   NaN   NaN   NaN   NaN  5236     7   NaN   NaN    NaN   ...     \n",
       "3   NaN   NaN   NaN   NaN   NaN   NaN     0   NaN   NaN    NaN   ...     \n",
       "4   NaN   NaN   NaN   NaN   NaN  1029     7   NaN   NaN    NaN   ...     \n",
       "\n",
       "    Var221   Var222      Var223  Var224  Var225  Var226   Var227  \\\n",
       "0     oslk  fXVEsaq  jySVZNlOJy     NaN     NaN    xb3V     RAYp   \n",
       "1     oslk  2Kb5FSF  LM8l689qOp     NaN     NaN    fKCe     RAYp   \n",
       "2  Al6ZaUT  NKv4yOc  jySVZNlOJy     NaN    kG3k    Qu4f  02N6s8f   \n",
       "3     oslk  CE7uk3u  LM8l689qOp     NaN     NaN    FSa2     RAYp   \n",
       "4     oslk  1J2cvxe  LM8l689qOp     NaN    kG3k    FSa2     RAYp   \n",
       "\n",
       "          Var228  Var229  Var230  \n",
       "0  F2FyR07IdsN7I     NaN     NaN  \n",
       "1  F2FyR07IdsN7I     NaN     NaN  \n",
       "2  ib5G6X1eUxUn6    am7c     NaN  \n",
       "3  F2FyR07IdsN7I     NaN     NaN  \n",
       "4  F2FyR07IdsN7I    mj86     NaN  \n",
       "\n",
       "[5 rows x 230 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(50000, 230)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Preparing categorical features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def to_float_str(element):\n",
    "    try:\n",
    "        return str(float(element))\n",
    "    except ValueError:\n",
    "        return element"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "categorical_features = { 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228 }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "for i in categorical_features:\n",
    "    data[data.columns[i]].fillna(\"?\", inplace=True)\n",
    "    data[data.columns[i]] = data[data.columns[i]].apply(lambda x: to_float_str(x))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Preparing numerical features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "columns_to_impute = []\n",
    "for i, column in enumerate(data.columns):\n",
    "    if i not in categorical_features and pd.isnull(data[column]).any():\n",
    "        columns_to_impute.append(column)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "189"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(columns_to_impute)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "for column_name in columns_to_impute:\n",
    "    data[column_name + \"_imputed\"] = pd.isnull(data[column_name]).astype(float)\n",
    "    data[column_name].fillna(0, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i, column in enumerate(data.columns):\n",
    "    if i not in categorical_features:\n",
    "        data[column] = data[column].astype(float)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(50000, 419)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Preparing train/test split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# creating file with features\n",
    "def prepare_pool(data, labels, filename):\n",
    "    X = data.values\n",
    "    y = labels.values\n",
    "    with open(filename, \"w\") as fout:\n",
    "        for i in range(data.shape[0]):\n",
    "            fout.write(str(y[i]) + \"\\t\" + \"\\t\".join(map(str, X[i])) + \"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "for dataset in [\"appetency\", \"churn\", \"upselling\"]:\n",
    "    target = -pd.read_csv(\"./orange_small_train_\" + dataset + \".labels\", header=None)[0]\n",
    "    \n",
    "    train_idx = pd.read_csv(dataset + \"/stratified_train_idx_\" + dataset + \".txt\", header=None)\n",
    "    test_idx = pd.read_csv(dataset + \"/stratified_test_idx_\" + dataset + \".txt\", header=None)\n",
    "\n",
    "    Xtrain = data.iloc[train_idx[0]]\n",
    "    Ytrain = target.iloc[train_idx[0]]\n",
    "    Xtest = data.iloc[test_idx[0]]\n",
    "    Ytest = target.iloc[test_idx[0]]\n",
    "\n",
    "    prepare_pool(Xtrain, Ytrain, dataset + \"/\" + resulting_train_filename)\n",
    "    prepare_pool(Xtest, Ytest, dataset + \"/\" + resulting_test_filename)\n",
    "    \n",
    "    with open(dataset + \"/\" + resulting_train_filename + '.cd', 'w') as fout:\n",
    "        fout.write('0\\tTarget\\n')\n",
    "        for cat_f_id in sorted(categorical_features):\n",
    "            fout.write('{}\\tCateg\\n'.format(cat_f_id + 1))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}