{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Instruction"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To obtain the dataset Click used for algorithms comparison:\n",
    "\n",
    "1) Download `training.txt` file from \"Don't Get Kicked!\" competition on Kaggle: https://www.kaggle.com/c/kddcup2012-track2/data (you can download track2.zip or track2.7z archive and extract the file from it).\n",
    "\n",
    "2) Put it to the same directory as this notebook.\n",
    "\n",
    "3) Run all the cells of this notebook successively to produce files for training and testing."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "resulting_train_filename = \"train\"\n",
    "resulting_test_filename = \"test\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Preparing the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import re\n",
    "import sys\n",
    "from StringIO import StringIO"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "with open(\"subsampling_idx.txt\") as fin:\n",
    "    ids = map(int, fin.read().split())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "unique_ids = set(ids)\n",
    "data_strings = {}\n",
    "with open('training.txt') as fin:\n",
    "    for i, string in enumerate(fin):\n",
    "        if i in unique_ids:\n",
    "            data_strings[i] = string\n",
    "            \n",
    "data_rows = []\n",
    "for i in ids:\n",
    "    data_rows.append(data_strings[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_table(StringIO(\"\".join(data_rows)), header=None).apply(np.float64)\n",
    "colnames = ['click', \n",
    "            'impression', \n",
    "            'url_hash', \n",
    "            'ad_id', \n",
    "            'advertiser_id', \n",
    "            'depth', \n",
    "            'position', \n",
    "            'query_id', \n",
    "            'keyword_id', \n",
    "            'title_id', \n",
    "            'description_id', \n",
    "            'user_id']\n",
    "data.columns = colnames"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>click</th>\n",
       "      <th>impression</th>\n",
       "      <th>url_hash</th>\n",
       "      <th>ad_id</th>\n",
       "      <th>advertiser_id</th>\n",
       "      <th>depth</th>\n",
       "      <th>position</th>\n",
       "      <th>query_id</th>\n",
       "      <th>keyword_id</th>\n",
       "      <th>title_id</th>\n",
       "      <th>description_id</th>\n",
       "      <th>user_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.071003e+19</td>\n",
       "      <td>8343295.0</td>\n",
       "      <td>11700.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>7702266.0</td>\n",
       "      <td>21264.0</td>\n",
       "      <td>27892.0</td>\n",
       "      <td>1559.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.736385e+19</td>\n",
       "      <td>20017077.0</td>\n",
       "      <td>23798.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>93079.0</td>\n",
       "      <td>35498.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>36476.0</td>\n",
       "      <td>562934.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>8.915473e+18</td>\n",
       "      <td>21348354.0</td>\n",
       "      <td>36654.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>10981.0</td>\n",
       "      <td>19975.0</td>\n",
       "      <td>36105.0</td>\n",
       "      <td>33292.0</td>\n",
       "      <td>11621116.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>4.426693e+18</td>\n",
       "      <td>20366086.0</td>\n",
       "      <td>33280.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>5942.0</td>\n",
       "      <td>4057.0</td>\n",
       "      <td>4390.0</td>\n",
       "      <td>8778348.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.157260e+19</td>\n",
       "      <td>6803526.0</td>\n",
       "      <td>10790.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>9881978.0</td>\n",
       "      <td>60593.0</td>\n",
       "      <td>25242.0</td>\n",
       "      <td>1679.0</td>\n",
       "      <td>12118311.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   click  impression      url_hash       ad_id  advertiser_id  depth  \\\n",
       "0    0.0         1.0  1.071003e+19   8343295.0        11700.0    3.0   \n",
       "1    1.0         1.0  1.736385e+19  20017077.0        23798.0    1.0   \n",
       "2    0.0         1.0  8.915473e+18  21348354.0        36654.0    1.0   \n",
       "3    0.0         1.0  4.426693e+18  20366086.0        33280.0    3.0   \n",
       "4    0.0         1.0  1.157260e+19   6803526.0        10790.0    2.0   \n",
       "\n",
       "   position   query_id  keyword_id  title_id  description_id     user_id  \n",
       "0       3.0  7702266.0     21264.0   27892.0          1559.0         0.0  \n",
       "1       1.0    93079.0     35498.0       4.0         36476.0    562934.0  \n",
       "2       1.0    10981.0     19975.0   36105.0         33292.0  11621116.0  \n",
       "3       3.0        0.0      5942.0    4057.0          4390.0   8778348.0  \n",
       "4       1.0  9881978.0     60593.0   25242.0          1679.0  12118311.0  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "target = data[\"click\"].apply(lambda x: 1 if x == 0 else -1)\n",
    "data.drop([\"click\"], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Preparing categorical features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "categorical_features = {1, 2, 3, 6, 7, 8, 9, 10}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def clean_string(s):\n",
    "    return \"v_\" + re.sub('[^A-Za-z0-9]+', \"_\", str(s))\n",
    "\n",
    "for i in categorical_features:\n",
    "    data[data.columns[i]] = data[data.columns[i]].apply(clean_string)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Preparing train/test split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_idx = pd.read_csv(\"stratified_train_idx.txt\", header=None)\n",
    "test_idx = pd.read_csv(\"stratified_test_idx.txt\", header=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Xtrain = data.iloc[train_idx[0]]\n",
    "Ytrain = target.iloc[train_idx[0]]\n",
    "Xtest = data.iloc[test_idx[0]]\n",
    "Ytest = target.iloc[test_idx[0]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# creating file with features\n",
    "def prepare_pool(data, labels, filename):\n",
    "    X = data.values\n",
    "    y = labels.values\n",
    "    with open(filename, \"w\") as fout:\n",
    "        for i in range(data.shape[0]):\n",
    "            fout.write(str(y[i]) + \"\\t\" + \"\\t\".join(map(str, X[i])) + \"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "prepare_pool(Xtrain, Ytrain, resulting_train_filename)\n",
    "prepare_pool(Xtest, Ytest, resulting_test_filename)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "with open(resulting_train_filename + '.cd', 'w') as fout:\n",
    "    fout.write('0\\tTarget\\n')\n",
    "    for cat_f_id in sorted(categorical_features):\n",
    "        fout.write('{}\\tCateg\\n'.format(cat_f_id + 1))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}