{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Instruction" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To obtain the dataset Click used for algorithms comparison:\n", "\n", "1) Download `training.txt` file from \"Don't Get Kicked!\" competition on Kaggle: https://www.kaggle.com/c/kddcup2012-track2/data (you can download track2.zip or track2.7z archive and extract the file from it).\n", "\n", "2) Put it to the same directory as this notebook.\n", "\n", "3) Run all the cells of this notebook successively to produce files for training and testing." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "resulting_train_filename = \"train\"\n", "resulting_test_filename = \"test\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Preparing the data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import re\n", "import sys\n", "from StringIO import StringIO" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "with open(\"subsampling_idx.txt\") as fin:\n", " ids = map(int, fin.read().split())" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "unique_ids = set(ids)\n", "data_strings = {}\n", "with open('training.txt') as fin:\n", " for i, string in enumerate(fin):\n", " if i in unique_ids:\n", " data_strings[i] = string\n", " \n", "data_rows = []\n", "for i in ids:\n", " data_rows.append(data_strings[i])" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "data = pd.read_table(StringIO(\"\".join(data_rows)), header=None).apply(np.float64)\n", "colnames = ['click', \n", " 'impression', \n", " 'url_hash', \n", " 'ad_id', \n", " 'advertiser_id', \n", " 'depth', \n", " 'position', \n", " 'query_id', \n", " 'keyword_id', \n", " 'title_id', \n", " 'description_id', \n", " 'user_id']\n", "data.columns = colnames" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
clickimpressionurl_hashad_idadvertiser_iddepthpositionquery_idkeyword_idtitle_iddescription_iduser_id
00.01.01.071003e+198343295.011700.03.03.07702266.021264.027892.01559.00.0
11.01.01.736385e+1920017077.023798.01.01.093079.035498.04.036476.0562934.0
20.01.08.915473e+1821348354.036654.01.01.010981.019975.036105.033292.011621116.0
30.01.04.426693e+1820366086.033280.03.03.00.05942.04057.04390.08778348.0
40.01.01.157260e+196803526.010790.02.01.09881978.060593.025242.01679.012118311.0
\n", "
" ], "text/plain": [ " click impression url_hash ad_id advertiser_id depth \\\n", "0 0.0 1.0 1.071003e+19 8343295.0 11700.0 3.0 \n", "1 1.0 1.0 1.736385e+19 20017077.0 23798.0 1.0 \n", "2 0.0 1.0 8.915473e+18 21348354.0 36654.0 1.0 \n", "3 0.0 1.0 4.426693e+18 20366086.0 33280.0 3.0 \n", "4 0.0 1.0 1.157260e+19 6803526.0 10790.0 2.0 \n", "\n", " position query_id keyword_id title_id description_id user_id \n", "0 3.0 7702266.0 21264.0 27892.0 1559.0 0.0 \n", "1 1.0 93079.0 35498.0 4.0 36476.0 562934.0 \n", "2 1.0 10981.0 19975.0 36105.0 33292.0 11621116.0 \n", "3 3.0 0.0 5942.0 4057.0 4390.0 8778348.0 \n", "4 1.0 9881978.0 60593.0 25242.0 1679.0 12118311.0 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "target = data[\"click\"].apply(lambda x: 1 if x == 0 else -1)\n", "data.drop([\"click\"], axis=1, inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Preparing categorical features" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "categorical_features = {1, 2, 3, 6, 7, 8, 9, 10}" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def clean_string(s):\n", " return \"v_\" + re.sub('[^A-Za-z0-9]+', \"_\", str(s))\n", "\n", "for i in categorical_features:\n", " data[data.columns[i]] = data[data.columns[i]].apply(clean_string)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Preparing train/test split" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "train_idx = pd.read_csv(\"stratified_train_idx.txt\", header=None)\n", "test_idx = pd.read_csv(\"stratified_test_idx.txt\", header=None)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "Xtrain = data.iloc[train_idx[0]]\n", "Ytrain = target.iloc[train_idx[0]]\n", "Xtest = data.iloc[test_idx[0]]\n", "Ytest = target.iloc[test_idx[0]]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# creating file with features\n", "def prepare_pool(data, labels, filename):\n", " X = data.values\n", " y = labels.values\n", " with open(filename, \"w\") as fout:\n", " for i in range(data.shape[0]):\n", " fout.write(str(y[i]) + \"\\t\" + \"\\t\".join(map(str, X[i])) + \"\\n\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "prepare_pool(Xtrain, Ytrain, resulting_train_filename)\n", "prepare_pool(Xtest, Ytest, resulting_test_filename)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "with open(resulting_train_filename + '.cd', 'w') as fout:\n", " fout.write('0\\tTarget\\n')\n", " for cat_f_id in sorted(categorical_features):\n", " fout.write('{}\\tCateg\\n'.format(cat_f_id + 1))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.13" } }, "nbformat": 4, "nbformat_minor": 1 }