{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Instruction" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To obtain the dataset Click used for algorithms comparison:\n", "\n", "1) Download `training.txt` file from \"Don't Get Kicked!\" competition on Kaggle: https://www.kaggle.com/c/kddcup2012-track2/data (you can download track2.zip or track2.7z archive and extract the file from it).\n", "\n", "2) Put it to the same directory as this notebook.\n", "\n", "3) Run all the cells of this notebook successively to produce files for training and testing." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "resulting_train_filename = \"train\"\n", "resulting_test_filename = \"test\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Preparing the data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import re\n", "import sys\n", "from StringIO import StringIO" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "with open(\"subsampling_idx.txt\") as fin:\n", " ids = map(int, fin.read().split())" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "unique_ids = set(ids)\n", "data_strings = {}\n", "with open('training.txt') as fin:\n", " for i, string in enumerate(fin):\n", " if i in unique_ids:\n", " data_strings[i] = string\n", " \n", "data_rows = []\n", "for i in ids:\n", " data_rows.append(data_strings[i])" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "data = pd.read_table(StringIO(\"\".join(data_rows)), header=None).apply(np.float64)\n", "colnames = ['click', \n", " 'impression', \n", " 'url_hash', \n", " 'ad_id', \n", " 'advertiser_id', \n", " 'depth', \n", " 'position', \n", " 'query_id', \n", " 'keyword_id', \n", " 'title_id', \n", " 'description_id', \n", " 'user_id']\n", "data.columns = colnames" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | click | \n", "impression | \n", "url_hash | \n", "ad_id | \n", "advertiser_id | \n", "depth | \n", "position | \n", "query_id | \n", "keyword_id | \n", "title_id | \n", "description_id | \n", "user_id | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "0.0 | \n", "1.0 | \n", "1.071003e+19 | \n", "8343295.0 | \n", "11700.0 | \n", "3.0 | \n", "3.0 | \n", "7702266.0 | \n", "21264.0 | \n", "27892.0 | \n", "1559.0 | \n", "0.0 | \n", "
1 | \n", "1.0 | \n", "1.0 | \n", "1.736385e+19 | \n", "20017077.0 | \n", "23798.0 | \n", "1.0 | \n", "1.0 | \n", "93079.0 | \n", "35498.0 | \n", "4.0 | \n", "36476.0 | \n", "562934.0 | \n", "
2 | \n", "0.0 | \n", "1.0 | \n", "8.915473e+18 | \n", "21348354.0 | \n", "36654.0 | \n", "1.0 | \n", "1.0 | \n", "10981.0 | \n", "19975.0 | \n", "36105.0 | \n", "33292.0 | \n", "11621116.0 | \n", "
3 | \n", "0.0 | \n", "1.0 | \n", "4.426693e+18 | \n", "20366086.0 | \n", "33280.0 | \n", "3.0 | \n", "3.0 | \n", "0.0 | \n", "5942.0 | \n", "4057.0 | \n", "4390.0 | \n", "8778348.0 | \n", "
4 | \n", "0.0 | \n", "1.0 | \n", "1.157260e+19 | \n", "6803526.0 | \n", "10790.0 | \n", "2.0 | \n", "1.0 | \n", "9881978.0 | \n", "60593.0 | \n", "25242.0 | \n", "1679.0 | \n", "12118311.0 | \n", "