### Instruction

To obtain the dataset Click used for algorithms comparison:

1) Download `training.txt` file from "Don't Get Kicked!" competition on Kaggle: https://www.kaggle.com/c/kddcup2012-track2/data (you can download track2.zip or track2.7z archive and extract the file from it).

2) Put it to the same directory as this notebook.

3) Run all the cells of this notebook successively to produce files for training and testing.

In [1]:
resulting_train_filename = "train"
resulting_test_filename = "test"

### Preparing the data

In [2]:
import pandas as pd
import numpy as np
import re
import sys
from StringIO import StringIO

In [3]:
with open("subsampling_idx.txt") as fin:
    ids = map(int, fin.read().split())

In [4]:
unique_ids = set(ids)
data_strings = {}
with open('training.txt') as fin:
    for i, string in enumerate(fin):
        if i in unique_ids:
            data_strings[i] = string
            
data_rows = []
for i in ids:
    data_rows.append(data_strings[i])

In [5]:
data = pd.read_table(StringIO("".join(data_rows)), header=None).apply(np.float64)
colnames = ['click', 
            'impression', 
            'url_hash', 
            'ad_id', 
            'advertiser_id', 
            'depth', 
            'position', 
            'query_id', 
            'keyword_id', 
            'title_id', 
            'description_id', 
            'user_id']
data.columns = colnames

In [6]:
data.head()

Unnamed: 0,click,impression,url_hash,ad_id,advertiser_id,depth,position,query_id,keyword_id,title_id,description_id,user_id
0,0.0,1.0,1.071003e+19,8343295.0,11700.0,3.0,3.0,7702266.0,21264.0,27892.0,1559.0,0.0
1,1.0,1.0,1.736385e+19,20017077.0,23798.0,1.0,1.0,93079.0,35498.0,4.0,36476.0,562934.0
2,0.0,1.0,8.915473e+18,21348354.0,36654.0,1.0,1.0,10981.0,19975.0,36105.0,33292.0,11621116.0
3,0.0,1.0,4.426693e+18,20366086.0,33280.0,3.0,3.0,0.0,5942.0,4057.0,4390.0,8778348.0
4,0.0,1.0,1.15726e+19,6803526.0,10790.0,2.0,1.0,9881978.0,60593.0,25242.0,1679.0,12118311.0


In [7]:
target = data["click"].apply(lambda x: 1 if x == 0 else -1)
data.drop(["click"], axis=1, inplace=True)

### Preparing categorical features

In [8]:
categorical_features = {1, 2, 3, 6, 7, 8, 9, 10}

In [9]:
def clean_string(s):
    return "v_" + re.sub('[^A-Za-z0-9]+', "_", str(s))

for i in categorical_features:
    data[data.columns[i]] = data[data.columns[i]].apply(clean_string)

### Preparing train/test split

In [10]:
train_idx = pd.read_csv("stratified_train_idx.txt", header=None)
test_idx = pd.read_csv("stratified_test_idx.txt", header=None)

In [11]:
Xtrain = data.iloc[train_idx[0]]
Ytrain = target.iloc[train_idx[0]]
Xtest = data.iloc[test_idx[0]]
Ytest = target.iloc[test_idx[0]]

In [12]:
# creating file with features
def prepare_pool(data, labels, filename):
    X = data.values
    y = labels.values
    with open(filename, "w") as fout:
        for i in range(data.shape[0]):
            fout.write(str(y[i]) + "\t" + "\t".join(map(str, X[i])) + "\n")

In [13]:
prepare_pool(Xtrain, Ytrain, resulting_train_filename)
prepare_pool(Xtest, Ytest, resulting_test_filename)

In [14]:
with open(resulting_train_filename + '.cd', 'w') as fout:
    fout.write('0\tTarget\n')
    for cat_f_id in sorted(categorical_features):
        fout.write('{}\tCateg\n'.format(cat_f_id + 1))