{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Instruction"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To obtain the dataset Internet used for algorithms comparison:\n",
    "\n",
    "1) Download `kdd_internet_usage.arff` file from http://www.cs.odu.edu/~mukka/cs795sum10dm/datasets/uci-20070111/nominal/kdd_internet_usage.arff.\n",
    "\n",
    "2) Put it to the same directory as this notebook.\n",
    "\n",
    "3) Run all the cells of this notebook successively to produce files for training and testing."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "resulting_train_filename = \"train\"\n",
    "resulting_test_filename = \"test\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Preparing the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import re\n",
    "import scipy.io.arff"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"kdd_internet_usage.arff\", \"rb\") as fin:\n",
    "    data, meta = scipy.io.arff.loadarff(fin)\n",
    "    data = pd.DataFrame(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Actual_Time</th>\n",
       "      <th>Age</th>\n",
       "      <th>Community_Building</th>\n",
       "      <th>Community_Membership_Family</th>\n",
       "      <th>Community_Membership_Hobbies</th>\n",
       "      <th>Community_Membership_None</th>\n",
       "      <th>Community_Membership_Other</th>\n",
       "      <th>Community_Membership_Political</th>\n",
       "      <th>Community_Membership_Professional</th>\n",
       "      <th>Community_Membership_Religious</th>\n",
       "      <th>...</th>\n",
       "      <th>Web_Page_Creation</th>\n",
       "      <th>Who_Pays_for_Access_Dont_Know</th>\n",
       "      <th>Who_Pays_for_Access_Other</th>\n",
       "      <th>Who_Pays_for_Access_Parents</th>\n",
       "      <th>Who_Pays_for_Access_School</th>\n",
       "      <th>Who_Pays_for_Access_Self</th>\n",
       "      <th>Who_Pays_for_Access_Work</th>\n",
       "      <th>Willingness_to_Pay_Fees</th>\n",
       "      <th>Years_on_Internet</th>\n",
       "      <th>who</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Consultant</td>\n",
       "      <td>41</td>\n",
       "      <td>Equally</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>Yes</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>Other_sources</td>\n",
       "      <td>1-3_yr</td>\n",
       "      <td>93819</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>College_Student</td>\n",
       "      <td>28</td>\n",
       "      <td>Equally</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>No</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>Already_paying</td>\n",
       "      <td>Under_6_mo</td>\n",
       "      <td>95708</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Other</td>\n",
       "      <td>25</td>\n",
       "      <td>More</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>Yes</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Other_sources</td>\n",
       "      <td>1-3_yr</td>\n",
       "      <td>97218</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Salesperson</td>\n",
       "      <td>28</td>\n",
       "      <td>More</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>Yes</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>Already_paying</td>\n",
       "      <td>1-3_yr</td>\n",
       "      <td>91627</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>K-12_Student</td>\n",
       "      <td>17</td>\n",
       "      <td>More</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>Yes</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>Already_paying</td>\n",
       "      <td>1-3_yr</td>\n",
       "      <td>49906</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 72 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       Actual_Time Age Community_Building Community_Membership_Family  \\\n",
       "0       Consultant  41            Equally                           0   \n",
       "1  College_Student  28            Equally                           0   \n",
       "2            Other  25               More                           1   \n",
       "3      Salesperson  28               More                           0   \n",
       "4     K-12_Student  17               More                           0   \n",
       "\n",
       "  Community_Membership_Hobbies Community_Membership_None  \\\n",
       "0                            0                         1   \n",
       "1                            0                         0   \n",
       "2                            1                         0   \n",
       "3                            0                         0   \n",
       "4                            0                         0   \n",
       "\n",
       "  Community_Membership_Other Community_Membership_Political  \\\n",
       "0                          0                              0   \n",
       "1                          0                              0   \n",
       "2                          0                              0   \n",
       "3                          1                              0   \n",
       "4                          0                              1   \n",
       "\n",
       "  Community_Membership_Professional Community_Membership_Religious  ...    \\\n",
       "0                                 0                              0  ...     \n",
       "1                                 0                              0  ...     \n",
       "2                                 1                              0  ...     \n",
       "3                                 0                              0  ...     \n",
       "4                                 1                              0  ...     \n",
       "\n",
       "  Web_Page_Creation Who_Pays_for_Access_Dont_Know Who_Pays_for_Access_Other  \\\n",
       "0               Yes                             0                         0   \n",
       "1                No                             0                         0   \n",
       "2               Yes                             0                         0   \n",
       "3               Yes                             0                         0   \n",
       "4               Yes                             0                         0   \n",
       "\n",
       "  Who_Pays_for_Access_Parents Who_Pays_for_Access_School  \\\n",
       "0                           0                          0   \n",
       "1                           0                          0   \n",
       "2                           0                          0   \n",
       "3                           0                          0   \n",
       "4                           0                          0   \n",
       "\n",
       "  Who_Pays_for_Access_Self Who_Pays_for_Access_Work Willingness_to_Pay_Fees  \\\n",
       "0                        1                        0           Other_sources   \n",
       "1                        1                        0          Already_paying   \n",
       "2                        1                        1           Other_sources   \n",
       "3                        1                        0          Already_paying   \n",
       "4                        1                        0          Already_paying   \n",
       "\n",
       "  Years_on_Internet    who  \n",
       "0            1-3_yr  93819  \n",
       "1        Under_6_mo  95708  \n",
       "2            1-3_yr  97218  \n",
       "3            1-3_yr  91627  \n",
       "4            1-3_yr  49906  \n",
       "\n",
       "[5 rows x 72 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "target = data[\"Who_Pays_for_Access_Work\"].apply(lambda x: 1 if x == '0' else -1)\n",
    "data.drop([\"Who_Pays_for_Access_Work\", \"Willingness_to_Pay_Fees\", \"Years_on_Internet\", \"who\"], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10108, 68)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Preparing train/test split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_idx = pd.read_csv(\"stratified_train_idx.txt\", header=None)\n",
    "test_idx = pd.read_csv(\"stratified_test_idx.txt\", header=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Xtrain = data.iloc[train_idx[0]]\n",
    "Ytrain = target.iloc[train_idx[0]]\n",
    "Xtest = data.iloc[test_idx[0]]\n",
    "Ytest = target.iloc[test_idx[0]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# creating file with features\n",
    "def prepare_pool(data, labels, filename):\n",
    "    X = data.values\n",
    "    y = labels.values\n",
    "    with open(filename, \"w\") as fout:\n",
    "        for i in range(data.shape[0]):\n",
    "            fout.write(str(y[i]) + \"\\t\" + \"\\t\".join(map(str, X[i])) + \"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "prepare_pool(Xtrain, Ytrain, resulting_train_filename)\n",
    "prepare_pool(Xtest, Ytest, resulting_test_filename)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "categorical_features = {0, 1, 2, 11, 12, 18, 19, 20, 21, 31, 32, 33, 34, 36, 37, 38, 39, 59, 60, 61, 62}\n",
    "with open(resulting_train_filename + '.cd', 'w') as fout:\n",
    "    fout.write('0\\tTarget\\n')\n",
    "    for cat_f_id in sorted(categorical_features):\n",
    "        fout.write('{}\\tCateg\\n'.format(cat_f_id + 1))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}