{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Instruction" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To obtain the dataset Internet used for algorithms comparison:\n", "\n", "1) Download `kdd_internet_usage.arff` file from http://www.cs.odu.edu/~mukka/cs795sum10dm/datasets/uci-20070111/nominal/kdd_internet_usage.arff.\n", "\n", "2) Put it to the same directory as this notebook.\n", "\n", "3) Run all the cells of this notebook successively to produce files for training and testing." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "resulting_train_filename = \"train\"\n", "resulting_test_filename = \"test\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Preparing the data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import re\n", "import scipy.io.arff" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "with open(\"kdd_internet_usage.arff\", \"rb\") as fin:\n", " data, meta = scipy.io.arff.loadarff(fin)\n", " data = pd.DataFrame(data)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Actual_TimeAgeCommunity_BuildingCommunity_Membership_FamilyCommunity_Membership_HobbiesCommunity_Membership_NoneCommunity_Membership_OtherCommunity_Membership_PoliticalCommunity_Membership_ProfessionalCommunity_Membership_Religious...Web_Page_CreationWho_Pays_for_Access_Dont_KnowWho_Pays_for_Access_OtherWho_Pays_for_Access_ParentsWho_Pays_for_Access_SchoolWho_Pays_for_Access_SelfWho_Pays_for_Access_WorkWillingness_to_Pay_FeesYears_on_Internetwho
0Consultant41Equally0010000...Yes000010Other_sources1-3_yr93819
1College_Student28Equally0000000...No000010Already_payingUnder_6_mo95708
2Other25More1100010...Yes000011Other_sources1-3_yr97218
3Salesperson28More0001000...Yes000010Already_paying1-3_yr91627
4K-12_Student17More0000110...Yes000010Already_paying1-3_yr49906
\n", "

5 rows × 72 columns

\n", "
" ], "text/plain": [ " Actual_Time Age Community_Building Community_Membership_Family \\\n", "0 Consultant 41 Equally 0 \n", "1 College_Student 28 Equally 0 \n", "2 Other 25 More 1 \n", "3 Salesperson 28 More 0 \n", "4 K-12_Student 17 More 0 \n", "\n", " Community_Membership_Hobbies Community_Membership_None \\\n", "0 0 1 \n", "1 0 0 \n", "2 1 0 \n", "3 0 0 \n", "4 0 0 \n", "\n", " Community_Membership_Other Community_Membership_Political \\\n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 1 0 \n", "4 0 1 \n", "\n", " Community_Membership_Professional Community_Membership_Religious ... \\\n", "0 0 0 ... \n", "1 0 0 ... \n", "2 1 0 ... \n", "3 0 0 ... \n", "4 1 0 ... \n", "\n", " Web_Page_Creation Who_Pays_for_Access_Dont_Know Who_Pays_for_Access_Other \\\n", "0 Yes 0 0 \n", "1 No 0 0 \n", "2 Yes 0 0 \n", "3 Yes 0 0 \n", "4 Yes 0 0 \n", "\n", " Who_Pays_for_Access_Parents Who_Pays_for_Access_School \\\n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "\n", " Who_Pays_for_Access_Self Who_Pays_for_Access_Work Willingness_to_Pay_Fees \\\n", "0 1 0 Other_sources \n", "1 1 0 Already_paying \n", "2 1 1 Other_sources \n", "3 1 0 Already_paying \n", "4 1 0 Already_paying \n", "\n", " Years_on_Internet who \n", "0 1-3_yr 93819 \n", "1 Under_6_mo 95708 \n", "2 1-3_yr 97218 \n", "3 1-3_yr 91627 \n", "4 1-3_yr 49906 \n", "\n", "[5 rows x 72 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "target = data[\"Who_Pays_for_Access_Work\"].apply(lambda x: 1 if x == '0' else -1)\n", "data.drop([\"Who_Pays_for_Access_Work\", \"Willingness_to_Pay_Fees\", \"Years_on_Internet\", \"who\"], axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(10108, 68)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Preparing train/test split" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "train_idx = pd.read_csv(\"stratified_train_idx.txt\", header=None)\n", "test_idx = pd.read_csv(\"stratified_test_idx.txt\", header=None)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "Xtrain = data.iloc[train_idx[0]]\n", "Ytrain = target.iloc[train_idx[0]]\n", "Xtest = data.iloc[test_idx[0]]\n", "Ytest = target.iloc[test_idx[0]]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# creating file with features\n", "def prepare_pool(data, labels, filename):\n", " X = data.values\n", " y = labels.values\n", " with open(filename, \"w\") as fout:\n", " for i in range(data.shape[0]):\n", " fout.write(str(y[i]) + \"\\t\" + \"\\t\".join(map(str, X[i])) + \"\\n\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "prepare_pool(Xtrain, Ytrain, resulting_train_filename)\n", "prepare_pool(Xtest, Ytest, resulting_test_filename)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "categorical_features = {0, 1, 2, 11, 12, 18, 19, 20, 21, 31, 32, 33, 34, 36, 37, 38, 39, 59, 60, 61, 62}\n", "with open(resulting_train_filename + '.cd', 'w') as fout:\n", " fout.write('0\\tTarget\\n')\n", " for cat_f_id in sorted(categorical_features):\n", " fout.write('{}\\tCateg\\n'.format(cat_f_id + 1))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.13" } }, "nbformat": 4, "nbformat_minor": 1 }