## Example of using CatBoost on text data with word2vec embedding.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/catboost/tutorials/blob/master/competition_examples/quora_w2v.ipynb)

In [2]:
import catboost
import collections
import gensim
import os
import nltk
import numpy as np
import pandas as pd
import random
import tensorflow as tf
import zipfile

from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis

from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score

from urllib import urlretrieve

In [5]:
data_path = '../text8/'
if not os.path.exists(data_path):
    os.makedirs(data_path)

### Embedding

Train word2vec embeddings using Tensorflow ([from this example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/udacity/5_word2vec.ipynb)).

Load [Text8](http://mattmahoney.net/dc/textdata) data.

In [6]:
url = 'http://mattmahoney.net/dc/'
filename = 'text8.zip'
filename, _ = urlretrieve(url + filename, data_path + filename)

In [7]:
with zipfile.ZipFile(data_path + filename) as f:
    words = tf.compat.as_str(f.read(f.namelist()[0])).split()

Build a dataset. Rare words are replaced with 'UNK' token.

In [8]:
vocabulary_size = 50000
count = [['UNK', -1]]
count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
dictionary = dict()

for word, _ in count:
    dictionary[word] = len(dictionary)
data = list()
unk_count = 0
for word in words:
    if word in dictionary:
        index = dictionary[word]
    else:
        index = 0  # dictionary['UNK']
        unk_count = unk_count + 1
    data.append(index)

count[0][1] = unk_count
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
del words

Write batch generator.

In [9]:
data_index = 0

def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buf = collections.deque(maxlen=span)
    for _ in xrange(span):
        buf.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in xrange(batch_size // num_skips):
        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [ skip_window ]
        for j in xrange(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buf[skip_window]
            labels[i * num_skips + j, 0] = buf[target]
        buf.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels

Train a skip-gram model.

In [10]:
batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a label.
num_sampled = 64      # Number of negative examples to sample.

graph = tf.Graph()

with graph.as_default(), tf.device('/cpu:0'):
    train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels  = tf.placeholder(tf.int32, shape=[batch_size, 1])
  
    embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    softmax_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0/np.sqrt(embedding_size)))
    softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
  
    embed = tf.nn.embedding_lookup(embeddings, train_dataset)
    loss = tf.reduce_mean(
        tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=embed,
                                   labels=train_labels, num_sampled=num_sampled, num_classes=vocabulary_size))

    optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)

    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [11]:
num_steps = 500001

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    average_loss = 0
    for step in range(num_steps):
        batch_data, batch_labels = generate_batch(batch_size, num_skips, skip_window)
        feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
        _, l = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += l
        if step % 100000 == 0 and step > 0:
            print('Average loss at step %d: %f' % (step, average_loss / 100000))
            average_loss = 0
    word2vec = normalized_embeddings.eval()

Average loss at step 100000: 3.454290
Average loss at step 200000: 3.242673
Average loss at step 300000: 3.177683
Average loss at step 400000: 3.131030
Average loss at step 500000: 3.077533


Check trained word2vec: find nearest for car.

In [12]:
distances = -word2vec[dictionary['car']].reshape((1, -1)).dot(word2vec.T)
inds = np.argsort(distances.ravel())[1:6]
print(' '.join([reverse_dictionary[i] for i in inds]))

cars automobile train aircraft company


Also you can:
1. Change parameters of model.
2. Change dataset to bigger one.
3. Increase train time.
4. Use pretrained model (not only word2vec).

### Dataset

Load dataset from [Kaggle Quora Question Pairs](https://www.kaggle.com/c/quora-question-pairs/overview) competition. The goal of this task is to determine which pair of questions is duplicated (binary classification).

In [14]:
data = pd.read_csv(data_path + 'train.csv').fillna('')
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [15]:
target = data.is_duplicate
data.drop(['is_duplicate', 'id', 'qid1', 'qid2'], axis=1, inplace=True)

In [16]:
data.question1 = data.question1.apply(lambda x: x.lower().decode('utf-8'))
data.question2 = data.question2.apply(lambda x: x.lower().decode('utf-8'))

### Feature extraction

Nltk for tokenizer and stop-words filtering.

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')

Get a vector of every question as:
1. Tokenizing
2. Filtering from stop-words and non-words
3. Summig vectors of words and normilizing it.

In [18]:
EPS = 1e-100

def question2vec(s):
    words = nltk.word_tokenize(s)
    words = filter(lambda x: not x in stop_words and x.isalpha(), words)
    seq = np.array([word2vec[dictionary[w]] for w in words if w in dictionary])
    v = seq.sum(axis=0)
    return v / ((v ** 2).sum() + EPS) ** 0.5 if seq.shape[0] != 0 else np.ones(embedding_size)*1.0/embedding_size**0.5

question1_vec = np.array([question2vec(q) for q in data.question1.values])
question2_vec = np.array([question2vec(q) for q in data.question2.values])

You can not only average vectors but also find max, min and std for all question.

Generate features on embeddings.

In [19]:
data['cosine']     = [cosine(x, y)       for (x, y) in zip(question1_vec, question2_vec)]
data['cityblock']  = [cityblock(x, y)    for (x, y) in zip(question1_vec, question2_vec)]
data['canberra']   = [canberra(x, y)     for (x, y) in zip(question1_vec, question2_vec)]
data['euclidean']  = [euclidean(x, y)    for (x, y) in zip(question1_vec, question2_vec)]
data['minkowski']  = [minkowski(x, y, 3) for (x, y) in zip(question1_vec, question2_vec)]
data['braycurtis'] = [braycurtis(x, y)   for (x, y) in zip(question1_vec, question2_vec)]

data['skew_q1'] = [skew(x) for x in question1_vec]
data['skew_q2'] = [skew(x) for x in question2_vec]
data['kur_q1']  = [kurtosis(x) for x in question1_vec]
data['kur_q2']  = [kurtosis(x) for x in question2_vec]

data['skew_diff'] = np.abs(data['skew_q1'] - data['skew_q2'])
data['kur_diff']  = np.abs(data['kur_q1'] - data['kur_q2'])

In addition you can not only calculate metric between question but use all vectors or differences.

Generate simple features.

In [20]:
data['len_q1'] = data.question1.apply(lambda x: len(x))
data['len_q2'] = data.question2.apply(lambda x: len(x))
data['len_diff'] = np.abs(data.len_q1 - data.len_q2)

data['len_char_q1'] = data.question1.apply(lambda x: len(x.replace(' ', '')))
data['len_char_q2'] = data.question2.apply(lambda x: len(x.replace(' ', '')))
data['len_char_diff'] = np.abs(data.len_char_q1 - data.len_char_q2)

data['len_uniq_char_q1'] = data.question1.apply(lambda x: len(''.join(set(x.replace(' ', '')))))
data['len_uniq_char_q2'] = data.question2.apply(lambda x: len(''.join(set(x.replace(' ', '')))))
data['len_uniq_char_diff'] = np.abs(data.len_uniq_char_q1 - data.len_uniq_char_q2)

data['len_word_q1'] = data.question1.apply(lambda x: len(x.split()))
data['len_word_q2'] = data.question2.apply(lambda x: len(x.split()))
data['len_word_diff'] = np.abs(data.len_word_q1 - data.len_word_q2)

data['len_uniq_word_q1'] = data.question1.apply(lambda x: len(set(x.split())))
data['len_uniq_word_q2'] = data.question2.apply(lambda x: len(set(x.split())))
data['len_uniq_word_diff'] = np.abs(data.len_uniq_word_q1 - data.len_uniq_word_q2)

data['common_words']  = data.apply(lambda x: len(set(x['question1'].split()).intersection(set(x['question2'].split()))), axis=1)
data['union_words']   = data.apply(lambda x: len(set(x['question1'].split()).union(set(x['question2'].split()))), axis=1)
data['jaccard_words'] = data.common_words / (data.union_words + EPS)

### Train and check model

Split dataset to train and validation parts.

In [21]:
train, test, y_train, y_test = train_test_split(data.drop(['question1', 'question2'], axis=1), target, test_size=0.2)

Train CatBoost and check prediction on validation part.

In [22]:
clf = catboost.CatBoostClassifier(depth=6, iterations=1000, learning_rate=0.1, thread_count=16, logging_level='Silent')
clf.fit(train, y_train)

<catboost.core.CatBoostClassifier at 0x7f72ddd072d0>

In [23]:
y_pred = clf.predict_proba(test)[:, 1]
print 'AUC:', roc_auc_score(y_test, y_pred)

AUC: 0.8268292157683419
