import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy

from time import time
from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline

random_state = 2021-1-30
fontsize = 16
base_sample = 10000
max_features = 2000
nlp = spacy.load("es_core_news_sm")

def print_elapsed_minutes(start_time):
    minutes_count = (time() - start_time) / 60
    print(f"{minutes_count:.2f} min")

reviews_train_data = pd.read_json("./data/dataset_es_train.json", lines=True)
reviews_train_data.head()

reviews_train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   review_id         200000 non-null  object
 1   product_id        200000 non-null  object
 2   reviewer_id       200000 non-null  object
 3   stars             200000 non-null  int64 
 4   review_body       200000 non-null  object
 5   review_title      200000 non-null  object
 6   language          200000 non-null  object
 7   product_category  200000 non-null  object
dtypes: int64(1), object(7)
memory usage: 12.2+ MB

g = sns.countplot(x="stars", data=reviews_train_data)
g.axes.set_title("Reviews by Scores", fontsize = 1.5 * fontsize)
g.set_xlabel("Stars", fontsize=fontsize)
g.set_ylabel("Count", fontsize=fontsize)
plt.show()

order = reviews_train_data["product_category"].value_counts().index
plt.figure(figsize=(20, 5))

g = sns.countplot(x="product_category", data=reviews_train_data, order=order)
g.axes.set_title("Reviews count by Category", fontsize = 1.5 * fontsize)
g.set_xlabel("Product Category", fontsize=fontsize)
g.set_ylabel("Count", fontsize=fontsize)
g.tick_params(labelsize=fontsize)

plt.xticks(rotation=90)
plt.show()

reviews_count = reviews_train_data[["product_category", "stars"]]\
                    .groupby(["product_category", "stars"])["product_category"]\
                    .count()\
                    .reset_index(name="count")\
                    .copy()

plt.figure(figsize=(20, 8))
g = sns.lineplot(data=reviews_count, x="product_category", y="count", hue="stars", palette = "Set1")
g.axes.set_title("Count by Product Category", fontsize = 1.5 * fontsize)
g.set_xlabel("Product Category", fontsize=fontsize)
g.set_ylabel("Count", fontsize=fontsize)
plt.xticks(rotation=90)
plt.show()

print(f"Unique languages: {reviews_train_data.language.nunique()}")
print(f"Unique products: {reviews_train_data.product_id.nunique()} ({100 * reviews_train_data.product_id.nunique() / reviews_train_data.shape[0]}%)")
print(f"Unique reviewer: {reviews_train_data.reviewer_id.nunique()} ({100 * reviews_train_data.reviewer_id.nunique() / reviews_train_data.shape[0]}%)")

Unique languages: 1
Unique products: 150938 (75.469%)
Unique reviewer: 179076 (89.538%)

np.random.seed(random_state)
random_index = np.random.randint(reviews_train_data.shape[0])
reviews_train_data.iloc[random_index]

review_id                                                  es_0018170
product_id                                         product_es_0239974
reviewer_id                                       reviewer_es_0518937
stars                                                               1
review_body         La primera vez que la metí en el lavavajillas ...
review_title                                   Lo barato sale caro...
language                                                           es
product_category                                               sports
Name: 9162, dtype: object

random_review_body = reviews_train_data.review_body[random_index]
random_review_body

'La primera vez que la metí en el lavavajillas con un programa de 50 grados, se deformó. Ya antes se había partido el asa con tan solo unos días de uso.'

doc = nlp(random_review_body)
print(doc.text)

La primera vez que la metí en el lavavajillas con un programa de 50 grados, se deformó. Ya antes se había partido el asa con tan solo unos días de uso.

for sentence in doc.sents:
    print(sentence)

La primera vez que la metí en el lavavajillas con un programa de 50 grados, se deformó.
Ya antes se había partido el asa con tan solo unos días de uso.

tokenized_review_body = [token for token in doc]
print(tokenized_review_body)

[La, primera, vez, que, la, metí, en, el, lavavajillas, con, un, programa, de, 50, grados, ,, se, deformó, ., Ya, antes, se, había, partido, el, asa, con, tan, solo, unos, días, de, uso, .]

for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_lemma = token.lemma_
    token_pos = token.pos_
    token_dep = token.dep_
    token_explanation = spacy.explain(token_pos)
    print(f"{token_text:<13}{token_lemma:<13}{token_pos:<10}{token_dep:<10}{token_explanation}")

La           La           DET       det       determiner
primera      primero      ADJ       amod      adjective
vez          vez          NOUN      nsubj     noun
que          que          SCONJ     obl       subordinating conjunction
la           lo           DET       obj       determiner
metí         meter        NOUN      acl       noun
en           en           ADP       case      adposition
el           el           DET       det       determiner
lavavajillas lavavajillas NOUN      obl       noun
con          con          ADP       case      adposition
un           uno          DET       det       determiner
programa     programar    NOUN      obl       noun
de           de           ADP       case      adposition
50           50           NUM       nummod    numeral
grados       grado        NOUN      nmod      noun
,            ,            PUNCT     punct     punctuation
se           se           PRON      obj       pronoun
deformó      deformar     VERB      ROOT      verb
.            .            PUNCT     punct     punctuation
Ya           Ya           ADV       advmod    adverb
antes        antes        ADV       advmod    adverb
se           se           PRON      obj       pronoun
había        haber        VERB      aux       verb
partido      partir       VERB      ROOT      verb
el           el           DET       det       determiner
asa          asar         PROPN     nsubj     proper noun
con          con          ADP       case      adposition
tan          tan          INTJ      advmod    interjection
solo         solo         INTJ      fixed     interjection
unos         uno          DET       det       determiner
días         día          NOUN      obl       noun
de           de           ADP       case      adposition
uso          usar         NOUN      nmod      noun
.            .            PUNCT     punct     punctuation

for token in doc:
    if token.is_stop or token.is_punct:
        continue
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    token_lemma = token.lemma_
    token_explanation = str(spacy.explain(token_pos))
    print(f"{token_text:<13}{token_lemma:<13}{token_pos:<10}{token_dep:<10}{token_explanation}")

metí         meter        NOUN      acl       noun
lavavajillas lavavajillas NOUN      obl       noun
programa     programar    NOUN      obl       noun
50           50           NUM       nummod    numeral
grados       grado        NOUN      nmod      noun
deformó      deformar     VERB      ROOT      verb
partido      partir       VERB      ROOT      verb
asa          asar         PROPN     nsubj     proper noun

def is_valid_token(token, min_word_length=4, exceptions=[]):
    has_min_length = len(token.lemma_) >= min_word_length
    is_exception = token.text.lower() in exceptions
    return (not token.is_stop and not token.is_punct and has_min_length) or is_exception

def lemmatize_text(text):
    doc = nlp(text.lower())
    lemmatized_words = [token.lemma_ for token in doc if is_valid_token(token, exceptions=["no"])]
    return " ".join(lemmatized_words)

lemmatize_text(random_review_body)

'meter lavavajillas programar grado deformar partir asar'

def lemmatize_text_without_exceptions(text):
    doc = nlp(text.lower())
    lemmatized_words = [token.lemma_ for token in doc if is_valid_token(token)]
    return " ".join(lemmatized_words)    

original_negative_text = "El producto no fue recibido"
original_positive_text = "El producto fue recibido"
print(f"Negative text: {original_negative_text} => {lemmatize_text_without_exceptions(original_negative_text)}")
print(f"Positive text: {original_positive_text} => {lemmatize_text_without_exceptions(original_positive_text)}")

Negative text: El producto no fue recibido => producto recibir
Positive text: El producto fue recibido => producto recibir

def common_words_count(df, stars, ngram_range, max_count=40, fontsize=fontsize, figsize=(15, 8)):
    # Filter the Dataset and create the count vectorizer
    data_to_study = df.copy()[df.stars.isin(stars)]
    count_vectorizer = CountVectorizer(ngram_range=ngram_range)
    fig, axes = plt.subplots(1, 2, figsize=figsize)
    fig.suptitle(f"Most frequent words for reviews with {stars} stars", fontsize=fontsize)
    
    # Title
    body_words_count = count_vectorizer.fit_transform(data_to_study.review_title)
    count_per_word = body_words_count.toarray().sum(axis=0)
    index_order = np.argsort(-count_per_word)
    body_words_labels = np.array(count_vectorizer.get_feature_names())[index_order][0:max_count]
    count_per_word = count_per_word[index_order][0:max_count]
    g = sns.barplot(x=body_words_labels,y=count_per_word, ax=axes[0])
    g.set_xlabel("n-grams", fontsize=fontsize)
    g.set_ylabel("Count", fontsize=fontsize)
    g.tick_params(labelsize=fontsize, labelrotation=90)
    axes[0].set_title("Most common n-grams in the title")
    
    # Body
    body_words_count = count_vectorizer.fit_transform(data_to_study.review_body)
    count_per_word = body_words_count.toarray().sum(axis=0)
    index_order = np.argsort(-count_per_word)
    body_words_labels = np.array(count_vectorizer.get_feature_names())[index_order][0:max_count]
    count_per_word = count_per_word[index_order][0:max_count]
    g = sns.barplot(x=body_words_labels,y=count_per_word, ax=axes[1])
    g.set_xlabel("n-grams", fontsize=fontsize)
    g.set_ylabel("Count", fontsize=fontsize)
    g.tick_params(labelsize=fontsize, labelrotation=90)
    axes[1].set_title("Most common n-grams in the body")
    
    plt.show()

# Lemmatize
print("Body process:", end=" ")
start_time = time()
reviews_train_data.review_body = reviews_train_data.review_body.apply(lemmatize_text)
print_elapsed_minutes(start_time)

print("Title process:", end=" ")
start_time = time()
reviews_train_data.review_title = reviews_train_data.review_title.apply(lemmatize_text)
print_elapsed_minutes(start_time)

weights = np.ones(reviews_train_data.shape[0])
sampled_reviews_train_data = reviews_train_data.sample(n=base_sample, weights=weights, random_state=random_state)
sampled_reviews_train_data = sampled_reviews_train_data[["review_title", "review_body", "stars"]]
print(f"Size of the sample: {sampled_reviews_train_data.shape}")

g = sns.countplot(x="stars", data=sampled_reviews_train_data)
g.set_xlabel("Stars", fontsize=fontsize)
g.set_ylabel("Count", fontsize=fontsize)
plt.show()

Body process: 35.88 min
Title process: 22.55 min
Size of the sample: (10000, 3)

common_words_count(df=sampled_reviews_train_data, stars=[4,5], ngram_range=(2,3), max_count=20, fontsize=fontsize, figsize=(20, 5))

common_words_count(df=sampled_reviews_train_data, stars=[1,2], ngram_range=(2,3), max_count=20, fontsize=fontsize, figsize=(20, 5))

# Create counter
benchmark_count_vectorizer = CountVectorizer(max_features = max_features, ngram_range=(1, 1))

# Transform
print("Vectorizing process:", end=" ")
%time matrix_body_train = benchmark_count_vectorizer.fit_transform(sampled_reviews_train_data.review_body)
X_train_bench = matrix_body_train.toarray()
y_train_bench = sampled_reviews_train_data.stars

Vectorizing process: Wall time: 176 ms

reviews_dev_data = pd.read_json("./data/dataset_es_dev.json", lines=True)

# Lemmatize
print("Body process:", end=" ")
%time reviews_dev_data.review_body = reviews_dev_data.review_body.apply(lemmatize_text)
print("Title process:", end=" ")
%time reviews_dev_data.review_title = reviews_dev_data.review_title.apply(lemmatize_text)

reviews_dev_data = reviews_dev_data[["review_title", "review_body", "stars"]]

Body process: Wall time: 51.9 s
Title process: Wall time: 33.4 s

print("Vectorizing process:", end=" ")
%time matrix_body_dev = benchmark_count_vectorizer.transform(reviews_dev_data.review_body)
X_dev_bench = matrix_body_dev.toarray()
y_dev_bench = reviews_dev_data.stars

Vectorizing process: Wall time: 85 ms

# Plot confusion matrix
def confusion_multi(ytest,y_pred):
    names = ["1","2", "3", "4", "5"]
    cm = confusion_matrix(ytest,y_pred)
    f,ax = plt.subplots(figsize=(5,5))
    sns.heatmap(cm, annot=True, linewidth=.5, linecolor="r", fmt=".0f", ax=ax)
    plt.xlabel("y_pred")
    plt.ylabel("y_true")
    ax.set_xticklabels(names)
    ax.set_yticklabels(names)
    plt.show()

# Train model
def train_benchmark_model(model, traindata, devdata):
    X_train, y_train = traindata
    X_dev, y_dev = devdata

    # Training classifier
    print("Training", end=" ")
    start_time = time()
    model.fit(X_train, y_train)
    print_elapsed_minutes(start_time)

    # Predict
    print("Predicting", end=" ")
    start_time = time()
    predictions = model.predict(X_dev)
    print_elapsed_minutes(start_time)

    # Accuracy dev
    accuracy = accuracy_score(y_dev, predictions)
    print(f"Accuracy for dev set: {100 * accuracy:.2f}%")

    confusion_multi(y_dev, predictions)

# # One vs Rest + SVC
train_benchmark_model(OneVsRestClassifier(SVC(random_state=random_state), n_jobs=-1), (X_train_bench, y_train_bench), (X_dev_bench, y_dev_bench))

Training 4.04 min
Predicting 3.69 min
Accuracy for dev set: 41.76%

# One vs Rest + Random Forest
train_benchmark_model(OneVsRestClassifier(RandomForestClassifier(random_state=random_state, n_jobs=-1), n_jobs=-1), (X_train_bench, y_train_bench), (X_dev_bench, y_dev_bench))

Training 1.02 min
Predicting 0.02 min
Accuracy for dev set: 39.30%

def confusion_binary(ytest,y_pred):
    names = ["0","1"]
    cm = confusion_matrix(ytest,y_pred)
    f,ax = plt.subplots(figsize=(5,5))
    sns.heatmap(cm, annot=True, linewidth=.5, linecolor="r", fmt=".0f", ax=ax)
    plt.xlabel("y_pred")
    plt.ylabel("y_true")
    ax.set_xticklabels(names)
    ax.set_yticklabels(names)
    plt.show()

def train_model(data, classifier, title_vectorizer, body_vectorizer, return_results=True, print_confusion_matrix=False, binary=False):
    start_time = time()
    
    train_data, dev_data = data
    title_words_train = title_vectorizer.fit_transform(train_data.review_title)
    body_words_train = body_vectorizer.fit_transform(train_data.review_body)
    
    X_train = sparse.hstack((title_words_train, body_words_train)) 
    y_train = train_data.stars

    # model generation
    classifier.fit(X_train, y_train)

    title_words_dev = title_vectorizer.transform(dev_data.review_title)
    body_words_dev = body_vectorizer.transform(dev_data.review_body)

    X_dev = sparse.hstack((title_words_dev, body_words_dev))
    y_dev = dev_data.stars

    # Predict
    predictions = classifier.predict(X_dev)
    print_elapsed_minutes(start_time)
    
    # Accuracy dev
    accuracy = accuracy_score(y_dev, predictions)
    print(f"Accuracy for dev set: {100 * accuracy:.2f}%\n")

    if print_confusion_matrix:
        confusion_binary(y_dev, predictions) if binary else confusion_multi(y_dev, predictions)

    if return_results:
        return accuracy

ngram_ranges = [(1,1), (1,2)]
Cs=[1.0, 2.0]
kernels=["linear", "rbf"]
degrees=[2, 3]
vectorizer_strategies = ["count", "tf_idf"]
combinations = []
best_params_svc = {
    "accuracy": 0
}

for ngram_range in ngram_ranges:
    for C in Cs:
        for kernel in kernels:
            for degree in degrees:
                for vectorizer_strategy in vectorizer_strategies:
                    combinations.append({
                        "ngram_range": ngram_range,
                        "C": C,
                        "kernel": kernel,
                        "degree": degree,
                        "vectorizer_strategy": vectorizer_strategy
                    })

total_combinations = len(combinations)
for loop_number, combination in enumerate(combinations):
    print(f"Loop {loop_number + 1}/{total_combinations} ({combination}) =>", end=" ")
    
    if combination["vectorizer_strategy"] == "tf_idf":
        body_vectorizer = TfidfVectorizer(max_features = max_features, ngram_range=combination["ngram_range"])
        title_vectorizer = TfidfVectorizer(max_features = max_features, ngram_range=combination["ngram_range"])
    else:
        body_vectorizer = CountVectorizer(max_features = max_features, ngram_range=combination["ngram_range"])
        title_vectorizer = CountVectorizer(max_features = max_features, ngram_range=combination["ngram_range"])
    
    classifier = OneVsRestClassifier(SVC(C=combination["C"], kernel=combination["kernel"], degree=combination["degree"], random_state=random_state), n_jobs=-1)
    model_accuracy = train_model((sampled_reviews_train_data, reviews_dev_data), classifier, title_vectorizer, body_vectorizer)
    
    if best_params_svc["accuracy"] < model_accuracy:
        best_params_svc["accuracy"] = model_accuracy
        best_params_svc["params"] = combination
        best_params_svc["classifier"] = classifier
        best_params_svc["body_vectorizer"] = body_vectorizer
        best_params_svc["title_vectorizer"] = title_vectorizer

print("\nBest Params:")
best_params_svc

Loop 1/32 ({'ngram_range': (1, 1), 'C': 1.0, 'kernel': 'linear', 'degree': 2, 'vectorizer_strategy': 'count'}) => 0.41 min
Accuracy for dev set: 43.38%

Loop 2/32 ({'ngram_range': (1, 1), 'C': 1.0, 'kernel': 'linear', 'degree': 2, 'vectorizer_strategy': 'tf_idf'}) => 0.31 min
Accuracy for dev set: 44.62%

Loop 3/32 ({'ngram_range': (1, 1), 'C': 1.0, 'kernel': 'linear', 'degree': 3, 'vectorizer_strategy': 'count'}) => 0.40 min
Accuracy for dev set: 43.38%

Loop 4/32 ({'ngram_range': (1, 1), 'C': 1.0, 'kernel': 'linear', 'degree': 3, 'vectorizer_strategy': 'tf_idf'}) => 0.31 min
Accuracy for dev set: 44.62%

Loop 5/32 ({'ngram_range': (1, 1), 'C': 1.0, 'kernel': 'rbf', 'degree': 2, 'vectorizer_strategy': 'count'}) => 0.55 min
Accuracy for dev set: 44.34%

Loop 6/32 ({'ngram_range': (1, 1), 'C': 1.0, 'kernel': 'rbf', 'degree': 2, 'vectorizer_strategy': 'tf_idf'}) => 0.56 min
Accuracy for dev set: 45.66%

Loop 7/32 ({'ngram_range': (1, 1), 'C': 1.0, 'kernel': 'rbf', 'degree': 3, 'vectorizer_strategy': 'count'}) => 0.55 min
Accuracy for dev set: 44.34%

Loop 8/32 ({'ngram_range': (1, 1), 'C': 1.0, 'kernel': 'rbf', 'degree': 3, 'vectorizer_strategy': 'tf_idf'}) => 0.56 min
Accuracy for dev set: 45.66%

Loop 9/32 ({'ngram_range': (1, 1), 'C': 2.0, 'kernel': 'linear', 'degree': 2, 'vectorizer_strategy': 'count'}) => 0.51 min
Accuracy for dev set: 42.78%

Loop 10/32 ({'ngram_range': (1, 1), 'C': 2.0, 'kernel': 'linear', 'degree': 2, 'vectorizer_strategy': 'tf_idf'}) => 0.32 min
Accuracy for dev set: 43.72%

Loop 11/32 ({'ngram_range': (1, 1), 'C': 2.0, 'kernel': 'linear', 'degree': 3, 'vectorizer_strategy': 'count'}) => 0.52 min
Accuracy for dev set: 42.78%

Loop 12/32 ({'ngram_range': (1, 1), 'C': 2.0, 'kernel': 'linear', 'degree': 3, 'vectorizer_strategy': 'tf_idf'}) => 0.32 min
Accuracy for dev set: 43.72%

Loop 13/32 ({'ngram_range': (1, 1), 'C': 2.0, 'kernel': 'rbf', 'degree': 2, 'vectorizer_strategy': 'count'}) => 0.61 min
Accuracy for dev set: 44.26%

Loop 14/32 ({'ngram_range': (1, 1), 'C': 2.0, 'kernel': 'rbf', 'degree': 2, 'vectorizer_strategy': 'tf_idf'}) => 0.71 min
Accuracy for dev set: 45.38%

Loop 15/32 ({'ngram_range': (1, 1), 'C': 2.0, 'kernel': 'rbf', 'degree': 3, 'vectorizer_strategy': 'count'}) => 0.61 min
Accuracy for dev set: 44.26%

Loop 16/32 ({'ngram_range': (1, 1), 'C': 2.0, 'kernel': 'rbf', 'degree': 3, 'vectorizer_strategy': 'tf_idf'}) => 0.69 min
Accuracy for dev set: 45.38%

Loop 17/32 ({'ngram_range': (1, 2), 'C': 1.0, 'kernel': 'linear', 'degree': 2, 'vectorizer_strategy': 'count'}) => 0.40 min
Accuracy for dev set: 42.06%

Loop 18/32 ({'ngram_range': (1, 2), 'C': 1.0, 'kernel': 'linear', 'degree': 2, 'vectorizer_strategy': 'tf_idf'}) => 0.35 min
Accuracy for dev set: 44.30%

Loop 19/32 ({'ngram_range': (1, 2), 'C': 1.0, 'kernel': 'linear', 'degree': 3, 'vectorizer_strategy': 'count'}) => 0.41 min
Accuracy for dev set: 42.06%

Loop 20/32 ({'ngram_range': (1, 2), 'C': 1.0, 'kernel': 'linear', 'degree': 3, 'vectorizer_strategy': 'tf_idf'}) => 0.34 min
Accuracy for dev set: 44.30%

Loop 21/32 ({'ngram_range': (1, 2), 'C': 1.0, 'kernel': 'rbf', 'degree': 2, 'vectorizer_strategy': 'count'}) => 0.58 min
Accuracy for dev set: 44.72%

Loop 22/32 ({'ngram_range': (1, 2), 'C': 1.0, 'kernel': 'rbf', 'degree': 2, 'vectorizer_strategy': 'tf_idf'}) => 0.61 min
Accuracy for dev set: 45.86%

Loop 23/32 ({'ngram_range': (1, 2), 'C': 1.0, 'kernel': 'rbf', 'degree': 3, 'vectorizer_strategy': 'count'}) => 0.58 min
Accuracy for dev set: 44.72%

Loop 24/32 ({'ngram_range': (1, 2), 'C': 1.0, 'kernel': 'rbf', 'degree': 3, 'vectorizer_strategy': 'tf_idf'}) => 0.61 min
Accuracy for dev set: 45.86%

Loop 25/32 ({'ngram_range': (1, 2), 'C': 2.0, 'kernel': 'linear', 'degree': 2, 'vectorizer_strategy': 'count'}) => 0.53 min
Accuracy for dev set: 41.06%

Loop 26/32 ({'ngram_range': (1, 2), 'C': 2.0, 'kernel': 'linear', 'degree': 2, 'vectorizer_strategy': 'tf_idf'}) => 0.35 min
Accuracy for dev set: 43.44%

Loop 27/32 ({'ngram_range': (1, 2), 'C': 2.0, 'kernel': 'linear', 'degree': 3, 'vectorizer_strategy': 'count'}) => 0.53 min
Accuracy for dev set: 41.06%

Loop 28/32 ({'ngram_range': (1, 2), 'C': 2.0, 'kernel': 'linear', 'degree': 3, 'vectorizer_strategy': 'tf_idf'}) => 0.35 min
Accuracy for dev set: 43.44%

Loop 29/32 ({'ngram_range': (1, 2), 'C': 2.0, 'kernel': 'rbf', 'degree': 2, 'vectorizer_strategy': 'count'}) => 0.67 min
Accuracy for dev set: 44.20%

Loop 30/32 ({'ngram_range': (1, 2), 'C': 2.0, 'kernel': 'rbf', 'degree': 2, 'vectorizer_strategy': 'tf_idf'}) => 0.76 min
Accuracy for dev set: 45.54%

Loop 31/32 ({'ngram_range': (1, 2), 'C': 2.0, 'kernel': 'rbf', 'degree': 3, 'vectorizer_strategy': 'count'}) => 0.66 min
Accuracy for dev set: 44.20%

Loop 32/32 ({'ngram_range': (1, 2), 'C': 2.0, 'kernel': 'rbf', 'degree': 3, 'vectorizer_strategy': 'tf_idf'}) => 0.76 min
Accuracy for dev set: 45.54%


Best Params:

{'accuracy': 0.4586,
 'params': {'ngram_range': (1, 2),
  'C': 1.0,
  'kernel': 'rbf',
  'degree': 2,
  'vectorizer_strategy': 'tf_idf'},
 'classifier': OneVsRestClassifier(estimator=SVC(degree=2, random_state=1990), n_jobs=-1),
 'body_vectorizer': TfidfVectorizer(max_features=2000, ngram_range=(1, 2)),
 'title_vectorizer': TfidfVectorizer(max_features=2000, ngram_range=(1, 2))}

ngram_ranges = [(1,1), (1,2)]
n_estimators_array = [100, 200]
criterions = ["gini", "entropy"]
max_depths = [10, 50, 100]
vectorizer_strategies = ["count", "tf_idf"]
combinations = []
best_params_random_forest = {
    "accuracy": 0
}

for n_estimators in n_estimators_array:
    for criterion in criterions:
        for max_depth in max_depths:
            for ngram_range in ngram_ranges:
                for vectorizer_strategy in vectorizer_strategies:
                    combinations.append({
                        "n_estimators": n_estimators,
                        "criterion": criterion,
                        "max_depth": max_depth,
                        "ngram_range": ngram_range,
                        "vectorizer_strategy": vectorizer_strategy
                    })

total_combinations = len(combinations)
for loop_number, combination in enumerate(combinations):
    print(f"Loop {loop_number + 1}/{total_combinations} ({combination}) =>", end=" ")

    if combination["vectorizer_strategy"] == "tf_idf":
        body_vectorizer = TfidfVectorizer(max_features = max_features, ngram_range=combination["ngram_range"])
        title_vectorizer = TfidfVectorizer(max_features = max_features, ngram_range=combination["ngram_range"])
    else:
        body_vectorizer = CountVectorizer(max_features = max_features, ngram_range=combination["ngram_range"])
        title_vectorizer = CountVectorizer(max_features = max_features, ngram_range=combination["ngram_range"])

    classifier = OneVsRestClassifier(RandomForestClassifier(max_depth=combination["max_depth"], criterion=combination["criterion"], n_estimators=combination["n_estimators"], n_jobs=-1, random_state=random_state), n_jobs=-1)
    model_accuracy = train_model((sampled_reviews_train_data, reviews_dev_data), classifier, title_vectorizer, body_vectorizer)

    if best_params_random_forest["accuracy"] < model_accuracy:
        best_params_random_forest["accuracy"] = model_accuracy
        best_params_random_forest["params"] = combination
        best_params_random_forest["classifier"] = classifier
        best_params_random_forest["body_vectorizer"] = body_vectorizer
        best_params_random_forest["title_vectorizer"] = title_vectorizer

print("\nBest Params for Random Forest:")
best_params_random_forest

Loop 1/48 ({'n_estimators': 100, 'criterion': 'gini', 'max_depth': 10, 'ngram_range': (1, 1), 'vectorizer_strategy': 'count'}) => 0.03 min
Accuracy for dev set: 42.32%

Loop 2/48 ({'n_estimators': 100, 'criterion': 'gini', 'max_depth': 10, 'ngram_range': (1, 1), 'vectorizer_strategy': 'tf_idf'}) => 0.03 min
Accuracy for dev set: 42.76%

Loop 3/48 ({'n_estimators': 100, 'criterion': 'gini', 'max_depth': 10, 'ngram_range': (1, 2), 'vectorizer_strategy': 'count'}) => 0.03 min
Accuracy for dev set: 42.62%

Loop 4/48 ({'n_estimators': 100, 'criterion': 'gini', 'max_depth': 10, 'ngram_range': (1, 2), 'vectorizer_strategy': 'tf_idf'}) => 0.03 min
Accuracy for dev set: 42.08%

Loop 5/48 ({'n_estimators': 100, 'criterion': 'gini', 'max_depth': 50, 'ngram_range': (1, 1), 'vectorizer_strategy': 'count'}) => 0.05 min
Accuracy for dev set: 44.58%

Loop 6/48 ({'n_estimators': 100, 'criterion': 'gini', 'max_depth': 50, 'ngram_range': (1, 1), 'vectorizer_strategy': 'tf_idf'}) => 0.05 min
Accuracy for dev set: 44.16%

Loop 7/48 ({'n_estimators': 100, 'criterion': 'gini', 'max_depth': 50, 'ngram_range': (1, 2), 'vectorizer_strategy': 'count'}) => 0.05 min
Accuracy for dev set: 44.24%

Loop 8/48 ({'n_estimators': 100, 'criterion': 'gini', 'max_depth': 50, 'ngram_range': (1, 2), 'vectorizer_strategy': 'tf_idf'}) => 0.06 min
Accuracy for dev set: 44.22%

Loop 9/48 ({'n_estimators': 100, 'criterion': 'gini', 'max_depth': 100, 'ngram_range': (1, 1), 'vectorizer_strategy': 'count'}) => 0.07 min
Accuracy for dev set: 44.10%

Loop 10/48 ({'n_estimators': 100, 'criterion': 'gini', 'max_depth': 100, 'ngram_range': (1, 1), 'vectorizer_strategy': 'tf_idf'}) => 0.07 min
Accuracy for dev set: 43.40%

Loop 11/48 ({'n_estimators': 100, 'criterion': 'gini', 'max_depth': 100, 'ngram_range': (1, 2), 'vectorizer_strategy': 'count'}) => 0.08 min
Accuracy for dev set: 44.30%

Loop 12/48 ({'n_estimators': 100, 'criterion': 'gini', 'max_depth': 100, 'ngram_range': (1, 2), 'vectorizer_strategy': 'tf_idf'}) => 0.08 min
Accuracy for dev set: 43.94%

Loop 13/48 ({'n_estimators': 100, 'criterion': 'entropy', 'max_depth': 10, 'ngram_range': (1, 1), 'vectorizer_strategy': 'count'}) => 0.03 min
Accuracy for dev set: 42.56%

Loop 14/48 ({'n_estimators': 100, 'criterion': 'entropy', 'max_depth': 10, 'ngram_range': (1, 1), 'vectorizer_strategy': 'tf_idf'}) => 0.03 min
Accuracy for dev set: 43.24%

Loop 15/48 ({'n_estimators': 100, 'criterion': 'entropy', 'max_depth': 10, 'ngram_range': (1, 2), 'vectorizer_strategy': 'count'}) => 0.03 min
Accuracy for dev set: 42.86%

Loop 16/48 ({'n_estimators': 100, 'criterion': 'entropy', 'max_depth': 10, 'ngram_range': (1, 2), 'vectorizer_strategy': 'tf_idf'}) => 0.04 min
Accuracy for dev set: 42.84%

Loop 17/48 ({'n_estimators': 100, 'criterion': 'entropy', 'max_depth': 50, 'ngram_range': (1, 1), 'vectorizer_strategy': 'count'}) => 0.05 min
Accuracy for dev set: 44.64%

Loop 18/48 ({'n_estimators': 100, 'criterion': 'entropy', 'max_depth': 50, 'ngram_range': (1, 1), 'vectorizer_strategy': 'tf_idf'}) => 0.05 min
Accuracy for dev set: 44.14%

Loop 19/48 ({'n_estimators': 100, 'criterion': 'entropy', 'max_depth': 50, 'ngram_range': (1, 2), 'vectorizer_strategy': 'count'}) => 0.05 min
Accuracy for dev set: 44.24%

Loop 20/48 ({'n_estimators': 100, 'criterion': 'entropy', 'max_depth': 50, 'ngram_range': (1, 2), 'vectorizer_strategy': 'tf_idf'}) => 0.06 min
Accuracy for dev set: 44.00%

Loop 21/48 ({'n_estimators': 100, 'criterion': 'entropy', 'max_depth': 100, 'ngram_range': (1, 1), 'vectorizer_strategy': 'count'}) => 0.07 min
Accuracy for dev set: 43.96%

Loop 22/48 ({'n_estimators': 100, 'criterion': 'entropy', 'max_depth': 100, 'ngram_range': (1, 1), 'vectorizer_strategy': 'tf_idf'}) => 0.08 min
Accuracy for dev set: 44.34%

Loop 23/48 ({'n_estimators': 100, 'criterion': 'entropy', 'max_depth': 100, 'ngram_range': (1, 2), 'vectorizer_strategy': 'count'}) => 0.08 min
Accuracy for dev set: 44.22%

Loop 24/48 ({'n_estimators': 100, 'criterion': 'entropy', 'max_depth': 100, 'ngram_range': (1, 2), 'vectorizer_strategy': 'tf_idf'}) => 0.09 min
Accuracy for dev set: 44.16%

Loop 25/48 ({'n_estimators': 200, 'criterion': 'gini', 'max_depth': 10, 'ngram_range': (1, 1), 'vectorizer_strategy': 'count'}) => 0.04 min
Accuracy for dev set: 43.06%

Loop 26/48 ({'n_estimators': 200, 'criterion': 'gini', 'max_depth': 10, 'ngram_range': (1, 1), 'vectorizer_strategy': 'tf_idf'}) => 0.04 min
Accuracy for dev set: 42.96%

Loop 27/48 ({'n_estimators': 200, 'criterion': 'gini', 'max_depth': 10, 'ngram_range': (1, 2), 'vectorizer_strategy': 'count'}) => 0.04 min
Accuracy for dev set: 42.90%

Loop 28/48 ({'n_estimators': 200, 'criterion': 'gini', 'max_depth': 10, 'ngram_range': (1, 2), 'vectorizer_strategy': 'tf_idf'}) => 0.05 min
Accuracy for dev set: 42.96%

Loop 29/48 ({'n_estimators': 200, 'criterion': 'gini', 'max_depth': 50, 'ngram_range': (1, 1), 'vectorizer_strategy': 'count'}) => 0.08 min
Accuracy for dev set: 44.82%

Loop 30/48 ({'n_estimators': 200, 'criterion': 'gini', 'max_depth': 50, 'ngram_range': (1, 1), 'vectorizer_strategy': 'tf_idf'}) => 0.08 min
Accuracy for dev set: 44.02%

Loop 31/48 ({'n_estimators': 200, 'criterion': 'gini', 'max_depth': 50, 'ngram_range': (1, 2), 'vectorizer_strategy': 'count'}) => 0.09 min
Accuracy for dev set: 44.38%

Loop 32/48 ({'n_estimators': 200, 'criterion': 'gini', 'max_depth': 50, 'ngram_range': (1, 2), 'vectorizer_strategy': 'tf_idf'}) => 0.09 min
Accuracy for dev set: 44.36%

Loop 33/48 ({'n_estimators': 200, 'criterion': 'gini', 'max_depth': 100, 'ngram_range': (1, 1), 'vectorizer_strategy': 'count'}) => 0.14 min
Accuracy for dev set: 43.96%

Loop 34/48 ({'n_estimators': 200, 'criterion': 'gini', 'max_depth': 100, 'ngram_range': (1, 1), 'vectorizer_strategy': 'tf_idf'}) => 0.14 min
Accuracy for dev set: 43.70%

Loop 35/48 ({'n_estimators': 200, 'criterion': 'gini', 'max_depth': 100, 'ngram_range': (1, 2), 'vectorizer_strategy': 'count'}) => 0.15 min
Accuracy for dev set: 44.74%

Loop 36/48 ({'n_estimators': 200, 'criterion': 'gini', 'max_depth': 100, 'ngram_range': (1, 2), 'vectorizer_strategy': 'tf_idf'}) => 0.15 min
Accuracy for dev set: 43.82%

Loop 37/48 ({'n_estimators': 200, 'criterion': 'entropy', 'max_depth': 10, 'ngram_range': (1, 1), 'vectorizer_strategy': 'count'}) => 0.04 min
Accuracy for dev set: 42.98%

Loop 38/48 ({'n_estimators': 200, 'criterion': 'entropy', 'max_depth': 10, 'ngram_range': (1, 1), 'vectorizer_strategy': 'tf_idf'}) => 0.04 min
Accuracy for dev set: 43.46%

Loop 39/48 ({'n_estimators': 200, 'criterion': 'entropy', 'max_depth': 10, 'ngram_range': (1, 2), 'vectorizer_strategy': 'count'}) => 0.04 min
Accuracy for dev set: 43.02%

Loop 40/48 ({'n_estimators': 200, 'criterion': 'entropy', 'max_depth': 10, 'ngram_range': (1, 2), 'vectorizer_strategy': 'tf_idf'}) => 0.05 min
Accuracy for dev set: 43.32%

Loop 41/48 ({'n_estimators': 200, 'criterion': 'entropy', 'max_depth': 50, 'ngram_range': (1, 1), 'vectorizer_strategy': 'count'}) => 0.08 min
Accuracy for dev set: 44.98%

Loop 42/48 ({'n_estimators': 200, 'criterion': 'entropy', 'max_depth': 50, 'ngram_range': (1, 1), 'vectorizer_strategy': 'tf_idf'}) => 0.09 min
Accuracy for dev set: 44.50%

Loop 43/48 ({'n_estimators': 200, 'criterion': 'entropy', 'max_depth': 50, 'ngram_range': (1, 2), 'vectorizer_strategy': 'count'}) => 0.09 min
Accuracy for dev set: 44.64%

Loop 44/48 ({'n_estimators': 200, 'criterion': 'entropy', 'max_depth': 50, 'ngram_range': (1, 2), 'vectorizer_strategy': 'tf_idf'}) => 0.10 min
Accuracy for dev set: 44.32%

Loop 45/48 ({'n_estimators': 200, 'criterion': 'entropy', 'max_depth': 100, 'ngram_range': (1, 1), 'vectorizer_strategy': 'count'}) => 0.14 min
Accuracy for dev set: 44.14%

Loop 46/48 ({'n_estimators': 200, 'criterion': 'entropy', 'max_depth': 100, 'ngram_range': (1, 1), 'vectorizer_strategy': 'tf_idf'}) => 0.16 min
Accuracy for dev set: 44.08%

Loop 47/48 ({'n_estimators': 200, 'criterion': 'entropy', 'max_depth': 100, 'ngram_range': (1, 2), 'vectorizer_strategy': 'count'}) => 0.15 min
Accuracy for dev set: 44.74%

Loop 48/48 ({'n_estimators': 200, 'criterion': 'entropy', 'max_depth': 100, 'ngram_range': (1, 2), 'vectorizer_strategy': 'tf_idf'}) => 0.17 min
Accuracy for dev set: 44.30%


Best Params for Random Forest:

{'accuracy': 0.4498,
 'params': {'n_estimators': 200,
  'criterion': 'entropy',
  'max_depth': 50,
  'ngram_range': (1, 1),
  'vectorizer_strategy': 'count'},
 'classifier': OneVsRestClassifier(estimator=RandomForestClassifier(criterion='entropy',
                                                      max_depth=50,
                                                      n_estimators=200,
                                                      n_jobs=-1,
                                                      random_state=1990),
                     n_jobs=-1),
 'body_vectorizer': CountVectorizer(max_features=2000),
 'title_vectorizer': CountVectorizer(max_features=2000)}

best_params_random_forest

{'accuracy': 0.4498,
 'params': {'n_estimators': 200,
  'criterion': 'entropy',
  'max_depth': 50,
  'ngram_range': (1, 1),
  'vectorizer_strategy': 'count'},
 'classifier': OneVsRestClassifier(estimator=RandomForestClassifier(criterion='entropy',
                                                      max_depth=50,
                                                      n_estimators=200,
                                                      n_jobs=-1,
                                                      random_state=1990),
                     n_jobs=-1),
 'body_vectorizer': CountVectorizer(max_features=2000),
 'title_vectorizer': CountVectorizer(max_features=2000)}

rf_best_params = best_params_random_forest["params"]
rf_body_vectorizer = CountVectorizer(max_features = max_features, ngram_range=rf_best_params["ngram_range"])
rf_title_vectorizer = CountVectorizer(max_features = max_features, ngram_range=rf_best_params["ngram_range"])
rf_classifier = OneVsRestClassifier(RandomForestClassifier(max_depth=rf_best_params["max_depth"], criterion=rf_best_params["criterion"], n_estimators=rf_best_params["n_estimators"], n_jobs=-1, random_state=random_state), n_jobs=-1)
print("Training time:", end=" ")
train_model((reviews_train_data, reviews_dev_data), rf_classifier, rf_title_vectorizer, rf_body_vectorizer, return_results=False, print_confusion_matrix=True)

Training time: 9.05 min
Accuracy for dev set: 48.18%

# Import
reviews_test_data = pd.read_json("./data/dataset_es_test.json", lines=True)

# Process
print("Body process:", end=" ")
%time reviews_test_data.review_body = reviews_test_data.review_body.apply(lemmatize_text)
print("Title process:", end=" ")
%time reviews_test_data.review_title = reviews_test_data.review_title.apply(lemmatize_text)
reviews_test_data = reviews_test_data[["review_title", "review_body", "stars"]]

title_words_test = rf_title_vectorizer.transform(reviews_test_data.review_title)
body_words_test = rf_body_vectorizer.transform(reviews_test_data.review_body)

X_test = sparse.hstack((title_words_test, body_words_test))
y_test = reviews_test_data.stars

# Predict
print("Predict:", end=" ")
start_time = time()
predictions = rf_classifier.predict(X_test)
print_elapsed_minutes(start_time)

# Evaluation
accuracy_test = accuracy_score(y_test, predictions)
print(f"Accuracy for test set: {100 * accuracy_test:.2f}%\n")

confusion_multi(y_test, predictions)

Body process: Wall time: 1min 8s
Title process: Wall time: 39.3 s
Predict: 0.02 min
Accuracy for test set: 48.28%

all_body_words = [f"{word} (body)" for word in rf_body_vectorizer.get_feature_names()]
all_title_words = [f"{word} (title)" for word in rf_title_vectorizer.get_feature_names()]
all_words = [*all_title_words, *all_body_words]

def plot_feature_importance(all_words, importances, use_case):
    indices = np.argsort(importances)[::-1]
    indices = indices[:30]
    selected_words = [all_words[i] for i in indices]
    selected_importances = importances[indices]

    plt.figure(figsize = (15,8))
    g = sns.barplot(x=selected_words, y=selected_importances)
    g.set_title(f"Most relevant words for predicting {use_case}", fontsize=fontsize)
    g.set_xlabel("Words", fontsize=fontsize)
    g.set_ylabel("Importance", fontsize=fontsize)
    g.tick_params(axis="x", labelsize=fontsize, labelrotation=80)
    plt.show()

importances = rf_classifier.estimators_[0].feature_importances_

plot_feature_importance(all_words, importances, "1 star")

importances = rf_classifier.estimators_[4].feature_importances_

plot_feature_importance(all_words, importances, "5 stars")

binary_train_data = reviews_train_data[reviews_train_data.stars.isin([1, 2, 4, 5])].copy()
binary_train_data.stars.replace([1,2,4,5], [1,1,0,0], inplace=True)

binary_dev_data = reviews_dev_data[reviews_dev_data.stars.isin([1, 2, 4, 5])].copy()
binary_dev_data.stars.replace([1,2,4,5], [1,1,0,0], inplace=True)

fig, axes = plt.subplots(1, 2, figsize=(15,5))
fig.suptitle(f"Binary classes count for train and dev datasets", fontsize=fontsize)


g = sns.countplot(x="stars", data=binary_train_data, ax=axes[0])
g.set_xlabel("Classes", fontsize=fontsize)
g.set_ylabel("Count", fontsize=fontsize)
axes[0].set_title("Train dataset")

g = sns.countplot(x="stars", data=binary_dev_data, ax=axes[1])
g.set_xlabel("Classes", fontsize=fontsize)
g.set_ylabel("Count", fontsize=fontsize)
axes[1].set_title("Dev dataset")

plt.show()

binary_body_vectorizer = CountVectorizer(max_features = max_features, ngram_range=rf_best_params["ngram_range"])
binary_title_vectorizer = CountVectorizer(max_features = max_features, ngram_range=rf_best_params["ngram_range"])
binary_random_forest = RandomForestClassifier(max_depth=rf_best_params["max_depth"], criterion=rf_best_params["criterion"], n_estimators=rf_best_params["n_estimators"], n_jobs=-1, random_state=random_state)
print("Training time:", end=" ")
train_model((binary_train_data, binary_dev_data), binary_random_forest, binary_title_vectorizer, binary_body_vectorizer, return_results=False, print_confusion_matrix=True, binary=True)

Training time: 1.48 min
Accuracy for dev set: 86.40%

all_body_words_binary = [f"{word} (body)" for word in binary_body_vectorizer.get_feature_names()]
all_title_words_binary = [f"{word} (title)" for word in binary_title_vectorizer.get_feature_names()]
all_words_binary = [*all_title_words_binary, *all_body_words_binary]

importances = binary_random_forest.estimators_[0].feature_importances_

plot_feature_importance(all_words, importances, "positive rates")

importances = binary_random_forest.estimators_[1].feature_importances_

plot_feature_importance(all_words, importances, "negative rates")

	review_id	product_id	reviewer_id	stars	review_body	review_title	language	product_category
0	es_0491108	product_es_0296024	reviewer_es_0999081	1	Nada bueno se me fue ka pantalla en menos de 8...	television Nevir	es	electronics
1	es_0869872	product_es_0922286	reviewer_es_0216771	1	Horrible, nos tuvimos que comprar otro porque ...	Dinero tirado a la basura con esta compra	es	electronics
2	es_0811721	product_es_0474543	reviewer_es_0929213	1	Te obligan a comprar dos unidades y te llega s...	solo llega una unidad cuando te obligan a comp...	es	drugstore
3	es_0359921	product_es_0656090	reviewer_es_0224702	1	No entro en descalificar al vendedor, solo pue...	PRODUCTO NO RECIBIDO.	es	wireless
4	es_0068940	product_es_0662544	reviewer_es_0224827	1	Llega tarde y co la talla equivocada	Devuelto	es	shoes

Natural Language Processing¶

Dataset: The Multilingual Amazon Reviews Corpus¶

Exploratory Data Analysis¶

Benchmark Model¶

Creation of the model and optimization of hyperparameters.¶

Results Analysis¶

Conclusions¶