15.05.2020
Anılcan Atik
The Amazon Movie Dataset contains product reviews and metadata from Amazon, including 142.8 million reviews spanning May 1996 - July 2014.
This dataset includes reviews (ratings, text, helpfulness votes) and product metadata (descriptions, category information, price, brand, and image features).
{
"reviewerID": "A2SUAM1J3GNN3B",
"asin": "0000013714",
"reviewerName": "J. McDonald",
"helpful": [2, 3],
"reviewText": "I bought this for my husband who plays the piano. He is having a wonderful time playing these old hymns. The music is at times hard to read because we think the book was published for singing from more than playing from. Great purchase though!",
"overall": 5.0,
"summary": "Heavenly Highway Hymns",
"unixReviewTime": 1252800000,
"reviewTime": "09 13, 2009"
}
import csv
import json
#download link = "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Movies_and_TV_5.json.gz"
input_file = "C:/Users/Joahn/Desktop/ML Intro Term Project/Movies_and_TV_5.json"
input_json = open(input_file, "r", encoding="utf-8")
output_file = "C:/Users/Joahn/Desktop/ML Intro Term Project/Movies_and_TV_5.csv"
with open(output_file, "w", encoding="utf-8") as output_csv:
csv_writer = csv.writer(output_csv)
flag = 0
for line in input_json.readlines():
dic = json.loads(line)
# writing headline in the beginning
if flag == 0:
csv_writer.writerow(dic)
flag = 1
csv_writer.writerow(dic.values())
print("Done")
Since rating is 1 to 5; 1-2 ratings indicate negative response whereas 5-4 ratings indicate positive response.
Thus we aim to eliminate neutral rating 3, and re-label positive scores as +1, negative scores as -1.
import pandas as pd
import string
input_data = pd.read_csv("C:/Users/Joahn/Desktop/ML Intro Term Project/Movies_and_TV_5.csv")
input_data['overall'] = input_data['overall'].astype(object) # fix datatype error
input_data['reviewText'] = input_data['reviewText'].astype(object) # fix datatype error
input_data.head(3)
input_data.shape
dataset = {"reviewText": input_data["reviewText"], "overall": input_data["overall"] }
dataset = pd.DataFrame(data = dataset)
dataset = dataset.dropna()
dataset.head(3)
dataset.shape
Eliminating the neutral reviews of "3",
positive label value is +1 and it includes 4 and 5 overall ratings;
while negative label value is -1 and in includes 1 and 2 overal ratings.
dataset = dataset[dataset["overall"] != "3.0"] # need datatype=object
dataset["label"] = dataset["overall"].apply(lambda rating : +1 if str(rating) > '3' else -1)
dataset.head(3)
dataset.shape
dataset.count()
print("Number of positive reviews are {}, while number of negative reviews are {} in the dataset".format((dataset.label == 1).sum(),(dataset.label == -1).sum()))
There is a way less negative reviews compared to positive reviews in our data. That might create a problematic bias towards positive reviews.
I need to investigate further whether I should be using balanced learning model or choose the imbalanced learning model.
dataset_i = dataset.sample(frac = 0.03, replace = False, random_state=42)
dataset_i.count()
print("Number of positive reviews are {}, while number of negative reviews are {} in the dataset.".format((dataset_i.label == 1).sum(),(dataset_i.label == -1).sum()))
dataset_neg = dataset[dataset["label"] == -1]
dataset_pos= dataset[dataset["label"] == +1]
dataset_neg = dataset_neg.sample(frac = 0.1215, replace = False, random_state = 42)
dataset_pos = dataset_pos.sample(frac= 0.01936, replace = False, random_state = 42)
print("dataset_neg: {}, dataset_pos: {}.".format(dataset_neg.count(),dataset_pos.count()))
dataset_b = pd.concat([dataset_neg,dataset_pos])
print("dataset_b: {}".format(dataset_b.count()))
dataset_b.head()
from nltk.corpus import wordnet
def get_wordnet_pos(pos_tag):
if pos_tag.startswith('J'):
return wordnet.ADJ
elif pos_tag.startswith('V'):
return wordnet.VERB
elif pos_tag.startswith('N'):
return wordnet.NOUN
elif pos_tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
def clean_text(text):
# lower text
text = text.lower()
# tokenize text and remove puncutation
text = [word.strip(string.punctuation) for word in text.split(" ")]
# remove words that contain numbers
text = [word for word in text if not any(c.isdigit() for c in word)]
# remove stop words
stop = stopwords.words('english')
text = [x for x in text if x not in stop]
# remove empty tokens
text = [t for t in text if len(t) > 0]
# pos tag text
pos_tags = pos_tag(text)
# lemmatize text
text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
# remove words with only one letter
text = [t for t in text if len(t) > 1]
# join all
text = " ".join(text)
return(text)
# clean text data_i
dataset_i["review_clean"] = dataset_i["reviewText"].apply(lambda x: clean_text(x))
# clean text data_b
dataset_b["review_clean"] = dataset_b["reviewText"].apply(lambda x: clean_text(x))
dataset_i.head()
dataset_b.head()
from matplotlib import pyplot as plt
neg_reviews = dataset_b[dataset_b.label == -1]
neg_string = []
for t in neg_reviews.review_clean:
neg_string.append(t)
neg_string = pd.Series(neg_string).str.cat(sep=' ')
from wordcloud import WordCloud
wordcloud = WordCloud(width=1600, height=800,max_font_size=200).generate(neg_string)
plt.figure(figsize=(12,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
from matplotlib import pyplot as plt
pos_reviews = dataset_b[dataset_b.label == +1]
pos_string = []
for t in pos_reviews.review_clean:
pos_string.append(t)
pos_string = pd.Series(pos_string).str.cat(sep=' ')
from wordcloud import WordCloud
wordcloud = WordCloud(width=1600, height=800,max_font_size=200).generate(neg_string)
plt.figure(figsize=(12,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
from sklearn.model_selection import train_test_split
Xb = pd.DataFrame(dataset_b, columns = ["reviewText"])
yb = pd.DataFrame(dataset_b, columns = ["label"])
train_Xb, test_Xb, train_yb, test_yb = train_test_split(Xb, yb,test_size=0.30)
from sklearn.model_selection import train_test_split
Xi = pd.DataFrame(dataset_i, columns = ["reviewText"])
yi = pd.DataFrame(dataset_i, columns = ["label"])
train_Xi, test_Xi, train_yi, test_yi = train_test_split(Xi, yi,test_size=0.30)
train_Xb.head(3)
train_Xi.head(3)
print("Shape of the data data : train_Xb={} train_yb={} test_Xb={} test_yb={}".format(str(train_Xb.shape),str(train_yb.shape),str(test_Xb.shape),str(test_yb.shape)))
print("Shape of the data data : train_Xi={} train_yi={} test_Xi={} test_yi={}".format(str(train_Xi.shape),str(train_yi.shape),str(test_Xi.shape),str(test_yi.shape)))
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_vectorb = vectorizer.fit_transform(train_Xb["reviewText"])
test_vectorb = vectorizer.transform(test_Xb["reviewText"])
print("Vocabulary size: {}".format(len(vectorizer.vocabulary_)))
print("X_train:\n{}".format(repr(train_Xb)))
print("X_test: \n{}".format(repr(test_Xb)))
feature_names = vectorizer.get_feature_names()
print("Number of features: {}".format(len(feature_names)))
train_vectori = vectorizer.fit_transform(train_Xi["reviewText"])
test_vectori = vectorizer.transform(test_Xi["reviewText"])
print("Vocabulary size: {}".format(len(vectorizer.vocabulary_)))
print("X_train:\n{}".format(repr(train_Xi)))
print("X_test: \n{}".format(repr(test_Xi)))
feature_names = vectorizer.get_feature_names()
print("Number of features: {}".format(len(feature_names)))
We will use LogisticRegression for model development as for high dimensional sparse data like ours.
Grid Search: for paramater tuning of LogisticRegression. We want to determine what value of coefficeint ‘C’ provides better accuracy.
from sklearn.linear_model import LogisticRegression
clr = LogisticRegression()
clr.fit(train_vectorb, train_yb.values.ravel())
scores = clr.score(test_vectorb, test_yb) # accuracy
pred_yb = clr.predict(test_vectorb)
print(scores)
clr.fit(train_vectori, train_yi.values.ravel())
scores = clr.score(test_vectori, test_yi) # accuracy
pred_yi = clr.predict(test_vectori)
print(scores)
#Xb = pd.DataFrame(dataset_b, columns = ["reviewText"])
#yb = pd.DataFrame(dataset_b, columns = ["label"])
#Balanced:
#train_Xb, test_Xb, train_yb, test_yb = train_test_split(Xb, yb,test_size=0.30)
#pred_yb = clr.predict(test_vectorb)
#Imbalanced:
#train_Xi, test_Xi, train_yi, test_yi = train_test_split(Xi, yi,test_size=0.30)
#pred_yi = clr.predict(test_vectori)
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
print("Accuracy: {:.2f}%".format(accuracy_score(test_yb, pred_yb) * 100))
print("\nF1 Score: {:.2f}".format(f1_score(test_yb, pred_yb) * 100))
print("\nCOnfusion Matrix:\n", confusion_matrix(test_yb, pred_yb))
print("Accuracy: {:.2f}%".format(accuracy_score(test_yi, pred_yi) * 100))
print("\nF1 Score: {:.2f}".format(f1_score(test_yi, pred_yi) * 100))
print("\nCOnfusion Matrix:\n", confusion_matrix(test_yi, pred_yi))
In this notebook, we have build a ML Model, that is able to predict whether the sentiment of the message is negative or positive, by introducing the review texts and their corresponding positive (+1) or negative (-1) label, which has been determined by whether the review rating was 4 or 5; or 1-2. The model was able to predict with accuracy of 92.04%.