/NLPSolverAI/nlp.py
Python | 241 lines | 228 code | 7 blank | 6 comment | 5 complexity | cc463dc5d8ebcdf8038d6096e0beb3f2 MD5 | raw file
- train_path = "/home/lancel/Documents/AI/AI/Proj5/aclImdb/train/" # use terminal to ls files under this directory
- test_path = "imdb_te.csv" # test data for grade evaluation
- import pandas as pd
- from copy import deepcopy
- from sklearn.linear_model import SGDClassifier
- from sklearn.model_selection import GridSearchCV
- import re
- import os
- from sklearn.feature_extraction.text import CountVectorizer
- import nltk
- from nltk.stem.porter import PorterStemmer
- from nltk.corpus import stopwords
- from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
- from sklearn.feature_extraction.text import HashingVectorizer
- import numpy as np
- from sklearn.metrics import confusion_matrix
- from nltk.stem import SnowballStemmer
- from sklearn.model_selection import train_test_split
- from sklearn.metrics import accuracy_score
- #nltk.download('stopwords')
- words_to_go = ['one', 'two', 'three', 'man', 'woman', 'men', 'women', 'br', 'four', 'five', \
- 'also', 'still', 'yet', 'would', 'might', 'movi', 'film', 'boy', 'girl', \
- 'think', 'seem', 'becaus', 'see', 'saw', 'may', 'watch', 'before']
- grid_parameters = [{'loss':['hinge', 'log', 'perceptron'], 'penalty': ['l1', 'l2'], \
- 'alpha':[0.0001, 0.001, 0.01, 0.1]}]
- def text_cleanup(text, stopwords):
- review = re.sub('[^a-zA-Z]', ' ', text)
- review = review.lower()
- review = review.split()
- #ps = PorterStemmer()
- ps = SnowballStemmer('english')
- review = [ps.stem(word) for word in review]
- review = [word for word in review if not word in stopwords and not word in words_to_go and len(word)>2]
- review = ' '.join(review)
- return review
- def imdb_data_preprocess(inpath, outpath="./", name="imdb_tr.csv", mix=False):
- '''Implement this module to extract
- and combine text files under train_path directory into
- imdb_tr.csv. Each text file in train_path should be stored
- as a row in imdb_tr.csv. And imdb_tr.csv should have two
- columns, "text" and label'''
- print ("preprocessing ...")
- path_neg = os.path.join(inpath, 'neg')
- path_pos = os.path.join(inpath, 'pos')
- stopwords = []
- with open('stopwords.en.txt', 'r') as stopword:
- lines = stopword.readlines()
- for line in lines:
- stopwords.append(line.strip())
- stopwords = set(stopwords)
- reviews = []
- files_neg = [f for f in os.listdir(path_neg) if os.path.isfile(os.path.join(path_neg,f))]
- files_pos = [f for f in os.listdir(path_pos) if os.path.isfile(os.path.join(path_pos,f))]
- for f in files_neg:
- f_full_path = os.path.join(path_neg, f)
- with open (f_full_path, "r") as review:
- reviews.append([text_cleanup(review.read(-1), stopwords),0])
- for f in files_pos:
- f_full_path = os.path.join(path_pos, f)
- with open (f_full_path, "r") as review:
- reviews.append([text_cleanup(review.read(-1), stopwords), 1])
- df = pd.DataFrame(reviews)
- df.columns = ['text', 'label']
- df.to_csv(os.path.join(outpath, name), sep = '\t', encoding='utf-8')
- return df
-
- def get_y(data):
- return data.iloc[:,1].values
- def imdb_data_cleanup(data):
- '''
- removing stop words; punctations etc
- return: corpus as a list
- '''
- stopwords = []
- with open('stopwords.en.txt', 'r') as stopword:
- lines = stopword.readlines()
- for line in lines:
- stopwords.append(line.strip())
- corpus = []
- row, col = data.shape
- for i in range(row):
- review = text_cleanup(data['text'][i], stopwords)
- corpus.append(review)
-
- return corpus
- def grid_search_init(model, parameters, X_train, y_train, cv=5):
- grid_search_cv = GridSearchCV(estimator = model,
- param_grid = parameters,
- scoring = 'accuracy',
- cv = cv)
- grid_search = grid_search_cv.fit(X_train, y_train)
- print(grid_search.best_params_)
- return grid_search.best_params_
-
- def SGD_builder(X, y, X_test, param):
- '''
- construct the SGD model
- '''
- sgd = SGDClassifier(random_state=0)
- bp = grid_search_init(sgd, param, X, y)
- best_model = SGDClassifier(loss=bp['loss'], penalty=bp['penalty'], alpha=bp['alpha'],random_state=0)
- best_model.fit(X, y)
- y_pred = best_model.predict(X_test)
- return y_pred, best_model
-
- if __name__ == "__main__":
- if not os.path.isfile('imdb_tr.csv'):
- train_data = imdb_data_preprocess(inpath=train_path)
- else:
- train_data = pd.read_csv('imdb_tr.csv', sep = '\t', index_col=0)
- # if not os.path.isfile('imdb_test.csv'):
- # test_data = imdb_data_preprocess(inpath = testing_path, name="imdb_test.csv")
- # else:
- # test_data = pd.read_csv('imdb_test.csv', sep = '\t', index_col=0)
- final_test_data = pd.read_csv(test_path, sep = ',', index_col=0)
-
- # getting y and corpus for each category
- X = train_data.iloc[:, 0].values
- y = get_y(train_data)
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 0)
- corpus_train, corpus_test = [review for review in X_train], [review for review in X_test]
- corpus_final_test = imdb_data_cleanup(final_test_data)
-
-
- '''train a SGD classifier using unigram representation,
- predict sentiments on imdb_te.csv, and write output to
- unigram.output.txt'''
- print ("processing unigram")
- try:
- unigram = CountVectorizer(ngram_range = (1,1), stop_words = 'english', max_features=2000)
- X_train_unigram = unigram.fit_transform(corpus_train).toarray()
- X_test_unigram = unigram.transform(corpus_test).toarray()
- y_pred_unigram, model = SGD_builder(X_train_unigram, y_train, X_test_unigram, grid_parameters)
- accuracy_unigram = accuracy_score(y_pred_unigram, y_test)
- print(accuracy_unigram)
- X_final_test_unigram = unigram.transform(corpus_final_test).toarray()
- y_final_test_unigram = model.predict(X_final_test_unigram)
- f = open('unigram.output.txt', "w")
- for y_pred in y_final_test_unigram:
- f.write(str(y_pred) + '\n')
- f.close()
- except Exception:
- print("Error occurs.")
- pass
- # TODO: output to txt
- '''train a SGD classifier using bigram representation,
- predict sentiments on imdb_te.csv, and write output to
- unigram.output.txt'''
- print ("processing bigram")
- try:
- bigram = CountVectorizer(ngram_range = (2,2), stop_words = 'english', max_features=2000)
- X_train_bigram = bigram.fit_transform(corpus_train).toarray()
- X_test_bigram = bigram.transform(corpus_test).toarray()
- y_pred_bigram, model = SGD_builder(X_train_bigram, y_train, X_test_bigram, grid_parameters)
- accuracy_bigram = accuracy_score(y_pred_bigram, y_test)
- print(accuracy_bigram)
- try:
- X_final_test_bigram = bigram.transform(corpus_final_test).toarray()
- y_final_test_bigram = model.predict(X_final_test_bigram)
- except Exception:
- pass
- f = open('bigram.output.txt', "w")
- for y_pred in y_final_test_unigram:
- f.write(str(y_pred) + '\n')
- f.close()
- except Exception:
- print("Error occurs.")
- pass
- # TODO: output to txt
-
- '''train a SGD classifier using unigram representation
- with tf-idf, predict sentiments on imdb_te.csv, and write
- output to unigram.output.txt'''
- print ("processing unigram-tfidf")
- try:
- unigram_tfidf = TfidfTransformer()
- X_train_unigram_tfidf = unigram_tfidf.fit_transform(X_train_unigram)
- X_test_unigram_tfidf = unigram_tfidf.transform(X_test_unigram)
- y_pred_unigram_tfidf, model = SGD_builder(X_train_unigram_tfidf, y_train, X_test_unigram_tfidf, grid_parameters)
- accuracy_unigram_tfidf = accuracy_score(y_pred_unigram_tfidf, y_test)
- print(accuracy_unigram_tfidf)
- X_final_test_unigram_tfidf = unigram_tfidf.transform(X_final_test_unigram)
- y_final_test_unigram_tfidf = model.predict(X_final_test_unigram_tfidf)
- f = open('unigramtfidf.output.txt', "w")
- for y_pred in y_final_test_unigram_tfidf:
- f.write(str(y_pred) + '\n')
- f.close()
- except Exception:
- print("Error occurs.")
- pass
- # TODO: output to txt
-
- '''train a SGD classifier using bigram representation
- with tf-idf, predict sentiments on imdb_te.csv, and write
- output to unigram.output.txt'''
- try:
- print ("processing bigram-tfidf")
- bigram_tfidf = TfidfTransformer()
- X_train_bigram_tfidf = bigram_tfidf.fit_transform(X_train_bigram)
- X_test_bigram_tfidf = bigram_tfidf.transform(X_test_bigram)
- y_pred_bigram_tfidf, model = SGD_builder(X_train_bigram_tfidf, y_train, X_test_bigram_tfidf, grid_parameters)
- accuracy_bigram_tfidf = accuracy_score(y_pred_bigram_tfidf, y_test)
- print(accuracy_bigram_tfidf)
- try:
- X_final_test_bigram_tfidf = bigram_tfidf.transform(X_final_test_bigram)
- y_final_test_bigram_tfidf = model.predict(X_final_test_bigram_tfidf)
- except Exception:
- pass
- f = open('bigramtfidf.output.txt', "w")
- for y_pred in y_final_test_unigram_tfidf:
- f.write(str(y_pred) + '\n')
- f.close() # TODO: output to txt
- except Exception:
- print("Error occurs.")
- pass