nlp.py | searchcode

/NLPSolverAI/nlp.py

https://gitlab.com/lancezlin/ai · Python · 241 lines · 228 code · 7 blank · 6 comment · 5 complexity · cc463dc5d8ebcdf8038d6096e0beb3f2 MD5 · raw file

train_path = "/home/lancel/Documents/AI/AI/Proj5/aclImdb/train/" # use terminal to ls files under this directory
test_path = "imdb_te.csv" # test data for grade evaluation


import pandas as pd
from copy import deepcopy
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
import re
import os
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
import numpy as np
from sklearn.metrics import confusion_matrix
from nltk.stem import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
#nltk.download('stopwords')

words_to_go = ['one', 'two', 'three', 'man', 'woman', 'men', 'women', 'br', 'four', 'five', \
				'also', 'still', 'yet', 'would', 'might', 'movi', 'film', 'boy', 'girl', \
				'think', 'seem', 'becaus', 'see', 'saw', 'may', 'watch', 'before']

grid_parameters = [{'loss':['hinge', 'log', 'perceptron'], 'penalty': ['l1', 'l2'], \
					'alpha':[0.0001, 0.001, 0.01, 0.1]}]

def text_cleanup(text, stopwords):
	review = re.sub('[^a-zA-Z]', ' ', text)
	review = review.lower()
	review = review.split()
	#ps = PorterStemmer()
	ps = SnowballStemmer('english')
	review = [ps.stem(word) for word in review]
	review = [word for word in review if not word in stopwords and not word in words_to_go and len(word)>2]
	review = ' '.join(review)

	return review

def imdb_data_preprocess(inpath, outpath="./", name="imdb_tr.csv", mix=False):
 	'''Implement this module to extract
	and combine text files under train_path directory into 
    imdb_tr.csv. Each text file in train_path should be stored 
    as a row in imdb_tr.csv. And imdb_tr.csv should have two 
    columns, "text" and label'''
	print ("preprocessing ...")
	path_neg = os.path.join(inpath, 'neg')
	path_pos = os.path.join(inpath, 'pos')

	stopwords = []
	with open('stopwords.en.txt', 'r') as stopword:
		lines = stopword.readlines()
		for line in lines:
			stopwords.append(line.strip())
	stopwords = set(stopwords)

	reviews = []
	files_neg = [f for f in os.listdir(path_neg) if os.path.isfile(os.path.join(path_neg,f))]
	files_pos = [f for f in os.listdir(path_pos) if os.path.isfile(os.path.join(path_pos,f))]
	for f in files_neg:
	    f_full_path = os.path.join(path_neg, f)
	    with open (f_full_path, "r") as review:

	        reviews.append([text_cleanup(review.read(-1), stopwords),0])
	for f in files_pos:
		f_full_path = os.path.join(path_pos, f)
		with open (f_full_path, "r") as review:
			reviews.append([text_cleanup(review.read(-1), stopwords), 1])

	df = pd.DataFrame(reviews)
	df.columns = ['text', 'label']
	df.to_csv(os.path.join(outpath, name), sep = '\t', encoding='utf-8')
	return df


	  
def get_y(data):
	return data.iloc[:,1].values

def imdb_data_cleanup(data):
	''' 
		removing stop words; punctations etc
		return: corpus as a list
	'''
	stopwords = []
	with open('stopwords.en.txt', 'r') as stopword:
		lines = stopword.readlines()
		for line in lines:
			stopwords.append(line.strip())
	corpus = []
	row, col = data.shape
	for i in range(row):
		review = text_cleanup(data['text'][i], stopwords)

		corpus.append(review)
		
	return corpus


def grid_search_init(model, parameters, X_train, y_train, cv=5):
	grid_search_cv = GridSearchCV(estimator = model,
								param_grid = parameters,
								scoring = 'accuracy',
								cv = cv)
	grid_search = grid_search_cv.fit(X_train, y_train)
	print(grid_search.best_params_)
	return grid_search.best_params_
	 


def SGD_builder(X, y, X_test, param):
	'''
		construct the SGD model
	'''
	sgd = SGDClassifier(random_state=0)
	bp = grid_search_init(sgd, param, X, y)
	best_model = SGDClassifier(loss=bp['loss'], penalty=bp['penalty'], alpha=bp['alpha'],random_state=0)
	best_model.fit(X, y)
	y_pred = best_model.predict(X_test)

	return y_pred, best_model
   
if __name__ == "__main__":


	if not os.path.isfile('imdb_tr.csv'):
		train_data = imdb_data_preprocess(inpath=train_path)
	else:
		train_data = pd.read_csv('imdb_tr.csv', sep = '\t', index_col=0)
#	if not os.path.isfile('imdb_test.csv'):
#		test_data = imdb_data_preprocess(inpath = testing_path, name="imdb_test.csv")
#	else:
#		test_data = pd.read_csv('imdb_test.csv', sep = '\t', index_col=0)
	final_test_data = pd.read_csv(test_path, sep = ',', index_col=0)
	
	# getting y and corpus for each category
	X = train_data.iloc[:, 0].values
	y = get_y(train_data)
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 0)
	corpus_train, corpus_test = [review for review in X_train], [review for review in X_test]
	corpus_final_test = imdb_data_cleanup(final_test_data)
	
 	
	'''train a SGD classifier using unigram representation,
	predict sentiments on imdb_te.csv, and write output to
	unigram.output.txt'''
	print ("processing unigram")
	try:
		unigram = CountVectorizer(ngram_range = (1,1), stop_words = 'english', max_features=2000)
		X_train_unigram = unigram.fit_transform(corpus_train).toarray()
		X_test_unigram = unigram.transform(corpus_test).toarray()
		y_pred_unigram, model = SGD_builder(X_train_unigram, y_train, X_test_unigram, grid_parameters)
		accuracy_unigram = accuracy_score(y_pred_unigram, y_test)
		print(accuracy_unigram)

		X_final_test_unigram = unigram.transform(corpus_final_test).toarray()
		y_final_test_unigram = model.predict(X_final_test_unigram)
		f = open('unigram.output.txt', "w")
		for y_pred in y_final_test_unigram:
			f.write(str(y_pred) + '\n')
		f.close()
	except Exception:
		print("Error occurs.")
		pass
  	# TODO: output to txt


	'''train a SGD classifier using bigram representation,
	predict sentiments on imdb_te.csv, and write output to
	unigram.output.txt'''
	print ("processing bigram")
	try:
		bigram = CountVectorizer(ngram_range = (2,2), stop_words = 'english', max_features=2000)
		X_train_bigram = bigram.fit_transform(corpus_train).toarray()
		X_test_bigram = bigram.transform(corpus_test).toarray()
		y_pred_bigram, model = SGD_builder(X_train_bigram, y_train, X_test_bigram, grid_parameters)
		accuracy_bigram = accuracy_score(y_pred_bigram, y_test)
		print(accuracy_bigram)
		try:
			X_final_test_bigram = bigram.transform(corpus_final_test).toarray()
			y_final_test_bigram = model.predict(X_final_test_bigram)
		except Exception:
			pass
		f = open('bigram.output.txt', "w")
		for y_pred in y_final_test_unigram:
			f.write(str(y_pred) + '\n')
		f.close()
	except Exception:
		print("Error occurs.")
		pass
  	# TODO: output to txt
     
	'''train a SGD classifier using unigram representation
	with tf-idf, predict sentiments on imdb_te.csv, and write 
	output to unigram.output.txt'''
	print ("processing unigram-tfidf")
	try:
		unigram_tfidf = TfidfTransformer()
		X_train_unigram_tfidf = unigram_tfidf.fit_transform(X_train_unigram)
		X_test_unigram_tfidf = unigram_tfidf.transform(X_test_unigram)
		y_pred_unigram_tfidf, model = SGD_builder(X_train_unigram_tfidf, y_train, X_test_unigram_tfidf, grid_parameters)
		accuracy_unigram_tfidf = accuracy_score(y_pred_unigram_tfidf, y_test)
		print(accuracy_unigram_tfidf)

		X_final_test_unigram_tfidf = unigram_tfidf.transform(X_final_test_unigram)
		y_final_test_unigram_tfidf = model.predict(X_final_test_unigram_tfidf)
		f = open('unigramtfidf.output.txt', "w")
		for y_pred in y_final_test_unigram_tfidf:
			f.write(str(y_pred) + '\n')
		f.close()
	except Exception:
		print("Error occurs.")
		pass
	# TODO: output to txt
  	
	'''train a SGD classifier using bigram representation
	with tf-idf, predict sentiments on imdb_te.csv, and write 
	output to unigram.output.txt'''
	try:
		print ("processing bigram-tfidf")
		bigram_tfidf = TfidfTransformer()
		X_train_bigram_tfidf = bigram_tfidf.fit_transform(X_train_bigram)
		X_test_bigram_tfidf = bigram_tfidf.transform(X_test_bigram)
		y_pred_bigram_tfidf, model = SGD_builder(X_train_bigram_tfidf, y_train, X_test_bigram_tfidf, grid_parameters)
		accuracy_bigram_tfidf = accuracy_score(y_pred_bigram_tfidf, y_test)
		print(accuracy_bigram_tfidf)
		try:
			X_final_test_bigram_tfidf = bigram_tfidf.transform(X_final_test_bigram)
			y_final_test_bigram_tfidf = model.predict(X_final_test_bigram_tfidf)
		except Exception:
			pass
		f = open('bigramtfidf.output.txt', "w")
		for y_pred in y_final_test_unigram_tfidf:
			f.write(str(y_pred) + '\n')
		f.close() 	# TODO: output to txt
	except Exception:
		print("Error occurs.")
		pass
Tech Fingerprint

Alerts (21)

'def' Ensure functions have docstrings for documentation
31 80 103
Complexity hotspot; lines 37 to 38 (total complexity: 5)
37 38
'print(' Use logging module for better control and configurability
109 157 166 181 192 206 215 229 240
'open(' Use 'with open()' to ensure Files are properly closed
161
'except Exception:' Catch specific exceptions instead of Exception to avoid masking bugs
165 185 191 214 233 239