PageRenderTime 47ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 0ms

/NLPSolverAI/nlp.py

https://gitlab.com/lancezlin/ai
Python | 241 lines | 228 code | 7 blank | 6 comment | 5 complexity | cc463dc5d8ebcdf8038d6096e0beb3f2 MD5 | raw file
  1. train_path = "/home/lancel/Documents/AI/AI/Proj5/aclImdb/train/" # use terminal to ls files under this directory
  2. test_path = "imdb_te.csv" # test data for grade evaluation
  3. import pandas as pd
  4. from copy import deepcopy
  5. from sklearn.linear_model import SGDClassifier
  6. from sklearn.model_selection import GridSearchCV
  7. import re
  8. import os
  9. from sklearn.feature_extraction.text import CountVectorizer
  10. import nltk
  11. from nltk.stem.porter import PorterStemmer
  12. from nltk.corpus import stopwords
  13. from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
  14. from sklearn.feature_extraction.text import HashingVectorizer
  15. import numpy as np
  16. from sklearn.metrics import confusion_matrix
  17. from nltk.stem import SnowballStemmer
  18. from sklearn.model_selection import train_test_split
  19. from sklearn.metrics import accuracy_score
  20. #nltk.download('stopwords')
  21. words_to_go = ['one', 'two', 'three', 'man', 'woman', 'men', 'women', 'br', 'four', 'five', \
  22. 'also', 'still', 'yet', 'would', 'might', 'movi', 'film', 'boy', 'girl', \
  23. 'think', 'seem', 'becaus', 'see', 'saw', 'may', 'watch', 'before']
  24. grid_parameters = [{'loss':['hinge', 'log', 'perceptron'], 'penalty': ['l1', 'l2'], \
  25. 'alpha':[0.0001, 0.001, 0.01, 0.1]}]
  26. def text_cleanup(text, stopwords):
  27. review = re.sub('[^a-zA-Z]', ' ', text)
  28. review = review.lower()
  29. review = review.split()
  30. #ps = PorterStemmer()
  31. ps = SnowballStemmer('english')
  32. review = [ps.stem(word) for word in review]
  33. review = [word for word in review if not word in stopwords and not word in words_to_go and len(word)>2]
  34. review = ' '.join(review)
  35. return review
  36. def imdb_data_preprocess(inpath, outpath="./", name="imdb_tr.csv", mix=False):
  37. '''Implement this module to extract
  38. and combine text files under train_path directory into
  39. imdb_tr.csv. Each text file in train_path should be stored
  40. as a row in imdb_tr.csv. And imdb_tr.csv should have two
  41. columns, "text" and label'''
  42. print ("preprocessing ...")
  43. path_neg = os.path.join(inpath, 'neg')
  44. path_pos = os.path.join(inpath, 'pos')
  45. stopwords = []
  46. with open('stopwords.en.txt', 'r') as stopword:
  47. lines = stopword.readlines()
  48. for line in lines:
  49. stopwords.append(line.strip())
  50. stopwords = set(stopwords)
  51. reviews = []
  52. files_neg = [f for f in os.listdir(path_neg) if os.path.isfile(os.path.join(path_neg,f))]
  53. files_pos = [f for f in os.listdir(path_pos) if os.path.isfile(os.path.join(path_pos,f))]
  54. for f in files_neg:
  55. f_full_path = os.path.join(path_neg, f)
  56. with open (f_full_path, "r") as review:
  57. reviews.append([text_cleanup(review.read(-1), stopwords),0])
  58. for f in files_pos:
  59. f_full_path = os.path.join(path_pos, f)
  60. with open (f_full_path, "r") as review:
  61. reviews.append([text_cleanup(review.read(-1), stopwords), 1])
  62. df = pd.DataFrame(reviews)
  63. df.columns = ['text', 'label']
  64. df.to_csv(os.path.join(outpath, name), sep = '\t', encoding='utf-8')
  65. return df
  66. def get_y(data):
  67. return data.iloc[:,1].values
  68. def imdb_data_cleanup(data):
  69. '''
  70. removing stop words; punctations etc
  71. return: corpus as a list
  72. '''
  73. stopwords = []
  74. with open('stopwords.en.txt', 'r') as stopword:
  75. lines = stopword.readlines()
  76. for line in lines:
  77. stopwords.append(line.strip())
  78. corpus = []
  79. row, col = data.shape
  80. for i in range(row):
  81. review = text_cleanup(data['text'][i], stopwords)
  82. corpus.append(review)
  83. return corpus
  84. def grid_search_init(model, parameters, X_train, y_train, cv=5):
  85. grid_search_cv = GridSearchCV(estimator = model,
  86. param_grid = parameters,
  87. scoring = 'accuracy',
  88. cv = cv)
  89. grid_search = grid_search_cv.fit(X_train, y_train)
  90. print(grid_search.best_params_)
  91. return grid_search.best_params_
  92. def SGD_builder(X, y, X_test, param):
  93. '''
  94. construct the SGD model
  95. '''
  96. sgd = SGDClassifier(random_state=0)
  97. bp = grid_search_init(sgd, param, X, y)
  98. best_model = SGDClassifier(loss=bp['loss'], penalty=bp['penalty'], alpha=bp['alpha'],random_state=0)
  99. best_model.fit(X, y)
  100. y_pred = best_model.predict(X_test)
  101. return y_pred, best_model
  102. if __name__ == "__main__":
  103. if not os.path.isfile('imdb_tr.csv'):
  104. train_data = imdb_data_preprocess(inpath=train_path)
  105. else:
  106. train_data = pd.read_csv('imdb_tr.csv', sep = '\t', index_col=0)
  107. # if not os.path.isfile('imdb_test.csv'):
  108. # test_data = imdb_data_preprocess(inpath = testing_path, name="imdb_test.csv")
  109. # else:
  110. # test_data = pd.read_csv('imdb_test.csv', sep = '\t', index_col=0)
  111. final_test_data = pd.read_csv(test_path, sep = ',', index_col=0)
  112. # getting y and corpus for each category
  113. X = train_data.iloc[:, 0].values
  114. y = get_y(train_data)
  115. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 0)
  116. corpus_train, corpus_test = [review for review in X_train], [review for review in X_test]
  117. corpus_final_test = imdb_data_cleanup(final_test_data)
  118. '''train a SGD classifier using unigram representation,
  119. predict sentiments on imdb_te.csv, and write output to
  120. unigram.output.txt'''
  121. print ("processing unigram")
  122. try:
  123. unigram = CountVectorizer(ngram_range = (1,1), stop_words = 'english', max_features=2000)
  124. X_train_unigram = unigram.fit_transform(corpus_train).toarray()
  125. X_test_unigram = unigram.transform(corpus_test).toarray()
  126. y_pred_unigram, model = SGD_builder(X_train_unigram, y_train, X_test_unigram, grid_parameters)
  127. accuracy_unigram = accuracy_score(y_pred_unigram, y_test)
  128. print(accuracy_unigram)
  129. X_final_test_unigram = unigram.transform(corpus_final_test).toarray()
  130. y_final_test_unigram = model.predict(X_final_test_unigram)
  131. f = open('unigram.output.txt', "w")
  132. for y_pred in y_final_test_unigram:
  133. f.write(str(y_pred) + '\n')
  134. f.close()
  135. except Exception:
  136. print("Error occurs.")
  137. pass
  138. # TODO: output to txt
  139. '''train a SGD classifier using bigram representation,
  140. predict sentiments on imdb_te.csv, and write output to
  141. unigram.output.txt'''
  142. print ("processing bigram")
  143. try:
  144. bigram = CountVectorizer(ngram_range = (2,2), stop_words = 'english', max_features=2000)
  145. X_train_bigram = bigram.fit_transform(corpus_train).toarray()
  146. X_test_bigram = bigram.transform(corpus_test).toarray()
  147. y_pred_bigram, model = SGD_builder(X_train_bigram, y_train, X_test_bigram, grid_parameters)
  148. accuracy_bigram = accuracy_score(y_pred_bigram, y_test)
  149. print(accuracy_bigram)
  150. try:
  151. X_final_test_bigram = bigram.transform(corpus_final_test).toarray()
  152. y_final_test_bigram = model.predict(X_final_test_bigram)
  153. except Exception:
  154. pass
  155. f = open('bigram.output.txt', "w")
  156. for y_pred in y_final_test_unigram:
  157. f.write(str(y_pred) + '\n')
  158. f.close()
  159. except Exception:
  160. print("Error occurs.")
  161. pass
  162. # TODO: output to txt
  163. '''train a SGD classifier using unigram representation
  164. with tf-idf, predict sentiments on imdb_te.csv, and write
  165. output to unigram.output.txt'''
  166. print ("processing unigram-tfidf")
  167. try:
  168. unigram_tfidf = TfidfTransformer()
  169. X_train_unigram_tfidf = unigram_tfidf.fit_transform(X_train_unigram)
  170. X_test_unigram_tfidf = unigram_tfidf.transform(X_test_unigram)
  171. y_pred_unigram_tfidf, model = SGD_builder(X_train_unigram_tfidf, y_train, X_test_unigram_tfidf, grid_parameters)
  172. accuracy_unigram_tfidf = accuracy_score(y_pred_unigram_tfidf, y_test)
  173. print(accuracy_unigram_tfidf)
  174. X_final_test_unigram_tfidf = unigram_tfidf.transform(X_final_test_unigram)
  175. y_final_test_unigram_tfidf = model.predict(X_final_test_unigram_tfidf)
  176. f = open('unigramtfidf.output.txt', "w")
  177. for y_pred in y_final_test_unigram_tfidf:
  178. f.write(str(y_pred) + '\n')
  179. f.close()
  180. except Exception:
  181. print("Error occurs.")
  182. pass
  183. # TODO: output to txt
  184. '''train a SGD classifier using bigram representation
  185. with tf-idf, predict sentiments on imdb_te.csv, and write
  186. output to unigram.output.txt'''
  187. try:
  188. print ("processing bigram-tfidf")
  189. bigram_tfidf = TfidfTransformer()
  190. X_train_bigram_tfidf = bigram_tfidf.fit_transform(X_train_bigram)
  191. X_test_bigram_tfidf = bigram_tfidf.transform(X_test_bigram)
  192. y_pred_bigram_tfidf, model = SGD_builder(X_train_bigram_tfidf, y_train, X_test_bigram_tfidf, grid_parameters)
  193. accuracy_bigram_tfidf = accuracy_score(y_pred_bigram_tfidf, y_test)
  194. print(accuracy_bigram_tfidf)
  195. try:
  196. X_final_test_bigram_tfidf = bigram_tfidf.transform(X_final_test_bigram)
  197. y_final_test_bigram_tfidf = model.predict(X_final_test_bigram_tfidf)
  198. except Exception:
  199. pass
  200. f = open('bigramtfidf.output.txt', "w")
  201. for y_pred in y_final_test_unigram_tfidf:
  202. f.write(str(y_pred) + '\n')
  203. f.close() # TODO: output to txt
  204. except Exception:
  205. print("Error occurs.")
  206. pass