PageRenderTime 25ms CodeModel.GetById 12ms RepoModel.GetById 0ms app.codeStats 0ms

/examples/frames/utils/4forums/generate_lexicon.py

https://gitlab.com/purdueNlp/DRaiL
Python | 369 lines | 358 code | 7 blank | 4 comment | 7 complexity | 2cec951434e5be97157f21613c0bb581 MD5 | raw file
  1. from __future__ import unicode_literals
  2. import json
  3. import os
  4. import math
  5. from tqdm import tqdm
  6. import nltk
  7. import string
  8. from inflection import singularize
  9. import re
  10. import sys
  11. import pickle
  12. import json
  13. import operator
  14. from nltk.stem import LancasterStemmer
  15. from nltk.util import ngrams
  16. nltk.download('stopwords')
  17. from nltk.corpus import stopwords
  18. from nltk import word_tokenize
  19. import operator
  20. from nltk.corpus import wordnet as wn
  21. class LexiconGenerator():
  22. def __init__(self, data_folder, issues, lower_bound, upper_bound):
  23. self.data_folder = data_folder
  24. self.issues = issues
  25. self.lower_bound = float(lower_bound)
  26. self.upper_bound = float(upper_bound)
  27. self.lancaster = LancasterStemmer()
  28. self.post2frames, self.frame2posts = self.annotate_posts()
  29. self.ngram_issue_counts, self.ngram_frame_counts, self.ngram_in_post = self.get_all_ngrams()
  30. # with open(os.getcwd() + "/ngram_issue_counts.json", "r") as f:
  31. # self.ngram_issue_counts = json.load(f)
  32. # with open(os.getcwd() + "/ngram_frame_counts.json", "r") as f:
  33. # self.ngram_frame_counts = json.load(f)
  34. self.ngram_in_post = pickle.load(open(os.getcwd() + "/ngram_in_post.pkl", "rb"))
  35. self.prune_ngrams(self.lower_bound, self.upper_bound)
  36. def annotate_posts(self):
  37. post2frames = {}
  38. frames2posts = {}
  39. with open("../../data/frame_subframe_lexicon.json") as f:
  40. indicator_list = json.load(f)
  41. frame_lexicon = indicator_list[0]
  42. for issue in self.issues:
  43. post2frames[issue] = {}
  44. frames2posts[issue] = {}
  45. subdir = self.data_folder + issue
  46. n = len(os.listdir(subdir))
  47. print("Annotating Posts for issue {}: ".format(issue))
  48. pbar = tqdm(total=n)
  49. for filename in os.listdir(subdir):
  50. post = ""
  51. with open(os.path.join(subdir, filename), 'r') as fp:
  52. lines = fp.readlines()
  53. counter = 0
  54. while lines[counter].startswith("ID:") == False:
  55. post += lines[counter]
  56. counter += 1
  57. post_id = lines[counter].split(":")[1].replace("\n", "")
  58. frames = self.annotate_post(post, frame_lexicon)
  59. post2frames[issue][post_id] = frames
  60. for frame in frames:
  61. if frame not in frames2posts[issue]:
  62. frames2posts[issue][frame] = set()
  63. frames2posts[issue][frame].add(post_id)
  64. pbar.update(1)
  65. pbar.close()
  66. return post2frames, frames2posts
  67. def annotate_post(self, post, frame_lexicon):
  68. post_rankings = {}
  69. tokens = word_tokenize(post.lower())
  70. stemmed_tokens = []
  71. for token in tokens:
  72. stemmed_tokens.append(self.lancaster.stem(token))
  73. stemmed_tokens = "_" + "_".join(stemmed_tokens) + "_"
  74. for frame in frame_lexicon:
  75. post_rankings[frame] = 0
  76. indicators = frame_lexicon[frame]
  77. for indicator in indicators:
  78. if indicator in stemmed_tokens:
  79. post_rankings[frame] += 1
  80. frames = list(frame_lexicon.keys())
  81. sorted_frames = sorted(frames, key=lambda x: post_rankings[x], reverse=True)
  82. return set(sorted_frames[:2])
  83. def preprocess(self, post):
  84. post = re.sub(r'http\S+', '', post)
  85. post = post.replace(' ', ' ')
  86. post = post.replace('’', '\'')
  87. post = post.replace("“", "\"")
  88. post = post.replace("”", "\"")
  89. post = post.replace("—", "-")
  90. post = post.replace("\n", " ")
  91. post = post.replace("\t", " ")
  92. regex = re.compile('[%s]' % re.escape(string.punctuation))
  93. post = regex.sub(' ', post)
  94. post=post.lower()
  95. return post
  96. def generate_post_ngrams(self, post):
  97. def is_ascii(s):
  98. return all(ord(c) < 128 for c in s)
  99. # stop_words=['hundred', 'thousand', 'news', 'daily', 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'per', 'a', 'an',\
  100. # 'the', 'and', 'but', 'or', 'should', 'would', 'might', 'could', 'of', 'with', 'at', 'from', 'into', 'during', 'including', 'until', 'against',\
  101. # 'among', 'throughout', 'despite', 'towards', 'upon', 'concerning', 'to', 'in', 'for', 'on', 'by', 'about', 'like', 'through', 'over', 'before',\
  102. # 'between', 'after', 'since', 'without', 'under', 'within', 'along', 'following', 'across', 'behind', 'beyond', 'plus', 'except', 'but', 'up',\
  103. # 'out', 'around', 'down', 'off', 'above', 'near']
  104. post = self.preprocess(post)
  105. tokens = nltk.word_tokenize(post)
  106. initial_bigrams=ngrams(tokens, 2)
  107. initial_trigrams=ngrams(tokens, 3)
  108. bigrams = []
  109. trigrams = []
  110. for bigram in initial_bigrams:
  111. flag=0
  112. modified=[]
  113. for w in bigram:
  114. if w == "" or is_ascii(w) == False or w.isdigit() == True or w in stopwords.words('english'):
  115. flag=1
  116. continue
  117. w_singularized = str(singularize(w))
  118. if w_singularized.isdigit() == False and w not in stopwords.words('english'):
  119. w = self.lancaster.stem(w)
  120. else:
  121. flag=1
  122. continue
  123. modified.append(w)
  124. if flag==0:
  125. bigrams.append(modified)
  126. for trigram in initial_trigrams:
  127. flag=0
  128. modified=[]
  129. for w in trigram:
  130. #if w == "" or is_ascii(w) == False or w.isdigit() == True or w in stopwords.words('english') or w in stop_words:
  131. if w == "" or is_ascii(w) == False or w.isdigit() == True:
  132. flag=1
  133. continue
  134. w_singularized = str(singularize(w))
  135. if w_singularized.isdigit() == False:
  136. #if w_singularized.isdigit() == False and w not in stopwords.words('english') and w not in stop_words:
  137. w = self.lancaster.stem(w)
  138. else:
  139. flag=1
  140. continue
  141. modified.append(w)
  142. if flag==0:
  143. trigrams.append(modified)
  144. return bigrams, trigrams
  145. def get_pmi(self):
  146. '''
  147. PMI: {
  148. abortion: {
  149. frame_1: {
  150. ngram_1: pmi(ngram)
  151. .
  152. .
  153. .
  154. },
  155. frame_2: {},
  156. .
  157. .
  158. .
  159. frame_n: {}
  160. },
  161. guns: {
  162. }
  163. }
  164. '''
  165. frames = list(self.ngram_frame_counts["abortion"].keys())
  166. PMI = {}
  167. print("Calculating PMI Scores...............")
  168. for issue in self.issues:
  169. PMI[issue] = {}
  170. for frame in frames:
  171. PMI[issue][frame] = {}
  172. n = len(self.ngram_issue_counts[issue])
  173. pbar = tqdm(total=n)
  174. print("Calculating PMI for issue: {} and frame: {}".format(issue, frame))
  175. for ngram in self.ngram_frame_counts[issue][frame]:
  176. total_issue_count = len(self.ngram_issue_counts[issue]) #Count of all ngrams in issue
  177. denominator = float(self.ngram_issue_counts[issue][ngram] / total_issue_count) #P(g)
  178. ngram_frame_count = self.ngram_frame_counts[issue][frame][ngram] #count(ngram in frame)
  179. total_frame_count = len(self.ngram_frame_counts[issue][frame]) #count (all ngrams in frame)
  180. numerator = float(ngram_frame_count / total_frame_count)
  181. pmi = math.log10(float(numerator/denominator))
  182. PMI[issue][frame][ngram] = pmi
  183. pbar.update(1)
  184. pbar.close()
  185. return PMI
  186. def prune_ngrams(self, lower_threshold, upper_threshold):
  187. lower_threshold = float(lower_threshold / 100)
  188. upper_threshold = float(upper_threshold / 100)
  189. print("Pruning ngrams..............")
  190. for issue in self.issues:
  191. n = len(self.ngram_in_post[issue])
  192. pbar = tqdm(total=n)
  193. print("Pruning ngrams for issue: {}".format(issue))
  194. for ngram in self.ngram_in_post[issue]:
  195. num_posts = len(self.ngram_in_post[issue][ngram])
  196. subdir = self.data_folder + issue
  197. n = len(os.listdir(subdir))
  198. low_bound = float(lower_threshold * n)
  199. upp_bound = float(upper_threshold * n)
  200. if num_posts < low_bound or num_posts > upp_bound:
  201. del self.ngram_issue_counts[issue][ngram]
  202. for frame in self.ngram_frame_counts[issue].keys():
  203. if ngram in self.ngram_frame_counts[issue][frame]:
  204. del self.ngram_frame_counts[issue][frame][ngram]
  205. pbar.update(1)
  206. pbar.close()
  207. # with open("ngram_issue_counts.json", "w") as f:
  208. # json.dump(self.ngram_issue_counts, f)
  209. # with open("ngram_frame_counts.json", "w") as f:
  210. # json.dump(self.ngram_frame_counts, f)
  211. def generate_lexicon(self):
  212. pmi_scores = self.get_pmi()
  213. sorted_pmi_scores = {}
  214. #sorting pmi scores for each issue/frame
  215. for issue in self.issues:
  216. sorted_pmi_scores[issue] = {}
  217. for frame in pmi_scores[issue]:
  218. sorted_pmi_scores[issue][frame] = sorted(pmi_scores[issue][frame].items(), key = operator.itemgetter(1), reverse=True)
  219. f = open("../../data/4forums/pmi_scores.json", "w")
  220. json.dump(sorted_pmi_scores, f)
  221. lexicon = {}
  222. frames = list(self.ngram_frame_counts["abortion"].keys())
  223. for issue in self.issues:
  224. lexicon[issue] = {}
  225. for frame in frames:
  226. lexicon[issue][frame] = []
  227. lexicon[issue][frame].append(sorted_pmi_scores[issue][frame][0][0])
  228. lexicon[issue][frame].append(sorted_pmi_scores[issue][frame][1][0])
  229. return lexicon
  230. def get_all_ngrams(self):
  231. ngram_frame_counts = {}
  232. ngram_issue_counts = {}
  233. ngram_in_post = {}
  234. all_ngrams = set()
  235. for issue in self.issues:
  236. ngram_issue_counts[issue] = {}
  237. ngram_frame_counts[issue] = {}
  238. ngram_in_post[issue] = {}
  239. subdir = self.data_folder + issue
  240. n = len(os.listdir(subdir))
  241. pbar = tqdm(total=n)
  242. print("Generating all ngrams for issue: {}".format(issue))
  243. for filename in os.listdir(subdir):
  244. post = ""
  245. with open(os.path.join(subdir, filename), 'r') as fp:
  246. lines = fp.readlines()
  247. counter = 0
  248. while lines[counter].startswith("ID:") == False:
  249. post += lines[counter]
  250. counter += 1
  251. post_id = lines[counter].split(":")[1].replace("\n", "")
  252. #import pdb; pdb.set_trace()
  253. frames = self.post2frames[issue][post_id]
  254. bigrams, trigrams = self.generate_post_ngrams(post)
  255. for bigram in bigrams:
  256. bgram_str = " ".join(bigram)
  257. if bgram_str in ngram_issue_counts[issue]:
  258. ngram_issue_counts[issue][bgram_str] += 1
  259. else:
  260. ngram_issue_counts[issue][bgram_str] = 1
  261. for frame in frames:
  262. if frame not in ngram_frame_counts[issue]:
  263. ngram_frame_counts[issue][frame] = {}
  264. if bgram_str in ngram_frame_counts[issue][frame]:
  265. ngram_frame_counts[issue][frame][bgram_str] += 1
  266. else:
  267. ngram_frame_counts[issue][frame][bgram_str] = 1
  268. if bgram_str not in ngram_in_post[issue]:
  269. ngram_in_post[issue][bgram_str] = set()
  270. ngram_in_post[issue][bgram_str].add(post_id)
  271. for trigram in trigrams:
  272. tgram_str = " ".join(trigram)
  273. if tgram_str in ngram_issue_counts[issue]:
  274. ngram_issue_counts[issue][tgram_str] += 1
  275. else:
  276. ngram_issue_counts[issue][tgram_str] = 1
  277. for frame in frames:
  278. if frame not in ngram_frame_counts[issue]:
  279. ngram_frame_counts[issue][frame] = {}
  280. if tgram_str in ngram_frame_counts[issue][frame]:
  281. ngram_frame_counts[issue][frame][tgram_str] += 1
  282. else:
  283. ngram_frame_counts[issue][frame][tgram_str] = 1
  284. if tgram_str not in ngram_in_post[issue]:
  285. ngram_in_post[issue][tgram_str] = set()
  286. ngram_in_post[issue][tgram_str].add(post_id)
  287. pbar.update(1)
  288. pbar.close()
  289. with open("ngram_issue_counts.json", "w") as f:
  290. json.dump(ngram_issue_counts, f)
  291. with open("ngram_frame_counts.json", "w") as f:
  292. json.dump(ngram_frame_counts, f)
  293. with open("ngram_in_post.pkl", "wb") as f:
  294. pickle.dump(ngram_in_post, f)
  295. return ngram_issue_counts, ngram_frame_counts, ngram_in_post
  296. # if __name__ == "__main__":
  297. # lexicon_generator = LexiconGenerator(sys.argv[1], sys.argv[2:4], sys.argv[4], sys.argv[5])
  298. # lexicon = lexicon_generator.generate_lexicon()
  299. # with open("../../data/4forums/4forums_lexicon.json", "w") as outfile:
  300. # json.dump(lexicon, outfile)