/examples/frames/utils/4forums/generate_lexicon.py
Python | 369 lines | 358 code | 7 blank | 4 comment | 7 complexity | 2cec951434e5be97157f21613c0bb581 MD5 | raw file
- from __future__ import unicode_literals
- import json
- import os
- import math
- from tqdm import tqdm
- import nltk
- import string
- from inflection import singularize
- import re
- import sys
- import pickle
- import json
- import operator
- from nltk.stem import LancasterStemmer
- from nltk.util import ngrams
- nltk.download('stopwords')
- from nltk.corpus import stopwords
- from nltk import word_tokenize
- import operator
- from nltk.corpus import wordnet as wn
- class LexiconGenerator():
- def __init__(self, data_folder, issues, lower_bound, upper_bound):
- self.data_folder = data_folder
- self.issues = issues
- self.lower_bound = float(lower_bound)
- self.upper_bound = float(upper_bound)
- self.lancaster = LancasterStemmer()
- self.post2frames, self.frame2posts = self.annotate_posts()
- self.ngram_issue_counts, self.ngram_frame_counts, self.ngram_in_post = self.get_all_ngrams()
- # with open(os.getcwd() + "/ngram_issue_counts.json", "r") as f:
- # self.ngram_issue_counts = json.load(f)
- # with open(os.getcwd() + "/ngram_frame_counts.json", "r") as f:
- # self.ngram_frame_counts = json.load(f)
- self.ngram_in_post = pickle.load(open(os.getcwd() + "/ngram_in_post.pkl", "rb"))
- self.prune_ngrams(self.lower_bound, self.upper_bound)
- def annotate_posts(self):
- post2frames = {}
- frames2posts = {}
- with open("../../data/frame_subframe_lexicon.json") as f:
- indicator_list = json.load(f)
- frame_lexicon = indicator_list[0]
- for issue in self.issues:
- post2frames[issue] = {}
- frames2posts[issue] = {}
- subdir = self.data_folder + issue
- n = len(os.listdir(subdir))
- print("Annotating Posts for issue {}: ".format(issue))
- pbar = tqdm(total=n)
- for filename in os.listdir(subdir):
- post = ""
- with open(os.path.join(subdir, filename), 'r') as fp:
- lines = fp.readlines()
- counter = 0
- while lines[counter].startswith("ID:") == False:
- post += lines[counter]
- counter += 1
- post_id = lines[counter].split(":")[1].replace("\n", "")
- frames = self.annotate_post(post, frame_lexicon)
- post2frames[issue][post_id] = frames
- for frame in frames:
- if frame not in frames2posts[issue]:
- frames2posts[issue][frame] = set()
- frames2posts[issue][frame].add(post_id)
-
- pbar.update(1)
- pbar.close()
- return post2frames, frames2posts
- def annotate_post(self, post, frame_lexicon):
- post_rankings = {}
- tokens = word_tokenize(post.lower())
- stemmed_tokens = []
- for token in tokens:
- stemmed_tokens.append(self.lancaster.stem(token))
- stemmed_tokens = "_" + "_".join(stemmed_tokens) + "_"
- for frame in frame_lexicon:
- post_rankings[frame] = 0
- indicators = frame_lexicon[frame]
- for indicator in indicators:
- if indicator in stemmed_tokens:
- post_rankings[frame] += 1
- frames = list(frame_lexicon.keys())
- sorted_frames = sorted(frames, key=lambda x: post_rankings[x], reverse=True)
-
- return set(sorted_frames[:2])
-
- def preprocess(self, post):
- post = re.sub(r'http\S+', '', post)
- post = post.replace(' ', ' ')
- post = post.replace('’', '\'')
- post = post.replace("“", "\"")
- post = post.replace("”", "\"")
- post = post.replace("—", "-")
- post = post.replace("\n", " ")
- post = post.replace("\t", " ")
- regex = re.compile('[%s]' % re.escape(string.punctuation))
- post = regex.sub(' ', post)
- post=post.lower()
- return post
- def generate_post_ngrams(self, post):
- def is_ascii(s):
- return all(ord(c) < 128 for c in s)
- # stop_words=['hundred', 'thousand', 'news', 'daily', 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'per', 'a', 'an',\
- # 'the', 'and', 'but', 'or', 'should', 'would', 'might', 'could', 'of', 'with', 'at', 'from', 'into', 'during', 'including', 'until', 'against',\
- # 'among', 'throughout', 'despite', 'towards', 'upon', 'concerning', 'to', 'in', 'for', 'on', 'by', 'about', 'like', 'through', 'over', 'before',\
- # 'between', 'after', 'since', 'without', 'under', 'within', 'along', 'following', 'across', 'behind', 'beyond', 'plus', 'except', 'but', 'up',\
- # 'out', 'around', 'down', 'off', 'above', 'near']
- post = self.preprocess(post)
- tokens = nltk.word_tokenize(post)
- initial_bigrams=ngrams(tokens, 2)
- initial_trigrams=ngrams(tokens, 3)
- bigrams = []
- trigrams = []
- for bigram in initial_bigrams:
- flag=0
- modified=[]
- for w in bigram:
- if w == "" or is_ascii(w) == False or w.isdigit() == True or w in stopwords.words('english'):
- flag=1
- continue
- w_singularized = str(singularize(w))
- if w_singularized.isdigit() == False and w not in stopwords.words('english'):
- w = self.lancaster.stem(w)
- else:
- flag=1
- continue
- modified.append(w)
- if flag==0:
- bigrams.append(modified)
- for trigram in initial_trigrams:
- flag=0
- modified=[]
- for w in trigram:
- #if w == "" or is_ascii(w) == False or w.isdigit() == True or w in stopwords.words('english') or w in stop_words:
- if w == "" or is_ascii(w) == False or w.isdigit() == True:
- flag=1
- continue
- w_singularized = str(singularize(w))
- if w_singularized.isdigit() == False:
- #if w_singularized.isdigit() == False and w not in stopwords.words('english') and w not in stop_words:
- w = self.lancaster.stem(w)
- else:
- flag=1
- continue
- modified.append(w)
- if flag==0:
- trigrams.append(modified)
- return bigrams, trigrams
- def get_pmi(self):
- '''
- PMI: {
- abortion: {
- frame_1: {
- ngram_1: pmi(ngram)
- .
- .
- .
- },
- frame_2: {},
- .
- .
- .
- frame_n: {}
- },
- guns: {
- }
- }
- '''
- frames = list(self.ngram_frame_counts["abortion"].keys())
- PMI = {}
- print("Calculating PMI Scores...............")
- for issue in self.issues:
- PMI[issue] = {}
- for frame in frames:
- PMI[issue][frame] = {}
- n = len(self.ngram_issue_counts[issue])
- pbar = tqdm(total=n)
- print("Calculating PMI for issue: {} and frame: {}".format(issue, frame))
- for ngram in self.ngram_frame_counts[issue][frame]:
- total_issue_count = len(self.ngram_issue_counts[issue]) #Count of all ngrams in issue
- denominator = float(self.ngram_issue_counts[issue][ngram] / total_issue_count) #P(g)
- ngram_frame_count = self.ngram_frame_counts[issue][frame][ngram] #count(ngram in frame)
- total_frame_count = len(self.ngram_frame_counts[issue][frame]) #count (all ngrams in frame)
- numerator = float(ngram_frame_count / total_frame_count)
- pmi = math.log10(float(numerator/denominator))
- PMI[issue][frame][ngram] = pmi
- pbar.update(1)
- pbar.close()
- return PMI
- def prune_ngrams(self, lower_threshold, upper_threshold):
- lower_threshold = float(lower_threshold / 100)
- upper_threshold = float(upper_threshold / 100)
- print("Pruning ngrams..............")
- for issue in self.issues:
- n = len(self.ngram_in_post[issue])
- pbar = tqdm(total=n)
- print("Pruning ngrams for issue: {}".format(issue))
- for ngram in self.ngram_in_post[issue]:
- num_posts = len(self.ngram_in_post[issue][ngram])
- subdir = self.data_folder + issue
- n = len(os.listdir(subdir))
- low_bound = float(lower_threshold * n)
- upp_bound = float(upper_threshold * n)
- if num_posts < low_bound or num_posts > upp_bound:
- del self.ngram_issue_counts[issue][ngram]
- for frame in self.ngram_frame_counts[issue].keys():
- if ngram in self.ngram_frame_counts[issue][frame]:
- del self.ngram_frame_counts[issue][frame][ngram]
- pbar.update(1)
- pbar.close()
- # with open("ngram_issue_counts.json", "w") as f:
- # json.dump(self.ngram_issue_counts, f)
- # with open("ngram_frame_counts.json", "w") as f:
- # json.dump(self.ngram_frame_counts, f)
- def generate_lexicon(self):
- pmi_scores = self.get_pmi()
- sorted_pmi_scores = {}
- #sorting pmi scores for each issue/frame
- for issue in self.issues:
- sorted_pmi_scores[issue] = {}
- for frame in pmi_scores[issue]:
- sorted_pmi_scores[issue][frame] = sorted(pmi_scores[issue][frame].items(), key = operator.itemgetter(1), reverse=True)
- f = open("../../data/4forums/pmi_scores.json", "w")
- json.dump(sorted_pmi_scores, f)
- lexicon = {}
- frames = list(self.ngram_frame_counts["abortion"].keys())
- for issue in self.issues:
- lexicon[issue] = {}
- for frame in frames:
- lexicon[issue][frame] = []
- lexicon[issue][frame].append(sorted_pmi_scores[issue][frame][0][0])
- lexicon[issue][frame].append(sorted_pmi_scores[issue][frame][1][0])
- return lexicon
- def get_all_ngrams(self):
- ngram_frame_counts = {}
- ngram_issue_counts = {}
- ngram_in_post = {}
- all_ngrams = set()
- for issue in self.issues:
- ngram_issue_counts[issue] = {}
- ngram_frame_counts[issue] = {}
- ngram_in_post[issue] = {}
- subdir = self.data_folder + issue
- n = len(os.listdir(subdir))
- pbar = tqdm(total=n)
- print("Generating all ngrams for issue: {}".format(issue))
- for filename in os.listdir(subdir):
- post = ""
- with open(os.path.join(subdir, filename), 'r') as fp:
- lines = fp.readlines()
- counter = 0
- while lines[counter].startswith("ID:") == False:
- post += lines[counter]
- counter += 1
- post_id = lines[counter].split(":")[1].replace("\n", "")
- #import pdb; pdb.set_trace()
- frames = self.post2frames[issue][post_id]
- bigrams, trigrams = self.generate_post_ngrams(post)
- for bigram in bigrams:
- bgram_str = " ".join(bigram)
- if bgram_str in ngram_issue_counts[issue]:
- ngram_issue_counts[issue][bgram_str] += 1
- else:
- ngram_issue_counts[issue][bgram_str] = 1
- for frame in frames:
- if frame not in ngram_frame_counts[issue]:
- ngram_frame_counts[issue][frame] = {}
-
- if bgram_str in ngram_frame_counts[issue][frame]:
- ngram_frame_counts[issue][frame][bgram_str] += 1
- else:
- ngram_frame_counts[issue][frame][bgram_str] = 1
- if bgram_str not in ngram_in_post[issue]:
- ngram_in_post[issue][bgram_str] = set()
- ngram_in_post[issue][bgram_str].add(post_id)
-
- for trigram in trigrams:
- tgram_str = " ".join(trigram)
- if tgram_str in ngram_issue_counts[issue]:
- ngram_issue_counts[issue][tgram_str] += 1
- else:
- ngram_issue_counts[issue][tgram_str] = 1
- for frame in frames:
- if frame not in ngram_frame_counts[issue]:
- ngram_frame_counts[issue][frame] = {}
- if tgram_str in ngram_frame_counts[issue][frame]:
- ngram_frame_counts[issue][frame][tgram_str] += 1
- else:
- ngram_frame_counts[issue][frame][tgram_str] = 1
- if tgram_str not in ngram_in_post[issue]:
- ngram_in_post[issue][tgram_str] = set()
- ngram_in_post[issue][tgram_str].add(post_id)
- pbar.update(1)
- pbar.close()
- with open("ngram_issue_counts.json", "w") as f:
- json.dump(ngram_issue_counts, f)
- with open("ngram_frame_counts.json", "w") as f:
- json.dump(ngram_frame_counts, f)
- with open("ngram_in_post.pkl", "wb") as f:
- pickle.dump(ngram_in_post, f)
-
- return ngram_issue_counts, ngram_frame_counts, ngram_in_post
- # if __name__ == "__main__":
- # lexicon_generator = LexiconGenerator(sys.argv[1], sys.argv[2:4], sys.argv[4], sys.argv[5])
- # lexicon = lexicon_generator.generate_lexicon()
- # with open("../../data/4forums/4forums_lexicon.json", "w") as outfile:
- # json.dump(lexicon, outfile)