generate_lexicon.py | searchcode

/examples/frames/utils/4forums/generate_lexicon.py

https://gitlab.com/purdueNlp/DRaiL · Python · 369 lines · 259 code · 69 blank · 41 comment · 66 complexity · 2cec951434e5be97157f21613c0bb581 MD5 · raw file

from __future__ import unicode_literals
import json
import os
import math
from tqdm import tqdm 
import nltk
import string
from inflection import singularize
import re
import sys
import pickle
import json
import operator
from nltk.stem import LancasterStemmer
from nltk.util import ngrams
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk import word_tokenize
import operator
from nltk.corpus import wordnet as wn




class LexiconGenerator():
    def __init__(self, data_folder, issues, lower_bound, upper_bound):
        self.data_folder = data_folder
        self.issues = issues
        self.lower_bound = float(lower_bound)
        self.upper_bound = float(upper_bound)
        self.lancaster = LancasterStemmer()
        self.post2frames, self.frame2posts = self.annotate_posts()
        self.ngram_issue_counts, self.ngram_frame_counts, self.ngram_in_post = self.get_all_ngrams()
        # with open(os.getcwd() + "/ngram_issue_counts.json", "r") as f:
        #     self.ngram_issue_counts = json.load(f)
        # with open(os.getcwd() + "/ngram_frame_counts.json", "r") as f:
        #     self.ngram_frame_counts = json.load(f)
        self.ngram_in_post = pickle.load(open(os.getcwd() + "/ngram_in_post.pkl", "rb"))
        self.prune_ngrams(self.lower_bound, self.upper_bound)        


    def annotate_posts(self):
        post2frames = {}
        frames2posts = {}
        with open("../../data/frame_subframe_lexicon.json") as f:
            indicator_list = json.load(f)
            frame_lexicon = indicator_list[0]

        for issue in self.issues:
            post2frames[issue] = {}
            frames2posts[issue] = {}
            subdir = self.data_folder + issue
            n = len(os.listdir(subdir))
            print("Annotating Posts for issue {}: ".format(issue))
            pbar = tqdm(total=n)
            for filename in os.listdir(subdir):
                post = ""
                with open(os.path.join(subdir, filename), 'r') as fp:
                    lines = fp.readlines()
                    counter = 0
                    while lines[counter].startswith("ID:") == False:
                        post += lines[counter]
                        counter += 1
                    post_id = lines[counter].split(":")[1].replace("\n", "")
                    frames = self.annotate_post(post, frame_lexicon)
                    post2frames[issue][post_id] = frames
                    for frame in frames:
                        if frame not in frames2posts[issue]:
                            frames2posts[issue][frame] = set()
                        frames2posts[issue][frame].add(post_id)
                        
                pbar.update(1)
            pbar.close()

        return post2frames, frames2posts

    def annotate_post(self, post, frame_lexicon):
        post_rankings = {}

        tokens = word_tokenize(post.lower())
        stemmed_tokens = []
        for token in tokens:
            stemmed_tokens.append(self.lancaster.stem(token))
        stemmed_tokens = "_" + "_".join(stemmed_tokens) + "_"

        for frame in frame_lexicon:
            post_rankings[frame] = 0
            indicators = frame_lexicon[frame]
            for indicator in indicators:
                if indicator in stemmed_tokens:
                    post_rankings[frame] += 1


        frames = list(frame_lexicon.keys())
        sorted_frames = sorted(frames, key=lambda x: post_rankings[x], reverse=True)
        
        return set(sorted_frames[:2])
            


    def preprocess(self, post):
        post = re.sub(r'http\S+', '', post)
        post = post.replace('&#160;', ' ')
        post = post.replace('’', '\'')
        post = post.replace("“", "\"")
        post = post.replace("”", "\"")
        post = post.replace("—", "-")

        post = post.replace("\n", " ")
        post = post.replace("\t", " ")
        regex = re.compile('[%s]' % re.escape(string.punctuation))
        post = regex.sub(' ', post)
        post=post.lower()

        return post

    def generate_post_ngrams(self, post):

        def is_ascii(s):
            return all(ord(c) < 128 for c in s)

        # stop_words=['hundred', 'thousand', 'news', 'daily', 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'per', 'a', 'an',\
        #            'the', 'and', 'but', 'or', 'should', 'would', 'might', 'could', 'of', 'with', 'at', 'from', 'into', 'during', 'including', 'until', 'against',\
        #            'among', 'throughout', 'despite', 'towards', 'upon', 'concerning', 'to', 'in', 'for', 'on', 'by', 'about', 'like', 'through', 'over', 'before',\
        #            'between', 'after', 'since', 'without', 'under', 'within', 'along', 'following', 'across', 'behind', 'beyond', 'plus', 'except', 'but', 'up',\
        #            'out', 'around', 'down', 'off', 'above', 'near']

        post = self.preprocess(post)
        tokens = nltk.word_tokenize(post)

        initial_bigrams=ngrams(tokens, 2)
        initial_trigrams=ngrams(tokens, 3)

        bigrams = []
        trigrams = []

        for bigram in initial_bigrams:
            flag=0
            modified=[]
            for w in bigram:
                if w == "" or is_ascii(w) == False or w.isdigit() == True or w in stopwords.words('english'):
                    flag=1
                    continue

                w_singularized = str(singularize(w))
                if w_singularized.isdigit() == False and w not in stopwords.words('english'):
                    w = self.lancaster.stem(w)
                else:
                    flag=1
                    continue
                modified.append(w)
            if flag==0:
                bigrams.append(modified)

        for trigram in initial_trigrams:
            flag=0
            modified=[]
            for w in trigram:
                #if w == "" or is_ascii(w) == False or w.isdigit() == True or w in stopwords.words('english') or w in stop_words:
                if w == "" or is_ascii(w) == False or w.isdigit() == True:
                    flag=1
                    continue

                w_singularized = str(singularize(w))
                if w_singularized.isdigit() == False:
                #if w_singularized.isdigit() == False and w not in stopwords.words('english') and w not in stop_words:
                    w = self.lancaster.stem(w)
                else:
                    flag=1
                    continue
                modified.append(w)
            if flag==0:
                trigrams.append(modified)

        return bigrams, trigrams

    def get_pmi(self):
        '''
        PMI: {
            abortion: {
                frame_1: {
                    ngram_1: pmi(ngram)
                    .
                    .
                    .
                },
                frame_2: {},
                .
                .
                .
                frame_n: {}
            },
            guns: {
            }
        }
        '''
        frames = list(self.ngram_frame_counts["abortion"].keys())
        PMI = {}
        print("Calculating PMI Scores...............")
        for issue in self.issues:
            PMI[issue] = {}
            for frame in frames:
                PMI[issue][frame] = {}
                n = len(self.ngram_issue_counts[issue])
                pbar = tqdm(total=n)
                print("Calculating PMI for issue: {} and frame: {}".format(issue, frame))
                for ngram in self.ngram_frame_counts[issue][frame]:

                    total_issue_count = len(self.ngram_issue_counts[issue]) #Count of all ngrams in issue
                    denominator = float(self.ngram_issue_counts[issue][ngram] / total_issue_count) #P(g)

                    ngram_frame_count = self.ngram_frame_counts[issue][frame][ngram] #count(ngram in frame)
                    total_frame_count = len(self.ngram_frame_counts[issue][frame]) #count (all ngrams in frame)
                    numerator = float(ngram_frame_count / total_frame_count)

                    pmi = math.log10(float(numerator/denominator))
                    PMI[issue][frame][ngram] = pmi
                    pbar.update(1)
                pbar.close()

        return PMI

    def prune_ngrams(self, lower_threshold, upper_threshold):
        lower_threshold = float(lower_threshold / 100)
        upper_threshold = float(upper_threshold / 100)
        print("Pruning ngrams..............")
        for issue in self.issues:
            n = len(self.ngram_in_post[issue])
            pbar = tqdm(total=n)
            print("Pruning ngrams for issue: {}".format(issue))
            for ngram in self.ngram_in_post[issue]:
                num_posts = len(self.ngram_in_post[issue][ngram])
                subdir = self.data_folder + issue
                n = len(os.listdir(subdir))

                low_bound = float(lower_threshold * n)
                upp_bound = float(upper_threshold * n)

                if num_posts < low_bound or num_posts > upp_bound:
                    del self.ngram_issue_counts[issue][ngram]
                    for frame in self.ngram_frame_counts[issue].keys():
                        if ngram in self.ngram_frame_counts[issue][frame]:
                            del self.ngram_frame_counts[issue][frame][ngram]

                pbar.update(1)
            pbar.close()

        # with open("ngram_issue_counts.json", "w") as f:
        #     json.dump(self.ngram_issue_counts, f)
        # with open("ngram_frame_counts.json", "w") as f:
        #     json.dump(self.ngram_frame_counts, f)


    def generate_lexicon(self):
        pmi_scores = self.get_pmi()
        sorted_pmi_scores = {}

        #sorting pmi scores for each issue/frame
        for issue in self.issues:
            sorted_pmi_scores[issue] = {}
            for frame in pmi_scores[issue]:
                sorted_pmi_scores[issue][frame] = sorted(pmi_scores[issue][frame].items(), key = operator.itemgetter(1), reverse=True)


        f = open("../../data/4forums/pmi_scores.json", "w")
        json.dump(sorted_pmi_scores, f)

        lexicon = {}
        frames = list(self.ngram_frame_counts["abortion"].keys())

        for issue in self.issues:
            lexicon[issue] = {}
            for frame in frames:
                lexicon[issue][frame] = []
                lexicon[issue][frame].append(sorted_pmi_scores[issue][frame][0][0])
                lexicon[issue][frame].append(sorted_pmi_scores[issue][frame][1][0])

        return lexicon


    def get_all_ngrams(self):
        ngram_frame_counts = {}
        ngram_issue_counts = {}
        ngram_in_post = {}
        all_ngrams = set()

        for issue in self.issues:
            ngram_issue_counts[issue] = {}
            ngram_frame_counts[issue] = {}
            ngram_in_post[issue] = {}
            subdir = self.data_folder + issue
            n = len(os.listdir(subdir))
            pbar = tqdm(total=n)
            print("Generating all ngrams for issue: {}".format(issue))
            for filename in os.listdir(subdir):
                post = ""
                with open(os.path.join(subdir, filename), 'r') as fp:
                    lines = fp.readlines()
                    counter = 0
                    while lines[counter].startswith("ID:") == False:
                        post += lines[counter]
                        counter += 1
                    post_id = lines[counter].split(":")[1].replace("\n", "")
                    #import pdb; pdb.set_trace()
                    frames = self.post2frames[issue][post_id]
                    bigrams, trigrams = self.generate_post_ngrams(post)

                    for bigram in bigrams:
                        bgram_str = " ".join(bigram)
                        if bgram_str in ngram_issue_counts[issue]:
                            ngram_issue_counts[issue][bgram_str] += 1
                        else:
                            ngram_issue_counts[issue][bgram_str] = 1


                        for frame in frames:
                            if frame not in ngram_frame_counts[issue]:
                                ngram_frame_counts[issue][frame] = {}
                    
                            if bgram_str in ngram_frame_counts[issue][frame]:
                                ngram_frame_counts[issue][frame][bgram_str] += 1
                            else:
                                ngram_frame_counts[issue][frame][bgram_str] = 1

                        if bgram_str not in ngram_in_post[issue]:
                            ngram_in_post[issue][bgram_str] = set()
                        ngram_in_post[issue][bgram_str].add(post_id)

                    
                    for trigram in trigrams:
                        tgram_str = " ".join(trigram)
                        if tgram_str in ngram_issue_counts[issue]:
                            ngram_issue_counts[issue][tgram_str] += 1
                        else:
                            ngram_issue_counts[issue][tgram_str] = 1


                        for frame in frames:
                            if frame not in ngram_frame_counts[issue]:
                                ngram_frame_counts[issue][frame] = {}

                            if tgram_str in ngram_frame_counts[issue][frame]:
                                ngram_frame_counts[issue][frame][tgram_str] += 1
                            else:
                                ngram_frame_counts[issue][frame][tgram_str] = 1

                        if tgram_str not in ngram_in_post[issue]:
                            ngram_in_post[issue][tgram_str] = set()
                        ngram_in_post[issue][tgram_str].add(post_id)

                pbar.update(1)
            pbar.close()

        with open("ngram_issue_counts.json", "w") as f:
            json.dump(ngram_issue_counts, f)
        with open("ngram_frame_counts.json", "w") as f:
            json.dump(ngram_frame_counts, f)
        with open("ngram_in_post.pkl", "wb") as f:
            pickle.dump(ngram_in_post, f)


        
        return ngram_issue_counts, ngram_frame_counts, ngram_in_post

# if __name__ == "__main__":
#     lexicon_generator = LexiconGenerator(sys.argv[1], sys.argv[2:4], sys.argv[4], sys.argv[5])
#     lexicon = lexicon_generator.generate_lexicon()
#     with open("../../data/4forums/4forums_lexicon.json", "w") as outfile:
#         json.dump(lexicon, outfile)
Tech Fingerprint

Alerts (21)

'def' Ensure functions have docstrings for documentation
42 77 101 117 119 223 254 281
'print(' Use logging module for better control and configurability
54 199 206 226 230 294
'list(' Avoid unnecessary list conversions; use generators where possible
94 197 269
Complexity hotspot; lines 140 to 141 (total complexity: 5)
140 141
'del' Avoid unless necessary; Python's garbage collector typically handles object deletion
240 243