PageRenderTime 1784ms CodeModel.GetById 67ms RepoModel.GetById 13ms app.codeStats 0ms

/DocumentSummarization/TextRank.py

https://bitbucket.org/arka7z/information-retrieval
Python | 271 lines | 250 code | 16 blank | 5 comment | 6 complexity | 0ffbaf25994a6183055ae88a716bb245 MD5 | raw file
  1. # coding: utf-8
  2. # In[ ]:
  3. from __future__ import division
  4. import os
  5. import nltk
  6. import time
  7. import pickle
  8. import io
  9. import math
  10. import numpy as np
  11. from nltk.stem import WordNetLemmatizer
  12. from functools import reduce
  13. from bs4 import BeautifulSoup
  14. from nltk.tokenize.punkt import PunktSentenceTokenizer
  15. from nltk.stem import WordNetLemmatizer
  16. from scipy import spatial
  17. word_to_id_map=dict()
  18. id_to_word=dict()
  19. word_to_doc_freq=dict()
  20. sentence_to_id=dict()
  21. id_to_sentence=dict()
  22. sentence_count=0
  23. word_count=0
  24. total_docs=0
  25. sentence_tf_idf_vec=dict()
  26. wordnet_lemmatizer = WordNetLemmatizer()
  27. path=os.getcwd()
  28. # In[ ]:
  29. def init():
  30. global word_to_id_map
  31. global id_to_word
  32. global word_to_doc_freq
  33. global sentence_to_id
  34. global id_to_sentence
  35. global sentence_count
  36. global word_count
  37. global total_docs
  38. global sentence_tf_idf_vec
  39. word_to_id_map=dict()
  40. id_to_word=dict()
  41. word_to_doc_freq=dict()
  42. sentence_to_id=dict()
  43. id_to_sentence=dict()
  44. sentence_count=0
  45. word_count=0
  46. total_docs=0
  47. sentence_tf_idf_vec=dict()
  48. # In[ ]:
  49. # Module to parse the file and get the sentences
  50. def parse_file(topic_num):
  51. global word_to_id_map
  52. global id_to_word
  53. global word_to_doc_freq
  54. global sentence_to_id
  55. global id_to_sentence
  56. global sentence_count
  57. global word_count
  58. global total_docs
  59. global sentence_tf_idf_vec
  60. for file in os.listdir(path+"/Assignement2_IR/Topic"+str(topic_num)):
  61. total_docs+=1
  62. f = io.open(os.path.join(path+"/Assignement2_IR/Topic"+str(topic_num), file), 'r', encoding='utf-8')
  63. file_content=f.read()
  64. soup = BeautifulSoup(file_content, "lxml")
  65. text_group = soup.get_text()
  66. text_group = ' '.join(text_group.strip().split('\n'))
  67. sentences=nltk.sent_tokenize(text_group)
  68. doc_word_set=set()
  69. for sentence in sentences:
  70. tokens = nltk.word_tokenize(sentence)
  71. tokens=[token.lower() for token in tokens]
  72. wordset=[wordnet_lemmatizer.lemmatize(token) for token in tokens]
  73. sentence_to_id[sentence]=sentence_count
  74. id_to_sentence[sentence_count]=sentence
  75. for word in wordset:
  76. doc_word_set.add(word)
  77. if word in word_to_id_map:
  78. ("")
  79. else:
  80. word_to_id_map[word]=word_count
  81. id_to_word[word_count]=word
  82. word_count+=1
  83. sentence_count+=1
  84. for dist_word in doc_word_set:
  85. if dist_word in word_to_doc_freq:
  86. word_to_doc_freq[dist_word]+=1
  87. else:
  88. word_to_doc_freq[dist_word]=1
  89. # In[ ]:
  90. # CREATING TF-IDF representation of the sentences
  91. def get_tf_idf_vec():
  92. global word_to_id_map
  93. global id_to_word
  94. global word_to_doc_freq
  95. global sentence_to_id
  96. global id_to_sentence
  97. global sentence_count
  98. global word_count
  99. global total_docs
  100. global sentence_tf_idf_vec
  101. for sentence in sentence_to_id:
  102. tokens = nltk.word_tokenize(sentence)
  103. tokens=[token.lower() for token in tokens]
  104. wordset=[wordnet_lemmatizer.lemmatize(token) for token in tokens]
  105. sentence_tf_idf_vec[sentence]=[0.0]*word_count
  106. for word in wordset:
  107. sentence_tf_idf_vec[sentence][word_to_id_map[word]]+=1
  108. wordset=list(set(wordset))
  109. for dist_word in wordset:
  110. sentence_tf_idf_vec[sentence][word_to_id_map[dist_word]]*=math.log(total_docs/word_to_doc_freq[dist_word])/math.log(2)
  111. # In[ ]:
  112. # Returns cosine similarity between the vector representation of the two sentences
  113. def get_cosine_similarity(sentence1,sentence2):
  114. global word_to_id_map
  115. global id_to_word
  116. global word_to_doc_freq
  117. global sentence_to_id
  118. global id_to_sentence
  119. global sentence_count
  120. global word_count
  121. global total_docs
  122. global sentence_tf_idf_vec
  123. sim=1 - spatial.distance.cosine(sentence_tf_idf_vec[sentence1], sentence_tf_idf_vec[sentence2])
  124. return sim
  125. # In[ ]:
  126. # Constructs the similarity graph between sentences with threshold passed as param
  127. def construct_graph(threshold):
  128. global word_to_id_map
  129. global id_to_word
  130. global word_to_doc_freq
  131. global sentence_to_id
  132. global id_to_sentence
  133. global sentence_count
  134. global word_count
  135. global total_docs
  136. global sentence_tf_idf_vec
  137. graph=dict()
  138. Degree=[0]*sentence_count
  139. for i in range(sentence_count):
  140. graph[i]=[0.0]*sentence_count
  141. for j in range (sentence_count):
  142. graph[i][j]=get_cosine_similarity(id_to_sentence[i],id_to_sentence[j])
  143. if(graph[i][j]<threshold):
  144. graph[i][j]=0
  145. else:
  146. graph[i][j]=1
  147. Degree[i]+=1
  148. for i in range(sentence_count):
  149. for j in range (sentence_count):
  150. if(Degree[i]!=0):
  151. graph[i][j]/=Degree[i]
  152. return graph,Degree
  153. # In[ ]:
  154. # Power method to find the left eigen vector of the transition kernel
  155. def power_method(graph,damping_factor,tolerance):
  156. d=damping_factor
  157. global sentence_count
  158. tmp=list()
  159. for key,value in graph.items():
  160. tmp.append(value)
  161. graph=np.asarray(tmp)
  162. p_old=[1.0/sentence_count]*sentence_count
  163. U=[p_old]*len(p_old)
  164. U=np.asarray(U)
  165. p_old=np.asarray(p_old).T
  166. iter_count=0
  167. while(True):
  168. iter_count+=1
  169. p=graph.T.dot(p_old)
  170. p=(d*U+(1-d)*graph).T.dot(p_old)
  171. if ( (np.sqrt(np.sum((p_old-p)**2))) <tolerance):
  172. break
  173. p_old=p
  174. return p
  175. # In[ ]:
  176. # Selects sentences on the basis of TextRank algorithm and writes them to a file
  177. def write_summary_to_file(topic_num,threshold):
  178. global word_to_id_map
  179. global id_to_word
  180. global word_to_doc_freq
  181. global sentence_to_id
  182. global id_to_sentence
  183. global sentence_count
  184. global word_count
  185. global total_docs
  186. global sentence_tf_idf_vec
  187. init()
  188. parse_file(topic_num)
  189. get_tf_idf_vec()
  190. graph,Degree=construct_graph(threshold)
  191. p=power_method(graph,0.15,1e-5)
  192. words_added=0
  193. summary=""
  194. p=np.argsort(p)
  195. p=p[::-1]
  196. for highest_degree_id in p:
  197. sentence=id_to_sentence[highest_degree_id]
  198. summary+=sentence
  199. tokens = nltk.word_tokenize(sentence)
  200. words_added+=len(tokens)
  201. if(words_added>250):
  202. break
  203. with open("TextRank_Summary_"+str(topic_num)+"_Threshold_"+str(threshold)+".txt",'a') as file:
  204. file.write(summary)
  205. file.close()
  206. # In[ ]:
  207. threshold_list=[0.1,0.2,0.3]
  208. for topic in range(1,6):
  209. for thresh in threshold_list:
  210. write_summary_to_file(topic,thresh)
  211. # In[ ]: