/ver-2/main.py

https://github.com/yasulab/author-evaluator · Python · 234 lines · 185 code · 27 blank · 22 comment · 45 complexity · adbe8a53dc309360d56fbbcc01eda41c MD5 · raw file

  1. #!/usr/bin/python
  2. #! -*- coding: utf-8 -*-
  3. import sys
  4. import re
  5. import os
  6. import copy
  7. import math, random
  8. from optparse import OptionParser
  9. COUNTER = 0
  10. WORD = 1
  11. VFLAG = False
  12. START = "START"
  13. P_OF_UNKNOWN_WORD = 10**-8
  14. def read_file(filename):
  15. if os.path.exists('./'+filename) == False:
  16. print "No such a file."
  17. return
  18. input = open(filename, "r")
  19. return input.read()
  20. def get_num_of_words(given_text):
  21. return len(get_word_list(given_text))
  22. def sum_of_table(table):
  23. sum = 0.0
  24. for word in sorted(table.keys()):
  25. #print word, table[word]
  26. sum += float(table[word])
  27. return sum
  28. def pick_rand(ptable, last_word):
  29. next_word = ""
  30. if ptable.has_key(last_word):
  31. rand = random.random() * sum_of_table(ptable[last_word])
  32. #print sum_of_table(ptable[last_word])
  33. #print rand
  34. counter = 0
  35. for word in sorted(ptable[last_word].keys()):
  36. counter += ptable[last_word][word]
  37. if rand < counter:
  38. next_word = word
  39. break
  40. else:
  41. next_word = "There are no possible words."
  42. return next_word
  43. def get_ptable_len(ptable):
  44. ptable_len = 0.0
  45. for word in ptable:
  46. ptable_len += len(ptable[word])
  47. return ptable_len
  48. def show_all_ptable(ptable):
  49. ptable_len = get_ptable_len(ptable)
  50. sum_prob = 0
  51. for word in sorted(ptable.keys()):
  52. print "WORD |\tPROB | NEXT_WORD\t"
  53. print word
  54. for nw in sorted(ptable[word].keys()):
  55. print "\t%1f | %s" % (ptable[word][nw], nw)
  56. print
  57. print
  58. def get_word_list(text):
  59. p = re.compile(r'\W+')
  60. list = p.split(text)
  61. for i,word in enumerate(list):
  62. if not word:
  63. list.pop(i)
  64. return list
  65. def get_unigram(given_text, given_num_of_words):
  66. word_list = get_word_list(given_text)
  67. unigram = dict()
  68. num_of_words = 0.0
  69. for word in word_list:
  70. word = word.lower()
  71. #unigram[word] = float(unigram.get(word, 0) + 1.0) / len(word_list)
  72. unigram[word] = (unigram.get(word, 0) + 1.0)
  73. #print word, unigram[word]
  74. num_of_words += 1.0
  75. #print num_of_words, given_num_of_words
  76. if given_num_of_words == 0:
  77. continue
  78. elif num_of_words > float(given_num_of_words):
  79. break
  80. for word in unigram:
  81. unigram[word] = unigram[word] / len(unigram)
  82. #print word, unigram[word]
  83. return unigram
  84. def get_log_unigram(given_text, given_num_of_words):
  85. unigram = get_unigram(given_text, given_num_of_words)
  86. for word in unigram:
  87. unigram[word] = math.log(unigram[word])
  88. return unigram
  89. def get_bigram(given_text, given_num_of_words):
  90. word_list = get_word_list(given_text)
  91. bigram = dict()
  92. num_of_words = 0.0
  93. prev_word = START
  94. for word in word_list:
  95. word = word.lower()
  96. if not bigram.has_key(prev_word):
  97. #print word
  98. bigram[prev_word] = dict()
  99. #print "DUP: ", word
  100. bigram[prev_word][word] = bigram.get(prev_word, WORD).get(word, COUNTER)+1.0
  101. num_of_words += 1.0
  102. prev_word = word
  103. #print num_of_words, given_num_of_words
  104. if given_num_of_words == 0:
  105. continue
  106. elif num_of_words > float(given_num_of_words):
  107. break
  108. for pw in bigram:
  109. for w in bigram[pw]:
  110. bigram[pw][w] = bigram[pw][w] / get_ptable_len(bigram)
  111. #print pw, w, bigram[pw][w], get_ptable_len(bigram)
  112. return bigram
  113. def get_last_sentence(given_text, given_num_of_words):
  114. word_list = get_word_list(given_text)
  115. if not given_num_of_words == 0:
  116. word_list = word_list[:given_num_of_words]
  117. for i,word in enumerate(word_list):
  118. if not word:
  119. word_list.pop(i)
  120. word_list.reverse()
  121. sentence = []
  122. for word in word_list:
  123. sentence.append(word)
  124. if not word:
  125. continue
  126. if word[0].isupper():
  127. break
  128. sentence.reverse()
  129. return sentence
  130. def get_reversed_given_num(given_text, given_num_of_words):
  131. word_list = get_word_list(given_text)
  132. if given_num_of_words < 0:
  133. return len(word_list) + given_num_of_words
  134. else:
  135. return given_num_of_words
  136. def create_ptable(unigram, bigram):
  137. ptable = copy.deepcopy(bigram)
  138. for pw in bigram:
  139. for w in bigram[pw]:
  140. if pw == START:
  141. """
  142. P(w|START) = P(w)
  143. """
  144. ptable[pw][w] = unigram[w]
  145. else:
  146. """
  147. P(w|prev) = (P(prev|w)P(w)) / P(prev)
  148. """
  149. ptable[pw][w] = (bigram[pw][w] * unigram[w])/unigram[pw]
  150. return ptable
  151. def create_log_ptable(unigram, bigram):
  152. ptable = create_ptable(unigram, bigram)
  153. for pw in ptable:
  154. for w in ptable[pw]:
  155. ptable[pw][w] = math.log(ptable[pw][w])
  156. return ptable
  157. def create_log_ptable_from_filename(filename):
  158. given_text = read_file(filename)
  159. unigram = get_unigram(given_text, 0)
  160. bigram = get_bigram(given_text, 0)
  161. ptable = create_log_ptable(unigram, bigram)
  162. return ptable
  163. def get_score(ptable, text):
  164. word_list = get_word_list(text)
  165. pw = "START"
  166. result = 0.0
  167. for w in word_list:
  168. w = w.lower()
  169. if ptable.has_key(pw) and ptable[pw].has_key(w):
  170. result += ptable[pw][w]
  171. else:
  172. """
  173. The bigram is an unknown set.
  174. """
  175. result += math.log(P_OF_UNKNOWN_WORD)
  176. pw = w
  177. return result
  178. if __name__ == "__main__":
  179. argv_len = len(sys.argv)
  180. if not argv_len == 4:
  181. print "Usage: python main.py FILENAME_1 FILENAME_2 TEXT"
  182. exit()
  183. filename_1 = sys.argv[1]
  184. filename_2 = sys.argv[2]
  185. given_text = sys.argv[3]
  186. ptable_1 = create_log_ptable_from_filename(filename_1)
  187. ptable_2 = create_log_ptable_from_filename(filename_2)
  188. print "******************"
  189. print "*%s *" % filename_1
  190. print "******************"
  191. show_all_ptable(ptable_2)
  192. show_all_ptable(ptable_1)
  193. print "******************"
  194. print "*%s *" % filename_2
  195. print "******************"
  196. show_all_ptable(ptable_2)
  197. score_1 = get_score(ptable_1, given_text)
  198. score_2 = get_score(ptable_2, given_text)
  199. print "Given Text:", given_text
  200. print "Score for '%s': %d" % (filename_1, score_1)
  201. print "Score for '%s': %d" % (filename_2, score_2)
  202. if score_1 == score_2:
  203. print "Can't determine who is the author."
  204. elif score_1 > score_2:
  205. print "Given text's author will be the author of '%s'." % filename_1
  206. else:
  207. print "Given text's author will be the author of '%s'." % filename_2