/get_wiki_articles_run.py

https://github.com/dedan/runs-gensim
Python | 149 lines | 102 code | 21 blank | 26 comment | 17 complexity | 36de0b82e12bd50d58bb20250168020e MD5 | raw file
  1. #!/usr/bin/env python
  2. # encoding: utf-8
  3. """
  4. get the wikipedia articles for lists of words provided in textfiles
  5. these textfiles have to be together in one folder. Necessary is also a
  6. reference.queries file in the same folder which contains the human ratings.
  7. Output is an articles.pickle file that can later be investigated with e.g.
  8. * topic_clusters_task
  9. * topic_noise_task
  10. further output is the file info.pickle which contains information on
  11. missing words, etc..
  12. """
  13. from gensim.corpora import wikicorpus
  14. from gensim.parsing.preprocessing import preprocess_string
  15. import codecs
  16. import glob
  17. import mwclient
  18. import os
  19. import pickle
  20. import re
  21. import sys
  22. import tools
  23. import unicodedata as ud
  24. import urllib
  25. def main(param_file=None):
  26. # setup
  27. p, base_path, output_dir = tools.setup(param_file)
  28. logger = tools.get_logger('gensim', os.path.join(output_dir, "run.log"))
  29. logger.info("running %s" % ' '.join(sys.argv))
  30. # initializations
  31. articles = {}
  32. all_missing = []
  33. redir_on = {}
  34. collisions = {}
  35. non_ascii = []
  36. site = mwclient.Site('en.wikipedia.org', '/w/api.php/')
  37. # get all txt files in a folder and iterate over them
  38. filelist = glob.glob(os.path.join(base_path,
  39. p['folder_path'],
  40. "*.txt"))
  41. for f in filelist:
  42. # get the word we are working on
  43. f_name = os.path.basename(f)
  44. k_word = os.path.splitext(f_name)[0]
  45. logger.info("working on file: %s" % f_name)
  46. # try to convert the word into ascii for the http query
  47. file_obj = codecs.open(f, "r", "utf-8")
  48. counter = 0
  49. words = []
  50. for w in file_obj.readlines():
  51. try:
  52. s = w.strip().decode('ascii')
  53. words.append(s)
  54. except Exception:
  55. counter += 1
  56. non_ascii.append(w.strip())
  57. logger.info("\t%d words containing non ascii are ommited" % counter)
  58. articles[k_word] = {}
  59. logger.info("\tfound %d words in file" % len(words))
  60. for word in words:
  61. data = {}
  62. page = site.Pages[word]
  63. # follow the redirect and check for collisions
  64. if page.redirect:
  65. res = re.search('\[\[(.+)\]\]', page.edit())
  66. redir_word = urllib.unquote(res.groups()[0])
  67. if redir_word in redir_on:
  68. logger.warning("[%s AND %s] both redirect on --> %s" %
  69. (word, redir_on[redir_word], redir_word))
  70. collisions[redir_word] = redir_on[redir_word]
  71. else:
  72. logger.info("[%s] redir from [%s]" % (redir_word, word))
  73. redir_on[redir_word] = word
  74. text = site.Pages[redir_word].edit()
  75. data['redirected'] = redir_word
  76. else:
  77. text = page.edit()
  78. # check for missing wikipedia articles
  79. if text == "":
  80. all_missing.append(word)
  81. continue
  82. # preprocess the received article
  83. data['text'] = wikicorpus.filter_wiki(text)
  84. in_ascii = ud.normalize('NFKD',
  85. data['text']).encode('ascii', 'ignore')
  86. data['text'] = preprocess_string(in_ascii)
  87. articles[k_word][word] = data
  88. logger.info('add human rating to the articles')
  89. id_word = {}
  90. sparql_path = os.path.join(base_path, p['sparql_path'])
  91. with open(os.path.join(sparql_path, 'id_word.txt')) as f:
  92. for line in f.readlines():
  93. idx, word = line.strip().split('\t')
  94. id_word[idx] = word
  95. #add human rating to the wikipedia data
  96. not_found = []
  97. with open(os.path.join(base_path,
  98. p['folder_path'],
  99. p['human_file'])) as f:
  100. for line in f.readlines():
  101. arr = line.split()
  102. word = id_word[arr[0]]
  103. term = arr[3]
  104. try:
  105. articles[word][term]['rating'] = float(arr[4])
  106. except KeyError:
  107. not_found.append(term)
  108. logger.info("%d words from the ref queries not found" % len(not_found))
  109. f = open(os.path.join(output_dir, "articles.pickle"), 'wb')
  110. pickle.dump(articles, f)
  111. f.close
  112. info = {}
  113. info['missing'] = all_missing
  114. info['redirs'] = redir_on
  115. info['collisions'] = collisions
  116. info['not_found'] = not_found
  117. info['non_ascii'] = non_ascii
  118. f = open(os.path.join(output_dir, "info.pickle"), 'wb')
  119. pickle.dump(info, f)
  120. f.close
  121. logger.info("%d redirecting collisions (see info.pkl)" % len(collisions))
  122. if __name__ == '__main__':
  123. main()