PageRenderTime 28ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/scripts/download_russian_contrast.py

https://bitbucket.org/Meister17/term-extraction
Python | 108 lines | 100 code | 6 blank | 2 comment | 11 complexity | 6e889c042c5f6e147d96f0dbc223cee6 MD5 | raw file
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. from BeautifulSoup import BeautifulSoup
  4. import optparse
  5. import os
  6. from time import sleep
  7. import urllib2
  8. URL_BEGIN = ('http://search.ruscorpora.ru/search.xml?env=alpha&mycorp=&mysent=' +
  9. '&mysize=&mysentsize=&mydocsize=&spd=&text=lexgramm&mode=main&sort' +
  10. '=gr_tagging&lang=ru&nodia=1&parent1=0&level1=0&lex1=')
  11. URL_MIDDLE = ('&gramm1=&sem1=&sem-mod1=sem&sem-mod1=sem2&flags1=&m1=&parent2=0&' +
  12. 'level2=0&min2=1&max2=1&lex2=')
  13. URL_END = '&gramm2=&sem2=&sem-mod2=sem&sem-mod2=sem2&flags2=&m2='
  14. def load_file(filename):
  15. terms = set()
  16. if os.path.exists(filename):
  17. with open(filename, 'r') as input:
  18. for line in input:
  19. tokens = line.strip().split('\t')
  20. if len(tokens) > 2:
  21. terms.add(tokens[0])
  22. return terms
  23. def download_frequency(term):
  24. try:
  25. words = term.split()
  26. if len(words) == 1:
  27. response = urllib2.urlopen(URL_BEGIN + urllib2.quote(term) + URL_MIDDLE + URL_END)
  28. elif len(words) == 2:
  29. response = urllib2.urlopen(URL_BEGIN + urllib2.quote(words[0]) + URL_MIDDLE + urllib2.quote(words[1]) + URL_END)
  30. else:
  31. raise Exception('Wrong number of words')
  32. html = response.read()
  33. soup = BeautifulSoup(html)
  34. zero_result = False;
  35. for p in soup.findAll('p'):
  36. if u'ничего не найдено' in p.text:
  37. zero_result = True
  38. break
  39. if zero_result:
  40. return 0, 0
  41. spans = soup.findAll('span', {'class': 'stat-number'})
  42. return int(''.join(spans[3].text.split())), int(''.join(spans[4].text.split()))
  43. except Exception, e:
  44. raise Exception(e)
  45. def download_frequencies(terms, filename):
  46. output = None
  47. try:
  48. if not os.path.exists(filename):
  49. output = open(filename, 'w')
  50. output.write('85996\t229968798\n')
  51. else:
  52. output = open(filename, 'a')
  53. for term in sorted(terms):
  54. df, tf = download_frequency(term)
  55. print term + '\t' + str(df) + '\t' + str(tf)
  56. output.write(term + '\t' + str(df) + '\t' + str(tf) + '\n')
  57. finally:
  58. if output is not None:
  59. output.close()
  60. def download_zero_frequencies(filename):
  61. lines = []
  62. with open(filename, 'r') as input:
  63. request_number = 0
  64. for line in input:
  65. tokens = line.strip().split('\t')
  66. if tokens[1] == '0' and tokens[2] == '0':
  67. if request_number > 4000 and request_number <= 5000:
  68. df, tf = download_frequency(tokens[0])
  69. print str(request_number) + '\t' + tokens[0] + '\t' + str(df) + '\t' + str(tf)
  70. lines.append(tokens[0] + '\t' + str(df) + '\t' + str(tf) + '\n')
  71. else:
  72. lines.append(line)
  73. request_number += 1
  74. else:
  75. lines.append(line)
  76. with open(filename, 'w') as output:
  77. for line in lines:
  78. output.write(line)
  79. if __name__ == '__main__':
  80. parser = optparse.OptionParser(usage='Usage: %prog [options] <input_file> <output_file>')
  81. parser.add_option('-p', '--proxy', dest='proxy', type=str,
  82. help='Proxy server to use')
  83. options, args = parser.parse_args()
  84. if len(args) != 2:
  85. parser.error('Incorrect usage')
  86. input_file = args[0]
  87. output_file = args[1]
  88. terms = load_file(output_file)
  89. new_terms = load_file(input_file)
  90. new_terms -= terms
  91. if options.proxy is not None:
  92. proxy_support = urllib2.ProxyHandler({"http": "http://" + options.proxy})
  93. opener = urllib2.build_opener(proxy_support)
  94. urllib2.install_opener(opener)
  95. download_frequencies(new_terms, output_file)
  96. download_zero_frequencies(output_file)