download_russian_contrast.py

/scripts/download_russian_contrast.py

https://bitbucket.org/Meister17/term-extraction
Python | 108 lines | 100 code | 6 blank | 2 comment | 11 complexity | 6e889c042c5f6e147d96f0dbc223cee6 MD5 | raw file

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from BeautifulSoup import BeautifulSoup
import optparse
import os
from time import sleep
import urllib2


URL_BEGIN = ('http://search.ruscorpora.ru/search.xml?env=alpha&mycorp=&mysent=' +
             '&mysize=&mysentsize=&mydocsize=&spd=&text=lexgramm&mode=main&sort' +
             '=gr_tagging&lang=ru&nodia=1&parent1=0&level1=0&lex1=')
URL_MIDDLE = ('&gramm1=&sem1=&sem-mod1=sem&sem-mod1=sem2&flags1=&m1=&parent2=0&' +
              'level2=0&min2=1&max2=1&lex2=')
URL_END = '&gramm2=&sem2=&sem-mod2=sem&sem-mod2=sem2&flags2=&m2='


def load_file(filename):
  terms = set()
  if os.path.exists(filename):
    with open(filename, 'r') as input:
      for line in input:
        tokens = line.strip().split('\t')
        if len(tokens) > 2:
          terms.add(tokens[0])
  return terms


def download_frequency(term):
  try:
    words = term.split()
    if len(words) == 1:
      response = urllib2.urlopen(URL_BEGIN + urllib2.quote(term) + URL_MIDDLE + URL_END)
    elif len(words) == 2:
      response = urllib2.urlopen(URL_BEGIN + urllib2.quote(words[0]) + URL_MIDDLE + urllib2.quote(words[1]) + URL_END)
    else:
      raise Exception('Wrong number of words')
    html = response.read()
    soup = BeautifulSoup(html)
    zero_result = False;
    for p in soup.findAll('p'):
      if u'ничего не найдено' in p.text:
        zero_result = True
        break
    if zero_result:
      return 0, 0
    spans = soup.findAll('span', {'class': 'stat-number'})
    return int(''.join(spans[3].text.split())), int(''.join(spans[4].text.split()))
  except Exception, e:
    raise Exception(e)

def download_frequencies(terms, filename):
  output = None
  try:
    if not os.path.exists(filename):
      output = open(filename, 'w')
      output.write('85996\t229968798\n')
    else:
      output = open(filename, 'a')
    for term in sorted(terms):
      df, tf = download_frequency(term)
      print term + '\t' + str(df) + '\t' + str(tf)
      output.write(term + '\t' + str(df) + '\t' + str(tf) + '\n')
  finally:
    if output is not None:
      output.close()


def download_zero_frequencies(filename):
  lines = []
  with open(filename, 'r') as input:
    request_number = 0
    for line in input:
      tokens = line.strip().split('\t')
      if tokens[1] == '0' and tokens[2] == '0':
        if request_number > 4000 and request_number <= 5000:
          df, tf = download_frequency(tokens[0])
          print str(request_number) + '\t' + tokens[0] + '\t' + str(df) + '\t' + str(tf)
          lines.append(tokens[0] + '\t' + str(df) + '\t' + str(tf) + '\n')
        else:
          lines.append(line)
        request_number += 1
      else:
        lines.append(line)
  with open(filename, 'w') as output:
    for line in lines:
      output.write(line)


if __name__ == '__main__':
  parser = optparse.OptionParser(usage='Usage: %prog [options] <input_file> <output_file>')
  parser.add_option('-p', '--proxy', dest='proxy', type=str,
                    help='Proxy server to use')
  options, args = parser.parse_args()
  if len(args) != 2:
    parser.error('Incorrect usage')

  input_file = args[0]
  output_file = args[1]
  terms = load_file(output_file)
  new_terms = load_file(input_file)
  new_terms -= terms
  if options.proxy is not None:
    proxy_support = urllib2.ProxyHandler({"http": "http://" + options.proxy})
    opener = urllib2.build_opener(proxy_support)
    urllib2.install_opener(opener)
  download_frequencies(new_terms, output_file)
  download_zero_frequencies(output_file)