/scripts/download_russian_contrast.py
Python | 108 lines | 100 code | 6 blank | 2 comment | 11 complexity | 6e889c042c5f6e147d96f0dbc223cee6 MD5 | raw file
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- from BeautifulSoup import BeautifulSoup
- import optparse
- import os
- from time import sleep
- import urllib2
- URL_BEGIN = ('http://search.ruscorpora.ru/search.xml?env=alpha&mycorp=&mysent=' +
- '&mysize=&mysentsize=&mydocsize=&spd=&text=lexgramm&mode=main&sort' +
- '=gr_tagging&lang=ru&nodia=1&parent1=0&level1=0&lex1=')
- URL_MIDDLE = ('&gramm1=&sem1=&sem-mod1=sem&sem-mod1=sem2&flags1=&m1=&parent2=0&' +
- 'level2=0&min2=1&max2=1&lex2=')
- URL_END = '&gramm2=&sem2=&sem-mod2=sem&sem-mod2=sem2&flags2=&m2='
- def load_file(filename):
- terms = set()
- if os.path.exists(filename):
- with open(filename, 'r') as input:
- for line in input:
- tokens = line.strip().split('\t')
- if len(tokens) > 2:
- terms.add(tokens[0])
- return terms
- def download_frequency(term):
- try:
- words = term.split()
- if len(words) == 1:
- response = urllib2.urlopen(URL_BEGIN + urllib2.quote(term) + URL_MIDDLE + URL_END)
- elif len(words) == 2:
- response = urllib2.urlopen(URL_BEGIN + urllib2.quote(words[0]) + URL_MIDDLE + urllib2.quote(words[1]) + URL_END)
- else:
- raise Exception('Wrong number of words')
- html = response.read()
- soup = BeautifulSoup(html)
- zero_result = False;
- for p in soup.findAll('p'):
- if u'ничего не найдено' in p.text:
- zero_result = True
- break
- if zero_result:
- return 0, 0
- spans = soup.findAll('span', {'class': 'stat-number'})
- return int(''.join(spans[3].text.split())), int(''.join(spans[4].text.split()))
- except Exception, e:
- raise Exception(e)
- def download_frequencies(terms, filename):
- output = None
- try:
- if not os.path.exists(filename):
- output = open(filename, 'w')
- output.write('85996\t229968798\n')
- else:
- output = open(filename, 'a')
- for term in sorted(terms):
- df, tf = download_frequency(term)
- print term + '\t' + str(df) + '\t' + str(tf)
- output.write(term + '\t' + str(df) + '\t' + str(tf) + '\n')
- finally:
- if output is not None:
- output.close()
- def download_zero_frequencies(filename):
- lines = []
- with open(filename, 'r') as input:
- request_number = 0
- for line in input:
- tokens = line.strip().split('\t')
- if tokens[1] == '0' and tokens[2] == '0':
- if request_number > 4000 and request_number <= 5000:
- df, tf = download_frequency(tokens[0])
- print str(request_number) + '\t' + tokens[0] + '\t' + str(df) + '\t' + str(tf)
- lines.append(tokens[0] + '\t' + str(df) + '\t' + str(tf) + '\n')
- else:
- lines.append(line)
- request_number += 1
- else:
- lines.append(line)
- with open(filename, 'w') as output:
- for line in lines:
- output.write(line)
- if __name__ == '__main__':
- parser = optparse.OptionParser(usage='Usage: %prog [options] <input_file> <output_file>')
- parser.add_option('-p', '--proxy', dest='proxy', type=str,
- help='Proxy server to use')
- options, args = parser.parse_args()
- if len(args) != 2:
- parser.error('Incorrect usage')
- input_file = args[0]
- output_file = args[1]
- terms = load_file(output_file)
- new_terms = load_file(input_file)
- new_terms -= terms
- if options.proxy is not None:
- proxy_support = urllib2.ProxyHandler({"http": "http://" + options.proxy})
- opener = urllib2.build_opener(proxy_support)
- urllib2.install_opener(opener)
- download_frequencies(new_terms, output_file)
- download_zero_frequencies(output_file)