/website/judge/problems/tasks/crawlers/VALLA.py
https://bitbucket.org/zhangjiejun/sjtuonlinejudge · Python · 75 lines · 59 code · 12 blank · 4 comment · 8 complexity · 13d4e3742b4686733b40b76010507101 MD5 · raw file
- import re
- import logging
- import mechanize
- import urlparse
- import urllib2
- from bs4 import BeautifulSoup
- from django.utils.encoding import smart_unicode
- logger = logging.getLogger(__name__)
- def crawl(problem_id):
- result = {'title': '', 'text': '', 'source': 'UVA'}
-
- # The contest volume home page
- problem_id = int(problem_id)
- br = mechanize.Browser()
- logger.info("OPENING CONTEST VOLUME HOME PAGE")
- html = br.open('https://icpcarchive.ecs.baylor.edu/index.php?option=com_onlinejudge&Itemid=8&category=1').read()
- # Determine the volume
- logger.info("GOTO THE VOLUME")
- volume = problem_id / 100
- volume_id = problem_id % 100
- r = re.compile(ur'<td><a href="(?P<url>[^"]*)">Volume[\s\d]+\(\d+-\d+\)</a></td>')
- target_url = r.findall(html)[volume - 20].replace('amp;','')
- target_url = urlparse.urljoin(br.geturl(), target_url)
- response = br.open(target_url);
- html = response.read();
- # Now we are inside the volume, determine the problem
- r = re.compile(ur'<td><a href="(?P<url>index.php\?option=com_onlinejudge&Itemid=8&category=\d+&page=show_problem&problem=(?P<real_id>\d+))">\d* - (?P<title>[^<]+)')
- problems = r.findall(html)
- real_id = problems[volume_id][1]
- result['title'] = problems[volume_id][2].replace(' ', ' ');
- # Got the problem, read the problem statement
- logger.info('GOT THE PROBLEM STATEMENT')
- target_url = 'https://icpcarchive.ecs.baylor.edu/external/{0}/{1}.html'.format(volume,problem_id)
- response = br.open(target_url)
- header = response.info()
- r = re.compile('charset=(?P<encoding>\S+)')
- m = r.search(header['Content-type'])
- encoding = m.group('encoding') if m else 'utf-8'
- html = smart_unicode(response.read(), encoding=encoding, errors='replace')
- r = re.compile('<br>',re.I)
- html = re.sub(r,'<br/>',html)
- soup = BeautifulSoup(html)
- link_tags = soup.find_all('link')
- for link in link_tags:
- if 'stylesheet' in link['rel']:
- try:
- url = urlparse.urljoin(br.geturl(), link['href'])
- logger.info('Inlining stylesheet: {}'.format(url))
- r = mechanize.urlopen(url)
- new_tag = soup.new_tag('style', type='text/css')
- new_tag.string = r.read()
- soup.head.append(new_tag)
- except urllib2.HTTPError:
- logger.error('Failed to inline stylesheet: HTTP Error')
- link.extract()
- img_tags = soup.find_all('img')
- for img_tag in img_tags:
- src = urlparse.urljoin(br.geturl(), img_tag['src'])
- img_tag['src'] = src
- result['text'] = str(soup)
- result['real_id'] = real_id
- return result
- if __name__ == '__main__':
- print crawl('4000')