VALLA.py - The contest volume home page Determine the volum…

/website/judge/problems/tasks/crawlers/VALLA.py

https://bitbucket.org/zhangjiejun/sjtuonlinejudge · Python · 75 lines · 59 code · 12 blank · 4 comment · 8 complexity · 13d4e3742b4686733b40b76010507101 MD5 · raw file


import re
import logging
import mechanize
import urlparse
import urllib2
from bs4 import BeautifulSoup
from django.utils.encoding import smart_unicode

logger = logging.getLogger(__name__)

def crawl(problem_id):
    result = {'title': '', 'text': '', 'source': 'UVA'}
    
    # The contest volume home page
    problem_id = int(problem_id)
    br = mechanize.Browser()
    logger.info("OPENING CONTEST VOLUME HOME PAGE")
    html = br.open('https://icpcarchive.ecs.baylor.edu/index.php?option=com_onlinejudge&Itemid=8&category=1').read()

    # Determine the volume
    logger.info("GOTO THE VOLUME")
    volume = problem_id / 100
    volume_id = problem_id % 100
    r = re.compile(ur'<td><a href="(?P<url>[^"]*)">Volume[\s\d]+\(\d+-\d+\)</a></td>')
    target_url = r.findall(html)[volume - 20].replace('amp;','')
    target_url = urlparse.urljoin(br.geturl(), target_url)
    response = br.open(target_url);
    html = response.read();

    # Now we are inside the volume, determine the problem
    r = re.compile(ur'<td><a href="(?P<url>index.php\?option=com_onlinejudge&amp;Itemid=8&amp;category=\d+&amp;page=show_problem&amp;problem=(?P<real_id>\d+))">\d*&nbsp;-&nbsp;(?P<title>[^<]+)')
    problems = r.findall(html)
    real_id = problems[volume_id][1]
    result['title'] = problems[volume_id][2].replace('&nbsp;', ' ');

    # Got the problem, read the problem statement
    logger.info('GOT THE PROBLEM STATEMENT')
    target_url = 'https://icpcarchive.ecs.baylor.edu/external/{0}/{1}.html'.format(volume,problem_id)

    response = br.open(target_url)
    header = response.info()
    r = re.compile('charset=(?P<encoding>\S+)')
    m = r.search(header['Content-type'])
    encoding = m.group('encoding') if m else 'utf-8'
    html = smart_unicode(response.read(), encoding=encoding, errors='replace')
    r = re.compile('<br>',re.I)
    html = re.sub(r,'<br/>',html)

    soup = BeautifulSoup(html)

    link_tags = soup.find_all('link')
    for link in link_tags:
        if 'stylesheet' in link['rel']:
            try:
                url = urlparse.urljoin(br.geturl(), link['href'])
                logger.info('Inlining stylesheet: {}'.format(url))
                r = mechanize.urlopen(url)
                new_tag = soup.new_tag('style', type='text/css')
                new_tag.string = r.read()
                soup.head.append(new_tag)
            except urllib2.HTTPError:
                logger.error('Failed to inline stylesheet: HTTP Error')
        link.extract()

    img_tags = soup.find_all('img')
    for img_tag in img_tags:
        src = urlparse.urljoin(br.geturl(), img_tag['src'])
        img_tag['src'] = src

    result['text'] = str(soup)
    result['real_id'] = real_id
    return result

if __name__ == '__main__':
    print crawl('4000')

Tech Fingerprint

Alerts (5)

'def' Ensure functions have docstrings for documentation
11
Complexity hotspot; lines 52 to 54 (total complexity: 3)
52 53 54
'open(' Use 'with open()' to ensure Files are properly closed
57