/website/judge/problems/tasks/crawlers/VALLA.py

https://bitbucket.org/zhangjiejun/sjtuonlinejudge · Python · 75 lines · 59 code · 12 blank · 4 comment · 8 complexity · 13d4e3742b4686733b40b76010507101 MD5 · raw file

  1. import re
  2. import logging
  3. import mechanize
  4. import urlparse
  5. import urllib2
  6. from bs4 import BeautifulSoup
  7. from django.utils.encoding import smart_unicode
  8. logger = logging.getLogger(__name__)
  9. def crawl(problem_id):
  10. result = {'title': '', 'text': '', 'source': 'UVA'}
  11. # The contest volume home page
  12. problem_id = int(problem_id)
  13. br = mechanize.Browser()
  14. logger.info("OPENING CONTEST VOLUME HOME PAGE")
  15. html = br.open('https://icpcarchive.ecs.baylor.edu/index.php?option=com_onlinejudge&Itemid=8&category=1').read()
  16. # Determine the volume
  17. logger.info("GOTO THE VOLUME")
  18. volume = problem_id / 100
  19. volume_id = problem_id % 100
  20. r = re.compile(ur'<td><a href="(?P<url>[^"]*)">Volume[\s\d]+\(\d+-\d+\)</a></td>')
  21. target_url = r.findall(html)[volume - 20].replace('amp;','')
  22. target_url = urlparse.urljoin(br.geturl(), target_url)
  23. response = br.open(target_url);
  24. html = response.read();
  25. # Now we are inside the volume, determine the problem
  26. r = re.compile(ur'<td><a href="(?P<url>index.php\?option=com_onlinejudge&amp;Itemid=8&amp;category=\d+&amp;page=show_problem&amp;problem=(?P<real_id>\d+))">\d*&nbsp;-&nbsp;(?P<title>[^<]+)')
  27. problems = r.findall(html)
  28. real_id = problems[volume_id][1]
  29. result['title'] = problems[volume_id][2].replace('&nbsp;', ' ');
  30. # Got the problem, read the problem statement
  31. logger.info('GOT THE PROBLEM STATEMENT')
  32. target_url = 'https://icpcarchive.ecs.baylor.edu/external/{0}/{1}.html'.format(volume,problem_id)
  33. response = br.open(target_url)
  34. header = response.info()
  35. r = re.compile('charset=(?P<encoding>\S+)')
  36. m = r.search(header['Content-type'])
  37. encoding = m.group('encoding') if m else 'utf-8'
  38. html = smart_unicode(response.read(), encoding=encoding, errors='replace')
  39. r = re.compile('<br>',re.I)
  40. html = re.sub(r,'<br/>',html)
  41. soup = BeautifulSoup(html)
  42. link_tags = soup.find_all('link')
  43. for link in link_tags:
  44. if 'stylesheet' in link['rel']:
  45. try:
  46. url = urlparse.urljoin(br.geturl(), link['href'])
  47. logger.info('Inlining stylesheet: {}'.format(url))
  48. r = mechanize.urlopen(url)
  49. new_tag = soup.new_tag('style', type='text/css')
  50. new_tag.string = r.read()
  51. soup.head.append(new_tag)
  52. except urllib2.HTTPError:
  53. logger.error('Failed to inline stylesheet: HTTP Error')
  54. link.extract()
  55. img_tags = soup.find_all('img')
  56. for img_tag in img_tags:
  57. src = urlparse.urljoin(br.geturl(), img_tag['src'])
  58. img_tag['src'] = src
  59. result['text'] = str(soup)
  60. result['real_id'] = real_id
  61. return result
  62. if __name__ == '__main__':
  63. print crawl('4000')