PageRenderTime 52ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/crawler/crawler.py

https://bitbucket.org/Meister17/wiki-posting-list
Python | 120 lines | 115 code | 4 blank | 1 comment | 4 complexity | c9b3d597b25238290958e47ae9321dd9 MD5 | raw file
  1. #!/usr/bin/python
  2. import Queue
  3. import multiprocessing as mp
  4. import optparse
  5. import os
  6. import requests
  7. import BeautifulSoup as BS
  8. import nltk
  9. import subprocess
  10. from urlparse import urljoin
  11. CONFIG_DOMAIN = 'http://simple.wikipedia.org'
  12. ARTICLE_PATTERN = CONFIG_DOMAIN + '/wiki/'
  13. BAD_PREFIXES = ['CAT:', 'WT:', 'Media:', 'File:', 'File_talk:', 'Talk:',
  14. 'User:', 'User_talk:', 'Wikipedia:', 'Wikipedia_talk:',
  15. 'MediaWiki:', 'MediaWiki_talk:', 'Template:', 'Template_talk:',
  16. 'Help:', 'Help_talk:', 'Category:', 'Category_talk:',
  17. 'Module:', 'Module_talk:', 'T:', 'Special:', 'mailto:']
  18. BAD_SUFFIXES = ['.jpg', '.mov', '.jpeg', '.tiff', '.bmp', '.ocr', '.png',
  19. '.css', '.js', '.tif', '.mid', '.mp2', '.mp3', '.mp4', '.wav',
  20. '.avi', '.mov', '.mpeg', '.ram', '.m4v', '.pdf', '.rm',
  21. '.smil', '.wmv', '.swf', '.wma', '.zip', '.rar', '.gz', '.svg']
  22. USER_AGENT = 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:16.0) Gecko/20120815 Firefox/16.0'
  23. HTTP_OK = 200
  24. CONTENT_DIVIDER = '#'
  25. def parse_web_page(url):
  26. success = False
  27. request_headers = {'User-Agent': USER_AGENT}
  28. while not success:
  29. try:
  30. request = requests.get(url, headers=request_headers)
  31. success = True
  32. except requests.exceptions.ConnectionError, error:
  33. print 'Error occurred', error
  34. if request.status_code != HTTP_OK:
  35. print 'Wrong result code on', url
  36. return '', '', []
  37. title = request.url[len(ARTICLE_PATTERN):]
  38. if (sum(title.endswith(suffix) for suffix in BAD_SUFFIXES) or
  39. sum(title.startswith(prefix) for prefix in BAD_PREFIXES)):
  40. return '', '', []
  41. title = title.replace('/', '..').replace(':', '..')
  42. links = []
  43. soup = BS.BeautifulSoup(request.text)
  44. content = nltk.clean_html(request.text.encode('utf8'))
  45. for link in soup.findAll('a', href=True):
  46. href = link['href']
  47. href = urljoin(CONFIG_DOMAIN, href)
  48. if not href.startswith(ARTICLE_PATTERN):
  49. continue
  50. title_link = href[len(ARTICLE_PATTERN):]
  51. divider_position = title_link.find(CONTENT_DIVIDER)
  52. if divider_position != -1:
  53. title_link = title_link[0:divider_position]
  54. href = href[0:len(ARTICLE_PATTERN) + divider_position]
  55. if (sum(title_link.endswith(suffix) for suffix in BAD_SUFFIXES) or
  56. sum(title_link.startswith(prefix) for prefix in BAD_PREFIXES)):
  57. continue
  58. if (href == CONFIG_DOMAIN or href == CONFIG_DOMAIN + '/' or
  59. href == CONFIG_DOMAIN + '/wiki' or href == ARTICLE_PATTERN):
  60. href = CONFIG_DOMAIN + '/wiki/Main_Page'
  61. links.append(href)
  62. return title, content, links
  63. def crawl_url(url, crawl_directory):
  64. title, content, links = parse_web_page(url)
  65. if len(content) > 0 and len(title) > 0:
  66. filename = os.path.join(crawl_directory, title)
  67. temp_filename = os.path.join(crawl_directory, title + '.tmp')
  68. with open(filename, 'w') as output:
  69. output.write(content)
  70. with open(filename, 'r') as input:
  71. with open(temp_filename, 'w') as output:
  72. args = ['./tokenizer.perl']
  73. subprocess.call(args, stdin=input, stdout=output, stderr=open('/dev/null', 'w'))
  74. with open(temp_filename, 'r') as input:
  75. with open(filename, 'w') as output:
  76. args = ['./lowercase.perl']
  77. subprocess.call(args, stdin=input, stdout=output, stderr=open('/dev/null', 'w'))
  78. os.remove(temp_filename)
  79. return url, links
  80. def crawl_web(crawl_directory, crawlers_number):
  81. start_url = CONFIG_DOMAIN + '/wiki/Main_Page'
  82. pool = mp.Pool(processes=crawlers_number)
  83. url_queue = Queue.Queue()
  84. visited_urls = set()
  85. url_queue.put(start_url)
  86. visited_urls.add(start_url)
  87. while not url_queue.empty():
  88. found_links = []
  89. while not url_queue.empty():
  90. start_url = url_queue.get_nowait()
  91. result = pool.apply_async(crawl_url, (start_url, crawl_directory))
  92. found_links.append(result)
  93. for element in found_links:
  94. (url, links) = element.get()
  95. for link in links:
  96. if link not in visited_urls:
  97. url_queue.put(link)
  98. visited_urls.add(link)
  99. if __name__ == '__main__':
  100. parser = optparse.OptionParser(
  101. usage='Usage: %prog [options] <crawl_directory>')
  102. parser.add_option('-n', '--crawlers', dest='crawlers', type=int,
  103. default=20, help='Number of crawlers')
  104. options, args = parser.parse_args()
  105. if len(args) != 1:
  106. parser.error('Incorrect usage')
  107. crawl_directory = args[0]
  108. if not os.path.exists(crawl_directory):
  109. os.makedirs(crawl_directory)
  110. crawl_web(crawl_directory, options.crawlers)