/crawler/crawler.py
Python | 120 lines | 115 code | 4 blank | 1 comment | 4 complexity | c9b3d597b25238290958e47ae9321dd9 MD5 | raw file
- #!/usr/bin/python
- import Queue
- import multiprocessing as mp
- import optparse
- import os
- import requests
- import BeautifulSoup as BS
- import nltk
- import subprocess
- from urlparse import urljoin
- CONFIG_DOMAIN = 'http://simple.wikipedia.org'
- ARTICLE_PATTERN = CONFIG_DOMAIN + '/wiki/'
- BAD_PREFIXES = ['CAT:', 'WT:', 'Media:', 'File:', 'File_talk:', 'Talk:',
- 'User:', 'User_talk:', 'Wikipedia:', 'Wikipedia_talk:',
- 'MediaWiki:', 'MediaWiki_talk:', 'Template:', 'Template_talk:',
- 'Help:', 'Help_talk:', 'Category:', 'Category_talk:',
- 'Module:', 'Module_talk:', 'T:', 'Special:', 'mailto:']
- BAD_SUFFIXES = ['.jpg', '.mov', '.jpeg', '.tiff', '.bmp', '.ocr', '.png',
- '.css', '.js', '.tif', '.mid', '.mp2', '.mp3', '.mp4', '.wav',
- '.avi', '.mov', '.mpeg', '.ram', '.m4v', '.pdf', '.rm',
- '.smil', '.wmv', '.swf', '.wma', '.zip', '.rar', '.gz', '.svg']
- USER_AGENT = 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:16.0) Gecko/20120815 Firefox/16.0'
- HTTP_OK = 200
- CONTENT_DIVIDER = '#'
- def parse_web_page(url):
- success = False
- request_headers = {'User-Agent': USER_AGENT}
- while not success:
- try:
- request = requests.get(url, headers=request_headers)
- success = True
- except requests.exceptions.ConnectionError, error:
- print 'Error occurred', error
- if request.status_code != HTTP_OK:
- print 'Wrong result code on', url
- return '', '', []
- title = request.url[len(ARTICLE_PATTERN):]
- if (sum(title.endswith(suffix) for suffix in BAD_SUFFIXES) or
- sum(title.startswith(prefix) for prefix in BAD_PREFIXES)):
- return '', '', []
- title = title.replace('/', '..').replace(':', '..')
- links = []
- soup = BS.BeautifulSoup(request.text)
- content = nltk.clean_html(request.text.encode('utf8'))
- for link in soup.findAll('a', href=True):
- href = link['href']
- href = urljoin(CONFIG_DOMAIN, href)
- if not href.startswith(ARTICLE_PATTERN):
- continue
- title_link = href[len(ARTICLE_PATTERN):]
- divider_position = title_link.find(CONTENT_DIVIDER)
- if divider_position != -1:
- title_link = title_link[0:divider_position]
- href = href[0:len(ARTICLE_PATTERN) + divider_position]
- if (sum(title_link.endswith(suffix) for suffix in BAD_SUFFIXES) or
- sum(title_link.startswith(prefix) for prefix in BAD_PREFIXES)):
- continue
- if (href == CONFIG_DOMAIN or href == CONFIG_DOMAIN + '/' or
- href == CONFIG_DOMAIN + '/wiki' or href == ARTICLE_PATTERN):
- href = CONFIG_DOMAIN + '/wiki/Main_Page'
- links.append(href)
- return title, content, links
- def crawl_url(url, crawl_directory):
- title, content, links = parse_web_page(url)
- if len(content) > 0 and len(title) > 0:
- filename = os.path.join(crawl_directory, title)
- temp_filename = os.path.join(crawl_directory, title + '.tmp')
- with open(filename, 'w') as output:
- output.write(content)
- with open(filename, 'r') as input:
- with open(temp_filename, 'w') as output:
- args = ['./tokenizer.perl']
- subprocess.call(args, stdin=input, stdout=output, stderr=open('/dev/null', 'w'))
- with open(temp_filename, 'r') as input:
- with open(filename, 'w') as output:
- args = ['./lowercase.perl']
- subprocess.call(args, stdin=input, stdout=output, stderr=open('/dev/null', 'w'))
- os.remove(temp_filename)
- return url, links
- def crawl_web(crawl_directory, crawlers_number):
- start_url = CONFIG_DOMAIN + '/wiki/Main_Page'
- pool = mp.Pool(processes=crawlers_number)
- url_queue = Queue.Queue()
- visited_urls = set()
- url_queue.put(start_url)
- visited_urls.add(start_url)
- while not url_queue.empty():
- found_links = []
- while not url_queue.empty():
- start_url = url_queue.get_nowait()
- result = pool.apply_async(crawl_url, (start_url, crawl_directory))
- found_links.append(result)
- for element in found_links:
- (url, links) = element.get()
- for link in links:
- if link not in visited_urls:
- url_queue.put(link)
- visited_urls.add(link)
- if __name__ == '__main__':
- parser = optparse.OptionParser(
- usage='Usage: %prog [options] <crawl_directory>')
- parser.add_option('-n', '--crawlers', dest='crawlers', type=int,
- default=20, help='Number of crawlers')
- options, args = parser.parse_args()
- if len(args) != 1:
- parser.error('Incorrect usage')
- crawl_directory = args[0]
- if not os.path.exists(crawl_directory):
- os.makedirs(crawl_directory)
- crawl_web(crawl_directory, options.crawlers)