crawler.py | searchcode

/crawler/crawler.py

https://bitbucket.org/Meister17/wiki-posting-list · Python · 120 lines · 109 code · 10 blank · 1 comment · 31 complexity · c9b3d597b25238290958e47ae9321dd9 MD5 · raw file

#!/usr/bin/python
import Queue
import multiprocessing as mp
import optparse
import os
import requests
import BeautifulSoup as BS
import nltk
import subprocess
from urlparse import urljoin


CONFIG_DOMAIN = 'http://simple.wikipedia.org'
ARTICLE_PATTERN = CONFIG_DOMAIN + '/wiki/'
BAD_PREFIXES = ['CAT:', 'WT:', 'Media:', 'File:', 'File_talk:', 'Talk:',
                'User:', 'User_talk:', 'Wikipedia:', 'Wikipedia_talk:',
                'MediaWiki:', 'MediaWiki_talk:', 'Template:', 'Template_talk:',
                'Help:', 'Help_talk:', 'Category:', 'Category_talk:',
                'Module:', 'Module_talk:', 'T:', 'Special:', 'mailto:']
BAD_SUFFIXES = ['.jpg', '.mov', '.jpeg', '.tiff', '.bmp', '.ocr', '.png',
                '.css', '.js', '.tif', '.mid', '.mp2', '.mp3', '.mp4', '.wav',
                '.avi', '.mov', '.mpeg', '.ram', '.m4v', '.pdf', '.rm',
                '.smil', '.wmv', '.swf', '.wma', '.zip', '.rar', '.gz', '.svg']
USER_AGENT = 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:16.0) Gecko/20120815 Firefox/16.0'
HTTP_OK = 200
CONTENT_DIVIDER = '#'


def parse_web_page(url):
    success = False
    request_headers = {'User-Agent': USER_AGENT}
    while not success:
        try:
            request = requests.get(url, headers=request_headers)
            success = True
        except requests.exceptions.ConnectionError, error:
            print 'Error occurred', error
    if request.status_code != HTTP_OK:
        print 'Wrong result code on', url
        return '', '', []
    title = request.url[len(ARTICLE_PATTERN):]
    if (sum(title.endswith(suffix) for suffix in BAD_SUFFIXES) or
            sum(title.startswith(prefix) for prefix in BAD_PREFIXES)):
        return '', '', []
    title = title.replace('/', '..').replace(':', '..')
    links = []
    soup = BS.BeautifulSoup(request.text)
    content = nltk.clean_html(request.text.encode('utf8'))
    for link in soup.findAll('a', href=True):
        href = link['href']
        href = urljoin(CONFIG_DOMAIN, href)
        if not href.startswith(ARTICLE_PATTERN):
            continue
        title_link = href[len(ARTICLE_PATTERN):]
        divider_position = title_link.find(CONTENT_DIVIDER)
        if divider_position != -1:
            title_link = title_link[0:divider_position]
            href = href[0:len(ARTICLE_PATTERN) + divider_position]
        if (sum(title_link.endswith(suffix) for suffix in BAD_SUFFIXES) or
                sum(title_link.startswith(prefix) for prefix in BAD_PREFIXES)):
            continue
        if (href == CONFIG_DOMAIN or href == CONFIG_DOMAIN + '/' or
                href == CONFIG_DOMAIN + '/wiki' or href == ARTICLE_PATTERN):
            href = CONFIG_DOMAIN + '/wiki/Main_Page'
        links.append(href)
    return title, content, links


def crawl_url(url, crawl_directory):
    title, content, links = parse_web_page(url)
    if len(content) > 0 and len(title) > 0:
        filename = os.path.join(crawl_directory, title)
        temp_filename = os.path.join(crawl_directory, title + '.tmp')
        with open(filename, 'w') as output:
            output.write(content)
        with open(filename, 'r') as input:
            with open(temp_filename, 'w') as output:
                args = ['./tokenizer.perl']
                subprocess.call(args, stdin=input, stdout=output, stderr=open('/dev/null', 'w'))
        with open(temp_filename, 'r') as input:
            with open(filename, 'w') as output:
                args = ['./lowercase.perl']
                subprocess.call(args, stdin=input, stdout=output, stderr=open('/dev/null', 'w'))
        os.remove(temp_filename)
    return url, links


def crawl_web(crawl_directory, crawlers_number):
    start_url = CONFIG_DOMAIN + '/wiki/Main_Page'
    pool = mp.Pool(processes=crawlers_number)
    url_queue = Queue.Queue()
    visited_urls = set()
    url_queue.put(start_url)
    visited_urls.add(start_url)
    while not url_queue.empty():
        found_links = []
        while not url_queue.empty():
            start_url = url_queue.get_nowait()
            result = pool.apply_async(crawl_url, (start_url, crawl_directory))
            found_links.append(result)
        for element in found_links:
            (url, links) = element.get()
            for link in links:
                if link not in visited_urls:
                    url_queue.put(link)
                    visited_urls.add(link)


if __name__ == '__main__':
    parser = optparse.OptionParser(
        usage='Usage: %prog [options] <crawl_directory>')
    parser.add_option('-n', '--crawlers', dest='crawlers', type=int,
                      default=20, help='Number of crawlers')
    options, args = parser.parse_args()
    if len(args) != 1:
        parser.error('Incorrect usage')
    crawl_directory = args[0]
    if not os.path.exists(crawl_directory):
        os.makedirs(crawl_directory)
    crawl_web(crawl_directory, options.crawlers)
Tech Fingerprint

Alerts (9)

'def' Ensure functions have docstrings for documentation
29 69 88
Complexity hotspot; lines 42 to 43 (total complexity: 3)
42 43
Complexity hotspot; lines 59 to 60 (total complexity: 3)
59 60
Complexity hotspot; lines 62 to 63 (total complexity: 3)
62 63