independent.py | searchcode

/crimespider/crimespider/spiders/independent.py

https://gitlab.com/harrigan/TPP · Python · 36 lines · 29 code · 5 blank · 2 comment · 4 complexity · 034c402bf7de1a6ab6eda29a9a279953 MD5 · raw file


#!/usr/bin/python3
# -*- coding: utf-8 -*-
import scrapy
import re
from crimespider.items import CrimeItem
from bs4 import BeautifulSoup


class IndependentSpider(scrapy.Spider):

    name = "crime"
    allowed_domains = ["independent.ie"]
    start_urls = [ 'http://www.independent.ie/irish-news/crime/', ]

    def parse(self, response):
        for a in response.css("article.w111"):
            link = a.xpath('a/@href').extract()
            url = response.urljoin(link[0])
            yield scrapy.Request(url, callback=self.parse_article)
        for a in response.css("article.w29"):
            link = a.xpath('a/@href').extract()
            url = response.urljoin(link[0])
            yield scrapy.Request(url, callback=self.parse_article)

    def parse_article(self, response):
        article = ""
        for t in response.css("title"):
            article += t.extract()
        for c in response.css("div.ctx_content"):
            article += c.extract()
        s = BeautifulSoup(article, 'lxml')
        print( s.get_text() )
        print( "\n" )
        print( "****************************" )
        print( "\n" )
        return None

Tech Fingerprint

Alerts (6)

'def' Ensure functions have docstrings for documentation
15 25
'print(' Use logging module for better control and configurability
32 33 34 35