/crimespider/crimespider/spiders/independent.py
Python | 36 lines | 29 code | 5 blank | 2 comment | 2 complexity | 034c402bf7de1a6ab6eda29a9a279953 MD5 | raw file
- #!/usr/bin/python3
- # -*- coding: utf-8 -*-
- import scrapy
- import re
- from crimespider.items import CrimeItem
- from bs4 import BeautifulSoup
- class IndependentSpider(scrapy.Spider):
- name = "crime"
- allowed_domains = ["independent.ie"]
- start_urls = [ 'http://www.independent.ie/irish-news/crime/', ]
- def parse(self, response):
- for a in response.css("article.w111"):
- link = a.xpath('a/@href').extract()
- url = response.urljoin(link[0])
- yield scrapy.Request(url, callback=self.parse_article)
- for a in response.css("article.w29"):
- link = a.xpath('a/@href').extract()
- url = response.urljoin(link[0])
- yield scrapy.Request(url, callback=self.parse_article)
- def parse_article(self, response):
- article = ""
- for t in response.css("title"):
- article += t.extract()
- for c in response.css("div.ctx_content"):
- article += c.extract()
- s = BeautifulSoup(article, 'lxml')
- print( s.get_text() )
- print( "\n" )
- print( "****************************" )
- print( "\n" )
- return None