PageRenderTime 72ms CodeModel.GetById 37ms RepoModel.GetById 1ms app.codeStats 0ms

/crimespider/crimespider/spiders/independent.py

https://gitlab.com/harrigan/TPP
Python | 36 lines | 29 code | 5 blank | 2 comment | 2 complexity | 034c402bf7de1a6ab6eda29a9a279953 MD5 | raw file
  1. #!/usr/bin/python3
  2. # -*- coding: utf-8 -*-
  3. import scrapy
  4. import re
  5. from crimespider.items import CrimeItem
  6. from bs4 import BeautifulSoup
  7. class IndependentSpider(scrapy.Spider):
  8. name = "crime"
  9. allowed_domains = ["independent.ie"]
  10. start_urls = [ 'http://www.independent.ie/irish-news/crime/', ]
  11. def parse(self, response):
  12. for a in response.css("article.w111"):
  13. link = a.xpath('a/@href').extract()
  14. url = response.urljoin(link[0])
  15. yield scrapy.Request(url, callback=self.parse_article)
  16. for a in response.css("article.w29"):
  17. link = a.xpath('a/@href').extract()
  18. url = response.urljoin(link[0])
  19. yield scrapy.Request(url, callback=self.parse_article)
  20. def parse_article(self, response):
  21. article = ""
  22. for t in response.css("title"):
  23. article += t.extract()
  24. for c in response.css("div.ctx_content"):
  25. article += c.extract()
  26. s = BeautifulSoup(article, 'lxml')
  27. print( s.get_text() )
  28. print( "\n" )
  29. print( "****************************" )
  30. print( "\n" )
  31. return None