PageRenderTime 23ms CodeModel.GetById 28ms RepoModel.GetById 0ms app.codeStats 0ms

/news_scraper/scrape.py

https://gitlab.com/mkhouri/news_scraper
Python | 72 lines | 69 code | 3 blank | 0 comment | 3 complexity | 227afcfa6f6187ba445cf29ded90cb84 MD5 | raw file
  1. import re
  2. from bs4 import BeautifulSoup
  3. from urllib.parse import urlparse
  4. import requests
  5. def parse(url, pageHtml, bodyLines):
  6. soup = BeautifulSoup(pageHtml, "lxml")
  7. host = urlparse(url).hostname
  8. if host == 'www.nydailynews.com':
  9. headline = soup.find(itemprop="headline").string
  10. try:
  11. author = soup.find(rel="author").string.strip()
  12. except AttributeError:
  13. author = soup.find(id="a-credits").string.strip()
  14. rawBody = soup.find_all('p', limit=bodyLines)
  15. body = ''
  16. for i in range(bodyLines):
  17. body += (rawBody[i].text.strip()) + ' '
  18. elif host == 'www.nytimes.com':
  19. headline = soup.find(itemprop="headline").string
  20. author = soup.find(attrs={"name": "author"})['content'].strip() # Author is in the tag itself
  21. rawBody = soup.find_all(attrs={"class": "story-body-text story-content"}, limit=bodyLines)
  22. body = ''
  23. for i in range(bodyLines):
  24. body += (rawBody[i].text) + ' '
  25. elif host == 'www.dnainfo.com':
  26. headline = soup.find(attrs={"class": "social-group"})['data-title'].strip() # Title is in the tag itself
  27. author = soup.find(attrs={"class": "name"}).string
  28. rawBody = soup.find_all('p', limit=bodyLines)
  29. body = ''
  30. for i in range(bodyLines):
  31. body += (rawBody[i].text.strip()) + ' '
  32. elif host == 'www.silive.com':
  33. headline = soup.find(attrs={"name": "title"})['content'].strip() # Title is in the tag itself
  34. author = soup.find(attrs={"name": "article_author"})['content'].split('|')[0].strip() # Author is in the tag itself, cutting out content right of '|'
  35. rawBody = soup.find_all('p', limit=bodyLines+1) # So we can skip the first <p>
  36. body = ''
  37. for i in range(bodyLines):
  38. body += (rawBody[i+1].text.strip()) + ' ' # So we skip the first <p>
  39. else:
  40. raise NameError("The specified 'source' is not valid")
  41. mayoralText = soup.findAll(text=re.compile('de ?blasio', re.IGNORECASE))
  42. departmentalText = soup.findAll(text=re.compile('department', re.IGNORECASE))
  43. return {
  44. 'headline': headline,
  45. 'author': author,
  46. 'body': body,
  47. 'mayoralMention': (mayoralText != []),
  48. 'mayoralText': mayoralText,
  49. 'departmentalMention': (departmentalText != []),
  50. 'departmentalText': departmentalText,
  51. }
  52. def fetch_page(url):
  53. pageHtml = requests.get(url).text
  54. return pageHtml
  55. def fetch_and_parse(url, bodyLines):
  56. """Takes a url, and returns a dictionary of data with 'bodyLines' lines"""
  57. pageHtml = fetch_page(url)
  58. return parse(url, pageHtml, bodyLines)