/feed/pipelines.py

https://github.com/richshaw2015/oh-my-rss · Python · 142 lines · 89 code · 32 blank · 21 comment · 38 complexity · b8bd5548f23acffaad445a11c0b9d547 MD5 · raw file

  1. # -*- coding: utf-8 -*-
  2. # Define your item pipelines here
  3. #
  4. # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  5. # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
  6. from feed.utils import *
  7. from scrapy.exceptions import DropItem
  8. import django
  9. import urllib
  10. from bs4 import BeautifulSoup
  11. import lxml.etree as etree
  12. import re
  13. # to use django models
  14. os.environ.setdefault("DJANGO_SETTINGS_MODULE", "ohmyrss.settings")
  15. django.setup()
  16. from web.models import *
  17. class ValidPipeline(object):
  18. def process_item(self, item, spider):
  19. if item['title'] and item['content'] and item['url'] and item['name'] and item['req_url']:
  20. if 'github' in item['url'] and ',禁止转载' in item['content']:
  21. return DropItem(f"Data not allowed`{item['title']}")
  22. else:
  23. return item
  24. else:
  25. raise DropItem(f"Data not valid`{item['title']}")
  26. class DomPipeline(object):
  27. """
  28. handle dom structure
  29. """
  30. def process_item(self, item, spider):
  31. content_soup = BeautifulSoup(item['content'], "html.parser")
  32. # to absolute external href
  33. for a in content_soup.find_all('a'):
  34. rel_href = a.attrs.get('href')
  35. abs_href = urllib.parse.urljoin(item['url'], rel_href)
  36. a.attrs['href'] = abs_href
  37. a.attrs['target'] = '_blank'
  38. # trim empty a tag
  39. if not a.contents:
  40. a.decompose()
  41. # to absolute src
  42. for img in content_soup.find_all('img'):
  43. if img.attrs.get('file'):
  44. img.attrs['src'] = img.attrs['file']
  45. elif img.attrs.get('data-src'):
  46. img.attrs['src'] = img.attrs['data-src']
  47. elif img.attrs.get('data-original'):
  48. img.attrs['src'] = img.attrs['data-original']
  49. try:
  50. for attr in ('srcset', 'data-srcset', 'data-s', 'data-w', 'data-type', 'data-ratio'):
  51. del img.attrs[attr]
  52. except KeyError:
  53. pass
  54. rel_src = img.attrs.get('src')
  55. abs_src = urllib.parse.urljoin(item['url'], rel_src)
  56. img.attrs['src'] = abs_src
  57. # code style
  58. for pre in content_soup.find_all('pre'):
  59. try:
  60. del pre.attrs['style']
  61. except KeyError:
  62. pass
  63. # deny exec js
  64. for script in content_soup.find_all('script'):
  65. script.name = 'noscript'
  66. # for tencent crayon code theme, keep space symbols
  67. for s in content_soup.find_all('span', class_='crayon-h'):
  68. s.attrs['style'] = "white-space:pre;"
  69. # reset span font size
  70. for tag in (content_soup.find_all('span') + content_soup.find_all('p')):
  71. if tag.attrs.get('style'):
  72. tag.attrs['style'] = re.sub(r'font-size\s*:\s*[^;]+;', '', tag.attrs['style'])
  73. # trim empty tag
  74. if not tag.contents:
  75. tag.decompose()
  76. # trim style tags
  77. if item.get('trim_style_tags'):
  78. for tag in item['trim_style_tags']:
  79. for t in content_soup.find_all(tag):
  80. if t.attrs.get('style'):
  81. t.attrs['style'] = ''
  82. # trim contents
  83. if item.get('trims'):
  84. content_etree = etree.fromstring(str(content_soup))
  85. for xpath in item['trims']:
  86. for node in content_etree.xpath(xpath):
  87. node.getparent().remove(node)
  88. item['content'] = etree.tostring(content_etree, pretty_print=False, encoding="utf-8").decode('utf8')
  89. else:
  90. item['content'] = str(content_soup)
  91. # add custom css
  92. if item.get('css'):
  93. item['content'] = f"""<style>%s</style>%s""" % (item.get('css'), item['content'])
  94. return item
  95. class InsertDBPipeline(object):
  96. def process_item(self, item, spider):
  97. from web.utils import write_dat2_file
  98. site = Site.objects.get(name=item['name'])
  99. if site.status == 'active':
  100. try:
  101. uindex = current_ts()
  102. article = Article(site=site, title=item['title'], uindex=uindex, src_url=item['url'],
  103. author=item.get('author'))
  104. article.save()
  105. write_dat2_file(uindex, site.id, item['content'])
  106. spider.logger.info(f"Insert to DB:`{item['title']}`{item['url']}`{item['req_url']}")
  107. mark_crawled_url(item['url'], item['req_url'])
  108. except django.db.utils.IntegrityError:
  109. # repeat item
  110. mark_crawled_url(item['url'], item['req_url'])