/feed/pipelines.py
https://github.com/richshaw2015/oh-my-rss · Python · 142 lines · 89 code · 32 blank · 21 comment · 38 complexity · b8bd5548f23acffaad445a11c0b9d547 MD5 · raw file
- # -*- coding: utf-8 -*-
- # Define your item pipelines here
- #
- # Don't forget to add your pipeline to the ITEM_PIPELINES setting
- # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
- from feed.utils import *
- from scrapy.exceptions import DropItem
- import django
- import urllib
- from bs4 import BeautifulSoup
- import lxml.etree as etree
- import re
- # to use django models
- os.environ.setdefault("DJANGO_SETTINGS_MODULE", "ohmyrss.settings")
- django.setup()
- from web.models import *
- class ValidPipeline(object):
- def process_item(self, item, spider):
- if item['title'] and item['content'] and item['url'] and item['name'] and item['req_url']:
- if 'github' in item['url'] and ',禁止转载' in item['content']:
- return DropItem(f"Data not allowed`{item['title']}")
- else:
- return item
- else:
- raise DropItem(f"Data not valid`{item['title']}")
- class DomPipeline(object):
- """
- handle dom structure
- """
- def process_item(self, item, spider):
- content_soup = BeautifulSoup(item['content'], "html.parser")
- # to absolute external href
- for a in content_soup.find_all('a'):
- rel_href = a.attrs.get('href')
- abs_href = urllib.parse.urljoin(item['url'], rel_href)
- a.attrs['href'] = abs_href
- a.attrs['target'] = '_blank'
- # trim empty a tag
- if not a.contents:
- a.decompose()
- # to absolute src
- for img in content_soup.find_all('img'):
- if img.attrs.get('file'):
- img.attrs['src'] = img.attrs['file']
- elif img.attrs.get('data-src'):
- img.attrs['src'] = img.attrs['data-src']
- elif img.attrs.get('data-original'):
- img.attrs['src'] = img.attrs['data-original']
- try:
- for attr in ('srcset', 'data-srcset', 'data-s', 'data-w', 'data-type', 'data-ratio'):
- del img.attrs[attr]
- except KeyError:
- pass
- rel_src = img.attrs.get('src')
- abs_src = urllib.parse.urljoin(item['url'], rel_src)
- img.attrs['src'] = abs_src
- # code style
- for pre in content_soup.find_all('pre'):
- try:
- del pre.attrs['style']
- except KeyError:
- pass
- # deny exec js
- for script in content_soup.find_all('script'):
- script.name = 'noscript'
- # for tencent crayon code theme, keep space symbols
- for s in content_soup.find_all('span', class_='crayon-h'):
- s.attrs['style'] = "white-space:pre;"
- # reset span font size
- for tag in (content_soup.find_all('span') + content_soup.find_all('p')):
- if tag.attrs.get('style'):
- tag.attrs['style'] = re.sub(r'font-size\s*:\s*[^;]+;', '', tag.attrs['style'])
- # trim empty tag
- if not tag.contents:
- tag.decompose()
- # trim style tags
- if item.get('trim_style_tags'):
- for tag in item['trim_style_tags']:
- for t in content_soup.find_all(tag):
- if t.attrs.get('style'):
- t.attrs['style'] = ''
- # trim contents
- if item.get('trims'):
- content_etree = etree.fromstring(str(content_soup))
- for xpath in item['trims']:
- for node in content_etree.xpath(xpath):
- node.getparent().remove(node)
- item['content'] = etree.tostring(content_etree, pretty_print=False, encoding="utf-8").decode('utf8')
- else:
- item['content'] = str(content_soup)
- # add custom css
- if item.get('css'):
- item['content'] = f"""<style>%s</style>%s""" % (item.get('css'), item['content'])
- return item
- class InsertDBPipeline(object):
- def process_item(self, item, spider):
- from web.utils import write_dat2_file
- site = Site.objects.get(name=item['name'])
- if site.status == 'active':
- try:
- uindex = current_ts()
- article = Article(site=site, title=item['title'], uindex=uindex, src_url=item['url'],
- author=item.get('author'))
- article.save()
- write_dat2_file(uindex, site.id, item['content'])
- spider.logger.info(f"Insert to DB:`{item['title']}`{item['url']}`{item['req_url']}")
- mark_crawled_url(item['url'], item['req_url'])
- except django.db.utils.IntegrityError:
- # repeat item
- mark_crawled_url(item['url'], item['req_url'])