pipelines.py - Define your item pipelines here Don't forget…

/feed/pipelines.py

https://github.com/richshaw2015/oh-my-rss · Python · 142 lines · 89 code · 32 blank · 21 comment · 38 complexity · b8bd5548f23acffaad445a11c0b9d547 MD5 · raw file


# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

from feed.utils import *
from scrapy.exceptions import DropItem
import django
import urllib
from bs4 import BeautifulSoup
import lxml.etree as etree
import re

# to use django models
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "ohmyrss.settings")
django.setup()

from web.models import *


class ValidPipeline(object):

    def process_item(self, item, spider):

        if item['title'] and item['content'] and item['url'] and item['name'] and item['req_url']:
            if 'github' in item['url'] and '，禁止转载' in item['content']:
                return DropItem(f"Data not allowed`{item['title']}")
            else:
                return item
        else:
            raise DropItem(f"Data not valid`{item['title']}")


class DomPipeline(object):
    """
    handle dom structure
    """

    def process_item(self, item, spider):
        content_soup = BeautifulSoup(item['content'], "html.parser")

        # to absolute external href
        for a in content_soup.find_all('a'):
            rel_href = a.attrs.get('href')
            abs_href = urllib.parse.urljoin(item['url'], rel_href)
            a.attrs['href'] = abs_href
            a.attrs['target'] = '_blank'

            # trim empty a tag
            if not a.contents:
                a.decompose()

        # to absolute src
        for img in content_soup.find_all('img'):
            if img.attrs.get('file'):
                img.attrs['src'] = img.attrs['file']
            elif img.attrs.get('data-src'):
                img.attrs['src'] = img.attrs['data-src']
            elif img.attrs.get('data-original'):
                img.attrs['src'] = img.attrs['data-original']
            try:
                for attr in ('srcset', 'data-srcset', 'data-s', 'data-w', 'data-type', 'data-ratio'):
                    del img.attrs[attr]
            except KeyError:
                pass

            rel_src = img.attrs.get('src')
            abs_src = urllib.parse.urljoin(item['url'], rel_src)
            img.attrs['src'] = abs_src

        # code style
        for pre in content_soup.find_all('pre'):
            try:
                del pre.attrs['style']
            except KeyError:
                pass

        # deny exec js
        for script in content_soup.find_all('script'):
            script.name = 'noscript'

        # for tencent crayon code theme, keep space symbols
        for s in content_soup.find_all('span', class_='crayon-h'):
            s.attrs['style'] = "white-space:pre;"

        # reset span font size
        for tag in (content_soup.find_all('span') + content_soup.find_all('p')):
            if tag.attrs.get('style'):
                tag.attrs['style'] = re.sub(r'font-size\s*:\s*[^;]+;', '', tag.attrs['style'])

            # trim empty tag
            if not tag.contents:
                tag.decompose()

        # trim style tags
        if item.get('trim_style_tags'):
            for tag in item['trim_style_tags']:
                for t in content_soup.find_all(tag):
                    if t.attrs.get('style'):
                        t.attrs['style'] = ''

        # trim contents
        if item.get('trims'):
            content_etree = etree.fromstring(str(content_soup))
            for xpath in item['trims']:
                for node in content_etree.xpath(xpath):
                    node.getparent().remove(node)
            item['content'] = etree.tostring(content_etree, pretty_print=False, encoding="utf-8").decode('utf8')
        else:
            item['content'] = str(content_soup)

        # add custom css
        if item.get('css'):
            item['content'] = f"""<style>%s</style>%s""" % (item.get('css'), item['content'])
        return item


class InsertDBPipeline(object):

    def process_item(self, item, spider):
        from web.utils import write_dat2_file

        site = Site.objects.get(name=item['name'])

        if site.status == 'active':
            try:
                uindex = current_ts()

                article = Article(site=site, title=item['title'], uindex=uindex, src_url=item['url'],
                                  author=item.get('author'))
                article.save()

                write_dat2_file(uindex, site.id, item['content'])

                spider.logger.info(f"Insert to DB:`{item['title']}`{item['url']}`{item['req_url']}")

                mark_crawled_url(item['url'], item['req_url'])
            except django.db.utils.IntegrityError:
                # repeat item
                mark_crawled_url(item['url'], item['req_url'])

Tech Fingerprint

Alerts (9)

'import *' Avoid to prevent namespace pollution; import specific names or use aliases
8 20
'def' Ensure functions have docstrings for documentation
25 41 122
Complexity hotspot; lines 27 to 28 (total complexity: 7)
27 28
'del' Avoid unless necessary; Python's garbage collector typically handles object deletion
65 76