adapter_adastrafanficcom.py - This is a Python class that p…

/fanficdownloader/adapters/adapter_adastrafanficcom.py

https://code.google.com/p/fanficdownloader/ · Python · 238 lines · 134 code · 51 blank · 53 comment · 37 complexity · 164c7b78da7cea047a7eda720711b6b6 MD5 · raw file

# -*- coding: utf-8 -*-



# Copyright 2011 Fanficdownloader team

#

# Licensed under the Apache License, Version 2.0 (the "License");

# you may not use this file except in compliance with the License.

# You may obtain a copy of the License at

#

#     http://www.apache.org/licenses/LICENSE-2.0

#

# Unless required by applicable law or agreed to in writing, software

# distributed under the License is distributed on an "AS IS" BASIS,

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

# See the License for the specific language governing permissions and

# limitations under the License.

#



# Software: eFiction

import time

import logging

logger = logging.getLogger(__name__)

import re

import urllib

import urllib2



from .. import BeautifulSoup as bs

from ..htmlcleanup import stripHTML

from .. import exceptions as exceptions



from base_adapter import BaseSiteAdapter,  makeDate



class AdAstraFanficComSiteAdapter(BaseSiteAdapter):



    def __init__(self, config, url):

        BaseSiteAdapter.__init__(self, config, url)

        self.story.setMetadata('siteabbrev','aaff')

        self.decode = ["Windows-1252",

                       "utf8"] # 1252 is a superset of iso-8859-1.

                               # Most sites that claim to be

                               # iso-8859-1 (and some that claim to be

                               # utf8) are really windows-1252.

        self.is_adult=False

        

        # get storyId from url--url validation guarantees query is only sid=1234

        self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])

        

        

        # normalized story URL.

        self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))



            

    @staticmethod

    def getSiteDomain():

        return 'www.adastrafanfic.com'



    @classmethod

    def getSiteExampleURLs(cls):

        return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"



    def getSiteURLPattern(self):

        return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"



    def use_pagecache(self):

        '''

        adapters that will work with the page cache need to implement

        this and change it to True.

        '''

        return True

    

    def extractChapterUrlsAndMetadata(self):



        if self.is_adult or self.getConfig("is_adult"):

            addurl = "&warning=5"

        else:

            addurl=""

            

        url = self.url+'&index=1'+addurl

        logger.debug("URL: "+url)



        try:

            data = self._fetchUrl(url)

        except urllib2.HTTPError, e:

            if e.code == 404:

                raise exceptions.StoryDoesNotExist(self.url)

            else:

                raise e



        if "Content is only suitable for mature adults. May contain explicit language and adult themes. Equivalent of NC-17." in data:

            raise exceptions.AdultCheckRequired(self.url)

            

        # problems with some stories, but only in calibre.  I suspect

        # issues with different SGML parsers in python.  This is a

        # nasty hack, but it works.

        data = data[data.index("<body"):]

        

        # use BeautifulSoup HTML parser to make everything easier to find.

        soup = bs.BeautifulSoup(data)



        ## Title

        a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))

        self.story.setMetadata('title',stripHTML(a))

        

        # Find authorid and URL from... author url.

        a = soup.find('a', href=re.compile(r"viewuser.php"))

        self.story.setMetadata('authorId',a['href'].split('=')[1])

        self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])

        self.story.setMetadata('author',a.string)



        # Find the chapters:

        for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):

            # just in case there's tags, like <i> in chapter titles.

            self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))



        self.story.setMetadata('numChapters',len(self.chapterUrls))



        ## <meta name='description' content='&lt;p&gt;Description&lt;/p&gt; ...' >

        ## Summary, strangely, is in the content attr of a <meta name='description'> tag

        ## which is escaped HTML.  Unfortunately, we can't use it because they don't

        ## escape (') chars in the desc, breakin the tag.

        #meta_desc = soup.find('meta',{'name':'description'})

        #metasoup = bs.BeautifulStoneSoup(meta_desc['content'])

        #self.story.setMetadata('description',stripHTML(metasoup))



        def defaultGetattr(d,k):

            try:

                return d[k]

            except:

                return ""

        

        # <span class="label">Rated:</span> NC-17<br /> etc

        labels = soup.findAll('span',{'class':'label'})

        for labelspan in labels:

            value = labelspan.nextSibling

            label = labelspan.string



            if 'Summary' in label:

                ## Everything until the next span class='label'

                svalue = ''

                while value and not defaultGetattr(value,'class') == 'label':

                    svalue += str(value)

                    value = value.nextSibling

                # sometimes poorly formated desc (<p> w/o </p>) leads

                # to all labels being included.

                svalue=svalue[:svalue.find('<span class="label">')]

                self.setDescription(url,svalue)

                #self.story.setMetadata('description',stripHTML(svalue))



            if 'Rated' in label:

                self.story.setMetadata('rating', value)



            if 'Word count' in label:

                self.story.setMetadata('numWords', value)



            if 'Categories' in label:

                cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))

                catstext = [cat.string for cat in cats]

                for cat in catstext:

                    self.story.addToList('category',cat.string)



            if 'Characters' in label:

                chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))

                charstext = [char.string for char in chars]

                for char in charstext:

                    self.story.addToList('characters',char.string)



            if 'Genre' in label:

                genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))

                genrestext = [genre.string for genre in genres]

                self.genre = ', '.join(genrestext)

                for genre in genrestext:

                    self.story.addToList('genre',genre.string)



            if 'Warnings' in label:

                warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))

                warningstext = [warning.string for warning in warnings]

                self.warning = ', '.join(warningstext)

                for warning in warningstext:

                    self.story.addToList('warnings',warning.string)



            if 'Completed' in label:

                if 'Yes' in value:

                    self.story.setMetadata('status', 'Completed')

                else:

                    self.story.setMetadata('status', 'In-Progress')



            if 'Published' in label:

                self.story.setMetadata('datePublished', makeDate(value.strip(), "%d %b %Y"))

            

            if 'Updated' in label:

                # there's a stray [ at the end.

                #value = value[0:-1]

                self.story.setMetadata('dateUpdated', makeDate(value.strip(), "%d %b %Y"))



        try:

            # Find Series name from series URL.

            a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))

            series_name = a.string

            series_url = 'http://'+self.host+'/'+a['href']



            # use BeautifulSoup HTML parser to make everything easier to find.

            seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))

            storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))

            i=1

            for a in storyas:

                if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):

                    self.setSeries(series_name, i)

                    self.story.setMetadata('seriesUrl',series_url)

                    break

                i+=1

            

        except:

            # I find it hard to care if the series parsing fails

            pass

            



    def getChapterText(self, url):



        logger.debug('Getting chapter text from: %s' % url)



        data = self._fetchUrl(url)

        # problems with some stories, but only in calibre.  I suspect

        # issues with different SGML parsers in python.  This is a

        # nasty hack, but it works.

        data = data[data.index("<body"):]

        

        soup = bs.BeautifulStoneSoup(data,

                                     selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.

        

        span = soup.find('div', {'id' : 'story'})



        if None == span:

            raise exceptions.FailedToDownload("Error downloading Chapter: %s!  Missing required element!" % url)

    

        return self.utf8FromSoup(url,span)



def getClass():

    return AdAstraFanficComSiteAdapter
Summary ✨

This is a Python class that parses data from the AdAstra fanfiction website. It extracts metadata such as title, author, genre, rating, and chapter text, and stores it in an object. The class can be used to download chapters and store them for later use. It appears to be designed for use with a library or framework that interacts with the website’s API.
Tech Fingerprint

Alerts (9)

'def' Ensure functions have docstrings for documentation
53 57 70 124 216 236
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
127 211
'try:' Ensure try blocks have corresponding except or finally blocks
194