adapter_twiwritenet.py - This Python code defines a class `…

/fanficdownloader/adapters/adapter_twiwritenet.py

https://code.google.com/p/fanficdownloader/ · Python · 281 lines · 171 code · 59 blank · 51 comment · 46 complexity · 33264a3cbe412a1f6724c94d769b6453 MD5 · raw file

# -*- coding: utf-8 -*-



# Copyright 2011 Fanficdownloader team

#

# Licensed under the Apache License, Version 2.0 (the "License");

# you may not use this file except in compliance with the License.

# You may obtain a copy of the License at

#

#     http://www.apache.org/licenses/LICENSE-2.0

#

# Unless required by applicable law or agreed to in writing, software

# distributed under the License is distributed on an "AS IS" BASIS,

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

# See the License for the specific language governing permissions and

# limitations under the License.

#



# Software: eFiction

import time

import logging

logger = logging.getLogger(__name__)

import re

import urllib

import urllib2



from .. import BeautifulSoup as bs

from ..htmlcleanup import stripHTML

from .. import exceptions as exceptions



from base_adapter import BaseSiteAdapter,  makeDate



class TwiwriteNetSiteAdapter(BaseSiteAdapter):



    def __init__(self, config, url):

        BaseSiteAdapter.__init__(self, config, url)

        self.story.setMetadata('siteabbrev','twrt')

        self.decode = ["Windows-1252",

                       "utf8"] # 1252 is a superset of iso-8859-1.

                               # Most sites that claim to be

                               # iso-8859-1 (and some that claim to be

                               # utf8) are really windows-1252.

        self.is_adult = False

        self.username = "NoneGiven" # if left empty, twiwrite.net doesn't return any message at all.

        self.password = ""

        

        # get storyId from url--url validation guarantees query is only sid=1234

        self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])

        

        

        # normalized story URL.

        self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))



            

    @staticmethod

    def getSiteDomain():

        return 'www.twiwrite.net'



    @classmethod

    def getAcceptDomains(cls):

        return ['www.twiwrite.net','twiwrite.net']



    @classmethod

    def getSiteExampleURLs(cls):

        return "http://www.twiwrite.net/viewstory.php?sid=1234"



    def getSiteURLPattern(self):

        return re.escape("http://")+r"(www\.)?"+re.escape("twiwrite.net/viewstory.php?sid=")+r"\d+$"



    def needToLoginCheck(self, data):

        if 'Registered Users Only' in data \

                or 'There is no such account on our website' in data \

                or "That password doesn't match the one in our database" in data:

          return True

        else:

          return False



    def performLogin(self, url):

        params = {}



        if self.password:

            params['penname'] = self.username

            params['password'] = self.password

        else:

            params['penname'] = self.getConfig("username")

            params['password'] = self.getConfig("password")

        params['cookiecheck'] = '1'

        params['submit'] = 'Submit'

    

        loginUrl = 'http://' + self.getSiteDomain() + '/user.php?action=login'

        logger.info("Will now login to URL (%s) as (%s)" % (loginUrl,

                                                              params['penname']))

    

        d = self._fetchUrl(loginUrl, params)

    

        if "Member Account" not in d : #Member Account

            logger.info("Failed to login to URL %s as %s" % (loginUrl,

                                                              params['penname']))

            raise exceptions.FailedToLogin(url,params['penname'])

            return False

        else:

            return True



    def extractChapterUrlsAndMetadata(self):



        if self.is_adult or self.getConfig("is_adult"):

            # Weirdly, different sites use different warning numbers.

            # If the title search below fails, there's a good chance

            # you need a different number.  print data at that point

            # and see what the 'click here to continue' url says.

            addurl = "&ageconsent=ok&warning=1" # XXX

        else:

            addurl=""

            

        url = self.url+'&index=1'+addurl

        logger.debug("URL: "+url)



        try:

            data = self._fetchUrl(url)

        except urllib2.HTTPError, e:

            if e.code == 404:

                raise exceptions.StoryDoesNotExist(self.url)

            else:

                raise e



        if self.needToLoginCheck(data):

            # need to log in for this one.

            self.performLogin(url)

            data = self._fetchUrl(url)



        if "Access denied. This story has not been validated by the adminstrators of this site." in data:

            raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")



        if "Contains Explicit Content for mature adults only! May contain graphic violence, mature sexual situations, and explicit language. Read with caution." in data:

            raise exceptions.AdultCheckRequired(self.url)

        

        # problems with some stories, but only in calibre.  I suspect

        # issues with different SGML parsers in python.  This is a

        # nasty hack, but it works.

        data = data[data.index("<body"):] 

        

        # use BeautifulSoup HTML parser to make everything easier to find.

        soup = bs.BeautifulSoup(data)



        pagetitlediv = soup.find('div',id='pagetitle')

        

        ## Title

        a = pagetitlediv.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))

        self.story.setMetadata('title',stripHTML(a))

        

        # Find authorid and URL from... author url.

        a = pagetitlediv.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))

        self.story.setMetadata('authorId',a['href'].split('=')[1])

        self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])

        self.story.setMetadata('author',a.string)



        # Find the chapters:

        for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):

            # just in case there's tags, like <i> in chapter titles.

            self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))



        self.story.setMetadata('numChapters',len(self.chapterUrls))



        ## <meta name='description' content='&lt;p&gt;Description&lt;/p&gt; ...' >

        ## Summary, strangely, is in the content attr of a <meta name='description'> tag

        ## which is escaped HTML.  Unfortunately, we can't use it because they don't

        ## escape (') chars in the desc, breakin the tag.

        #meta_desc = soup.find('meta',{'name':'description'})

        #metasoup = bs.BeautifulStoneSoup(meta_desc['content'])

        #self.story.setMetadata('description',stripHTML(metasoup))



        def defaultGetattr(d,k):

            try:

                return d[k]

            except:

                return ""

        

        # <span class="label">Rated:</span> NC-17<br /> etc

        labels = soup.findAll('span',{'class':'label'})

        for labelspan in labels:

            value = labelspan.nextSibling

            label = labelspan.string



            if 'Summary' in label:

                ## Everything until the next span class='label'

                svalue = ""

                while not defaultGetattr(value,'class') == 'label':

                    svalue += str(value)

                    value = value.nextSibling

                self.setDescription(url,svalue)

                #self.story.setMetadata('description',stripHTML(svalue))



            if 'Rated' in label:

                self.story.setMetadata('rating', value)



            if 'Word count' in label:

                self.story.setMetadata('numWords', value)



            if 'Categories' in label:

                cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))

                catstext = [cat.string for cat in cats]

                for cat in catstext:

                    self.story.addToList('category',cat.string)



            if 'Characters' in label:

                chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))

                charstext = [char.string for char in chars]

                for char in charstext:

                    self.story.addToList('characters',char.string)



            if 'Genre' in label:

                genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=3'))

                genrestext = [genre.string for genre in genres]

                self.genre = ', '.join(genrestext)

                for genre in genrestext:

                    self.story.addToList('genre',genre.string)



            if 'Warnings' in label:

                warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=8'))

                warningstext = [warning.string for warning in warnings]

                self.warning = ', '.join(warningstext)

                for warning in warningstext:

                    self.story.addToList('warning',warning.string)



            if 'Completed' in label:

                if 'Yes' in value:

                    self.story.setMetadata('status', 'Completed')

                else:

                    self.story.setMetadata('status', 'In-Progress')



            if 'Published' in label:

                self.story.setMetadata('datePublished', makeDate(value.strip(), "%B %d, %Y"))

            

            if 'Updated' in label:

                # there's a stray [ at the end.

                value = value[0:-1]

                self.story.setMetadata('dateUpdated', makeDate(value.strip(), "%B %d, %Y"))



        try:

            # Find Series name from series URL.

            a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))

            series_name = a.string

            series_url = 'http://'+self.host+'/'+a['href']



            # use BeautifulSoup HTML parser to make everything easier to find.

            seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))

            storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))

            i=1

            for a in storyas:

                if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):

                    self.setSeries(series_name, i)

                    self.story.setMetadata('seriesUrl',series_url)

                    break

                i+=1

            

        except:

            # I find it hard to care if the series parsing fails

            pass



    def getChapterText(self, url):



        logger.debug('Getting chapter text from: %s' % url)



        data = self._fetchUrl(url)

        # problems with some stories, but only in calibre.  I suspect

        # issues with different SGML parsers in python.  This is a

        # nasty hack, but it works.

        data = data[data.index("<body"):]

        

        soup = bs.BeautifulStoneSoup(data,

                                     selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.

        

        span = soup.find('div', {'id' : 'story'})



        if None == span:

            raise exceptions.FailedToDownload("Error downloading Chapter: %s!  Missing required element!" % url)

    

        return self.utf8FromSoup(url,span)



def getClass():

    return TwiwriteNetSiteAdapter
Summary ✨

This Python code defines a class TwiwriteNetSiteAdapter that adapts a website to be used with Twiwrite, a writing tool. It fetches and parses data from the website, including chapter text, ratings, genres, warnings, and more, and stores it in a format suitable for use with Twiwrite. The code uses BeautifulSoup to parse HTML and extract relevant information.
Tech Fingerprint

Alerts (15)

'def' Ensure functions have docstrings for documentation
55 59 63 69 77 103 171 259 279
Complexity hotspot; lines 70 to 72 (total complexity: 3)
70 71 72
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
174 255
'try:' Ensure try blocks have corresponding except or finally blocks
238