PageRenderTime 52ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/fanficdownloader/adapters/adapter_twiwritenet.py

https://code.google.com/p/fanficdownloader/
Python | 281 lines | 254 code | 6 blank | 21 comment | 4 complexity | 33264a3cbe412a1f6724c94d769b6453 MD5 | raw file
Possible License(s): MIT
  1. # -*- coding: utf-8 -*-
  2. # Copyright 2011 Fanficdownloader team
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # Software: eFiction
  17. import time
  18. import logging
  19. logger = logging.getLogger(__name__)
  20. import re
  21. import urllib
  22. import urllib2
  23. from .. import BeautifulSoup as bs
  24. from ..htmlcleanup import stripHTML
  25. from .. import exceptions as exceptions
  26. from base_adapter import BaseSiteAdapter, makeDate
  27. class TwiwriteNetSiteAdapter(BaseSiteAdapter):
  28. def __init__(self, config, url):
  29. BaseSiteAdapter.__init__(self, config, url)
  30. self.story.setMetadata('siteabbrev','twrt')
  31. self.decode = ["Windows-1252",
  32. "utf8"] # 1252 is a superset of iso-8859-1.
  33. # Most sites that claim to be
  34. # iso-8859-1 (and some that claim to be
  35. # utf8) are really windows-1252.
  36. self.is_adult = False
  37. self.username = "NoneGiven" # if left empty, twiwrite.net doesn't return any message at all.
  38. self.password = ""
  39. # get storyId from url--url validation guarantees query is only sid=1234
  40. self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
  41. # normalized story URL.
  42. self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
  43. @staticmethod
  44. def getSiteDomain():
  45. return 'www.twiwrite.net'
  46. @classmethod
  47. def getAcceptDomains(cls):
  48. return ['www.twiwrite.net','twiwrite.net']
  49. @classmethod
  50. def getSiteExampleURLs(cls):
  51. return "http://www.twiwrite.net/viewstory.php?sid=1234"
  52. def getSiteURLPattern(self):
  53. return re.escape("http://")+r"(www\.)?"+re.escape("twiwrite.net/viewstory.php?sid=")+r"\d+$"
  54. def needToLoginCheck(self, data):
  55. if 'Registered Users Only' in data \
  56. or 'There is no such account on our website' in data \
  57. or "That password doesn't match the one in our database" in data:
  58. return True
  59. else:
  60. return False
  61. def performLogin(self, url):
  62. params = {}
  63. if self.password:
  64. params['penname'] = self.username
  65. params['password'] = self.password
  66. else:
  67. params['penname'] = self.getConfig("username")
  68. params['password'] = self.getConfig("password")
  69. params['cookiecheck'] = '1'
  70. params['submit'] = 'Submit'
  71. loginUrl = 'http://' + self.getSiteDomain() + '/user.php?action=login'
  72. logger.info("Will now login to URL (%s) as (%s)" % (loginUrl,
  73. params['penname']))
  74. d = self._fetchUrl(loginUrl, params)
  75. if "Member Account" not in d : #Member Account
  76. logger.info("Failed to login to URL %s as %s" % (loginUrl,
  77. params['penname']))
  78. raise exceptions.FailedToLogin(url,params['penname'])
  79. return False
  80. else:
  81. return True
  82. def extractChapterUrlsAndMetadata(self):
  83. if self.is_adult or self.getConfig("is_adult"):
  84. # Weirdly, different sites use different warning numbers.
  85. # If the title search below fails, there's a good chance
  86. # you need a different number. print data at that point
  87. # and see what the 'click here to continue' url says.
  88. addurl = "&ageconsent=ok&warning=1" # XXX
  89. else:
  90. addurl=""
  91. url = self.url+'&index=1'+addurl
  92. logger.debug("URL: "+url)
  93. try:
  94. data = self._fetchUrl(url)
  95. except urllib2.HTTPError, e:
  96. if e.code == 404:
  97. raise exceptions.StoryDoesNotExist(self.url)
  98. else:
  99. raise e
  100. if self.needToLoginCheck(data):
  101. # need to log in for this one.
  102. self.performLogin(url)
  103. data = self._fetchUrl(url)
  104. if "Access denied. This story has not been validated by the adminstrators of this site." in data:
  105. raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
  106. if "Contains Explicit Content for mature adults only! May contain graphic violence, mature sexual situations, and explicit language. Read with caution." in data:
  107. raise exceptions.AdultCheckRequired(self.url)
  108. # problems with some stories, but only in calibre. I suspect
  109. # issues with different SGML parsers in python. This is a
  110. # nasty hack, but it works.
  111. data = data[data.index("<body"):]
  112. # use BeautifulSoup HTML parser to make everything easier to find.
  113. soup = bs.BeautifulSoup(data)
  114. pagetitlediv = soup.find('div',id='pagetitle')
  115. ## Title
  116. a = pagetitlediv.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
  117. self.story.setMetadata('title',stripHTML(a))
  118. # Find authorid and URL from... author url.
  119. a = pagetitlediv.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
  120. self.story.setMetadata('authorId',a['href'].split('=')[1])
  121. self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
  122. self.story.setMetadata('author',a.string)
  123. # Find the chapters:
  124. for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
  125. # just in case there's tags, like <i> in chapter titles.
  126. self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
  127. self.story.setMetadata('numChapters',len(self.chapterUrls))
  128. ## <meta name='description' content='&lt;p&gt;Description&lt;/p&gt; ...' >
  129. ## Summary, strangely, is in the content attr of a <meta name='description'> tag
  130. ## which is escaped HTML. Unfortunately, we can't use it because they don't
  131. ## escape (') chars in the desc, breakin the tag.
  132. #meta_desc = soup.find('meta',{'name':'description'})
  133. #metasoup = bs.BeautifulStoneSoup(meta_desc['content'])
  134. #self.story.setMetadata('description',stripHTML(metasoup))
  135. def defaultGetattr(d,k):
  136. try:
  137. return d[k]
  138. except:
  139. return ""
  140. # <span class="label">Rated:</span> NC-17<br /> etc
  141. labels = soup.findAll('span',{'class':'label'})
  142. for labelspan in labels:
  143. value = labelspan.nextSibling
  144. label = labelspan.string
  145. if 'Summary' in label:
  146. ## Everything until the next span class='label'
  147. svalue = ""
  148. while not defaultGetattr(value,'class') == 'label':
  149. svalue += str(value)
  150. value = value.nextSibling
  151. self.setDescription(url,svalue)
  152. #self.story.setMetadata('description',stripHTML(svalue))
  153. if 'Rated' in label:
  154. self.story.setMetadata('rating', value)
  155. if 'Word count' in label:
  156. self.story.setMetadata('numWords', value)
  157. if 'Categories' in label:
  158. cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
  159. catstext = [cat.string for cat in cats]
  160. for cat in catstext:
  161. self.story.addToList('category',cat.string)
  162. if 'Characters' in label:
  163. chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
  164. charstext = [char.string for char in chars]
  165. for char in charstext:
  166. self.story.addToList('characters',char.string)
  167. if 'Genre' in label:
  168. genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=3'))
  169. genrestext = [genre.string for genre in genres]
  170. self.genre = ', '.join(genrestext)
  171. for genre in genrestext:
  172. self.story.addToList('genre',genre.string)
  173. if 'Warnings' in label:
  174. warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=8'))
  175. warningstext = [warning.string for warning in warnings]
  176. self.warning = ', '.join(warningstext)
  177. for warning in warningstext:
  178. self.story.addToList('warning',warning.string)
  179. if 'Completed' in label:
  180. if 'Yes' in value:
  181. self.story.setMetadata('status', 'Completed')
  182. else:
  183. self.story.setMetadata('status', 'In-Progress')
  184. if 'Published' in label:
  185. self.story.setMetadata('datePublished', makeDate(value.strip(), "%B %d, %Y"))
  186. if 'Updated' in label:
  187. # there's a stray [ at the end.
  188. value = value[0:-1]
  189. self.story.setMetadata('dateUpdated', makeDate(value.strip(), "%B %d, %Y"))
  190. try:
  191. # Find Series name from series URL.
  192. a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
  193. series_name = a.string
  194. series_url = 'http://'+self.host+'/'+a['href']
  195. # use BeautifulSoup HTML parser to make everything easier to find.
  196. seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
  197. storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
  198. i=1
  199. for a in storyas:
  200. if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
  201. self.setSeries(series_name, i)
  202. self.story.setMetadata('seriesUrl',series_url)
  203. break
  204. i+=1
  205. except:
  206. # I find it hard to care if the series parsing fails
  207. pass
  208. def getChapterText(self, url):
  209. logger.debug('Getting chapter text from: %s' % url)
  210. data = self._fetchUrl(url)
  211. # problems with some stories, but only in calibre. I suspect
  212. # issues with different SGML parsers in python. This is a
  213. # nasty hack, but it works.
  214. data = data[data.index("<body"):]
  215. soup = bs.BeautifulStoneSoup(data,
  216. selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
  217. span = soup.find('div', {'id' : 'story'})
  218. if None == span:
  219. raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
  220. return self.utf8FromSoup(url,span)
  221. def getClass():
  222. return TwiwriteNetSiteAdapter