PageRenderTime 46ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/fanficdownloader/adapters/adapter_twilightednet.py

https://code.google.com/p/fanficdownloader/
Python | 253 lines | 217 code | 12 blank | 24 comment | 2 complexity | 21335bc32ca0c26d56d6f22ca54a3ad3 MD5 | raw file
Possible License(s): MIT
  1. # -*- coding: utf-8 -*-
  2. # Copyright 2011 Fanficdownloader team
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. # Software: eFiction
  17. import time
  18. import logging
  19. logger = logging.getLogger(__name__)
  20. import re
  21. import urllib
  22. import urllib2
  23. from .. import BeautifulSoup as bs
  24. from ..htmlcleanup import stripHTML
  25. from .. import exceptions as exceptions
  26. from base_adapter import BaseSiteAdapter, makeDate
  27. class TwilightedNetSiteAdapter(BaseSiteAdapter):
  28. def __init__(self, config, url):
  29. BaseSiteAdapter.__init__(self, config, url)
  30. self.story.setMetadata('siteabbrev','tw')
  31. self.decode = ["Windows-1252",
  32. "utf8"] # 1252 is a superset of iso-8859-1.
  33. # Most sites that claim to be
  34. # iso-8859-1 (and some that claim to be
  35. # utf8) are really windows-1252.
  36. self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
  37. self.password = ""
  38. # get storyId from url--url validation guarantees query is only sid=1234
  39. self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
  40. # normalized story URL.
  41. self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
  42. @staticmethod
  43. def getSiteDomain():
  44. return 'www.twilighted.net'
  45. @classmethod
  46. def getAcceptDomains(cls):
  47. return ['www.twilighted.net','twilighted.net']
  48. @classmethod
  49. def getSiteExampleURLs(cls):
  50. return "http://www.twilighted.net/viewstory.php?sid=1234"
  51. def getSiteURLPattern(self):
  52. return re.escape("http://")+r"(www\.)?"+re.escape("twilighted.net/viewstory.php?sid=")+r"\d+$"
  53. def needToLoginCheck(self, data):
  54. if 'Registered Users Only' in data \
  55. or 'There is no such account on our website' in data \
  56. or "That password doesn't match the one in our database" in data:
  57. return True
  58. else:
  59. return False
  60. def performLogin(self, url):
  61. params = {}
  62. if self.password:
  63. params['penname'] = self.username
  64. params['password'] = self.password
  65. else:
  66. params['penname'] = self.getConfig("username")
  67. params['password'] = self.getConfig("password")
  68. params['cookiecheck'] = '1'
  69. params['submit'] = 'Submit'
  70. loginUrl = 'http://' + self.getSiteDomain() + '/user.php?action=login'
  71. logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
  72. params['penname']))
  73. d = self._fetchUrl(loginUrl, params)
  74. if "Member Account" not in d : #Member Account
  75. logger.info("Failed to login to URL %s as %s" % (loginUrl,
  76. params['penname']))
  77. raise exceptions.FailedToLogin(url,params['penname'])
  78. return False
  79. else:
  80. return True
  81. def extractChapterUrlsAndMetadata(self):
  82. url = self.url+'&index=1'
  83. logger.debug("URL: "+url)
  84. try:
  85. data = self._fetchUrl(url)
  86. except urllib2.HTTPError, e:
  87. if e.code == 404:
  88. raise exceptions.StoryDoesNotExist(self.url)
  89. else:
  90. raise e
  91. if self.needToLoginCheck(data):
  92. # need to log in for this one.
  93. self.performLogin(url)
  94. data = self._fetchUrl(url)
  95. if "Access denied. This story has not been validated by the adminstrators of this site." in data:
  96. raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
  97. # problems with some stories, but only in calibre. I suspect
  98. # issues with different SGML parsers in python. This is a
  99. # nasty hack, but it works.
  100. # twilighted isn't writing <body> ??? wtf?
  101. data = "<html><body>"+data[data.index("</head>"):]
  102. # use BeautifulSoup HTML parser to make everything easier to find.
  103. soup = bs.BeautifulSoup(data)
  104. ## Title
  105. a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
  106. self.story.setMetadata('title',stripHTML(a))
  107. # Find authorid and URL from... author url.
  108. a = soup.find('a', href=re.compile(r"viewuser.php"))
  109. self.story.setMetadata('authorId',a['href'].split('=')[1])
  110. self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
  111. self.story.setMetadata('author',a.string)
  112. # Find the chapters:
  113. for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
  114. # just in case there's tags, like <i> in chapter titles.
  115. self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']))
  116. self.story.setMetadata('numChapters',len(self.chapterUrls))
  117. def defaultGetattr(d,k):
  118. try:
  119. return d[k]
  120. except:
  121. return ""
  122. # <span class="label">Rated:</span> NC-17<br /> etc
  123. labels = soup.findAll('span',{'class':'label'})
  124. for labelspan in labels:
  125. value = labelspan.nextSibling
  126. label = labelspan.string
  127. if 'Summary' in label:
  128. ## Everything until the next span class='label'
  129. svalue = ""
  130. while not defaultGetattr(value,'class') == 'label':
  131. svalue += str(value)
  132. value = value.nextSibling
  133. self.setDescription(url,svalue)
  134. if 'Rated' in label:
  135. self.story.setMetadata('rating', value)
  136. if 'Word count' in label:
  137. self.story.setMetadata('numWords', value)
  138. if 'Categories' in label:
  139. cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
  140. catstext = [cat.string for cat in cats]
  141. for cat in catstext:
  142. self.story.addToList('category',cat.string)
  143. if 'Characters' in label:
  144. chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
  145. charstext = [char.string for char in chars]
  146. for char in charstext:
  147. self.story.addToList('characters',char.string)
  148. ## twilighted.net doesn't use genre.
  149. # if 'Genre' in label:
  150. # genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class'))
  151. # genrestext = [genre.string for genre in genres]
  152. # self.genre = ', '.join(genrestext)
  153. # for genre in genrestext:
  154. # self.story.addToList('genre',genre.string)
  155. if 'Completed' in label:
  156. if 'Yes' in value:
  157. self.story.setMetadata('status', 'Completed')
  158. else:
  159. self.story.setMetadata('status', 'In-Progress')
  160. if 'Published' in label:
  161. self.story.setMetadata('datePublished', makeDate(value.strip(), "%B %d, %Y"))
  162. if 'Updated' in label:
  163. # there's a stray [ at the end.
  164. #value = value[0:-1]
  165. self.story.setMetadata('dateUpdated', makeDate(value.strip(), "%B %d, %Y"))
  166. try:
  167. # Find Series name from series URL.
  168. a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
  169. series_name = a.string
  170. series_url = 'http://'+self.host+'/'+a['href']
  171. # use BeautifulSoup HTML parser to make everything easier to find.
  172. seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
  173. storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
  174. i=1
  175. for a in storyas:
  176. if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
  177. self.setSeries(series_name, i)
  178. self.story.setMetadata('seriesUrl',series_url)
  179. break
  180. i+=1
  181. except:
  182. # I find it hard to care if the series parsing fails
  183. pass
  184. def getChapterText(self, url):
  185. logger.debug('Getting chapter text from: %s' % url)
  186. data = self._fetchUrl(url)
  187. # problems with some stories, but only in calibre. I suspect
  188. # issues with different SGML parsers in python. This is a
  189. # nasty hack, but it works.
  190. # twilighted isn't writing <body> ??? wtf?
  191. data = "<html><body>"+data[data.index("</head>"):]
  192. soup = bs.BeautifulStoneSoup(data,
  193. selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
  194. span = soup.find('div', {'id' : 'story'})
  195. if None == span:
  196. raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
  197. return self.utf8FromSoup(url,span)
  198. def getClass():
  199. return TwilightedNetSiteAdapter