PageRenderTime 85ms CodeModel.GetById 31ms RepoModel.GetById 0ms app.codeStats 0ms

/fanficdownloader/adapters/adapter_mediaminerorg.py

https://code.google.com/p/fanficdownloader/
Python | 237 lines | 132 code | 40 blank | 65 comment | 29 complexity | 587cd16ce5bc9286a84c0e9f20547c0c MD5 | raw file
Possible License(s): MIT
  1. # -*- coding: utf-8 -*-
  2. # Copyright 2011 Fanficdownloader team
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import time
  17. import logging
  18. logger = logging.getLogger(__name__)
  19. import re
  20. import urllib
  21. import urllib2
  22. from .. import BeautifulSoup as bs
  23. from ..htmlcleanup import stripHTML
  24. from .. import exceptions as exceptions
  25. from base_adapter import BaseSiteAdapter, makeDate
  26. class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
  27. def __init__(self, config, url):
  28. BaseSiteAdapter.__init__(self, config, url)
  29. self.story.setMetadata('siteabbrev','mm')
  30. self.decode = ["Windows-1252",
  31. "utf8"] # 1252 is a superset of iso-8859-1.
  32. # Most sites that claim to be
  33. # iso-8859-1 (and some that claim to be
  34. # utf8) are really windows-1252.
  35. # get storyId from url--url validation guarantees query correct
  36. m = re.match(self.getSiteURLPattern(),url)
  37. if m:
  38. self.story.setMetadata('storyId',m.group('id'))
  39. # normalized story URL.
  40. self._setURL('http://' + self.getSiteDomain() + '/fanfic/view_st.php/'+self.story.getMetadata('storyId'))
  41. else:
  42. raise exceptions.InvalidStoryURL(url,
  43. self.getSiteDomain(),
  44. self.getSiteExampleURLs())
  45. @staticmethod
  46. def getSiteDomain():
  47. return 'www.mediaminer.org'
  48. @classmethod
  49. def getSiteExampleURLs(cls):
  50. return "http://"+cls.getSiteDomain()+"/fanfic/view_st.php/123456 http://"+cls.getSiteDomain()+"/fanfic/view_ch.php/1234123/123444#fic_c"
  51. def getSiteURLPattern(self):
  52. ## http://www.mediaminer.org/fanfic/view_st.php/76882
  53. ## http://www.mediaminer.org/fanfic/view_ch.php/167618/594087#fic_c
  54. return re.escape("http://"+self.getSiteDomain())+\
  55. "/fanfic/view_(st|ch)\.php/"+r"(?P<id>\d+)(/\d+(#fic_c)?)?$"
  56. def extractChapterUrlsAndMetadata(self):
  57. url = self.url
  58. logger.debug("URL: "+url)
  59. try:
  60. data = self._fetchUrl(url)
  61. except urllib2.HTTPError, e:
  62. if e.code == 404:
  63. raise exceptions.StoryDoesNotExist(self.url)
  64. else:
  65. raise e
  66. # use BeautifulSoup HTML parser to make everything easier to find.
  67. soup = bs.BeautifulSoup(data)
  68. # [ A - All Readers ], strip '[' ']'
  69. ## Above title because we remove the smtxt font to get title.
  70. smtxt = soup.find("font",{"class":"smtxt"})
  71. if not smtxt:
  72. raise exceptions.StoryDoesNotExist(self.url)
  73. rating = smtxt.string[1:-1]
  74. self.story.setMetadata('rating',rating)
  75. # Find authorid and URL from... author url.
  76. a = soup.find('a', href=re.compile(r"/fanfic/src.php/u/\d+"))
  77. self.story.setMetadata('authorId',a['href'].split('/')[-1])
  78. self.story.setMetadata('authorUrl','http://'+self.host+a['href'])
  79. self.story.setMetadata('author',a.string)
  80. ## Title - Good grief. Title varies by chaptered, 1chapter and 'type=one shot'--and even 'one-shot's can have titled chapter.
  81. ## But, if colspan=2, there's no chapter title.
  82. ## <td class="ffh">Atmosphere: Chapter 1</b> <font class="smtxt">[ P - Pre-Teen ]</font></td>
  83. ## <td colspan=2 class="ffh">Hearts of Ice <font class="smtxt">[ P - Pre-Teen ]</font></td>
  84. ## <td colspan=2 class="ffh">Suzaku no Princess <font class="smtxt">[ P - Pre-Teen ]</font></td>
  85. ## <td class="ffh">The Kraut, The Bartender, and The Drunkard: Chapter 1</b> <font class="smtxt">[ P - Pre-Teen ]</font></td>
  86. ## <td class="ffh">Betrayal and Justice: A Cold Heart</b> <font size="-1">( Chapter 1 )</font> <font class="smtxt">[ A - All Readers ]</font></td>
  87. ## <td class="ffh">Question and Answer: Question and Answer</b> <font size="-1">( One-Shot )</font> <font class="smtxt">[ A - All Readers ]</font></td>
  88. title = soup.find('td',{'class':'ffh'})
  89. for font in title.findAll('font'):
  90. font.extract() # removes 'font' tags from inside the td.
  91. if title.has_key('colspan'):
  92. titlet = stripHTML(title)
  93. else:
  94. ## No colspan, it's part chapter title--even if it's a one-shot.
  95. titlet = ':'.join(stripHTML(title).split(':')[:-1]) # strip trailing 'Chapter X' or chapter title
  96. self.story.setMetadata('title',titlet)
  97. ## The story title is difficult to reliably parse from the
  98. ## story pages. Getting it from the author page is, but costs
  99. ## another fetch.
  100. # authsoup = bs.BeautifulSoup(self._fetchUrl(self.story.getMetadata('authorUrl')))
  101. # titlea = authsoup.find('a',{'href':'/fanfic/view_st.php/'+self.story.getMetadata('storyId')})
  102. # self.story.setMetadata('title',titlea.text)
  103. # save date from first for later.
  104. firstdate=None
  105. # Find the chapters
  106. select = soup.find('select',{'name':'cid'})
  107. if not select:
  108. self.chapterUrls.append(( self.story.getMetadata('title'),self.url))
  109. else:
  110. for option in select.findAll("option"):
  111. chapter = stripHTML(option.string)
  112. ## chapter can be: Chapter 7 [Jan 23, 2011]
  113. ## or: Vigilant Moonlight ( Chapter 1 ) [Jan 30, 2004]
  114. ## or even: Prologue ( Prologue ) [Jul 31, 2010]
  115. m = re.match(r'^(.*?) (\( .*? \) )?\[(.*?)\]$',chapter)
  116. chapter = m.group(1)
  117. # save date from first for later.
  118. if not firstdate:
  119. firstdate = m.group(3)
  120. self.chapterUrls.append((chapter,'http://'+self.host+'/fanfic/view_ch.php/'+self.story.getMetadata('storyId')+'/'+option['value']))
  121. self.story.setMetadata('numChapters',len(self.chapterUrls))
  122. # category
  123. # <a href="/fanfic/src.php/a/567">Ranma 1/2</a>
  124. for a in soup.findAll('a',href=re.compile(r"^/fanfic/src.php/a/")):
  125. self.story.addToList('category',a.string)
  126. # genre
  127. # <a href="/fanfic/src.php/a/567">Ranma 1/2</a>
  128. for a in soup.findAll('a',href=re.compile(r"^/fanfic/src.php/g/")):
  129. self.story.addToList('genre',a.string)
  130. # if firstdate, then the block below will only have last updated.
  131. if firstdate:
  132. self.story.setMetadata('datePublished', makeDate(firstdate, "%b %d, %Y"))
  133. # Everything else is in <tr bgcolor="#EEEED4">
  134. metastr = stripHTML(soup.find("tr",{"bgcolor":"#EEEED4"})).replace('\n',' ').replace('\r',' ').replace('\t',' ')
  135. # Latest Revision: August 03, 2010
  136. m = re.match(r".*?(?:Latest Revision|Uploaded On): ([a-zA-Z]+ \d\d, \d\d\d\d)",metastr)
  137. if m:
  138. self.story.setMetadata('dateUpdated', makeDate(m.group(1), "%B %d, %Y"))
  139. if not firstdate:
  140. self.story.setMetadata('datePublished',
  141. self.story.getMetadataRaw('dateUpdated'))
  142. else:
  143. self.story.setMetadata('dateUpdated',
  144. self.story.getMetadataRaw('datePublished'))
  145. # Words: 123456
  146. m = re.match(r".*?\| Words: (\d+) \|",metastr)
  147. if m:
  148. self.story.setMetadata('numWords', m.group(1))
  149. # Summary: ....
  150. m = re.match(r".*?Summary: (.*)$",metastr)
  151. if m:
  152. self.setDescription(url, m.group(1))
  153. #self.story.setMetadata('description', m.group(1))
  154. # completed
  155. m = re.match(r".*?Status: Completed.*?",metastr)
  156. if m:
  157. self.story.setMetadata('status','Completed')
  158. else:
  159. self.story.setMetadata('status','In-Progress')
  160. return
  161. def getChapterText(self, url):
  162. logger.debug('Getting chapter text from: %s' % url)
  163. data=self._fetchUrl(url)
  164. soup = bs.BeautifulStoneSoup(data,
  165. selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
  166. anchor = soup.find('a',{'name':'fic_c'})
  167. if None == anchor:
  168. raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
  169. ## find divs with align=left, those are paragraphs in newer stories.
  170. divlist = anchor.findAllNext('div',{'align':'left'})
  171. if divlist:
  172. for div in divlist:
  173. div.name='p' # convert to <p> mediaminer uses div with
  174. # a margin for paragraphs.
  175. anchor.append(div) # cheat! stuff all the content
  176. # divs into anchor just as a
  177. # holder.
  178. del div['style']
  179. del div['align']
  180. anchor.name='div'
  181. return self.utf8FromSoup(url,anchor)
  182. else:
  183. logger.debug('Using kludgey text find for older mediaminer story.')
  184. ## Some older mediaminer stories are unparsable with BeautifulSoup.
  185. ## Really nasty formatting. Sooo... Cheat! Parse it ourselves a bit first.
  186. ## Story stuff falls between:
  187. data = "<div id='HERE'>" + data[data.find('<a name="fic_c">'):] +"</div>"
  188. soup = bs.BeautifulStoneSoup(data,
  189. selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
  190. for tag in soup.findAll('td',{'class':'ffh'}) + \
  191. soup.findAll('div',{'class':'acl'}) + \
  192. soup.findAll('div',{'class':'footer smtxt'}) + \
  193. soup.findAll('table',{'class':'tbbrdr'}):
  194. tag.extract() # remove tag from soup.
  195. return self.utf8FromSoup(url,soup)
  196. def getClass():
  197. return MediaMinerOrgSiteAdapter