PageRenderTime 112ms CodeModel.GetById 40ms RepoModel.GetById 0ms app.codeStats 0ms

/fstmerge/examples/eXe/rev3426-3500/left-trunk-3500/exe/engine/wikipediaidevice.py

https://github.com/RoDaniel/featurehouse
Python | 211 lines | 206 code | 0 blank | 5 comment | 6 complexity | b10bf27c7f00ec32e1b0c7b370250480 MD5 | raw file
  1. """
  2. A Wikipedia Idevice is one built from a Wikipedia article.
  3. """
  4. import re
  5. from exe.engine.beautifulsoup import BeautifulSoup
  6. from exe.engine.idevice import Idevice
  7. from exe.engine.field import TextAreaField
  8. from exe.engine.translate import lateTranslate
  9. from exe.engine.path import Path, TempDirPath
  10. from exe.engine.resource import Resource
  11. import urllib
  12. class UrlOpener(urllib.FancyURLopener):
  13. """
  14. Set a distinctive User-Agent, so Wikipedia.org knows we're not spammers
  15. """
  16. version = "eXe/exe@exelearning.org"
  17. urllib._urlopener = UrlOpener()
  18. import logging
  19. log = logging.getLogger(__name__)
  20. class WikipediaIdevice(Idevice):
  21. """
  22. A Wikipedia Idevice is one built from a Wikipedia article.
  23. """
  24. persistenceVersion = 8
  25. def __init__(self, defaultSite):
  26. Idevice.__init__(self, x_(u"Wiki Article"),
  27. x_(u"University of Auckland"),
  28. x_(u"""<p>The Wikipedia iDevice allows you to locate
  29. existing content from within Wikipedia and download this content into your eXe
  30. resource. The Wikipedia Article iDevice takes a snapshot copy of the article
  31. content. Changes in Wikipedia will not automatically update individual snapshot
  32. copies in eXe, a fresh copy of the article will need to be taken. Likewise,
  33. changes made in eXe will not be updated in Wikipedia. </p> <p>Wikipedia content
  34. is covered by the GNU free documentation license.</p>"""),
  35. u"", u"")
  36. self.emphasis = Idevice.NoEmphasis
  37. self.articleName = u""
  38. self.article = TextAreaField(x_(u"Article"))
  39. self.article.idevice = self
  40. self.images = {}
  41. self.site = defaultSite
  42. self.icon = u"inter"
  43. self.systemResources += ["fdl.html"]
  44. self._langInstruc = x_(u"""Select the appropriate language version
  45. of Wikipedia to search and enter search term.""")
  46. self._searchInstruc = x_("""Enter a phrase or term you wish to search
  47. within Wikipedia.""")
  48. self.ownUrl = ""
  49. langInstruc = lateTranslate('langInstruc')
  50. searchInstruc = lateTranslate('searchInstruc')
  51. def loadArticle(self, name):
  52. """
  53. Load the article from Wikipedia
  54. """
  55. self.articleName = name
  56. url = ""
  57. name = urllib.quote(name.replace(" ", "_").encode('utf-8'))
  58. try:
  59. url = (self.site or self.ownUrl)
  60. if not url.endswith('/') and name <> '': url += '/'
  61. if '://' not in url: url = 'http://' + url
  62. url += name
  63. net = urllib.urlopen(url)
  64. page = net.read()
  65. net.close()
  66. except IOError, error:
  67. log.warning(unicode(error))
  68. self.article.content = _(u"Unable to download from %s <br/>Please check the spelling and connection and try again.") % url
  69. self.article.content_w_resourcePaths = self.article.content
  70. self.article.content_wo_resourcePaths = self.article.content
  71. return
  72. page = unicode(page, "utf8")
  73. page = page.replace(u'&#160;', u'&nbsp;')
  74. soup = BeautifulSoup(page, False)
  75. content = soup.first('div', {'id': "content"})
  76. if content:
  77. infoboxes = content.findAll('div',
  78. {'class' : 'infobox sisterproject'})
  79. [infobox.extract() for infobox in infoboxes]
  80. catboxes = content.findAll('div', {'id' : 'catlinks'})
  81. [catbox.extract() for catbox in catboxes]
  82. else:
  83. content = soup.first('body')
  84. if not content:
  85. log.error("no content")
  86. self.article.content = _(u"Unable to download from %s <br/>Please check the spelling and connection and try again.") % url
  87. self.article.content_w_resourcePaths = self.article.content
  88. self.article.content_wo_resourcePaths = self.article.content
  89. return
  90. while self.userResources:
  91. self.userResources[0].delete()
  92. self.images = {}
  93. bits = url.split('/')
  94. netloc = '%s//%s' % (bits[0], bits[2])
  95. path = '/'.join(bits[3:-1])
  96. tmpDir = TempDirPath()
  97. for imageTag in content.fetch('img'):
  98. imageSrc = unicode(imageTag['src'])
  99. imageName = imageSrc.split('/')[-1]
  100. if imageName not in self.images:
  101. if not imageSrc.startswith("http://"):
  102. if imageSrc.startswith("/"):
  103. imageSrc = netloc + imageSrc
  104. else:
  105. imageSrc = '%s/%s/%s' % (netloc, path, imageSrc)
  106. urllib.urlretrieve(imageSrc, tmpDir/imageName)
  107. new_resource = Resource(self, tmpDir/imageName)
  108. if new_resource._storageName != imageName:
  109. imageName = new_resource._storageName
  110. self.images[imageName] = True
  111. imageTag['src'] = (u"/" + self.parentNode.package.name + u"/resources/" + imageName)
  112. self.article.content = self.reformatArticle(netloc, unicode(content))
  113. self.article.content_w_resourcePaths = self.article.content
  114. self.article.content_wo_resourcePaths = self.article.content
  115. def reformatArticle(self, netloc, content):
  116. """
  117. Changes links, etc
  118. """
  119. content = re.sub(r'href="/', r'href="%s/' % netloc, content)
  120. content = re.sub(r'<(span|div)\s+(id|class)="(editsection|jump-to-nav)".*?</\1>', '', content)
  121. content = content.replace("\n", " ")
  122. content = re.sub(r'<script.*?</script>', '', content)
  123. return content
  124. def getResourcesField(self, this_resource):
  125. """
  126. implement the specific resource finding mechanism for this iDevice:
  127. """
  128. if hasattr(self, 'article') and hasattr(self.article, 'images'):
  129. for this_image in self.article.images:
  130. if hasattr(this_image, '_imageResource') \
  131. and this_resource == this_image._imageResource:
  132. return self.article
  133. for this_image in self.userResources:
  134. if this_resource == this_image:
  135. return self.article
  136. return None
  137. def getRichTextFields(self):
  138. """
  139. Like getResourcesField(), a general helper to allow nodes to search
  140. through all of their fields without having to know the specifics of each
  141. iDevice type.
  142. """
  143. fields_list = []
  144. if hasattr(self, 'article'):
  145. fields_list.append(self.article)
  146. return fields_list
  147. def __getstate__(self):
  148. """
  149. Re-write the img URLs just in case the class name has changed
  150. """
  151. log.debug("in __getstate__ " + repr(self.parentNode))
  152. if self.parentNode:
  153. self.article.content = re.sub(r'/[^/]*?/resources/',
  154. u"/" + self.parentNode.package.name +
  155. u"/resources/",
  156. self.article.content)
  157. return Idevice.__getstate__(self)
  158. def delete(self):
  159. """
  160. Clear out any old images when this iDevice is deleted
  161. """
  162. self.images = {}
  163. Idevice.delete(self)
  164. def upgradeToVersion1(self):
  165. """
  166. Called to upgrade from 0.6 release
  167. """
  168. self.site = _('http://en.wikipedia.org/')
  169. def upgradeToVersion2(self):
  170. """
  171. Upgrades v0.6 to v0.7.
  172. """
  173. self.lastIdevice = False
  174. def upgradeToVersion3(self):
  175. """
  176. Upgrades exe to v0.10
  177. """
  178. self._upgradeIdeviceToVersion1()
  179. self._site = self.__dict__['site']
  180. def upgradeToVersion4(self):
  181. """
  182. Upgrades exe to v0.11... what was I thinking?
  183. """
  184. self.site = self.__dict__['_site']
  185. def upgradeToVersion5(self):
  186. """
  187. Upgrades exe to v0.11... forgot to change the icon
  188. """
  189. self.icon = u"inter"
  190. def upgradeToVersion6(self):
  191. """
  192. Upgrades to v0.12
  193. """
  194. self._upgradeIdeviceToVersion2()
  195. self.systemResources += ["fdl.html"]
  196. if self.images and self.parentNode:
  197. for image in self.images:
  198. imageResource = Resource(self, Path(image))
  199. def upgradeToVersion7(self):
  200. """
  201. Upgrades to v0.12
  202. """
  203. self._langInstruc = x_(u"""Select the appropriate language version
  204. of Wikipedia to search and enter search term.""")
  205. self._searchInstruc = x_("""Enter a phrase or term you wish to search
  206. within Wikipedia.""")
  207. def upgradeToVersion8(self):
  208. """
  209. Upgrades to v0.19
  210. """
  211. self.ownUrl = ""