PageRenderTime 65ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 1ms

/feedparser.py

https://github.com/dpw/pnntprss
Python | 4013 lines | 3681 code | 150 blank | 182 comment | 237 complexity | 1cd2e1c4a1372515454a48a1105d69e9 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

  1. """Universal feed parser
  2. Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
  3. Visit https://code.google.com/p/feedparser/ for the latest version
  4. Visit http://packages.python.org/feedparser/ for the latest documentation
  5. Required: Python 2.4 or later
  6. Recommended: iconv_codec <http://cjkpython.i18n.org/>
  7. """
  8. __version__ = "5.1.3"
  9. __license__ = """
  10. Copyright (c) 2010-2012 Kurt McKee <contactme@kurtmckee.org>
  11. Copyright (c) 2002-2008 Mark Pilgrim
  12. All rights reserved.
  13. Redistribution and use in source and binary forms, with or without modification,
  14. are permitted provided that the following conditions are met:
  15. * Redistributions of source code must retain the above copyright notice,
  16. this list of conditions and the following disclaimer.
  17. * Redistributions in binary form must reproduce the above copyright notice,
  18. this list of conditions and the following disclaimer in the documentation
  19. and/or other materials provided with the distribution.
  20. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
  21. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  24. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  25. CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  26. SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  27. INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  28. CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  29. ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  30. POSSIBILITY OF SUCH DAMAGE."""
  31. __author__ = "Mark Pilgrim <http://diveintomark.org/>"
  32. __contributors__ = ["Jason Diamond <http://injektilo.org/>",
  33. "John Beimler <http://john.beimler.org/>",
  34. "Fazal Majid <http://www.majid.info/mylos/weblog/>",
  35. "Aaron Swartz <http://aaronsw.com/>",
  36. "Kevin Marks <http://epeus.blogspot.com/>",
  37. "Sam Ruby <http://intertwingly.net/>",
  38. "Ade Oshineye <http://blog.oshineye.com/>",
  39. "Martin Pool <http://sourcefrog.net/>",
  40. "Kurt McKee <http://kurtmckee.org/>",
  41. "Bernd Schlapsi <https://github.com/brot>",]
  42. # HTTP "User-Agent" header to send to servers when downloading feeds.
  43. # If you are embedding feedparser in a larger application, you should
  44. # change this to your application name and URL.
  45. USER_AGENT = "UniversalFeedParser/%s +https://code.google.com/p/feedparser/" % __version__
  46. # HTTP "Accept" header to send to servers when downloading feeds. If you don't
  47. # want to send an Accept header, set this to None.
  48. ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
  49. # List of preferred XML parsers, by SAX driver name. These will be tried first,
  50. # but if they're not installed, Python will keep searching through its own list
  51. # of pre-installed parsers until it finds one that supports everything we need.
  52. PREFERRED_XML_PARSERS = ["drv_libxml2"]
  53. # If you want feedparser to automatically run HTML markup through HTML Tidy, set
  54. # this to 1. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
  55. # or utidylib <http://utidylib.berlios.de/>.
  56. TIDY_MARKUP = 0
  57. # List of Python interfaces for HTML Tidy, in order of preference. Only useful
  58. # if TIDY_MARKUP = 1
  59. PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
  60. # If you want feedparser to automatically resolve all relative URIs, set this
  61. # to 1.
  62. RESOLVE_RELATIVE_URIS = 1
  63. # If you want feedparser to automatically sanitize all potentially unsafe
  64. # HTML content, set this to 1.
  65. SANITIZE_HTML = 1
  66. # If you want feedparser to automatically parse microformat content embedded
  67. # in entry contents, set this to 1
  68. PARSE_MICROFORMATS = 1
  69. # ---------- Python 3 modules (make it work if possible) ----------
  70. try:
  71. import rfc822
  72. except ImportError:
  73. from email import _parseaddr as rfc822
  74. try:
  75. # Python 3.1 introduces bytes.maketrans and simultaneously
  76. # deprecates string.maketrans; use bytes.maketrans if possible
  77. _maketrans = bytes.maketrans
  78. except (NameError, AttributeError):
  79. import string
  80. _maketrans = string.maketrans
  81. # base64 support for Atom feeds that contain embedded binary data
  82. try:
  83. import base64, binascii
  84. except ImportError:
  85. base64 = binascii = None
  86. else:
  87. # Python 3.1 deprecates decodestring in favor of decodebytes
  88. _base64decode = getattr(base64, 'decodebytes', base64.decodestring)
  89. # _s2bytes: convert a UTF-8 str to bytes if the interpreter is Python 3
  90. # _l2bytes: convert a list of ints to bytes if the interpreter is Python 3
  91. try:
  92. if bytes is str:
  93. # In Python 2.5 and below, bytes doesn't exist (NameError)
  94. # In Python 2.6 and above, bytes and str are the same type
  95. raise NameError
  96. except NameError:
  97. # Python 2
  98. def _s2bytes(s):
  99. return s
  100. def _l2bytes(l):
  101. return ''.join(map(chr, l))
  102. else:
  103. # Python 3
  104. def _s2bytes(s):
  105. return bytes(s, 'utf8')
  106. def _l2bytes(l):
  107. return bytes(l)
  108. # If you want feedparser to allow all URL schemes, set this to ()
  109. # List culled from Python's urlparse documentation at:
  110. # http://docs.python.org/library/urlparse.html
  111. # as well as from "URI scheme" at Wikipedia:
  112. # https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
  113. # Many more will likely need to be added!
  114. ACCEPTABLE_URI_SCHEMES = (
  115. 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'magnet',
  116. 'mailto', 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu',
  117. 'sftp', 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet',
  118. 'wais',
  119. # Additional common-but-unofficial schemes
  120. 'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs',
  121. 'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg',
  122. )
  123. #ACCEPTABLE_URI_SCHEMES = ()
  124. # ---------- required modules (should come with any Python distribution) ----------
  125. import cgi
  126. import codecs
  127. import copy
  128. import datetime
  129. import re
  130. import struct
  131. import time
  132. import types
  133. import urllib
  134. import urllib2
  135. import urlparse
  136. import warnings
  137. from htmlentitydefs import name2codepoint, codepoint2name, entitydefs
  138. try:
  139. from io import BytesIO as _StringIO
  140. except ImportError:
  141. try:
  142. from cStringIO import StringIO as _StringIO
  143. except ImportError:
  144. from StringIO import StringIO as _StringIO
  145. # ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
  146. # gzip is included with most Python distributions, but may not be available if you compiled your own
  147. try:
  148. import gzip
  149. except ImportError:
  150. gzip = None
  151. try:
  152. import zlib
  153. except ImportError:
  154. zlib = None
  155. # If a real XML parser is available, feedparser will attempt to use it. feedparser has
  156. # been tested with the built-in SAX parser and libxml2. On platforms where the
  157. # Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
  158. # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
  159. try:
  160. import xml.sax
  161. from xml.sax.saxutils import escape as _xmlescape
  162. except ImportError:
  163. _XML_AVAILABLE = 0
  164. def _xmlescape(data,entities={}):
  165. data = data.replace('&', '&amp;')
  166. data = data.replace('>', '&gt;')
  167. data = data.replace('<', '&lt;')
  168. for char, entity in entities:
  169. data = data.replace(char, entity)
  170. return data
  171. else:
  172. try:
  173. xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
  174. except xml.sax.SAXReaderNotAvailable:
  175. _XML_AVAILABLE = 0
  176. else:
  177. _XML_AVAILABLE = 1
  178. # sgmllib is not available by default in Python 3; if the end user doesn't have
  179. # it available then we'll lose illformed XML parsing, content santizing, and
  180. # microformat support (at least while feedparser depends on BeautifulSoup).
  181. try:
  182. import sgmllib
  183. except ImportError:
  184. # This is probably Python 3, which doesn't include sgmllib anymore
  185. _SGML_AVAILABLE = 0
  186. # Mock sgmllib enough to allow subclassing later on
  187. class sgmllib(object):
  188. class SGMLParser(object):
  189. def goahead(self, i):
  190. pass
  191. def parse_starttag(self, i):
  192. pass
  193. else:
  194. _SGML_AVAILABLE = 1
  195. # sgmllib defines a number of module-level regular expressions that are
  196. # insufficient for the XML parsing feedparser needs. Rather than modify
  197. # the variables directly in sgmllib, they're defined here using the same
  198. # names, and the compiled code objects of several sgmllib.SGMLParser
  199. # methods are copied into _BaseHTMLProcessor so that they execute in
  200. # feedparser's scope instead of sgmllib's scope.
  201. charref = re.compile('&#(\d+|[xX][0-9a-fA-F]+);')
  202. tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
  203. attrfind = re.compile(
  204. r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)[$]?(\s*=\s*'
  205. r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?'
  206. )
  207. # Unfortunately, these must be copied over to prevent NameError exceptions
  208. entityref = sgmllib.entityref
  209. incomplete = sgmllib.incomplete
  210. interesting = sgmllib.interesting
  211. shorttag = sgmllib.shorttag
  212. shorttagopen = sgmllib.shorttagopen
  213. starttagopen = sgmllib.starttagopen
  214. class _EndBracketRegEx:
  215. def __init__(self):
  216. # Overriding the built-in sgmllib.endbracket regex allows the
  217. # parser to find angle brackets embedded in element attributes.
  218. self.endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''')
  219. def search(self, target, index=0):
  220. match = self.endbracket.match(target, index)
  221. if match is not None:
  222. # Returning a new object in the calling thread's context
  223. # resolves a thread-safety.
  224. return EndBracketMatch(match)
  225. return None
  226. class EndBracketMatch:
  227. def __init__(self, match):
  228. self.match = match
  229. def start(self, n):
  230. return self.match.end(n)
  231. endbracket = _EndBracketRegEx()
  232. # iconv_codec provides support for more character encodings.
  233. # It's available from http://cjkpython.i18n.org/
  234. try:
  235. import iconv_codec
  236. except ImportError:
  237. pass
  238. # chardet library auto-detects character encodings
  239. # Download from http://chardet.feedparser.org/
  240. try:
  241. import chardet
  242. except ImportError:
  243. chardet = None
  244. # BeautifulSoup is used to extract microformat content from HTML
  245. # feedparser is tested using BeautifulSoup 3.2.0
  246. # http://www.crummy.com/software/BeautifulSoup/
  247. try:
  248. import BeautifulSoup
  249. except ImportError:
  250. BeautifulSoup = None
  251. PARSE_MICROFORMATS = False
  252. # ---------- don't touch these ----------
  253. class ThingsNobodyCaresAboutButMe(Exception): pass
  254. class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
  255. class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
  256. class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
  257. class UndeclaredNamespace(Exception): pass
  258. SUPPORTED_VERSIONS = {'': u'unknown',
  259. 'rss090': u'RSS 0.90',
  260. 'rss091n': u'RSS 0.91 (Netscape)',
  261. 'rss091u': u'RSS 0.91 (Userland)',
  262. 'rss092': u'RSS 0.92',
  263. 'rss093': u'RSS 0.93',
  264. 'rss094': u'RSS 0.94',
  265. 'rss20': u'RSS 2.0',
  266. 'rss10': u'RSS 1.0',
  267. 'rss': u'RSS (unknown version)',
  268. 'atom01': u'Atom 0.1',
  269. 'atom02': u'Atom 0.2',
  270. 'atom03': u'Atom 0.3',
  271. 'atom10': u'Atom 1.0',
  272. 'atom': u'Atom (unknown version)',
  273. 'cdf': u'CDF',
  274. }
  275. class FeedParserDict(dict):
  276. keymap = {'channel': 'feed',
  277. 'items': 'entries',
  278. 'guid': 'id',
  279. 'date': 'updated',
  280. 'date_parsed': 'updated_parsed',
  281. 'description': ['summary', 'subtitle'],
  282. 'description_detail': ['summary_detail', 'subtitle_detail'],
  283. 'url': ['href'],
  284. 'modified': 'updated',
  285. 'modified_parsed': 'updated_parsed',
  286. 'issued': 'published',
  287. 'issued_parsed': 'published_parsed',
  288. 'copyright': 'rights',
  289. 'copyright_detail': 'rights_detail',
  290. 'tagline': 'subtitle',
  291. 'tagline_detail': 'subtitle_detail'}
  292. def __getitem__(self, key):
  293. if key == 'category':
  294. try:
  295. return dict.__getitem__(self, 'tags')[0]['term']
  296. except IndexError:
  297. raise KeyError, "object doesn't have key 'category'"
  298. elif key == 'enclosures':
  299. norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel'])
  300. return [norel(link) for link in dict.__getitem__(self, 'links') if link['rel']==u'enclosure']
  301. elif key == 'license':
  302. for link in dict.__getitem__(self, 'links'):
  303. if link['rel']==u'license' and 'href' in link:
  304. return link['href']
  305. elif key == 'updated':
  306. # Temporarily help developers out by keeping the old
  307. # broken behavior that was reported in issue 310.
  308. # This fix was proposed in issue 328.
  309. if not dict.__contains__(self, 'updated') and \
  310. dict.__contains__(self, 'published'):
  311. warnings.warn("To avoid breaking existing software while "
  312. "fixing issue 310, a temporary mapping has been created "
  313. "from `updated` to `published` if `updated` doesn't "
  314. "exist. This fallback will be removed in a future version "
  315. "of feedparser.", DeprecationWarning)
  316. return dict.__getitem__(self, 'published')
  317. return dict.__getitem__(self, 'updated')
  318. elif key == 'updated_parsed':
  319. if not dict.__contains__(self, 'updated_parsed') and \
  320. dict.__contains__(self, 'published_parsed'):
  321. warnings.warn("To avoid breaking existing software while "
  322. "fixing issue 310, a temporary mapping has been created "
  323. "from `updated_parsed` to `published_parsed` if "
  324. "`updated_parsed` doesn't exist. This fallback will be "
  325. "removed in a future version of feedparser.",
  326. DeprecationWarning)
  327. return dict.__getitem__(self, 'published_parsed')
  328. return dict.__getitem__(self, 'updated_parsed')
  329. else:
  330. realkey = self.keymap.get(key, key)
  331. if isinstance(realkey, list):
  332. for k in realkey:
  333. if dict.__contains__(self, k):
  334. return dict.__getitem__(self, k)
  335. elif dict.__contains__(self, realkey):
  336. return dict.__getitem__(self, realkey)
  337. return dict.__getitem__(self, key)
  338. def __contains__(self, key):
  339. if key in ('updated', 'updated_parsed'):
  340. # Temporarily help developers out by keeping the old
  341. # broken behavior that was reported in issue 310.
  342. # This fix was proposed in issue 328.
  343. return dict.__contains__(self, key)
  344. try:
  345. self.__getitem__(key)
  346. except KeyError:
  347. return False
  348. else:
  349. return True
  350. has_key = __contains__
  351. def get(self, key, default=None):
  352. try:
  353. return self.__getitem__(key)
  354. except KeyError:
  355. return default
  356. def __setitem__(self, key, value):
  357. key = self.keymap.get(key, key)
  358. if isinstance(key, list):
  359. key = key[0]
  360. return dict.__setitem__(self, key, value)
  361. def setdefault(self, key, value):
  362. if key not in self:
  363. self[key] = value
  364. return value
  365. return self[key]
  366. def __getattr__(self, key):
  367. # __getattribute__() is called first; this will be called
  368. # only if an attribute was not already found
  369. try:
  370. return self.__getitem__(key)
  371. except KeyError:
  372. raise AttributeError, "object has no attribute '%s'" % key
  373. def __hash__(self):
  374. return id(self)
  375. _cp1252 = {
  376. 128: unichr(8364), # euro sign
  377. 130: unichr(8218), # single low-9 quotation mark
  378. 131: unichr( 402), # latin small letter f with hook
  379. 132: unichr(8222), # double low-9 quotation mark
  380. 133: unichr(8230), # horizontal ellipsis
  381. 134: unichr(8224), # dagger
  382. 135: unichr(8225), # double dagger
  383. 136: unichr( 710), # modifier letter circumflex accent
  384. 137: unichr(8240), # per mille sign
  385. 138: unichr( 352), # latin capital letter s with caron
  386. 139: unichr(8249), # single left-pointing angle quotation mark
  387. 140: unichr( 338), # latin capital ligature oe
  388. 142: unichr( 381), # latin capital letter z with caron
  389. 145: unichr(8216), # left single quotation mark
  390. 146: unichr(8217), # right single quotation mark
  391. 147: unichr(8220), # left double quotation mark
  392. 148: unichr(8221), # right double quotation mark
  393. 149: unichr(8226), # bullet
  394. 150: unichr(8211), # en dash
  395. 151: unichr(8212), # em dash
  396. 152: unichr( 732), # small tilde
  397. 153: unichr(8482), # trade mark sign
  398. 154: unichr( 353), # latin small letter s with caron
  399. 155: unichr(8250), # single right-pointing angle quotation mark
  400. 156: unichr( 339), # latin small ligature oe
  401. 158: unichr( 382), # latin small letter z with caron
  402. 159: unichr( 376), # latin capital letter y with diaeresis
  403. }
  404. _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
  405. def _urljoin(base, uri):
  406. uri = _urifixer.sub(r'\1\3', uri)
  407. #try:
  408. if not isinstance(uri, unicode):
  409. uri = uri.decode('utf-8', 'ignore')
  410. uri = urlparse.urljoin(base, uri)
  411. if not isinstance(uri, unicode):
  412. return uri.decode('utf-8', 'ignore')
  413. return uri
  414. #except:
  415. # uri = urlparse.urlunparse([urllib.quote(part) for part in urlparse.urlparse(uri)])
  416. # return urlparse.urljoin(base, uri)
  417. class _FeedParserMixin:
  418. namespaces = {
  419. '': '',
  420. 'http://backend.userland.com/rss': '',
  421. 'http://blogs.law.harvard.edu/tech/rss': '',
  422. 'http://purl.org/rss/1.0/': '',
  423. 'http://my.netscape.com/rdf/simple/0.9/': '',
  424. 'http://example.com/newformat#': '',
  425. 'http://example.com/necho': '',
  426. 'http://purl.org/echo/': '',
  427. 'uri/of/echo/namespace#': '',
  428. 'http://purl.org/pie/': '',
  429. 'http://purl.org/atom/ns#': '',
  430. 'http://www.w3.org/2005/Atom': '',
  431. 'http://purl.org/rss/1.0/modules/rss091#': '',
  432. 'http://webns.net/mvcb/': 'admin',
  433. 'http://purl.org/rss/1.0/modules/aggregation/': 'ag',
  434. 'http://purl.org/rss/1.0/modules/annotate/': 'annotate',
  435. 'http://media.tangent.org/rss/1.0/': 'audio',
  436. 'http://backend.userland.com/blogChannelModule': 'blogChannel',
  437. 'http://web.resource.org/cc/': 'cc',
  438. 'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
  439. 'http://purl.org/rss/1.0/modules/company': 'co',
  440. 'http://purl.org/rss/1.0/modules/content/': 'content',
  441. 'http://my.theinfo.org/changed/1.0/rss/': 'cp',
  442. 'http://purl.org/dc/elements/1.1/': 'dc',
  443. 'http://purl.org/dc/terms/': 'dcterms',
  444. 'http://purl.org/rss/1.0/modules/email/': 'email',
  445. 'http://purl.org/rss/1.0/modules/event/': 'ev',
  446. 'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner',
  447. 'http://freshmeat.net/rss/fm/': 'fm',
  448. 'http://xmlns.com/foaf/0.1/': 'foaf',
  449. 'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo',
  450. 'http://postneo.com/icbm/': 'icbm',
  451. 'http://purl.org/rss/1.0/modules/image/': 'image',
  452. 'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes',
  453. 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes',
  454. 'http://purl.org/rss/1.0/modules/link/': 'l',
  455. 'http://search.yahoo.com/mrss': 'media',
  456. # Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace
  457. 'http://search.yahoo.com/mrss/': 'media',
  458. 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
  459. 'http://prismstandard.org/namespaces/1.2/basic/': 'prism',
  460. 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf',
  461. 'http://www.w3.org/2000/01/rdf-schema#': 'rdfs',
  462. 'http://purl.org/rss/1.0/modules/reference/': 'ref',
  463. 'http://purl.org/rss/1.0/modules/richequiv/': 'reqv',
  464. 'http://purl.org/rss/1.0/modules/search/': 'search',
  465. 'http://purl.org/rss/1.0/modules/slash/': 'slash',
  466. 'http://schemas.xmlsoap.org/soap/envelope/': 'soap',
  467. 'http://purl.org/rss/1.0/modules/servicestatus/': 'ss',
  468. 'http://hacks.benhammersley.com/rss/streaming/': 'str',
  469. 'http://purl.org/rss/1.0/modules/subscription/': 'sub',
  470. 'http://purl.org/rss/1.0/modules/syndication/': 'sy',
  471. 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf',
  472. 'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo',
  473. 'http://purl.org/rss/1.0/modules/threading/': 'thr',
  474. 'http://purl.org/rss/1.0/modules/textinput/': 'ti',
  475. 'http://madskills.com/public/xml/rss/module/trackback/': 'trackback',
  476. 'http://wellformedweb.org/commentAPI/': 'wfw',
  477. 'http://purl.org/rss/1.0/modules/wiki/': 'wiki',
  478. 'http://www.w3.org/1999/xhtml': 'xhtml',
  479. 'http://www.w3.org/1999/xlink': 'xlink',
  480. 'http://www.w3.org/XML/1998/namespace': 'xml',
  481. }
  482. _matchnamespaces = {}
  483. can_be_relative_uri = set(['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo'])
  484. can_contain_relative_uris = set(['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'])
  485. can_contain_dangerous_markup = set(['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'])
  486. html_types = [u'text/html', u'application/xhtml+xml']
  487. def __init__(self, baseuri=None, baselang=None, encoding=u'utf-8'):
  488. if not self._matchnamespaces:
  489. for k, v in self.namespaces.items():
  490. self._matchnamespaces[k.lower()] = v
  491. self.feeddata = FeedParserDict() # feed-level data
  492. self.encoding = encoding # character encoding
  493. self.entries = [] # list of entry-level data
  494. self.version = u'' # feed type/version, see SUPPORTED_VERSIONS
  495. self.namespacesInUse = {} # dictionary of namespaces defined by the feed
  496. # the following are used internally to track state;
  497. # this is really out of control and should be refactored
  498. self.infeed = 0
  499. self.inentry = 0
  500. self.incontent = 0
  501. self.intextinput = 0
  502. self.inimage = 0
  503. self.inauthor = 0
  504. self.incontributor = 0
  505. self.inpublisher = 0
  506. self.insource = 0
  507. self.sourcedata = FeedParserDict()
  508. self.contentparams = FeedParserDict()
  509. self._summaryKey = None
  510. self.namespacemap = {}
  511. self.elementstack = []
  512. self.basestack = []
  513. self.langstack = []
  514. self.baseuri = baseuri or u''
  515. self.lang = baselang or None
  516. self.svgOK = 0
  517. self.title_depth = -1
  518. self.depth = 0
  519. if baselang:
  520. self.feeddata['language'] = baselang.replace('_','-')
  521. # A map of the following form:
  522. # {
  523. # object_that_value_is_set_on: {
  524. # property_name: depth_of_node_property_was_extracted_from,
  525. # other_property: depth_of_node_property_was_extracted_from,
  526. # },
  527. # }
  528. self.property_depth_map = {}
  529. def _normalize_attributes(self, kv):
  530. k = kv[0].lower()
  531. v = k in ('rel', 'type') and kv[1].lower() or kv[1]
  532. # the sgml parser doesn't handle entities in attributes, nor
  533. # does it pass the attribute values through as unicode, while
  534. # strict xml parsers do -- account for this difference
  535. if isinstance(self, _LooseFeedParser):
  536. v = v.replace('&amp;', '&')
  537. if not isinstance(v, unicode):
  538. v = v.decode('utf-8')
  539. return (k, v)
  540. def unknown_starttag(self, tag, attrs):
  541. # increment depth counter
  542. self.depth += 1
  543. # normalize attrs
  544. attrs = map(self._normalize_attributes, attrs)
  545. # track xml:base and xml:lang
  546. attrsD = dict(attrs)
  547. baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
  548. if not isinstance(baseuri, unicode):
  549. baseuri = baseuri.decode(self.encoding, 'ignore')
  550. # ensure that self.baseuri is always an absolute URI that
  551. # uses a whitelisted URI scheme (e.g. not `javscript:`)
  552. if self.baseuri:
  553. self.baseuri = _makeSafeAbsoluteURI(self.baseuri, baseuri) or self.baseuri
  554. else:
  555. self.baseuri = _urljoin(self.baseuri, baseuri)
  556. lang = attrsD.get('xml:lang', attrsD.get('lang'))
  557. if lang == '':
  558. # xml:lang could be explicitly set to '', we need to capture that
  559. lang = None
  560. elif lang is None:
  561. # if no xml:lang is specified, use parent lang
  562. lang = self.lang
  563. if lang:
  564. if tag in ('feed', 'rss', 'rdf:RDF'):
  565. self.feeddata['language'] = lang.replace('_','-')
  566. self.lang = lang
  567. self.basestack.append(self.baseuri)
  568. self.langstack.append(lang)
  569. # track namespaces
  570. for prefix, uri in attrs:
  571. if prefix.startswith('xmlns:'):
  572. self.trackNamespace(prefix[6:], uri)
  573. elif prefix == 'xmlns':
  574. self.trackNamespace(None, uri)
  575. # track inline content
  576. if self.incontent and not self.contentparams.get('type', u'xml').endswith(u'xml'):
  577. if tag in ('xhtml:div', 'div'):
  578. return # typepad does this 10/2007
  579. # element declared itself as escaped markup, but it isn't really
  580. self.contentparams['type'] = u'application/xhtml+xml'
  581. if self.incontent and self.contentparams.get('type') == u'application/xhtml+xml':
  582. if tag.find(':') <> -1:
  583. prefix, tag = tag.split(':', 1)
  584. namespace = self.namespacesInUse.get(prefix, '')
  585. if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
  586. attrs.append(('xmlns',namespace))
  587. if tag=='svg' and namespace=='http://www.w3.org/2000/svg':
  588. attrs.append(('xmlns',namespace))
  589. if tag == 'svg':
  590. self.svgOK += 1
  591. return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0)
  592. # match namespaces
  593. if tag.find(':') <> -1:
  594. prefix, suffix = tag.split(':', 1)
  595. else:
  596. prefix, suffix = '', tag
  597. prefix = self.namespacemap.get(prefix, prefix)
  598. if prefix:
  599. prefix = prefix + '_'
  600. # special hack for better tracking of empty textinput/image elements in illformed feeds
  601. if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
  602. self.intextinput = 0
  603. if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
  604. self.inimage = 0
  605. # call special handler (if defined) or default handler
  606. methodname = '_start_' + prefix + suffix
  607. try:
  608. method = getattr(self, methodname)
  609. return method(attrsD)
  610. except AttributeError:
  611. # Since there's no handler or something has gone wrong we explicitly add the element and its attributes
  612. unknown_tag = prefix + suffix
  613. if len(attrsD) == 0:
  614. # No attributes so merge it into the encosing dictionary
  615. return self.push(unknown_tag, 1)
  616. else:
  617. # Has attributes so create it in its own dictionary
  618. context = self._getContext()
  619. context[unknown_tag] = attrsD
  620. def unknown_endtag(self, tag):
  621. # match namespaces
  622. if tag.find(':') <> -1:
  623. prefix, suffix = tag.split(':', 1)
  624. else:
  625. prefix, suffix = '', tag
  626. prefix = self.namespacemap.get(prefix, prefix)
  627. if prefix:
  628. prefix = prefix + '_'
  629. if suffix == 'svg' and self.svgOK:
  630. self.svgOK -= 1
  631. # call special handler (if defined) or default handler
  632. methodname = '_end_' + prefix + suffix
  633. try:
  634. if self.svgOK:
  635. raise AttributeError()
  636. method = getattr(self, methodname)
  637. method()
  638. except AttributeError:
  639. self.pop(prefix + suffix)
  640. # track inline content
  641. if self.incontent and not self.contentparams.get('type', u'xml').endswith(u'xml'):
  642. # element declared itself as escaped markup, but it isn't really
  643. if tag in ('xhtml:div', 'div'):
  644. return # typepad does this 10/2007
  645. self.contentparams['type'] = u'application/xhtml+xml'
  646. if self.incontent and self.contentparams.get('type') == u'application/xhtml+xml':
  647. tag = tag.split(':')[-1]
  648. self.handle_data('</%s>' % tag, escape=0)
  649. # track xml:base and xml:lang going out of scope
  650. if self.basestack:
  651. self.basestack.pop()
  652. if self.basestack and self.basestack[-1]:
  653. self.baseuri = self.basestack[-1]
  654. if self.langstack:
  655. self.langstack.pop()
  656. if self.langstack: # and (self.langstack[-1] is not None):
  657. self.lang = self.langstack[-1]
  658. self.depth -= 1
  659. def handle_charref(self, ref):
  660. # called for each character reference, e.g. for '&#160;', ref will be '160'
  661. if not self.elementstack:
  662. return
  663. ref = ref.lower()
  664. if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
  665. text = '&#%s;' % ref
  666. else:
  667. if ref[0] == 'x':
  668. c = int(ref[1:], 16)
  669. else:
  670. c = int(ref)
  671. text = unichr(c).encode('utf-8')
  672. self.elementstack[-1][2].append(text)
  673. def handle_entityref(self, ref):
  674. # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
  675. if not self.elementstack:
  676. return
  677. if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
  678. text = '&%s;' % ref
  679. elif ref in self.entities:
  680. text = self.entities[ref]
  681. if text.startswith('&#') and text.endswith(';'):
  682. return self.handle_entityref(text)
  683. else:
  684. try:
  685. name2codepoint[ref]
  686. except KeyError:
  687. text = '&%s;' % ref
  688. else:
  689. text = unichr(name2codepoint[ref]).encode('utf-8')
  690. self.elementstack[-1][2].append(text)
  691. def handle_data(self, text, escape=1):
  692. # called for each block of plain text, i.e. outside of any tag and
  693. # not containing any character or entity references
  694. if not self.elementstack:
  695. return
  696. if escape and self.contentparams.get('type') == u'application/xhtml+xml':
  697. text = _xmlescape(text)
  698. self.elementstack[-1][2].append(text)
  699. def handle_comment(self, text):
  700. # called for each comment, e.g. <!-- insert message here -->
  701. pass
  702. def handle_pi(self, text):
  703. # called for each processing instruction, e.g. <?instruction>
  704. pass
  705. def handle_decl(self, text):
  706. pass
  707. def parse_declaration(self, i):
  708. # override internal declaration handler to handle CDATA blocks
  709. if self.rawdata[i:i+9] == '<![CDATA[':
  710. k = self.rawdata.find(']]>', i)
  711. if k == -1:
  712. # CDATA block began but didn't finish
  713. k = len(self.rawdata)
  714. return k
  715. self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
  716. return k+3
  717. else:
  718. k = self.rawdata.find('>', i)
  719. if k >= 0:
  720. return k+1
  721. else:
  722. # We have an incomplete CDATA block.
  723. return k
  724. def mapContentType(self, contentType):
  725. contentType = contentType.lower()
  726. if contentType == 'text' or contentType == 'plain':
  727. contentType = u'text/plain'
  728. elif contentType == 'html':
  729. contentType = u'text/html'
  730. elif contentType == 'xhtml':
  731. contentType = u'application/xhtml+xml'
  732. return contentType
  733. def trackNamespace(self, prefix, uri):
  734. loweruri = uri.lower()
  735. if not self.version:
  736. if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/'):
  737. self.version = u'rss090'
  738. elif loweruri == 'http://purl.org/rss/1.0/':
  739. self.version = u'rss10'
  740. elif loweruri == 'http://www.w3.org/2005/atom':
  741. self.version = u'atom10'
  742. if loweruri.find(u'backend.userland.com/rss') <> -1:
  743. # match any backend.userland.com namespace
  744. uri = u'http://backend.userland.com/rss'
  745. loweruri = uri
  746. if loweruri in self._matchnamespaces:
  747. self.namespacemap[prefix] = self._matchnamespaces[loweruri]
  748. self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
  749. else:
  750. self.namespacesInUse[prefix or ''] = uri
  751. def resolveURI(self, uri):
  752. return _urljoin(self.baseuri or u'', uri)
  753. def decodeEntities(self, element, data):
  754. return data
  755. def strattrs(self, attrs):
  756. return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'&quot;'})) for t in attrs])
  757. def push(self, element, expectingText):
  758. self.elementstack.append([element, expectingText, []])
  759. def pop(self, element, stripWhitespace=1):
  760. if not self.elementstack:
  761. return
  762. if self.elementstack[-1][0] != element:
  763. return
  764. element, expectingText, pieces = self.elementstack.pop()
  765. if self.version == u'atom10' and self.contentparams.get('type', u'text') == u'application/xhtml+xml':
  766. # remove enclosing child element, but only if it is a <div> and
  767. # only if all the remaining content is nested underneath it.
  768. # This means that the divs would be retained in the following:
  769. # <div>foo</div><div>bar</div>
  770. while pieces and len(pieces)>1 and not pieces[-1].strip():
  771. del pieces[-1]
  772. while pieces and len(pieces)>1 and not pieces[0].strip():
  773. del pieces[0]
  774. if pieces and (pieces[0] == '<div>' or pieces[0].startswith('<div ')) and pieces[-1]=='</div>':
  775. depth = 0
  776. for piece in pieces[:-1]:
  777. if piece.startswith('</'):
  778. depth -= 1
  779. if depth == 0:
  780. break
  781. elif piece.startswith('<') and not piece.endswith('/>'):
  782. depth += 1
  783. else:
  784. pieces = pieces[1:-1]
  785. # Ensure each piece is a str for Python 3
  786. for (i, v) in enumerate(pieces):
  787. if not isinstance(v, unicode):
  788. pieces[i] = v.decode('utf-8')
  789. output = u''.join(pieces)
  790. if stripWhitespace:
  791. output = output.strip()
  792. if not expectingText:
  793. return output
  794. # decode base64 content
  795. if base64 and self.contentparams.get('base64', 0):
  796. try:
  797. output = _base64decode(output)
  798. except binascii.Error:
  799. pass
  800. except binascii.Incomplete:
  801. pass
  802. except TypeError:
  803. # In Python 3, base64 takes and outputs bytes, not str
  804. # This may not be the most correct way to accomplish this
  805. output = _base64decode(output.encode('utf-8')).decode('utf-8')
  806. # resolve relative URIs
  807. if (element in self.can_be_relative_uri) and output:
  808. output = self.resolveURI(output)
  809. # decode entities within embedded markup
  810. if not self.contentparams.get('base64', 0):
  811. output = self.decodeEntities(element, output)
  812. # some feed formats require consumers to guess
  813. # whether the content is html or plain text
  814. if not self.version.startswith(u'atom') and self.contentparams.get('type') == u'text/plain':
  815. if self.lookslikehtml(output):
  816. self.contentparams['type'] = u'text/html'
  817. # remove temporary cruft from contentparams
  818. try:
  819. del self.contentparams['mode']
  820. except KeyError:
  821. pass
  822. try:
  823. del self.contentparams['base64']
  824. except KeyError:
  825. pass
  826. is_htmlish = self.mapContentType(self.contentparams.get('type', u'text/html')) in self.html_types
  827. # resolve relative URIs within embedded markup
  828. if is_htmlish and RESOLVE_RELATIVE_URIS:
  829. if element in self.can_contain_relative_uris:
  830. output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', u'text/html'))
  831. # parse microformats
  832. # (must do this before sanitizing because some microformats
  833. # rely on elements that we sanitize)
  834. if PARSE_MICROFORMATS and is_htmlish and element in ['content', 'description', 'summary']:
  835. mfresults = _parseMicroformats(output, self.baseuri, self.encoding)
  836. if mfresults:
  837. for tag in mfresults.get('tags', []):
  838. self._addTag(tag['term'], tag['scheme'], tag['label'])
  839. for enclosure in mfresults.get('enclosures', []):
  840. self._start_enclosure(enclosure)
  841. for xfn in mfresults.get('xfn', []):
  842. self._addXFN(xfn['relationships'], xfn['href'], xfn['name'])
  843. vcard = mfresults.get('vcard')
  844. if vcard:
  845. self._getContext()['vcard'] = vcard
  846. # sanitize embedded markup
  847. if is_htmlish and SANITIZE_HTML:
  848. if element in self.can_contain_dangerous_markup:
  849. output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', u'text/html'))
  850. if self.encoding and not isinstance(output, unicode):
  851. output = output.decode(self.encoding, 'ignore')
  852. # address common error where people take data that is already
  853. # utf-8, presume that it is iso-8859-1, and re-encode it.
  854. if self.encoding in (u'utf-8', u'utf-8_INVALID_PYTHON_3') and isinstance(output, unicode):
  855. try:
  856. output = output.encode('iso-8859-1').decode('utf-8')
  857. except (UnicodeEncodeError, UnicodeDecodeError):
  858. pass
  859. # map win-1252 extensions to the proper code points
  860. if isinstance(output, unicode):
  861. output = output.translate(_cp1252)
  862. # categories/tags/keywords/whatever are handled in _end_category
  863. if element == 'category':
  864. return output
  865. if element == 'title' and -1 < self.title_depth <= self.depth:
  866. return output
  867. # store output in appropriate place(s)
  868. if self.inentry and not self.insource:
  869. if element == 'content':
  870. self.entries[-1].setdefault(element, [])
  871. contentparams = copy.deepcopy(self.contentparams)
  872. contentparams['value'] = output
  873. self.entries[-1][element].append(contentparams)
  874. elif element == 'link':
  875. if not self.inimage:
  876. # query variables in urls in link elements are improperly
  877. # converted from `?a=1&b=2` to `?a=1&b;=2` as if they're
  878. # unhandled character references. fix this special case.
  879. output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
  880. self.entries[-1][element] = output
  881. if output:
  882. self.entries[-1]['links'][-1]['href'] = output
  883. else:
  884. if element == 'description':
  885. element = 'summary'
  886. old_value_depth = self.property_depth_map.setdefault(self.entries[-1], {}).get(element)
  887. if old_value_depth is None or self.depth <= old_value_depth:
  888. self.property_depth_map[self.entries[-1]][element] = self.depth
  889. self.entries[-1][element] = output
  890. if self.incontent:
  891. contentparams = copy.deepcopy(self.contentparams)
  892. contentparams['value'] = output
  893. self.entries[-1][element + '_detail'] = contentparams
  894. elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage):
  895. context = self._getContext()
  896. if element == 'description':
  897. element = 'subtitle'
  898. context[element] = output
  899. if element == 'link':
  900. # fix query variables; see above for the explanation
  901. output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
  902. context[element] = output
  903. context['links'][-1]['href'] = output
  904. elif self.incontent:
  905. contentparams = copy.deepcopy(self.contentparams)
  906. contentparams['value'] = output
  907. context[element + '_detail'] = contentparams
  908. return output
  909. def pushContent(self, tag, attrsD, defaultContentType, expectingText):
  910. self.incontent += 1
  911. if self.lang:
  912. self.lang=self.lang.replace('_','-')
  913. self.contentparams = FeedParserDict({
  914. 'type': self.mapContentType(attrsD.get('type', defaultContentType)),
  915. 'language': self.lang,
  916. 'base': self.baseuri})
  917. self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
  918. self.push(tag, expectingText)
  919. def popContent(self, tag):
  920. value = self.pop(tag)
  921. self.incontent -= 1
  922. self.contentparams.clear()
  923. return value
  924. # a number of elements in a number of RSS variants are nominally plain
  925. # text, but this is routinely ignored. This is an attempt to detect
  926. # the most common cases. As false positives often result in silent
  927. # data loss, this function errs on the conservative side.
  928. @staticmethod
  929. def lookslikehtml(s):
  930. # must have a close tag or an entity reference to qualify
  931. if not (re.search(r'</(\w+)>',s) or re.search("&#?\w+;",s)):
  932. return
  933. # all tags must be in a restricted subset of valid HTML tags
  934. if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements,
  935. re.findall(r'</?(\w+)',s)):
  936. return
  937. # all entities must have been defined as valid HTML entities
  938. if filter(lambda e: e not in entitydefs.keys(), re.findall(r'&(\w+);', s)):
  939. return
  940. return 1
  941. def _mapToStandardPrefix(self, name):
  942. colonpos = name.find(':')
  943. if colonpos <> -1:
  944. prefix = name[:colonpos]
  945. suffix = name[colonpos+1:]
  946. prefix = self.namespacemap.get(prefix, prefix)
  947. name = prefix + ':' + suffix
  948. return name
  949. def _getAttribute(self, attrsD, name):
  950. return attrsD.get(self._mapToStandardPrefix(name))
  951. def _isBase64(self, attrsD, contentparams):
  952. if attrsD.get('mode', '') == 'base64':
  953. return 1
  954. if self.contentparams['type'].startswith(u'text/'):
  955. return 0
  956. if self.contentparams['type'].endswith(u'+xml'):
  957. return 0
  958. if self.contentparams['type'].endswith(u'/xml'):
  959. return 0
  960. return 1
  961. def _itsAnHrefDamnIt(self, attrsD):
  962. href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
  963. if href:
  964. try:
  965. del attrsD['url']
  966. except KeyError:
  967. pass
  968. try:
  969. del attrsD['uri']
  970. except KeyError:
  971. pass
  972. attrsD['href'] = href
  973. return attrsD
  974. def _save(self, key, value, overwrite=False):
  975. context = self._getContext()
  976. if overwrite:
  977. context[key] = value
  978. else:
  979. context.setdefault(key, value)
  980. def _start_rss(self, attrsD):
  981. versionmap = {'0.91': u'rss091u',
  982. '0.92': u'rss092',
  983. '0.93': u'rss093',
  984. '0.94': u'rss094'}
  985. #If we're here then this is an RSS feed.
  986. #If we don't have a version or have a version that starts with something
  987. #other than RSS then there's been a mistake. Correct it.
  988. if not self.version or not self.version.startswith(u'rss'):
  989. attr_version = attrsD.get('version', '')
  990. version = versionmap.get(attr_version)
  991. if version:
  992. self.version = version
  993. elif attr_version.startswith('2.'):
  994. self.version = u'rss20'
  995. else:
  996. self.version = u'rss'
  997. def _start_channel(self, attrsD):
  998. self.infeed = 1
  999. self._cdf_common(attrsD)
  1000. def _cdf_common(self, attrsD):
  1001. if 'lastmod' in attrsD:
  1002. self._start_modified({})
  1003. self.elementstack[-1][-1] = attrsD['lastmod']
  1004. self._end_modified()
  1005. if 'href' in attrsD:
  1006. self._start_link({})
  1007. self.elementstack[-1][-1] = attrsD['href']
  1008. self._end_link()
  1009. def _start_feed(self, attrsD):
  1010. self.infeed = 1
  1011. versionmap = {'0.1': u'atom01',
  1012. '0.2': u'atom02',
  1013. '0.3': u'atom03'}
  1014. if not self.version:
  1015. attr_version = attrsD.get('version')
  1016. version = versionmap.get(attr_version)
  1017. if version:
  1018. self.version = version
  1019. else:
  1020. self.version = u'atom'
  1021. def _end_channel(self):
  1022. self.infeed = 0
  1023. _end_feed = _end_channel
  1024. def _start_image(self, attrsD):
  1025. context = self._getContext()
  1026. if not self.inentry:
  1027. context.setdefault('image', FeedParserDict())
  1028. self.inimage = 1
  1029. self.title_depth = -1
  1030. self.push('image', 0)
  1031. def _end_image(self):
  1032. self.pop('image')
  1033. self.inimage = 0
  1034. def _start_textinput(self, attrsD):
  1035. context = self._getContext()
  1036. context.setdefault('textinput', FeedParserDict())
  1037. self.intextinput = 1
  1038. self.title_depth = -1
  1039. self.push('textinput', 0)
  1040. _start_textInput = _start_textinput
  1041. def _end_textinput(self):
  1042. self.pop('textinput')
  1043. self.intextinput = 0
  1044. _end_textInput = _end_textinput
  1045. def _start_author(self, attrsD):
  1046. self.inauthor = 1
  1047. self.push('author', 1)
  1048. # Append a new FeedParserDict when expecting an author
  1049. context = self._getContext()
  1050. context.setdefault('authors', [])
  1051. context['authors'].append(FeedParserDict())
  1052. _start_managingeditor = _start_author
  1053. _start_dc_author = _start_author
  1054. _start_dc_creator = _start_author
  1055. _start_itunes_author = _start_author
  1056. def _end_author(self):
  1057. self.pop('author')
  1058. self.inauthor = 0
  1059. self._sync_author_detail()
  1060. _end_managingeditor = _end_author
  1061. _end_dc_author = _end_author
  1062. _end_dc_creator = _end_author
  1063. _end_itunes_author = _end_author
  1064. def _start_itunes_owner(self, attrsD):
  1065. self.inpublisher = 1
  1066. self.push('publisher', 0)
  1067. def _end_itunes_owner(self):
  1068. self.pop('publisher')
  1069. self.inpublisher = 0
  1070. self._sync_author_detail('publisher')
  1071. def _start_contributor(self, attrsD):
  1072. self.incontributor = 1
  1073. context = self._getContext()
  1074. context.setdefault('contributors', [])
  1075. context['contributors'].append(FeedParserDict())
  1076. self.push('contributor', 0)
  1077. def _end_contributor(self):
  1078. self.pop('contributor')
  1079. self.incontributor = 0
  1080. def _start_dc_contributor(self, attrsD):
  1081. self.incontributor = 1
  1082. context = self._getContext()
  1083. context.setdefault('contributors', [])
  1084. context['contributors'].append(FeedParserDict())
  1085. self.push('name', 0)
  1086. def _end_dc_contributor(self):
  1087. self._end_name()
  1088. self.incontributor = 0
  1089. def _start_name(self, attrsD):
  1090. self.push('name', 0)
  1091. _start_itunes_n

Large files files are truncated, but you can click here to view the full file