PageRenderTime 120ms CodeModel.GetById 26ms RepoModel.GetById 0ms app.codeStats 0ms

/web/lib/bs4/dammit.py

https://gitlab.com/adam.lukaitis/muzei
Python | 829 lines | 753 code | 13 blank | 63 comment | 12 complexity | c095fcf428d375db9831129ec252779d MD5 | raw file
  1. # -*- coding: utf-8 -*-
  2. """Beautiful Soup bonus library: Unicode, Dammit
  3. This library converts a bytestream to Unicode through any means
  4. necessary. It is heavily based on code from Mark Pilgrim's Universal
  5. Feed Parser. It works best on XML and XML, but it does not rewrite the
  6. XML or HTML to reflect a new encoding; that's the tree builder's job.
  7. """
  8. import codecs
  9. from htmlentitydefs import codepoint2name
  10. import re
  11. import logging
  12. import string
  13. # Import a library to autodetect character encodings.
  14. chardet_type = None
  15. try:
  16. # First try the fast C implementation.
  17. # PyPI package: cchardet
  18. import cchardet
  19. def chardet_dammit(s):
  20. return cchardet.detect(s)['encoding']
  21. except ImportError:
  22. try:
  23. # Fall back to the pure Python implementation
  24. # Debian package: python-chardet
  25. # PyPI package: chardet
  26. import chardet
  27. def chardet_dammit(s):
  28. return chardet.detect(s)['encoding']
  29. #import chardet.constants
  30. #chardet.constants._debug = 1
  31. except ImportError:
  32. # No chardet available.
  33. def chardet_dammit(s):
  34. return None
  35. # Available from http://cjkpython.i18n.org/.
  36. try:
  37. import iconv_codec
  38. except ImportError:
  39. pass
  40. xml_encoding_re = re.compile(
  41. '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
  42. html_meta_re = re.compile(
  43. '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
  44. class EntitySubstitution(object):
  45. """Substitute XML or HTML entities for the corresponding characters."""
  46. def _populate_class_variables():
  47. lookup = {}
  48. reverse_lookup = {}
  49. characters_for_re = []
  50. for codepoint, name in list(codepoint2name.items()):
  51. character = unichr(codepoint)
  52. if codepoint != 34:
  53. # There's no point in turning the quotation mark into
  54. # &quot;, unless it happens within an attribute value, which
  55. # is handled elsewhere.
  56. characters_for_re.append(character)
  57. lookup[character] = name
  58. # But we do want to turn &quot; into the quotation mark.
  59. reverse_lookup[name] = character
  60. re_definition = "[%s]" % "".join(characters_for_re)
  61. return lookup, reverse_lookup, re.compile(re_definition)
  62. (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
  63. CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
  64. CHARACTER_TO_XML_ENTITY = {
  65. "'": "apos",
  66. '"': "quot",
  67. "&": "amp",
  68. "<": "lt",
  69. ">": "gt",
  70. }
  71. BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
  72. "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
  73. ")")
  74. AMPERSAND_OR_BRACKET = re.compile("([<>&])")
  75. @classmethod
  76. def _substitute_html_entity(cls, matchobj):
  77. entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
  78. return "&%s;" % entity
  79. @classmethod
  80. def _substitute_xml_entity(cls, matchobj):
  81. """Used with a regular expression to substitute the
  82. appropriate XML entity for an XML special character."""
  83. entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
  84. return "&%s;" % entity
  85. @classmethod
  86. def quoted_attribute_value(self, value):
  87. """Make a value into a quoted XML attribute, possibly escaping it.
  88. Most strings will be quoted using double quotes.
  89. Bob's Bar -> "Bob's Bar"
  90. If a string contains double quotes, it will be quoted using
  91. single quotes.
  92. Welcome to "my bar" -> 'Welcome to "my bar"'
  93. If a string contains both single and double quotes, the
  94. double quotes will be escaped, and the string will be quoted
  95. using double quotes.
  96. Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
  97. """
  98. quote_with = '"'
  99. if '"' in value:
  100. if "'" in value:
  101. # The string contains both single and double
  102. # quotes. Turn the double quotes into
  103. # entities. We quote the double quotes rather than
  104. # the single quotes because the entity name is
  105. # "&quot;" whether this is HTML or XML. If we
  106. # quoted the single quotes, we'd have to decide
  107. # between &apos; and &squot;.
  108. replace_with = "&quot;"
  109. value = value.replace('"', replace_with)
  110. else:
  111. # There are double quotes but no single quotes.
  112. # We can use single quotes to quote the attribute.
  113. quote_with = "'"
  114. return quote_with + value + quote_with
  115. @classmethod
  116. def substitute_xml(cls, value, make_quoted_attribute=False):
  117. """Substitute XML entities for special XML characters.
  118. :param value: A string to be substituted. The less-than sign
  119. will become &lt;, the greater-than sign will become &gt;,
  120. and any ampersands will become &amp;. If you want ampersands
  121. that appear to be part of an entity definition to be left
  122. alone, use substitute_xml_containing_entities() instead.
  123. :param make_quoted_attribute: If True, then the string will be
  124. quoted, as befits an attribute value.
  125. """
  126. # Escape angle brackets and ampersands.
  127. value = cls.AMPERSAND_OR_BRACKET.sub(
  128. cls._substitute_xml_entity, value)
  129. if make_quoted_attribute:
  130. value = cls.quoted_attribute_value(value)
  131. return value
  132. @classmethod
  133. def substitute_xml_containing_entities(
  134. cls, value, make_quoted_attribute=False):
  135. """Substitute XML entities for special XML characters.
  136. :param value: A string to be substituted. The less-than sign will
  137. become &lt;, the greater-than sign will become &gt;, and any
  138. ampersands that are not part of an entity defition will
  139. become &amp;.
  140. :param make_quoted_attribute: If True, then the string will be
  141. quoted, as befits an attribute value.
  142. """
  143. # Escape angle brackets, and ampersands that aren't part of
  144. # entities.
  145. value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
  146. cls._substitute_xml_entity, value)
  147. if make_quoted_attribute:
  148. value = cls.quoted_attribute_value(value)
  149. return value
  150. @classmethod
  151. def substitute_html(cls, s):
  152. """Replace certain Unicode characters with named HTML entities.
  153. This differs from data.encode(encoding, 'xmlcharrefreplace')
  154. in that the goal is to make the result more readable (to those
  155. with ASCII displays) rather than to recover from
  156. errors. There's absolutely nothing wrong with a UTF-8 string
  157. containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
  158. character with "&eacute;" will make it more readable to some
  159. people.
  160. """
  161. return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
  162. cls._substitute_html_entity, s)
  163. class EncodingDetector:
  164. """Suggests a number of possible encodings for a bytestring.
  165. Order of precedence:
  166. 1. Encodings you specifically tell EncodingDetector to try first
  167. (the override_encodings argument to the constructor).
  168. 2. An encoding declared within the bytestring itself, either in an
  169. XML declaration (if the bytestring is to be interpreted as an XML
  170. document), or in a <meta> tag (if the bytestring is to be
  171. interpreted as an HTML document.)
  172. 3. An encoding detected through textual analysis by chardet,
  173. cchardet, or a similar external library.
  174. 4. UTF-8.
  175. 5. Windows-1252.
  176. """
  177. def __init__(self, markup, override_encodings=None, is_html=False):
  178. self.override_encodings = override_encodings or []
  179. self.chardet_encoding = None
  180. self.is_html = is_html
  181. self.declared_encoding = None
  182. # First order of business: strip a byte-order mark.
  183. self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
  184. def _usable(self, encoding, tried):
  185. if encoding is not None:
  186. encoding = encoding.lower()
  187. if encoding not in tried:
  188. tried.add(encoding)
  189. return True
  190. return False
  191. @property
  192. def encodings(self):
  193. """Yield a number of encodings that might work for this markup."""
  194. tried = set()
  195. for e in self.override_encodings:
  196. if self._usable(e, tried):
  197. yield e
  198. # Did the document originally start with a byte-order mark
  199. # that indicated its encoding?
  200. if self._usable(self.sniffed_encoding, tried):
  201. yield self.sniffed_encoding
  202. # Look within the document for an XML or HTML encoding
  203. # declaration.
  204. if self.declared_encoding is None:
  205. self.declared_encoding = self.find_declared_encoding(
  206. self.markup, self.is_html)
  207. if self._usable(self.declared_encoding, tried):
  208. yield self.declared_encoding
  209. # Use third-party character set detection to guess at the
  210. # encoding.
  211. if self.chardet_encoding is None:
  212. self.chardet_encoding = chardet_dammit(self.markup)
  213. if self._usable(self.chardet_encoding, tried):
  214. yield self.chardet_encoding
  215. # As a last-ditch effort, try utf-8 and windows-1252.
  216. for e in ('utf-8', 'windows-1252'):
  217. if self._usable(e, tried):
  218. yield e
  219. @classmethod
  220. def strip_byte_order_mark(cls, data):
  221. """If a byte-order mark is present, strip it and return the encoding it implies."""
  222. encoding = None
  223. if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
  224. and (data[2:4] != '\x00\x00'):
  225. encoding = 'utf-16be'
  226. data = data[2:]
  227. elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
  228. and (data[2:4] != '\x00\x00'):
  229. encoding = 'utf-16le'
  230. data = data[2:]
  231. elif data[:3] == b'\xef\xbb\xbf':
  232. encoding = 'utf-8'
  233. data = data[3:]
  234. elif data[:4] == b'\x00\x00\xfe\xff':
  235. encoding = 'utf-32be'
  236. data = data[4:]
  237. elif data[:4] == b'\xff\xfe\x00\x00':
  238. encoding = 'utf-32le'
  239. data = data[4:]
  240. return data, encoding
  241. @classmethod
  242. def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
  243. """Given a document, tries to find its declared encoding.
  244. An XML encoding is declared at the beginning of the document.
  245. An HTML encoding is declared in a <meta> tag, hopefully near the
  246. beginning of the document.
  247. """
  248. if search_entire_document:
  249. xml_endpos = html_endpos = len(markup)
  250. else:
  251. xml_endpos = 1024
  252. html_endpos = max(2048, int(len(markup) * 0.05))
  253. declared_encoding = None
  254. declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
  255. if not declared_encoding_match and is_html:
  256. declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
  257. if declared_encoding_match is not None:
  258. declared_encoding = declared_encoding_match.groups()[0].decode(
  259. 'ascii')
  260. if declared_encoding:
  261. return declared_encoding.lower()
  262. return None
  263. class UnicodeDammit:
  264. """A class for detecting the encoding of a *ML document and
  265. converting it to a Unicode string. If the source encoding is
  266. windows-1252, can replace MS smart quotes with their HTML or XML
  267. equivalents."""
  268. # This dictionary maps commonly seen values for "charset" in HTML
  269. # meta tags to the corresponding Python codec names. It only covers
  270. # values that aren't in Python's aliases and can't be determined
  271. # by the heuristics in find_codec.
  272. CHARSET_ALIASES = {"macintosh": "mac-roman",
  273. "x-sjis": "shift-jis"}
  274. ENCODINGS_WITH_SMART_QUOTES = [
  275. "windows-1252",
  276. "iso-8859-1",
  277. "iso-8859-2",
  278. ]
  279. def __init__(self, markup, override_encodings=[],
  280. smart_quotes_to=None, is_html=False):
  281. self.smart_quotes_to = smart_quotes_to
  282. self.tried_encodings = []
  283. self.contains_replacement_characters = False
  284. self.is_html = is_html
  285. self.detector = EncodingDetector(markup, override_encodings, is_html)
  286. # Short-circuit if the data is in Unicode to begin with.
  287. if isinstance(markup, unicode) or markup == '':
  288. self.markup = markup
  289. self.unicode_markup = unicode(markup)
  290. self.original_encoding = None
  291. return
  292. # The encoding detector may have stripped a byte-order mark.
  293. # Use the stripped markup from this point on.
  294. self.markup = self.detector.markup
  295. u = None
  296. for encoding in self.detector.encodings:
  297. markup = self.detector.markup
  298. u = self._convert_from(encoding)
  299. if u is not None:
  300. break
  301. if not u:
  302. # None of the encodings worked. As an absolute last resort,
  303. # try them again with character replacement.
  304. for encoding in self.detector.encodings:
  305. if encoding != "ascii":
  306. u = self._convert_from(encoding, "replace")
  307. if u is not None:
  308. logging.warning(
  309. "Some characters could not be decoded, and were "
  310. "replaced with REPLACEMENT CHARACTER.")
  311. self.contains_replacement_characters = True
  312. break
  313. # If none of that worked, we could at this point force it to
  314. # ASCII, but that would destroy so much data that I think
  315. # giving up is better.
  316. self.unicode_markup = u
  317. if not u:
  318. self.original_encoding = None
  319. def _sub_ms_char(self, match):
  320. """Changes a MS smart quote character to an XML or HTML
  321. entity, or an ASCII character."""
  322. orig = match.group(1)
  323. if self.smart_quotes_to == 'ascii':
  324. sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
  325. else:
  326. sub = self.MS_CHARS.get(orig)
  327. if type(sub) == tuple:
  328. if self.smart_quotes_to == 'xml':
  329. sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
  330. else:
  331. sub = '&'.encode() + sub[0].encode() + ';'.encode()
  332. else:
  333. sub = sub.encode()
  334. return sub
  335. def _convert_from(self, proposed, errors="strict"):
  336. proposed = self.find_codec(proposed)
  337. if not proposed or (proposed, errors) in self.tried_encodings:
  338. return None
  339. self.tried_encodings.append((proposed, errors))
  340. markup = self.markup
  341. # Convert smart quotes to HTML if coming from an encoding
  342. # that might have them.
  343. if (self.smart_quotes_to is not None
  344. and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
  345. smart_quotes_re = b"([\x80-\x9f])"
  346. smart_quotes_compiled = re.compile(smart_quotes_re)
  347. markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
  348. try:
  349. #print "Trying to convert document to %s (errors=%s)" % (
  350. # proposed, errors)
  351. u = self._to_unicode(markup, proposed, errors)
  352. self.markup = u
  353. self.original_encoding = proposed
  354. except Exception as e:
  355. #print "That didn't work!"
  356. #print e
  357. return None
  358. #print "Correct encoding: %s" % proposed
  359. return self.markup
  360. def _to_unicode(self, data, encoding, errors="strict"):
  361. '''Given a string and its encoding, decodes the string into Unicode.
  362. %encoding is a string recognized by encodings.aliases'''
  363. return unicode(data, encoding, errors)
  364. @property
  365. def declared_html_encoding(self):
  366. if not self.is_html:
  367. return None
  368. return self.detector.declared_encoding
  369. def find_codec(self, charset):
  370. value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
  371. or (charset and self._codec(charset.replace("-", "")))
  372. or (charset and self._codec(charset.replace("-", "_")))
  373. or (charset and charset.lower())
  374. or charset
  375. )
  376. if value:
  377. return value.lower()
  378. return None
  379. def _codec(self, charset):
  380. if not charset:
  381. return charset
  382. codec = None
  383. try:
  384. codecs.lookup(charset)
  385. codec = charset
  386. except (LookupError, ValueError):
  387. pass
  388. return codec
  389. # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
  390. MS_CHARS = {b'\x80': ('euro', '20AC'),
  391. b'\x81': ' ',
  392. b'\x82': ('sbquo', '201A'),
  393. b'\x83': ('fnof', '192'),
  394. b'\x84': ('bdquo', '201E'),
  395. b'\x85': ('hellip', '2026'),
  396. b'\x86': ('dagger', '2020'),
  397. b'\x87': ('Dagger', '2021'),
  398. b'\x88': ('circ', '2C6'),
  399. b'\x89': ('permil', '2030'),
  400. b'\x8A': ('Scaron', '160'),
  401. b'\x8B': ('lsaquo', '2039'),
  402. b'\x8C': ('OElig', '152'),
  403. b'\x8D': '?',
  404. b'\x8E': ('#x17D', '17D'),
  405. b'\x8F': '?',
  406. b'\x90': '?',
  407. b'\x91': ('lsquo', '2018'),
  408. b'\x92': ('rsquo', '2019'),
  409. b'\x93': ('ldquo', '201C'),
  410. b'\x94': ('rdquo', '201D'),
  411. b'\x95': ('bull', '2022'),
  412. b'\x96': ('ndash', '2013'),
  413. b'\x97': ('mdash', '2014'),
  414. b'\x98': ('tilde', '2DC'),
  415. b'\x99': ('trade', '2122'),
  416. b'\x9a': ('scaron', '161'),
  417. b'\x9b': ('rsaquo', '203A'),
  418. b'\x9c': ('oelig', '153'),
  419. b'\x9d': '?',
  420. b'\x9e': ('#x17E', '17E'),
  421. b'\x9f': ('Yuml', ''),}
  422. # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
  423. # horrors like stripping diacritical marks to turn á into a, but also
  424. # contains non-horrors like turning into ".
  425. MS_CHARS_TO_ASCII = {
  426. b'\x80' : 'EUR',
  427. b'\x81' : ' ',
  428. b'\x82' : ',',
  429. b'\x83' : 'f',
  430. b'\x84' : ',,',
  431. b'\x85' : '...',
  432. b'\x86' : '+',
  433. b'\x87' : '++',
  434. b'\x88' : '^',
  435. b'\x89' : '%',
  436. b'\x8a' : 'S',
  437. b'\x8b' : '<',
  438. b'\x8c' : 'OE',
  439. b'\x8d' : '?',
  440. b'\x8e' : 'Z',
  441. b'\x8f' : '?',
  442. b'\x90' : '?',
  443. b'\x91' : "'",
  444. b'\x92' : "'",
  445. b'\x93' : '"',
  446. b'\x94' : '"',
  447. b'\x95' : '*',
  448. b'\x96' : '-',
  449. b'\x97' : '--',
  450. b'\x98' : '~',
  451. b'\x99' : '(TM)',
  452. b'\x9a' : 's',
  453. b'\x9b' : '>',
  454. b'\x9c' : 'oe',
  455. b'\x9d' : '?',
  456. b'\x9e' : 'z',
  457. b'\x9f' : 'Y',
  458. b'\xa0' : ' ',
  459. b'\xa1' : '!',
  460. b'\xa2' : 'c',
  461. b'\xa3' : 'GBP',
  462. b'\xa4' : '$', #This approximation is especially parochial--this is the
  463. #generic currency symbol.
  464. b'\xa5' : 'YEN',
  465. b'\xa6' : '|',
  466. b'\xa7' : 'S',
  467. b'\xa8' : '..',
  468. b'\xa9' : '',
  469. b'\xaa' : '(th)',
  470. b'\xab' : '<<',
  471. b'\xac' : '!',
  472. b'\xad' : ' ',
  473. b'\xae' : '(R)',
  474. b'\xaf' : '-',
  475. b'\xb0' : 'o',
  476. b'\xb1' : '+-',
  477. b'\xb2' : '2',
  478. b'\xb3' : '3',
  479. b'\xb4' : ("'", 'acute'),
  480. b'\xb5' : 'u',
  481. b'\xb6' : 'P',
  482. b'\xb7' : '*',
  483. b'\xb8' : ',',
  484. b'\xb9' : '1',
  485. b'\xba' : '(th)',
  486. b'\xbb' : '>>',
  487. b'\xbc' : '1/4',
  488. b'\xbd' : '1/2',
  489. b'\xbe' : '3/4',
  490. b'\xbf' : '?',
  491. b'\xc0' : 'A',
  492. b'\xc1' : 'A',
  493. b'\xc2' : 'A',
  494. b'\xc3' : 'A',
  495. b'\xc4' : 'A',
  496. b'\xc5' : 'A',
  497. b'\xc6' : 'AE',
  498. b'\xc7' : 'C',
  499. b'\xc8' : 'E',
  500. b'\xc9' : 'E',
  501. b'\xca' : 'E',
  502. b'\xcb' : 'E',
  503. b'\xcc' : 'I',
  504. b'\xcd' : 'I',
  505. b'\xce' : 'I',
  506. b'\xcf' : 'I',
  507. b'\xd0' : 'D',
  508. b'\xd1' : 'N',
  509. b'\xd2' : 'O',
  510. b'\xd3' : 'O',
  511. b'\xd4' : 'O',
  512. b'\xd5' : 'O',
  513. b'\xd6' : 'O',
  514. b'\xd7' : '*',
  515. b'\xd8' : 'O',
  516. b'\xd9' : 'U',
  517. b'\xda' : 'U',
  518. b'\xdb' : 'U',
  519. b'\xdc' : 'U',
  520. b'\xdd' : 'Y',
  521. b'\xde' : 'b',
  522. b'\xdf' : 'B',
  523. b'\xe0' : 'a',
  524. b'\xe1' : 'a',
  525. b'\xe2' : 'a',
  526. b'\xe3' : 'a',
  527. b'\xe4' : 'a',
  528. b'\xe5' : 'a',
  529. b'\xe6' : 'ae',
  530. b'\xe7' : 'c',
  531. b'\xe8' : 'e',
  532. b'\xe9' : 'e',
  533. b'\xea' : 'e',
  534. b'\xeb' : 'e',
  535. b'\xec' : 'i',
  536. b'\xed' : 'i',
  537. b'\xee' : 'i',
  538. b'\xef' : 'i',
  539. b'\xf0' : 'o',
  540. b'\xf1' : 'n',
  541. b'\xf2' : 'o',
  542. b'\xf3' : 'o',
  543. b'\xf4' : 'o',
  544. b'\xf5' : 'o',
  545. b'\xf6' : 'o',
  546. b'\xf7' : '/',
  547. b'\xf8' : 'o',
  548. b'\xf9' : 'u',
  549. b'\xfa' : 'u',
  550. b'\xfb' : 'u',
  551. b'\xfc' : 'u',
  552. b'\xfd' : 'y',
  553. b'\xfe' : 'b',
  554. b'\xff' : 'y',
  555. }
  556. # A map used when removing rogue Windows-1252/ISO-8859-1
  557. # characters in otherwise UTF-8 documents.
  558. #
  559. # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
  560. # Windows-1252.
  561. WINDOWS_1252_TO_UTF8 = {
  562. 0x80 : b'\xe2\x82\xac', #
  563. 0x82 : b'\xe2\x80\x9a', #
  564. 0x83 : b'\xc6\x92', # ƒ
  565. 0x84 : b'\xe2\x80\x9e', #
  566. 0x85 : b'\xe2\x80\xa6', #
  567. 0x86 : b'\xe2\x80\xa0', #
  568. 0x87 : b'\xe2\x80\xa1', #
  569. 0x88 : b'\xcb\x86', # ˆ
  570. 0x89 : b'\xe2\x80\xb0', #
  571. 0x8a : b'\xc5\xa0', # Š
  572. 0x8b : b'\xe2\x80\xb9', #
  573. 0x8c : b'\xc5\x92', # Œ
  574. 0x8e : b'\xc5\xbd', # Ž
  575. 0x91 : b'\xe2\x80\x98', #
  576. 0x92 : b'\xe2\x80\x99', #
  577. 0x93 : b'\xe2\x80\x9c', #
  578. 0x94 : b'\xe2\x80\x9d', #
  579. 0x95 : b'\xe2\x80\xa2', #
  580. 0x96 : b'\xe2\x80\x93', #
  581. 0x97 : b'\xe2\x80\x94', #
  582. 0x98 : b'\xcb\x9c', # ˜
  583. 0x99 : b'\xe2\x84\xa2', #
  584. 0x9a : b'\xc5\xa1', # š
  585. 0x9b : b'\xe2\x80\xba', #
  586. 0x9c : b'\xc5\x93', # œ
  587. 0x9e : b'\xc5\xbe', # ž
  588. 0x9f : b'\xc5\xb8', # Ÿ
  589. 0xa0 : b'\xc2\xa0', #  
  590. 0xa1 : b'\xc2\xa1', # ¡
  591. 0xa2 : b'\xc2\xa2', # ¢
  592. 0xa3 : b'\xc2\xa3', # £
  593. 0xa4 : b'\xc2\xa4', # ¤
  594. 0xa5 : b'\xc2\xa5', # ¥
  595. 0xa6 : b'\xc2\xa6', # ¦
  596. 0xa7 : b'\xc2\xa7', # §
  597. 0xa8 : b'\xc2\xa8', # ¨
  598. 0xa9 : b'\xc2\xa9', # ©
  599. 0xaa : b'\xc2\xaa', # ª
  600. 0xab : b'\xc2\xab', # «
  601. 0xac : b'\xc2\xac', # ¬
  602. 0xad : b'\xc2\xad', # ­
  603. 0xae : b'\xc2\xae', # ®
  604. 0xaf : b'\xc2\xaf', # ¯
  605. 0xb0 : b'\xc2\xb0', # °
  606. 0xb1 : b'\xc2\xb1', # ±
  607. 0xb2 : b'\xc2\xb2', # ²
  608. 0xb3 : b'\xc2\xb3', # ³
  609. 0xb4 : b'\xc2\xb4', # ´
  610. 0xb5 : b'\xc2\xb5', # µ
  611. 0xb6 : b'\xc2\xb6', #
  612. 0xb7 : b'\xc2\xb7', # ·
  613. 0xb8 : b'\xc2\xb8', # ¸
  614. 0xb9 : b'\xc2\xb9', # ¹
  615. 0xba : b'\xc2\xba', # º
  616. 0xbb : b'\xc2\xbb', # »
  617. 0xbc : b'\xc2\xbc', # ¼
  618. 0xbd : b'\xc2\xbd', # ½
  619. 0xbe : b'\xc2\xbe', # ¾
  620. 0xbf : b'\xc2\xbf', # ¿
  621. 0xc0 : b'\xc3\x80', # À
  622. 0xc1 : b'\xc3\x81', # Á
  623. 0xc2 : b'\xc3\x82', # Â
  624. 0xc3 : b'\xc3\x83', # Ã
  625. 0xc4 : b'\xc3\x84', # Ä
  626. 0xc5 : b'\xc3\x85', # Å
  627. 0xc6 : b'\xc3\x86', # Æ
  628. 0xc7 : b'\xc3\x87', # Ç
  629. 0xc8 : b'\xc3\x88', # È
  630. 0xc9 : b'\xc3\x89', # É
  631. 0xca : b'\xc3\x8a', # Ê
  632. 0xcb : b'\xc3\x8b', # Ë
  633. 0xcc : b'\xc3\x8c', # Ì
  634. 0xcd : b'\xc3\x8d', # Í
  635. 0xce : b'\xc3\x8e', # Î
  636. 0xcf : b'\xc3\x8f', # Ï
  637. 0xd0 : b'\xc3\x90', # Ð
  638. 0xd1 : b'\xc3\x91', # Ñ
  639. 0xd2 : b'\xc3\x92', # Ò
  640. 0xd3 : b'\xc3\x93', # Ó
  641. 0xd4 : b'\xc3\x94', # Ô
  642. 0xd5 : b'\xc3\x95', # Õ
  643. 0xd6 : b'\xc3\x96', # Ö
  644. 0xd7 : b'\xc3\x97', # ×
  645. 0xd8 : b'\xc3\x98', # Ø
  646. 0xd9 : b'\xc3\x99', # Ù
  647. 0xda : b'\xc3\x9a', # Ú
  648. 0xdb : b'\xc3\x9b', # Û
  649. 0xdc : b'\xc3\x9c', # Ü
  650. 0xdd : b'\xc3\x9d', # Ý
  651. 0xde : b'\xc3\x9e', # Þ
  652. 0xdf : b'\xc3\x9f', # ß
  653. 0xe0 : b'\xc3\xa0', # à
  654. 0xe1 : b'\xa1', # á
  655. 0xe2 : b'\xc3\xa2', # â
  656. 0xe3 : b'\xc3\xa3', # ã
  657. 0xe4 : b'\xc3\xa4', # ä
  658. 0xe5 : b'\xc3\xa5', # å
  659. 0xe6 : b'\xc3\xa6', # æ
  660. 0xe7 : b'\xc3\xa7', # ç
  661. 0xe8 : b'\xc3\xa8', # è
  662. 0xe9 : b'\xc3\xa9', # é
  663. 0xea : b'\xc3\xaa', # ê
  664. 0xeb : b'\xc3\xab', # ë
  665. 0xec : b'\xc3\xac', # ì
  666. 0xed : b'\xc3\xad', # í
  667. 0xee : b'\xc3\xae', # î
  668. 0xef : b'\xc3\xaf', # ï
  669. 0xf0 : b'\xc3\xb0', # ð
  670. 0xf1 : b'\xc3\xb1', # ñ
  671. 0xf2 : b'\xc3\xb2', # ò
  672. 0xf3 : b'\xc3\xb3', # ó
  673. 0xf4 : b'\xc3\xb4', # ô
  674. 0xf5 : b'\xc3\xb5', # õ
  675. 0xf6 : b'\xc3\xb6', # ö
  676. 0xf7 : b'\xc3\xb7', # ÷
  677. 0xf8 : b'\xc3\xb8', # ø
  678. 0xf9 : b'\xc3\xb9', # ù
  679. 0xfa : b'\xc3\xba', # ú
  680. 0xfb : b'\xc3\xbb', # û
  681. 0xfc : b'\xc3\xbc', # ü
  682. 0xfd : b'\xc3\xbd', # ý
  683. 0xfe : b'\xc3\xbe', # þ
  684. }
  685. MULTIBYTE_MARKERS_AND_SIZES = [
  686. (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
  687. (0xe0, 0xef, 3), # 3-byte characters start with E0-EF
  688. (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
  689. ]
  690. FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
  691. LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
  692. @classmethod
  693. def detwingle(cls, in_bytes, main_encoding="utf8",
  694. embedded_encoding="windows-1252"):
  695. """Fix characters from one encoding embedded in some other encoding.
  696. Currently the only situation supported is Windows-1252 (or its
  697. subset ISO-8859-1), embedded in UTF-8.
  698. The input must be a bytestring. If you've already converted
  699. the document to Unicode, you're too late.
  700. The output is a bytestring in which `embedded_encoding`
  701. characters have been converted to their `main_encoding`
  702. equivalents.
  703. """
  704. if embedded_encoding.replace('_', '-').lower() not in (
  705. 'windows-1252', 'windows_1252'):
  706. raise NotImplementedError(
  707. "Windows-1252 and ISO-8859-1 are the only currently supported "
  708. "embedded encodings.")
  709. if main_encoding.lower() not in ('utf8', 'utf-8'):
  710. raise NotImplementedError(
  711. "UTF-8 is the only currently supported main encoding.")
  712. byte_chunks = []
  713. chunk_start = 0
  714. pos = 0
  715. while pos < len(in_bytes):
  716. byte = in_bytes[pos]
  717. if not isinstance(byte, int):
  718. # Python 2.x
  719. byte = ord(byte)
  720. if (byte >= cls.FIRST_MULTIBYTE_MARKER
  721. and byte <= cls.LAST_MULTIBYTE_MARKER):
  722. # This is the start of a UTF-8 multibyte character. Skip
  723. # to the end.
  724. for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
  725. if byte >= start and byte <= end:
  726. pos += size
  727. break
  728. elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
  729. # We found a Windows-1252 character!
  730. # Save the string up to this point as a chunk.
  731. byte_chunks.append(in_bytes[chunk_start:pos])
  732. # Now translate the Windows-1252 character into UTF-8
  733. # and add it as another, one-byte chunk.
  734. byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
  735. pos += 1
  736. chunk_start = pos
  737. else:
  738. # Go on to the next character.
  739. pos += 1
  740. if chunk_start == 0:
  741. # The string is unchanged.
  742. return in_bytes
  743. else:
  744. # Store the final chunk.
  745. byte_chunks.append(in_bytes[chunk_start:])
  746. return b''.join(byte_chunks)