PageRenderTime 50ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/repository/plugin.video.hushamiptv/htmlcleaner.py

https://gitlab.com/billyprice1/husham.com
Python | 158 lines | 110 code | 27 blank | 21 comment | 45 complexity | 9091d5c25bc4856b70186bbfd0277c72 MD5 | raw file
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. """
  4. HTMLCLEANER
  5. A bastardised version of html2text, only retaining the entity cleaner.
  6. What does it do?
  7. Replaces annoying characters like '
  8. USAGE:
  9. import htmlcleaner
  10. cleanedhtml = htmlcleaner.clean(my-html-string, strip=False)
  11. print cleanedhtml
  12. if strip = True, é will be replaced with e and so on.
  13. """
  14. __version__ = "1.0"
  15. __author__ = "Anarchintosh (@xbmcforums)"
  16. __copyright__ = "Copyleft 2011 onwards GNU GPL 3."
  17. __contributors__ = ["Aaron Swartz", "Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
  18. try:
  19. True
  20. except NameError:
  21. setattr(__builtins__, 'True', 1)
  22. setattr(__builtins__, 'False', 0)
  23. def has_key(x, y):
  24. if hasattr(x, 'has_key'): return x.has_key(y)
  25. else: return y in x
  26. try:
  27. import htmlentitydefs
  28. except ImportError: #Python3
  29. import html.entities as htmlentitydefs
  30. import re, codecs, unicodedata
  31. try: from textwrap import wrap
  32. except: pass
  33. # Use Unicode characters instead of their ascii psuedo-replacements
  34. UNICODE_SNOB = 1
  35. ### Entity Nonsense ###
  36. def name2cp(k):
  37. if k == 'apos': return ord("'")
  38. if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
  39. return htmlentitydefs.name2codepoint[k]
  40. else:
  41. k = htmlentitydefs.entitydefs[k]
  42. if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
  43. return ord(codecs.latin_1_decode(k)[0])
  44. unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
  45. 'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
  46. 'ndash':'-', 'oelig':'oe', 'aelig':'ae',
  47. 'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
  48. 'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
  49. 'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
  50. 'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
  51. 'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}
  52. unifiable_n = {}
  53. for k in unifiable.keys():
  54. unifiable_n[name2cp(k)] = unifiable[k]
  55. def charref(name):
  56. if name[0] in ['x','X']:
  57. c = int(name[1:], 16)
  58. else:
  59. c = int(name)
  60. if not UNICODE_SNOB and c in unifiable_n.keys():
  61. return unifiable_n[c]
  62. else:
  63. try:
  64. return unichr(c)
  65. except NameError: #Python3
  66. return chr(c)
  67. def entityref(c):
  68. if not UNICODE_SNOB and c in unifiable.keys():
  69. return unifiable[c]
  70. else:
  71. try: name2cp(c)
  72. except KeyError: return "&" + c + ';'
  73. else:
  74. try:
  75. return unichr(name2cp(c))
  76. except NameError: #Python3
  77. return chr(name2cp(c))
  78. def replaceEntities(s):
  79. s = s.group(1)
  80. if s[0] == "#":
  81. return charref(s[1:])
  82. elif s.startswith('u') or s.startswith('U'):
  83. return charref('x' + s[1:])
  84. else: return entityref(s)
  85. r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
  86. r_unescape_unicode = re.compile(r"\\([uU]{1}[0-9a-fA-F]{4})")
  87. def unescape(s):
  88. html_has_unicode = False
  89. if '\\u' in s or '\\U' in s: html_has_unicode = True
  90. s = r_unescape.sub(replaceEntities, s)
  91. if html_has_unicode:
  92. s = r_unescape_unicode.sub(replaceEntities, s)
  93. return s
  94. ### End Entity Nonsense ###
  95. def cleanUnicode(string):
  96. try:
  97. try:
  98. #string = str(string)
  99. if isinstance(string, unicode):
  100. unicode_replaced_str = string.decode('utf-8')
  101. elif isinstance(string, str):
  102. unicode_replaced_str = string.decode('utf-8')
  103. import unidecode
  104. unicode_replaced_str = unidecode.unidecode(unicode_replaced_str)
  105. string = unicode_replaced_str
  106. except:
  107. pass
  108. fixed_string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore' )
  109. return fixed_string
  110. except:
  111. return string
  112. #interface:
  113. def clean(html,strip=False,remove_non_ascii=False):
  114. cleaned = unescape(html)
  115. if remove_non_ascii:
  116. cleaned = re.sub(r'[^\x00-\x7F]+',' ', cleaned)
  117. if strip == True:
  118. return cleanUnicode(cleaned)
  119. else:
  120. return cleaned
  121. def clean2(html,strip=False,remove_non_ascii=False):
  122. cleaned = unescape(html)
  123. if strip == True:
  124. cleaned = cleanUnicode(cleaned)
  125. if remove_non_ascii:
  126. return re.sub(r'[^\x00-\x7F]+',' ', cleaned)
  127. else:
  128. return cleaned