/src/calibre/ebooks/metadata/html.py

https://github.com/sss/calibre-at-bzr · Python · 251 lines · 208 code · 23 blank · 20 comment · 97 complexity · bdb6e3ce60b9db0d01b8db27de7770b9 MD5 · raw file

  1. #!/usr/bin/env python
  2. # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
  3. __license__ = 'GPL v3'
  4. __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
  5. '''
  6. Try to read metadata from an HTML file.
  7. '''
  8. import re
  9. from calibre.ebooks.metadata import MetaInformation
  10. from calibre.ebooks.chardet import xml_to_unicode
  11. from calibre import entity_to_unicode
  12. from calibre.utils.date import parse_date
  13. def get_metadata(stream):
  14. src = stream.read()
  15. return get_metadata_(src)
  16. def get_meta_regexp_(name):
  17. return re.compile('<meta name=[\'"]' + name + r'[\'"]\s+content=[\'"](.+?)[\'"]\s*/?>', re.IGNORECASE)
  18. def get_metadata_(src, encoding=None):
  19. if not isinstance(src, unicode):
  20. if not encoding:
  21. src = xml_to_unicode(src)[0]
  22. else:
  23. src = src.decode(encoding, 'replace')
  24. # Meta data definitions as in
  25. # http://www.mobileread.com/forums/showpost.php?p=712544&postcount=9
  26. # Title
  27. title = None
  28. pat = re.compile(r'<!--.*?TITLE=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL)
  29. src = src[:150000] # Searching shouldn't take too long
  30. match = pat.search(src)
  31. if match:
  32. title = match.group(2)
  33. else:
  34. for x in ('DC.title','DCTERMS.title','Title'):
  35. pat = get_meta_regexp_(x)
  36. match = pat.search(src)
  37. if match:
  38. title = match.group(1)
  39. break
  40. if not title:
  41. pat = re.compile('<title>([^<>]+?)</title>', re.IGNORECASE)
  42. match = pat.search(src)
  43. if match:
  44. title = match.group(1)
  45. # Author
  46. author = None
  47. pat = re.compile(r'<!--.*?AUTHOR=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL)
  48. match = pat.search(src)
  49. if match:
  50. author = match.group(2).replace(',', ';')
  51. else:
  52. for x in ('Author','DC.creator.aut','DCTERMS.creator.aut', 'DC.creator'):
  53. pat = get_meta_regexp_(x)
  54. match = pat.search(src)
  55. if match:
  56. author = match.group(1)
  57. break
  58. # Create MetaInformation with Title and Author
  59. ent_pat = re.compile(r'&(\S+)?;')
  60. if title:
  61. title = ent_pat.sub(entity_to_unicode, title)
  62. if author:
  63. author = ent_pat.sub(entity_to_unicode, author)
  64. mi = MetaInformation(title, [author] if author else None)
  65. # Publisher
  66. publisher = None
  67. pat = re.compile(r'<!--.*?PUBLISHER=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL)
  68. match = pat.search(src)
  69. if match:
  70. publisher = match.group(2)
  71. else:
  72. for x in ('Publisher','DC.publisher','DCTERMS.publisher'):
  73. pat = get_meta_regexp_(x)
  74. match = pat.search(src)
  75. if match:
  76. publisher = match.group(1)
  77. break
  78. if publisher:
  79. mi.publisher = ent_pat.sub(entity_to_unicode, publisher)
  80. # ISBN
  81. isbn = None
  82. pat = re.compile(r'<!--.*?ISBN=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
  83. match = pat.search(src)
  84. if match:
  85. isbn = match.group(1)
  86. else:
  87. for x in ('ISBN','DC.identifier.ISBN','DCTERMS.identifier.ISBN'):
  88. pat = get_meta_regexp_(x)
  89. match = pat.search(src)
  90. if match:
  91. isbn = match.group(1)
  92. break
  93. if isbn:
  94. mi.isbn = re.sub(r'[^0-9xX]', '', isbn)
  95. # LANGUAGE
  96. language = None
  97. pat = re.compile(r'<!--.*?LANGUAGE=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
  98. match = pat.search(src)
  99. if match:
  100. language = match.group(1)
  101. else:
  102. for x in ('DC.language','DCTERMS.language'):
  103. pat = get_meta_regexp_(x)
  104. match = pat.search(src)
  105. if match:
  106. language = match.group(1)
  107. break
  108. if language:
  109. mi.language = language
  110. # PUBDATE
  111. pubdate = None
  112. pat = re.compile(r'<!--.*?PUBDATE=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
  113. match = pat.search(src)
  114. if match:
  115. pubdate = match.group(1)
  116. else:
  117. for x in ('Pubdate','Date of publication','DC.date.published','DC.date.publication','DC.date.issued','DCTERMS.issued'):
  118. pat = get_meta_regexp_(x)
  119. match = pat.search(src)
  120. if match:
  121. pubdate = match.group(1)
  122. break
  123. if pubdate:
  124. try:
  125. mi.pubdate = parse_date(pubdate)
  126. except:
  127. pass
  128. # TIMESTAMP
  129. timestamp = None
  130. pat = re.compile(r'<!--.*?TIMESTAMP=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
  131. match = pat.search(src)
  132. if match:
  133. timestamp = match.group(1)
  134. else:
  135. for x in ('Timestamp','Date of creation','DC.date.created','DC.date.creation','DCTERMS.created'):
  136. pat = get_meta_regexp_(x)
  137. match = pat.search(src)
  138. if match:
  139. timestamp = match.group(1)
  140. break
  141. if timestamp:
  142. try:
  143. mi.timestamp = parse_date(timestamp)
  144. except:
  145. pass
  146. # SERIES
  147. series = None
  148. pat = re.compile(r'<!--.*?SERIES=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
  149. match = pat.search(src)
  150. if match:
  151. series = match.group(1)
  152. else:
  153. pat = get_meta_regexp_("Series")
  154. match = pat.search(src)
  155. if match:
  156. series = match.group(1)
  157. if series:
  158. pat = re.compile(r'\[([.0-9]+)\]')
  159. match = pat.search(series)
  160. series_index = None
  161. if match is not None:
  162. try:
  163. series_index = float(match.group(1))
  164. except:
  165. pass
  166. series = series.replace(match.group(), '').strip()
  167. mi.series = ent_pat.sub(entity_to_unicode, series)
  168. if series_index is None:
  169. pat = get_meta_regexp_("Seriesnumber")
  170. match = pat.search(src)
  171. if match:
  172. try:
  173. series_index = float(match.group(1))
  174. except:
  175. pass
  176. if series_index is not None:
  177. mi.series_index = series_index
  178. # RATING
  179. rating = None
  180. pat = re.compile(r'<!--.*?RATING=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
  181. match = pat.search(src)
  182. if match:
  183. rating = match.group(1)
  184. else:
  185. pat = get_meta_regexp_("Rating")
  186. match = pat.search(src)
  187. if match:
  188. rating = match.group(1)
  189. if rating:
  190. try:
  191. mi.rating = float(rating)
  192. if mi.rating < 0:
  193. mi.rating = 0
  194. if mi.rating > 5:
  195. mi.rating /= 2.
  196. if mi.rating > 5:
  197. mi.rating = 0
  198. except:
  199. pass
  200. # COMMENTS
  201. comments = None
  202. pat = re.compile(r'<!--.*?COMMENTS=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
  203. match = pat.search(src)
  204. if match:
  205. comments = match.group(1)
  206. else:
  207. pat = get_meta_regexp_("Comments")
  208. match = pat.search(src)
  209. if match:
  210. comments = match.group(1)
  211. if comments:
  212. mi.comments = ent_pat.sub(entity_to_unicode, comments)
  213. # TAGS
  214. tags = None
  215. pat = re.compile(r'<!--.*?TAGS=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
  216. match = pat.search(src)
  217. if match:
  218. tags = match.group(1)
  219. else:
  220. pat = get_meta_regexp_("Tags")
  221. match = pat.search(src)
  222. if match:
  223. tags = match.group(1)
  224. if tags:
  225. mi.tags = [x.strip() for x in ent_pat.sub(entity_to_unicode,
  226. tags).split(",")]
  227. # Ready to return MetaInformation
  228. return mi