PageRenderTime 71ms CodeModel.GetById 33ms RepoModel.GetById 0ms app.codeStats 1ms

/plugins/pelican-mboxreader/pelican_mboxreader/mboxreader.py

https://gitlab.com/janninematt/janninematt
Python | 318 lines | 247 code | 30 blank | 41 comment | 32 complexity | b118f1e33d55ac2ce5dfa21345aea285 MD5 | raw file
  1. """mboxreader - Pelican plugin to interface with Unix mailboxes.
  2. This pelican plugin implements a custom generator that can read from
  3. an arbitrary number of Unix mboxes (support for maildirs or other types
  4. of mail folders coming soon) and turns them into articles with a unique
  5. SLUG.
  6. """
  7. from pelican import signals
  8. from pelican.generators import ArticlesGenerator, Generator
  9. from pelican.contents import Article, Page, Static, is_valid_content
  10. from pelican.utils import copy, process_translations, mkdir_p
  11. from pelican.utils import DateFormatter, slugify
  12. from pelican.readers import BaseReader, Readers
  13. from pelican import signals
  14. from itertools import chain, groupby
  15. from operator import attrgetter, itemgetter
  16. from collections import defaultdict
  17. from functools import partial
  18. import datetime
  19. import mailbox
  20. import logging
  21. import os
  22. import pytz
  23. # Other dependency! dateutil.
  24. try:
  25. from dateutil import parser
  26. except ImportError: # NOQA?
  27. parser = False
  28. # Markdown-- a half-decent plaintext -> HTML converter, for now.
  29. try:
  30. from markdown import Markdown
  31. except ImportError:
  32. Markdown = False # NOQA
  33. # The logger.
  34. logger = logging.getLogger()
  35. # Settings methods, adapted from tag-cloud plugin.
  36. # https://github.com/getpelican/pelican-plugins/blob/master/tag_cloud/tag_cloud.py
  37. def set_default_settings(settings):
  38. settings.setdefault('MBOX_PATH', '[input.mbox]')
  39. settings.setdefault('MBOX_CATEGORY', '[Mailbox]')
  40. settings.setdefault('MBOX_AUTHOR_STRING', '')
  41. settings.setdefault('MBOX_MARKDOWNIFY', False)
  42. def init_default_config(pelican):
  43. from pelican.settings import DEFAULT_CONFIG
  44. set_default_settings(DEFAULT_CONFIG)
  45. if pelican:
  46. set_default_settings(pelican.settings)
  47. def plaintext_to_html(plaintext, markdownify=False):
  48. # If markdownify is True, attempt to use markdown as a basic plaintext to
  49. # HTML converter. If we fail or if it's false, insert <p> tags as
  50. # appropriate and do no more.
  51. try:
  52. if not markdownify:
  53. raise RuntimeError
  54. content = Markdown().convert(plaintext)
  55. except:
  56. content = ''
  57. plaintext = plaintext.replace('\r\n', '\n')
  58. strings = plaintext.split('\n\n')
  59. for paragraph in strings:
  60. paragraph = paragraph.replace('\n', '<br/>')
  61. content += '<p>' + paragraph + '</p>\n\n'
  62. return content
  63. class MboxGenerator(ArticlesGenerator):
  64. def __init__(self, *args, **kwargs):
  65. """initialize properties"""
  66. self.articles = [] # only articles in default language
  67. self.translations = []
  68. self.dates = {}
  69. self.categories = defaultdict(list)
  70. self.authors = defaultdict(list)
  71. super(MboxGenerator, self).__init__(*args, **kwargs)
  72. # Private helper function to generate
  73. def _generate_mbox_articles(self, mboxPath, mboxCategory):
  74. baseReader = BaseReader(self.settings)
  75. category = baseReader.process_metadata('category', mboxCategory)
  76. # Complain if the mbox path does not exist and is not readable.
  77. try:
  78. if not os.path.exists(mboxPath):
  79. raise RuntimeError
  80. mbox = mailbox.mbox(mboxPath)
  81. except:
  82. logger.error('Could not process mbox file %s', mboxPath)
  83. return
  84. # Retrieve some fields from the settings.
  85. authorString = self.settings.get('MBOX_AUTHOR_STRING')
  86. markdownify = self.settings.get('MBOX_MARKDOWNIFY')
  87. # Loop over all messages, turn them into article objects.
  88. all_articles = []
  89. slugs = []
  90. for message in mbox.itervalues():
  91. # Get author name.
  92. author = message['from']
  93. if author is None:
  94. author = 'Unknown'
  95. else:
  96. if '<' and '>' in author:
  97. author = author[:author.find(' <')]
  98. author = author.replace('"', '').replace("'", '')
  99. # As a hack to avoid dealing with the fact that names can collide.
  100. if authorString is not None and authorString != '':
  101. author += ' ' + authorString
  102. authorObject = baseReader.process_metadata('author', author)
  103. # Get date object, using python-dateutil as an easy hack.
  104. # If there is no date in the message, abort, we shouldn't bother.
  105. if message['date'] is None:
  106. continue
  107. if parser:
  108. date = parser.parse(message['date'])
  109. else:
  110. logger.error('No python-dateutil, we cannot continue as ' +
  111. 'date formats cannot be parsed. ')
  112. continue
  113. monthYear = date.strftime('%B-%Y').lower()
  114. # Get title and slug; build year + month into slug.
  115. subject = message['subject']
  116. slugSubject = slugify(subject)
  117. slug = os.path.join(slugify(mboxCategory), monthYear, slugSubject)
  118. # Hack to handle multiple messages with the same subject.
  119. if slug in slugs:
  120. slug += "_%d"
  121. count = 2
  122. testSlug = slug % count
  123. while testSlug in slugs:
  124. count += 1
  125. testSlug = slug % count
  126. slug = testSlug
  127. slugs.append(slug)
  128. # Code adapted from Stackoverflow for parsing email messages.
  129. # https://stackoverflow.com/questions/4824376/parse-multi-part-email-with-sub-parts-using-python
  130. # Code is clumsy, should be refactored.
  131. if message.is_multipart():
  132. plaintext = None
  133. html = None
  134. for part in message.get_payload():
  135. charset = message.get_content_charset()
  136. if charset is None or charset == 'x-unknown':
  137. charset = 'us-ascii'
  138. payload = part.get_payload(decode=True)
  139. if part.get_content_type() == 'text/plain':
  140. plaintext = unicode(payload, charset, "ignore")
  141. plaintext = plaintext.encode('ascii', 'replace')
  142. if part.get_content_type() == 'text/html':
  143. html = unicode(payload, charset, "ignore")
  144. html = html.encode('ascii', 'replace')
  145. if plaintext is None and html is None:
  146. continue
  147. elif plaintext is None:
  148. content = html
  149. else:
  150. content = plaintext_to_html(plaintext, markdownify)
  151. else:
  152. charset = message.get_content_charset()
  153. if charset is None or charset == 'x-unknown':
  154. charset = 'us-ascii'
  155. payload = message.get_payload(decode=True)
  156. plaintext = unicode(payload, charset, "ignore")
  157. plaintext = plaintext.encode('ascii', 'replace')
  158. content = plaintext_to_html(plaintext, markdownify)
  159. metadata = {'title': subject,
  160. 'date': date,
  161. 'category': category,
  162. 'authors': [authorObject],
  163. 'slug': slug}
  164. article = Article(content=content,
  165. metadata=metadata,
  166. settings=self.settings,
  167. source_path=mboxPath,
  168. context=self.context)
  169. # This seems like it cannot happen... but it does without fail.
  170. article.author = article.authors[0]
  171. all_articles.append(article)
  172. return all_articles
  173. # For now, don't generate feeds.
  174. def generate_feeds(self, writer):
  175. return
  176. def generate_pages(self, writer):
  177. """Generate the pages on the disk"""
  178. write = partial(writer.write_file,
  179. relative_urls=self.settings['RELATIVE_URLS'],
  180. override_output=True)
  181. # to minimize the number of relative path stuff modification
  182. # in writer, articles pass first
  183. self.generate_articles(write)
  184. self.generate_period_archives(write)
  185. self.generate_direct_templates(write)
  186. # and subfolders after that
  187. self.generate_categories(write)
  188. self.generate_authors(write)
  189. def generate_articles(self, write):
  190. """Generate the articles."""
  191. # Hm... this is a bit clunky; it overrides override_output.
  192. # It appears that this is not a problem.
  193. for article in chain(self.translations, self.articles):
  194. write(article.save_as, self.get_template(article.template),
  195. self.context, article=article, category=article.category,
  196. override_output=True, blog=True)
  197. def generate_context(self):
  198. # Update the context (only articles in default language)
  199. self.articles = self.context['articles']
  200. # Complain if MBOX_PATH and MBOX_CATEGORY are not of the same length.
  201. mboxPaths = self.settings.get('MBOX_PATH')
  202. mboxCategories = self.settings.get('MBOX_CATEGORY')
  203. errMsg = 'MBOX_PATH, MBOX_CATEGORY not of equal length or non-empty.'
  204. if len(mboxPaths) != len(mboxCategories) or len(mboxPaths) <= 0:
  205. logger.error(errMsg)
  206. return
  207. all_articles = []
  208. for i in xrange(len(mboxPaths)):
  209. mboxPath = mboxPaths[i]
  210. mboxCategory = mboxCategories[i]
  211. new_articles = self._generate_mbox_articles(mboxPath, mboxCategory)
  212. all_articles.extend(new_articles)
  213. # Log that we did stuff.
  214. print(('Read in %d messages from %s and converted to articles in ' +
  215. 'category %s.') % (len(new_articles), mboxPath, mboxCategory))
  216. print('Read in %d messages from all mailboxes.' % (len(all_articles)))
  217. # Continue with the rest of ArticleGenerator, code adapted from:
  218. # https://github.com/getpelican/pelican/blob/master/pelican/generators.py#L548
  219. # ARTICLE_ORDER_BY doesn't exist in 3.3, which was in Fedora 21.
  220. # (I wanted to be able to build this on F21 at the time).
  221. articles, translations = process_translations(all_articles)
  222. # , order_by=self.settings['ARTICLE_ORDER_BY'])
  223. self.articles.extend(articles)
  224. self.translations.extend(translations)
  225. # Disabled for 3.3 compatibility, great.
  226. # signals.article_generator_pretaxonomy.send(self)
  227. for article in self.articles:
  228. # only main articles are listed in categories and tags
  229. # not translations
  230. # We have to use django for this, unfortunately.
  231. if article.date.tzinfo is None:
  232. article.date = pytz.UTC.localize(article.date)
  233. self.categories[article.category].append(article)
  234. # Support for Author and Authors.
  235. if hasattr(article, 'author') and article.author.name != '':
  236. self.authors[article.author].append(article)
  237. else:
  238. for author in getattr(article, 'authors', []):
  239. self.authors[author].append(article)
  240. # This may not technically be right, but...
  241. # Sort the articles by date too.
  242. self.articles = list(self.articles)
  243. self.dates = self.articles
  244. self.dates.sort(key=attrgetter('date'),
  245. reverse=self.context['NEWEST_FIRST_ARCHIVES'])
  246. # and generate the output :)
  247. # order the categories per name
  248. self.categories = list(self.categories.items())
  249. self.categories.sort(reverse=self.settings['REVERSE_CATEGORY_ORDER'])
  250. self.authors = list(self.authors.items())
  251. self.authors.sort()
  252. self._update_context(('articles', 'dates', 'categories', 'authors'))
  253. # Disabled for 3.3 compatibility for now, great.
  254. # self.save_cache()
  255. # self.readers.save_cache()
  256. # And finish.
  257. # signals.article_generator_finalized.send(self)
  258. def get_generators(pelican_object):
  259. return MboxGenerator
  260. def register():
  261. signals.initialized.connect(init_default_config)
  262. signals.get_generators.connect(get_generators)