PageRenderTime 68ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/_controllers/blog/post.py

https://github.com/langner/mmqc
Python | 342 lines | 317 code | 14 blank | 11 comment | 1 complexity | 03ae67d33db01b8634a5bd89d667a03d MD5 | raw file
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. """
  4. post.py parses post sources from the ./_post directory.
  5. """
  6. __author__ = "Ryan McGuire (ryan@enigmacurry.com)"
  7. __date__ = "Mon Feb 2 21:21:04 2009"
  8. import os
  9. import sys
  10. import datetime
  11. import re
  12. import operator
  13. import urlparse
  14. import hashlib
  15. import codecs
  16. import pytz
  17. import yaml
  18. import logging
  19. import BeautifulSoup
  20. import blogofile_bf as bf
  21. logger = logging.getLogger("blogofile.post")
  22. config = bf.config.controllers.blog.post
  23. config.mod = sys.modules[globals()["__name__"]]
  24. # These are all the Blogofile reserved field names for posts. It is not
  25. # recommended that users re-use any of these field names for purposes other
  26. # than the one stated.
  27. reserved_field_names = {
  28. "title" :"A one-line free-form title for the post",
  29. "date" :"The date that the post was originally created",
  30. "updated" :"The date that the post was last updated",
  31. "categories" :"A list of categories that the post pertains to, "\
  32. "each seperated by commas",
  33. "tags" :"A list of tags that the post pertains to, "\
  34. "each seperated by commas",
  35. "permalink" :"The full permanent URL for this post. "\
  36. "Automatically created if not provided",
  37. "path" :"The path from the permalink of the post",
  38. "guid" :"A unique hash for the post, if not provided it "\
  39. "is assumed that the permalink is the guid",
  40. "slug" :"The title part of the URL for the post, if not "\
  41. "provided it is automatically generated from the title."\
  42. "It is not used if permalink does not contain :title",
  43. "author" :"The name of the author of the post",
  44. "filters" :"The filter chain to apply to the entire post. "\
  45. "If not specified, a default chain based on the file extension is "\
  46. "applied. If set to 'None' it disables all filters, even default ones.",
  47. "filter" :"synonym for filters",
  48. "draft" :"If 'true' or 'True', the post is considered to be only a "\
  49. "draft and not to be published.",
  50. "source" :"Reserved internally",
  51. "yaml" :"Reserved internally",
  52. "content" :"Reserved internally",
  53. "filename" :"Reserved internally"
  54. }
  55. class PostParseException(Exception):
  56. def __init__(self, value):
  57. self.value = value
  58. def __str__(self):
  59. return repr(self.value)
  60. class Post(object):
  61. """
  62. Class to describe a blog post and associated metadata
  63. """
  64. def __init__(self, source, filename="Untitled"):
  65. self.source = source
  66. self.yaml = None
  67. self.title = None
  68. self.__timezone = bf.config.controllers.blog.timezone
  69. self.date = None
  70. self.updated = None
  71. self.categories = set()
  72. self.tags = set()
  73. self.permalink = None
  74. self.content = u""
  75. self.excerpt = u""
  76. self.filename = filename
  77. self.author = ""
  78. self.guid = None
  79. self.slug = None
  80. self.draft = False
  81. self.filters = None
  82. self.__parse()
  83. self.__post_process()
  84. def __repr__(self): #pragma: no cover
  85. return u"<Post title='{0}' date='{1}'>".format(
  86. self.title, self.date.strftime("%Y/%m/%d %H:%M:%S"))
  87. def __parse(self):
  88. """Parse the yaml and fill fields"""
  89. yaml_sep = re.compile("^---$", re.MULTILINE)
  90. content_parts = yaml_sep.split(self.source, maxsplit=2)
  91. if len(content_parts) < 2:
  92. raise PostParseException(u"{0}: Post has no YAML section".format(
  93. self.filename))
  94. else:
  95. #Extract the yaml at the top
  96. self.__parse_yaml(content_parts[1])
  97. post_src = content_parts[2]
  98. self.__apply_filters(post_src)
  99. #Do post excerpting
  100. self.__parse_post_excerpting()
  101. def __apply_filters(self, post_src):
  102. """Apply filters to the post"""
  103. #Apply block level filters (filters on only part of the post)
  104. # TODO: block level filters on posts
  105. #Apply post level filters (filters on the entire post)
  106. #If filter is unspecified, use the default filter based on
  107. #the file extension:
  108. if self.filters is None:
  109. try:
  110. file_extension = os.path.splitext(self.filename)[-1][1:]
  111. self.filters = bf.config.controllers.blog.post_default_filters[
  112. file_extension]
  113. except KeyError:
  114. self.filters = []
  115. self.content = bf.filter.run_chain(self.filters, post_src)
  116. def __parse_post_excerpting(self):
  117. if bf.config.controllers.blog.post_excerpts.enabled:
  118. length = bf.config.controllers.blog.post_excerpts.word_length
  119. try:
  120. self.excerpt = bf.config.post_excerpt(self.content, length)
  121. except AttributeError:
  122. self.excerpt = self.__excerpt(length)
  123. def __excerpt(self, num_words=50):
  124. #Default post excerpting function
  125. #Can be overridden in _config.py by
  126. #defining post_excerpt(content,num_words)
  127. if len(self.excerpt) == 0:
  128. """Retrieve excerpt from article"""
  129. s = BeautifulSoup.BeautifulSoup(self.content)
  130. # get rid of javascript, noscript and css
  131. [[tree.extract() for tree in s(elem)] for elem in (
  132. 'script', 'noscript', 'style')]
  133. # get rid of doctype
  134. subtree = s.findAll(text=re.compile("DOCTYPE|xml"))
  135. [tree.extract() for tree in subtree]
  136. # remove headers
  137. [[tree.extract() for tree in s(elem)] for elem in (
  138. 'h1', 'h2', 'h3', 'h4', 'h5', 'h6')]
  139. text = ''.join(s.findAll(text=True))\
  140. .replace("\n", "").split(" ")
  141. return " ".join(text[:num_words]) + '...'
  142. def __post_process(self):
  143. # fill in empty default value
  144. if not self.title:
  145. self.title = u"Untitled - {0}".format(
  146. datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
  147. if not self.slug:
  148. self.slug = re.sub("[ ?]", "-", self.title).lower()
  149. if not self.date:
  150. self.date = datetime.datetime.now(pytz.timezone(self.__timezone))
  151. if not self.updated:
  152. self.updated = self.date
  153. if not self.categories or len(self.categories) == 0:
  154. self.categories = set([Category('Uncategorized')])
  155. if not self.permalink and \
  156. bf.config.controllers.blog.auto_permalink.enabled:
  157. self.permalink = bf.config.site.url.rstrip("/") + \
  158. bf.config.controllers.blog.auto_permalink.path
  159. self.permalink = \
  160. re.sub(":blog_path", bf.config.blog.path, self.permalink)
  161. self.permalink = \
  162. re.sub(":year", self.date.strftime("%Y"), self.permalink)
  163. self.permalink = \
  164. re.sub(":month", self.date.strftime("%m"), self.permalink)
  165. self.permalink = \
  166. re.sub(":day", self.date.strftime("%d"), self.permalink)
  167. self.permalink = \
  168. re.sub(":title", self.slug, self.permalink)
  169. # TODO: slugification should be abstracted out somewhere reusable
  170. self.permalink = re.sub(
  171. ":filename", re.sub(
  172. "[ ?]", "-", self.filename).lower(), self.permalink)
  173. # Generate sha hash based on title
  174. self.permalink = re.sub(":uuid", hashlib.sha1(
  175. self.title.encode('utf-8')).hexdigest(), self.permalink)
  176. logger.debug(u"Permalink: {0}".format(self.permalink))
  177. def __parse_yaml(self, yaml_src):
  178. y = yaml.load(yaml_src)
  179. # Load all the fields that require special processing first:
  180. fields_need_processing = ('permalink', 'guid', 'date', 'updated',
  181. 'categories', 'tags', 'draft')
  182. try:
  183. self.permalink = y['permalink']
  184. if self.permalink.startswith("/"):
  185. self.permalink = urlparse.urljoin(bf.config.site.url,
  186. self.permalink)
  187. #Ensure that the permalink is for the same site as bf.config.site.url
  188. if not self.permalink.startswith(bf.config.site.url):
  189. raise PostParseException(u"{0}: permalink for a different site"
  190. " than configured".format(self.filename))
  191. logger.debug(u"path from permalink: {0}".format(self.path))
  192. except KeyError:
  193. pass
  194. try:
  195. self.guid = y['guid']
  196. except KeyError:
  197. self.guid = self.permalink
  198. try:
  199. self.date = pytz.timezone(self.__timezone).localize(
  200. datetime.datetime.strptime(y['date'], config.date_format))
  201. except KeyError:
  202. pass
  203. try:
  204. self.updated = pytz.timezone(self.__timezone).localize(
  205. datetime.datetime.strptime(y['updated'], config.date_format))
  206. except KeyError:
  207. pass
  208. try:
  209. self.categories = set([Category(x.strip()) for x in \
  210. y['categories'].split(",")])
  211. except:
  212. pass
  213. try:
  214. self.tags = set([x.strip() for x in y['tags'].split(",")])
  215. except:
  216. pass
  217. try:
  218. self.filters = y['filter'] #filter is a synonym for filters
  219. except KeyError:
  220. pass
  221. try:
  222. if y['draft']:
  223. self.draft = True
  224. logger.info(u"Post {0} is set to draft, "
  225. "ignoring this post".format(self.filename))
  226. else:
  227. self.draft = False
  228. except KeyError:
  229. self.draft = False
  230. # Load the rest of the fields that don't need processing:
  231. for field, value in y.items():
  232. if field not in fields_need_processing:
  233. setattr(self,field,value)
  234. def permapath(self):
  235. """Get just the path portion of a permalink"""
  236. return urlparse.urlparse(self.permalink)[2]
  237. def __cmp__(self, other_post):
  238. "Posts should be comparable by date"
  239. return cmp(self.date, other_post.date)
  240. def __eq__(self, other_post):
  241. return self is other_post
  242. def __getattr__(self, name):
  243. if name == "path":
  244. #Always generate the path from the permalink
  245. return self.permapath()
  246. else:
  247. raise AttributeError, name
  248. class Category(object):
  249. def __init__(self, name):
  250. self.name = unicode(name)
  251. # TODO: slugification should be abstracted out somewhere reusable
  252. # TODO: consider making url_name and path read-only properties?
  253. self.url_name = self.name.lower().replace(" ", "-")
  254. self.path = bf.util.site_path_helper(
  255. bf.config.controllers.blog.path,
  256. bf.config.controllers.blog.category_dir,
  257. self.url_name)
  258. def __eq__(self, other):
  259. if self.name == other.name:
  260. return True
  261. return False
  262. def __hash__(self):
  263. return hash(self.name)
  264. def __repr__(self):
  265. return self.name
  266. def __cmp__(self, other):
  267. return cmp(self.name, other.name)
  268. def parse_posts(directory):
  269. """Retrieve all the posts from the directory specified.
  270. Returns a list of the posts sorted in reverse by date."""
  271. posts = []
  272. post_filename_re = re.compile(
  273. ".*((\.textile$)|(\.markdown$)|(\.org$)|(\.html$)|(\.txt$)|(\.rst$))")
  274. if not os.path.isdir("_posts"):
  275. logger.warn("This site has no _posts directory.")
  276. return []
  277. post_paths = [f.decode("utf-8") for f in bf.util.recursive_file_list(
  278. directory, post_filename_re) if post_filename_re.match(f)]
  279. for post_path in post_paths:
  280. post_fn = os.path.split(post_path)[1]
  281. logger.debug(u"Parsing post: {0}".format(post_path))
  282. #IMO codecs.open is broken on Win32.
  283. #It refuses to open files without replacing newlines with CR+LF
  284. #reverting to regular open and decode:
  285. try:
  286. src = open(post_path, "r").read().decode(
  287. bf.config.controllers.blog.post_encoding)
  288. except:
  289. logger.exception(u"Error reading post: {0}".format(post_path))
  290. raise
  291. try:
  292. p = Post(src, filename=post_fn)
  293. except PostParseException as e:
  294. logger.warning(u"{0} : Skipping this post.".format(e.value))
  295. continue
  296. #Exclude some posts
  297. if not (p.permalink is None or p.draft is True):
  298. posts.append(p)
  299. posts.sort(key=operator.attrgetter('date'), reverse=True)
  300. return posts