PageRenderTime 47ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/blogofile/site_init/blog_features/_controllers/blog/post.py

https://github.com/mw44118/blogofile
Python | 309 lines | 288 code | 10 blank | 11 comment | 1 complexity | 25514f53e15c0f1c02eb97d779380f19 MD5 | raw file
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. """
  4. post.py parses post sources from the ./_post directory.
  5. """
  6. __author__ = "Ryan McGuire (ryan@enigmacurry.com)"
  7. __date__ = "Mon Feb 2 21:21:04 2009"
  8. import os
  9. import sys
  10. import datetime
  11. import re
  12. import operator
  13. import urlparse
  14. import hashlib
  15. import codecs
  16. import pytz
  17. import yaml
  18. import logging
  19. import BeautifulSoup
  20. import blogofile_bf as bf
  21. logger = logging.getLogger("blogofile.post")
  22. config = bf.config.controllers.blog.post
  23. config.mod = sys.modules[globals()["__name__"]]
  24. # These are all the Blogofile reserved field names for posts. It is not
  25. # recommended that users re-use any of these field names for purposes other than the
  26. # one stated.
  27. reserved_field_names = {
  28. "title" :"A one-line free-form title for the post",
  29. "date" :"The date that the post was originally created",
  30. "updated" :"The date that the post was last updated",
  31. "categories" :"A list of categories that the post pertains to, "\
  32. "each seperated by commas",
  33. "tags" :"A list of tags that the post pertains to, "\
  34. "each seperated by commas",
  35. "permalink" :"The full permanent URL for this post. "\
  36. "Automatically created if not provided",
  37. "path" :"The path from the permalink of the post",
  38. "guid" :"A unique hash for the post, if not provided it "\
  39. "is assumed that the permalink is the guid",
  40. "author" :"The name of the author of the post",
  41. "filters" :"The filter chain to apply to the entire post. "\
  42. "If not specified, a default chain based on the file extension is "\
  43. "applied. If set to 'None' it disables all filters, even default ones.",
  44. "filter" :"synonym for filters",
  45. "draft" :"If 'true' or 'True', the post is considered to be only a "\
  46. "draft and not to be published.",
  47. "source" :"Reserved internally",
  48. "yaml" :"Reserved internally",
  49. "content" :"Reserved internally",
  50. "filename" :"Reserved internally"
  51. }
  52. class PostParseException(Exception):
  53. def __init__(self, value):
  54. self.value = value
  55. def __str__(self):
  56. return repr(self.value)
  57. class Post:
  58. """
  59. Class to describe a blog post and associated metadata
  60. """
  61. def __init__(self, source, filename="Untitled"):
  62. self.source = source
  63. self.yaml = None
  64. self.title = None
  65. self.__timezone = bf.config.controllers.blog.timezone
  66. self.date = None
  67. self.updated = None
  68. self.categories = set()
  69. self.tags = set()
  70. self.permalink = None
  71. self.content = u""
  72. self.excerpt = u""
  73. self.filename = filename
  74. self.author = ""
  75. self.guid = None
  76. self.draft = False
  77. self.filters = None
  78. self.__parse()
  79. self.__post_process()
  80. def __repr__(self): #pragma: no cover
  81. return "<Post title='%s' date='%s'>" % \
  82. (self.title, self.date.strftime("%Y/%m/%d %H:%M:%S"))
  83. def __parse(self):
  84. """Parse the yaml and fill fields"""
  85. yaml_sep = re.compile("^---$", re.MULTILINE)
  86. content_parts = yaml_sep.split(self.source, maxsplit=2)
  87. if len(content_parts) < 2:
  88. raise PostParseException(self.filename+": Post has no YAML section")
  89. else:
  90. #Extract the yaml at the top
  91. self.__parse_yaml(content_parts[1])
  92. post_src = content_parts[2]
  93. self.__apply_filters(post_src)
  94. #Do post excerpting
  95. self.__parse_post_excerpting()
  96. def __apply_filters(self, post_src):
  97. """Apply filters to the post"""
  98. #Apply block level filters (filters on only part of the post)
  99. # TODO: block level filters on posts
  100. #Apply post level filters (filters on the entire post)
  101. #If filter is unspecified, use the default filter based on
  102. #the file extension:
  103. if self.filters == None:
  104. try:
  105. file_extension = os.path.splitext(self.filename)[-1][1:]
  106. self.filters = bf.config.controllers.blog.post_default_filters[
  107. file_extension]
  108. except KeyError:
  109. self.filters = []
  110. self.content = bf.filter.run_chain(self.filters, post_src)
  111. def __parse_post_excerpting(self):
  112. if bf.config.controllers.blog.post_excerpts.enabled:
  113. try:
  114. self.excerpt = bf.config.post_excerpt(
  115. self.content,bf.config.controllers.blog.post_excerpts.word_length)
  116. except AttributeError:
  117. self.excerpt = self.__excerpt(bf.config.controllers.blog.post_excerpts.word_length)
  118. def __excerpt(self, num_words=50):
  119. #Default post excerpting function
  120. #Can be overridden in _config.py by
  121. #defining post_excerpt(content,num_words)
  122. if len(self.excerpt) == 0:
  123. """Retrieve excerpt from article"""
  124. s = BeautifulSoup.BeautifulSoup(self.content)
  125. # get rid of javascript, noscript and css
  126. [[tree.extract() for tree in s(elem)] for elem in (
  127. 'script','noscript','style')]
  128. # get rid of doctype
  129. subtree = s.findAll(text=re.compile("DOCTYPE|xml"))
  130. [tree.extract() for tree in subtree]
  131. # remove headers
  132. [[tree.extract() for tree in s(elem)] for elem in (
  133. 'h1','h2','h3','h4','h5','h6')]
  134. text = ''.join(s.findAll(text=True))\
  135. .replace("\n","").split(" ")
  136. return " ".join(text[:num_words]) + '...'
  137. def __post_process(self):
  138. # fill in empty default value
  139. if not self.title:
  140. self.title = u"Untitled - " + \
  141. datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
  142. if not self.date:
  143. self.date = datetime.datetime.now(pytz.timezone(self.__timezone))
  144. if not self.updated:
  145. self.updated = self.date
  146. if not self.categories or len(self.categories) == 0:
  147. self.categories = set([Category('Uncategorized')])
  148. if not self.permalink and bf.config.controllers.blog.auto_permalink.enabled:
  149. self.permalink = bf.config.site.url.rstrip("/")+\
  150. bf.config.controllers.blog.auto_permalink.path
  151. self.permalink = re.sub(":year", self.date.strftime("%Y"),
  152. self.permalink)
  153. self.permalink = re.sub(":month", self.date.strftime("%m"),
  154. self.permalink)
  155. self.permalink = re.sub(":day", self.date.strftime("%d"),
  156. self.permalink)
  157. self.permalink = re.sub(
  158. ":title", re.sub("[ ?]","-",self.title).lower(),self.permalink)
  159. self.permalink = re.sub(
  160. ":filename", re.sub(
  161. "[ ?]","-", self.filename).lower(), self.permalink)
  162. # Generate sha hash based on title
  163. self.permalink = re.sub(":uuid", hashlib.sha1(
  164. self.title.encode('utf-8')).hexdigest(), self.permalink)
  165. logger.debug("Permalink: %s" % self.permalink)
  166. def __parse_yaml(self, yaml_src):
  167. y = yaml.load(yaml_src)
  168. # Load all the fields that require special processing first:
  169. fields_need_processing = ('permalink','guid','date','updated',
  170. 'categories','tags','draft')
  171. try:
  172. self.permalink = y['permalink']
  173. if self.permalink.startswith("/"):
  174. self.permalink = urlparse.urljoin(bf.config.site.url,self.permalink)
  175. #Ensure that the permalink is for the same site as bf.config.site.url
  176. if not self.permalink.startswith(bf.config.site.url):
  177. raise PostParseException(self.filename+": permalink for a different site"
  178. " than configured")
  179. logger.debug("path from permalink: "+self.path)
  180. except KeyError:
  181. pass
  182. try:
  183. self.guid = y['guid']
  184. except KeyError:
  185. self.guid = self.permalink
  186. try:
  187. self.date = pytz.timezone(self.__timezone).localize(
  188. datetime.datetime.strptime(y['date'],config.date_format))
  189. except KeyError:
  190. pass
  191. try:
  192. self.updated = pytz.timezone(self.__timezone).localize(
  193. datetime.datetime.strptime(y['updated'],config.date_format))
  194. except KeyError:
  195. pass
  196. try:
  197. self.categories = set([Category(x.strip()) for x in \
  198. y['categories'].split(",")])
  199. except:
  200. pass
  201. try:
  202. self.tags = set([x.strip() for x in y['tags'].split(",")])
  203. except:
  204. pass
  205. try:
  206. self.filters = y['filter'] #filter is a synonym for filters
  207. except KeyError:
  208. pass
  209. try:
  210. if y['draft']:
  211. self.draft = True
  212. logger.info("Post "+self.filename+
  213. " is set to draft, ignoring this post")
  214. else:
  215. self.draft = False
  216. except KeyError:
  217. self.draft = False
  218. # Load the rest of the fields that don't need processing:
  219. for field, value in y.items():
  220. if field not in fields_need_processing:
  221. setattr(self,field,value)
  222. def permapath(self):
  223. """Get just the path portion of a permalink"""
  224. return urlparse.urlparse(self.permalink)[2]
  225. def __cmp__(self, other_post):
  226. "Posts should be comparable by date"
  227. return cmp(self.date, other_post.date)
  228. def __eq__(self, other_post):
  229. return self is other_post
  230. def __getattr__(self, name):
  231. if name == "path":
  232. #Always generate the path from the permalink
  233. return self.permapath()
  234. else:
  235. raise AttributeError, name
  236. class Category:
  237. def __init__(self, name):
  238. self.name = unicode(name)
  239. self.url_name = self.name.lower().replace(" ","-")
  240. self.path = bf.util.site_path_helper(bf.config.controllers.blog.path,bf.config.controllers.blog.category_dir,self.url_name)
  241. def __eq__(self, other):
  242. if self.name == other.name:
  243. return True
  244. else:
  245. return False
  246. def __hash__(self):
  247. return hash(self.name)
  248. def __repr__(self):
  249. return self.name
  250. def __cmp__(self, other):
  251. return cmp(self.name, other.name)
  252. def __cmp__(self, other):
  253. return self is other
  254. def parse_posts(directory):
  255. """Retrieve all the posts from the directory specified.
  256. Returns a list of the posts sorted in reverse by date."""
  257. posts = []
  258. post_filename_re = re.compile(
  259. ".*((\.textile$)|(\.markdown$)|(\.org$)|(\.html$)|(\.txt$)|(\.rst$))")
  260. if not os.path.isdir("_posts"):
  261. logger.warn("This site has no _posts directory.")
  262. return []
  263. post_paths = [f for f in bf.util.recursive_file_list(
  264. directory, post_filename_re) if post_filename_re.match(f)]
  265. for post_path in post_paths:
  266. post_fn = os.path.split(post_path)[1]
  267. logger.debug("Parsing post: %s" % post_path)
  268. #IMO codecs.open is broken on Win32.
  269. #It refuses to open files without replacing newlines with CR+LF
  270. #reverting to regular open and decode:
  271. src = open(post_path,"r").read().decode(bf.config.controllers.blog.post_encoding)
  272. try:
  273. p = Post(src, filename=post_fn)
  274. except PostParseException as e:
  275. logger.warning(e.value+" : Skipping this post.")
  276. continue
  277. #Exclude some posts
  278. if not (p.permalink == None or p.draft == True):
  279. posts.append(p)
  280. posts.sort(key=operator.attrgetter('date'), reverse=True)
  281. return posts