PageRenderTime 36ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/r2/r2/lib/importer.py

https://github.com/wangmxf/lesswrong
Python | 291 lines | 282 code | 4 blank | 5 comment | 0 complexity | e8f459b67d57b2a5ffddbc7af4e3dcc8 MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception, LGPL-2.1
  1. import sys
  2. import os
  3. import re
  4. import datetime
  5. import pytz
  6. import yaml
  7. import urlparse
  8. from random import Random
  9. from r2.models import Link,Comment,Account,Subreddit
  10. from r2.models.account import AccountExists, register
  11. from r2.lib.db.thing import NotFound
  12. ###########################
  13. # Constants
  14. ###########################
  15. MAX_RETRIES = 100
  16. # Constants for the characters to compose a password from.
  17. # Easilty confused characters like I and l, 0 and O are omitted
  18. PASSWORD_NUMBERS='123456789'
  19. PASSWORD_LOWER_CHARS='abcdefghjkmnpqrstuwxz'
  20. PASSWORD_UPPER_CHARS='ABCDEFGHJKMNPQRSTUWXZ'
  21. PASSWORD_OTHER_CHARS='@#$%^&*'
  22. ALL_PASSWORD_CHARS = ''.join([PASSWORD_NUMBERS,PASSWORD_LOWER_CHARS,PASSWORD_UPPER_CHARS,PASSWORD_OTHER_CHARS])
  23. DATE_FORMAT = '%m/%d/%Y %I:%M:%S %p'
  24. INPUT_TIMEZONE = pytz.timezone('America/New_York')
  25. rng = Random()
  26. def generate_password():
  27. password = []
  28. for i in range(8):
  29. password.append(rng.choice(ALL_PASSWORD_CHARS))
  30. return ''.join(password)
  31. class Importer(object):
  32. def __init__(self, url_handler=None):
  33. """Constructs an importer that takes a data structure based on a yaml file.
  34. Args:
  35. url_handler: A optional URL transformation function that will be
  36. called with urls detected in post and comment bodies.
  37. """
  38. self.url_handler = url_handler if url_handler else self._default_url_handler
  39. self.username_mapping = {}
  40. @staticmethod
  41. def _default_url_handler(match):
  42. return match.group()
  43. def process_comment(self, comment_data, comment, post):
  44. # Prepare data for import
  45. ip = '127.0.0.1'
  46. if comment_data:
  47. naive_date = datetime.datetime.strptime(comment_data['dateCreated'], DATE_FORMAT)
  48. local_date = INPUT_TIMEZONE.localize(naive_date, is_dst=False) # Pick the non daylight savings time
  49. utc_date = local_date.astimezone(pytz.utc)
  50. # Determine account to use for this comment
  51. account = self._get_or_create_account(comment_data['author'], comment_data['authorEmail'])
  52. if comment_data and not comment:
  53. # Create new comment
  54. comment, inbox_rel = Comment._new(account, post, None, comment_data['body'], ip, date=utc_date)
  55. comment.is_html = True
  56. comment.ob_imported = True
  57. comment._commit()
  58. elif comment_data and comment:
  59. # Overwrite existing comment
  60. comment.author_id = account._id
  61. comment.body = comment_data['body']
  62. comment.ip = ip
  63. comment._date = utc_date
  64. comment.is_html = True
  65. comment.ob_imported = True
  66. comment._commit()
  67. elif not comment_data and comment:
  68. # Not enough comment data being imported to overwrite all comments
  69. print 'WARNING: More comments in lesswrong than we are importing, ignoring additional comment in lesswrong'
  70. kill_tags_re = re.compile(r'</?[iub]>')
  71. transform_categories_re = re.compile(r'[- ]')
  72. def process_post(self, post_data, sr):
  73. # Prepare data for import
  74. title = self.kill_tags_re.sub('', post_data['title'])
  75. article = u'%s%s' % (post_data['description'],
  76. Link._more_marker + post_data['mt_text_more'] if post_data['mt_text_more'] else u'')
  77. ip = '127.0.0.1'
  78. tags = [self.transform_categories_re.sub('_', tag.lower()) for tag in post_data.get('category', [])]
  79. naive_date = datetime.datetime.strptime(post_data['dateCreated'], DATE_FORMAT)
  80. local_date = INPUT_TIMEZONE.localize(naive_date, is_dst=False) # Pick the non daylight savings time
  81. utc_date = local_date.astimezone(pytz.utc)
  82. # Determine account to use for this post
  83. account = self._get_or_create_account(post_data['author'], post_data['authorEmail'])
  84. # Look for an existing post created due to a previous import
  85. post = self._query_post(Link.c.ob_permalink == post_data['permalink'])
  86. if not post:
  87. # Create new post
  88. post = Link._submit(title, article, account, sr, ip, tags, date=utc_date)
  89. post.blessed = True
  90. post.comment_sort_order = 'old'
  91. post.ob_permalink = post_data['permalink']
  92. post._commit()
  93. else:
  94. # Update existing post
  95. post.title = title
  96. post.article = article
  97. post.author_id = account._id
  98. post.sr_id = sr._id
  99. post.ip = ip
  100. post.set_tags(tags)
  101. post._date = utc_date
  102. post.blessed = True
  103. post.comment_sort_order = 'old'
  104. post._commit()
  105. # Process each comment for this post
  106. comments = self._query_comments(Comment.c.link_id == post._id, Comment.c.ob_imported == True)
  107. [self.process_comment(comment_data, comment, post)
  108. for comment_data, comment in map(None, post_data.get('comments', []), comments)]
  109. def substitute_ob_url(self, url):
  110. try:
  111. url = self.post_mapping[url].url
  112. except KeyError:
  113. pass
  114. return url
  115. # Borrowed from http://stackoverflow.com/questions/161738/what-is-the-best-regular-expression-to-check-if-a-string-is-a-valid-url/163684#163684
  116. url_re = re.compile(r"""(?:https?|ftp|file)://[-A-Z0-9+&@#/%?=~_|!:,.;]*[-A-Z0-9+&@#/%=~_|]""", re.IGNORECASE)
  117. def rewrite_ob_urls(self, text):
  118. if text:
  119. if isinstance(text, str):
  120. text = text.decode('utf-8')
  121. # Double decode needed to handle some wierd characters
  122. text = text.encode('utf-8')
  123. text = self.url_re.sub(lambda match: self.substitute_ob_url(match.group()), text)
  124. return text
  125. def post_process_post(self, post):
  126. """Perform post processsing to rewrite URLs and generate mapping
  127. between old and new permalinks"""
  128. post.article = self.rewrite_ob_urls(post.article)
  129. post._commit()
  130. comments = Comment._query(Comment.c.link_id == post._id, data = True)
  131. for comment in comments:
  132. comment.body = self.rewrite_ob_urls(comment.body)
  133. comment._commit()
  134. def _post_process(self, rewrite_map_file):
  135. def unicode_safe(text):
  136. if isinstance(text, unicode):
  137. return text.encode('utf-8')
  138. else:
  139. return text
  140. posts = list(Link._query(Link.c.ob_permalink != None, data = True))
  141. # Generate a mapping between ob permalinks and imported posts
  142. self.post_mapping = {}
  143. for post in posts:
  144. self.post_mapping[post.ob_permalink] = post
  145. # Write out the rewrite map
  146. for old_url, post in self.post_mapping.iteritems():
  147. ob_url = urlparse.urlparse(old_url)
  148. new_url = post.canonical_url
  149. try:
  150. rewrite_map_file.write("%s %s\n" % (unicode_safe(ob_url.path), unicode_safe(new_url)))
  151. except UnicodeEncodeError, uee:
  152. print "Unable to write to rewrite map file:"
  153. print unicode_safe(ob_url.path)
  154. print unicode_safe(new_url)
  155. # Update URLs in the posts and comments
  156. print 'Post processing imported content'
  157. for post in posts:
  158. self.post_process_post(post)
  159. def import_into_subreddit(self, sr, data, rewrite_map_file):
  160. for post_data in data:
  161. try:
  162. print post_data['title']
  163. self.process_post(post_data, sr)
  164. except Exception, e:
  165. print 'Unable to create post:\n%s\n%s\n%s' % (type(e), e, post_data)
  166. raise
  167. self._post_process(rewrite_map_file)
  168. def _query_account(self, *args):
  169. account = None
  170. kwargs = {'data': True}
  171. q = Account._query(*args, **kwargs)
  172. accounts = list(q)
  173. if accounts:
  174. account = accounts[0]
  175. return account
  176. def _query_post(self, *args):
  177. post = None
  178. kwargs = {'data': True}
  179. q = Link._query(*args, **kwargs)
  180. posts = list(q)
  181. if posts:
  182. post = posts[0]
  183. return post
  184. def _query_comments(self, *args):
  185. kwargs = {'data': True}
  186. q = Comment._query(*args, **kwargs)
  187. comments = list(q)
  188. return comments
  189. def _username_from_name(self, name):
  190. """Convert a name into a username"""
  191. return name.replace(' ', '_')
  192. def _find_account_for(self, name, email):
  193. """Try to find an existing account using derivations of the name"""
  194. try:
  195. # Look for an account we have cached
  196. account = self.username_mapping[(name, email)]
  197. except KeyError:
  198. # Look for an existing account that was created due to a previous import
  199. account = self._query_account(Account.c.ob_account_name == name,
  200. Account.c.email == email)
  201. if not account:
  202. # Look for an existing account based on derivations of the name
  203. candidates = (
  204. name,
  205. name.replace(' ', ''),
  206. self._username_from_name(name)
  207. )
  208. account = None
  209. for candidate in candidates:
  210. account = self._query_account(Account.c.name == candidate,
  211. Account.c.email == email)
  212. if account:
  213. account.ob_account_name = name
  214. account._commit()
  215. break
  216. # Cache the result for next time
  217. self.username_mapping[(name, email)] = account
  218. if not account:
  219. raise NotFound
  220. return account
  221. def _get_or_create_account(self, full_name, email):
  222. try:
  223. account = self._find_account_for(full_name, email)
  224. except NotFound:
  225. retry = 2 # First retry will by name2
  226. name = self._username_from_name(full_name)
  227. username = name
  228. while True:
  229. # Create a new account
  230. try:
  231. account = register(username, generate_password(), email)
  232. account.ob_account_name = full_name
  233. account._commit()
  234. except AccountExists:
  235. # This username is taken, generate another, but first limit the retries
  236. if retry > MAX_RETRIES:
  237. raise StandardError("Unable to create account for '%s' after %d attempts" % (full_name, retry - 1))
  238. else:
  239. # update cache with the successful account
  240. self.username_mapping[(full_name, email)] = account
  241. break
  242. username = "%s%d" % (name, retry)
  243. retry += 1
  244. return account