PageRenderTime 51ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/scripts/import_missing_comments.py

https://github.com/wangmxf/lesswrong
Python | 224 lines | 218 code | 4 blank | 2 comment | 5 complexity | 436797f2b74e5ef25180e9939b75f891 MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception, LGPL-2.1
  1. import re
  2. import yaml
  3. import pytz
  4. import urlparse
  5. import datetime
  6. from random import Random
  7. from BeautifulSoup import BeautifulSoup
  8. from r2.models import Link,Comment,Account,Subreddit,FakeAccount
  9. from r2.models.account import AccountExists, register
  10. from r2.lib.db.thing import NotFound
  11. DATE_FORMAT = '%m/%d/%Y %I:%M:%S %p'
  12. INPUT_TIMEZONE = pytz.timezone('America/New_York')
  13. MAX_RETRIES = 100
  14. dryrun = True
  15. username_mapping = {}
  16. # Constants for the characters to compose a password from.
  17. # Easilty confused characters like I and l, 0 and O are omitted
  18. PASSWORD_NUMBERS='123456789'
  19. PASSWORD_LOWER_CHARS='abcdefghjkmnpqrstuwxz'
  20. PASSWORD_UPPER_CHARS='ABCDEFGHJKMNPQRSTUWXZ'
  21. PASSWORD_OTHER_CHARS='@#$%^&*'
  22. ALL_PASSWORD_CHARS = ''.join([PASSWORD_NUMBERS,PASSWORD_LOWER_CHARS,PASSWORD_UPPER_CHARS,PASSWORD_OTHER_CHARS])
  23. rng = Random()
  24. def generate_password():
  25. password = []
  26. for i in range(8):
  27. password.append(rng.choice(ALL_PASSWORD_CHARS))
  28. return ''.join(password)
  29. def comment_excerpt(comment):
  30. excerpt = comment['body'].replace("\n", '')[0:50]
  31. try:
  32. excerpt = "comment by '%s': %s" % (comment['author'].decode('utf-8').encode('utf-8'), excerpt.decode('utf-8').encode('utf-8'))
  33. except UnicodeError:
  34. excerpt = '*'
  35. return excerpt
  36. re_non_alphanum = re.compile(r'[^a-zA-Z0-9]*')
  37. def comment_exists(post, comment):
  38. # Check if this comment already exists using brutal compare on content
  39. # BeautifulSoup is used to parse as HTML in order to remove markup
  40. content = ''.join(BeautifulSoup(comment['body']).findAll(text=True))
  41. key = re_non_alphanum.sub('', content)
  42. existing_comments = Comment._query(Comment.c.link_id == post._id, Comment.c.ob_imported == True, data=True)
  43. for existing_comment in existing_comments:
  44. author = Account._byID(existing_comment.author_id, data=True)
  45. content = ''.join(BeautifulSoup(existing_comment.body).findAll(text=True))
  46. existing_key = re_non_alphanum.sub('', content)
  47. if key == existing_key:
  48. print " Skipping existing %s" % comment_excerpt(comment)
  49. return True
  50. # else:
  51. # print "%s *|NOT|* %s" % (key, existing_key)
  52. return False
  53. def get_or_create_account(name):
  54. try:
  55. # Look for an account we have cached
  56. account = username_mapping[name]
  57. except KeyError:
  58. # See if there's a previously imported account
  59. account = list(Account._query(Account.c.ob_account_name == name, data=True))
  60. if len(account) == 1:
  61. account = account[0]
  62. elif len(account) > 1:
  63. print " Got more than one account for OB username '%s', select one below:" % name
  64. for i in range(len(account)):
  65. email = account[i].email if hasattr(account[i], 'email') else ''
  66. print " %d. %s, %s" % (i, account[i].name, email)
  67. i += 1
  68. print " %d. Create new" % i
  69. i += 1
  70. print " %d. None, abort" % i
  71. max_choice = i
  72. choice = -1
  73. while choice < 0 or choice > max_choice:
  74. choice = raw_input("Enter selection: ")
  75. try:
  76. choice = int(choice)
  77. except ValueError:
  78. choice = -1
  79. if choice in range(len(account)):
  80. account = account[choice]
  81. elif choice == max_choice:
  82. raise Exception("Aborting")
  83. else:
  84. # Fall through to code below
  85. account = None
  86. else:
  87. # Try derivatives of the name that may exist
  88. candidates = (
  89. name,
  90. name.replace(' ', ''),
  91. name.replace(' ', '_')
  92. )
  93. for candidate in candidates:
  94. try:
  95. account = Account._by_name(candidate)
  96. except NotFound:
  97. continue
  98. if account:
  99. if not dryrun:
  100. account.ob_account_name = name
  101. account._commit()
  102. break
  103. # No account found, create a new one
  104. if not account:
  105. account = create_account(name)
  106. username_mapping[name] = account
  107. return account
  108. def create_account(full_name):
  109. name = full_name.replace(' ', '_')
  110. retry = 2 # First retry will by name2
  111. username = name
  112. while True:
  113. # Create a new account
  114. try:
  115. if dryrun:
  116. try:
  117. account = Account._by_name(username)
  118. if account:
  119. raise AccountExists
  120. except NotFound:
  121. account = FakeAccount()
  122. account.name = username
  123. else:
  124. account = register(username, generate_password(), None)
  125. account.ob_account_name = full_name
  126. account._commit()
  127. except AccountExists:
  128. # This username is taken, generate another, but first limit the retries
  129. if retry > MAX_RETRIES:
  130. raise StandardError("Unable to create account for '%s' after %d attempts" % (full_name, retry - 1))
  131. else:
  132. return account
  133. username = "%s%d" % (name, retry)
  134. retry += 1
  135. def process_comments_on_post(post, comments):
  136. for comment in comments:
  137. if comment_exists(post, comment):
  138. continue
  139. # Prepare data for import
  140. ip = '127.0.0.1'
  141. naive_date = datetime.datetime.strptime(comment['dateCreated'], DATE_FORMAT)
  142. local_date = INPUT_TIMEZONE.localize(naive_date, is_dst=False) # Pick the non daylight savings time
  143. utc_date = local_date.astimezone(pytz.utc)
  144. # Determine account to use for this comment
  145. account = get_or_create_account(comment['author'])
  146. if not dryrun:
  147. # Create new comment
  148. new_comment, inbox_rel = Comment._new(account, post, None, comment['body'], ip, date=utc_date)
  149. new_comment.is_html = True
  150. new_comment.ob_imported = True
  151. new_comment._commit()
  152. try:
  153. print " Imported as '%s' %s" % (account.name.decode('utf-8').encode('utf-8'), comment_excerpt(comment).decode('utf-8').encode('utf-8'))
  154. except UnicodeError:
  155. print " Imported comment"
  156. re_strip_path = re.compile(r'^/overcomingbias')
  157. def adjust_permalink(permalink):
  158. """Transform:
  159. http://robinhanson.typepad.com/overcomingbias/2008/12/evolved-desires.html
  160. into:
  161. http://www.overcomingbias.com/2008/12/evolved-desires.html"""
  162. # Adjust the permalink to match those that were imported
  163. (scheme, host, path, query, fragment) = urlparse.urlsplit(permalink)
  164. host = 'www.overcomingbias.com'
  165. path = re_strip_path.sub('', path, 1)
  166. return urlparse.urlunsplit((scheme, host, path, query, fragment))
  167. def import_missing_comments(filename, apply_changes=False):
  168. """Imports the comments from the supplied YAML"""
  169. missing_comments = yaml.load(open(filename), Loader=yaml.CLoader)
  170. global dryrun
  171. dryrun = not apply_changes
  172. total_posts = len(missing_comments)
  173. post_count = 0
  174. for post in missing_comments:
  175. if post['author'] != 'Eliezer Yudkowsky':
  176. # print "Skipping non-EY post (%s): %s" % (post['author'], post['permalink'])
  177. continue
  178. ob_permalink = adjust_permalink(post['permalink'])
  179. # Attempt to retrieve the post that was imported into Less Wrong
  180. imported_post = list(Link._query(Link.c.ob_permalink == ob_permalink, data=True))
  181. if len(imported_post) < 1:
  182. print "Unable to retrieve imported post: %s" % ob_permalink
  183. continue
  184. elif len(imported_post) > 1:
  185. print "Got more than one result for: %s" % ob_permalink
  186. raise Exception
  187. else:
  188. imported_post = imported_post[0]
  189. post_count += 1
  190. try:
  191. print "Importing (%d of %d) comments on: %s" % (post_count, total_posts, imported_post.canonical_url)
  192. except UnicodeError:
  193. print "Importing comments on post (%d of %d)"
  194. process_comments_on_post(imported_post, post['comments'])