PageRenderTime 50ms CodeModel.GetById 25ms RepoModel.GetById 0ms app.codeStats 0ms

/scripts/fix_bare_links.py

https://github.com/wangmxf/lesswrong
Python | 81 lines | 57 code | 18 blank | 6 comment | 14 complexity | 3209d7d19b08bc9f8c7e4c69f93ac8e9 MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception, LGPL-2.1
  1. import re
  2. import sys
  3. import codecs
  4. bare_link_re = re.compile(r'([^-A-Z0-9+&@#/%?=~_|!:,.;">]|^)(/lw/[^/]+/[^/]+/)([^-A-Z0-9+&@#/%?=~_|!:;"<]|$)')
  5. linked_bare_link_re = re.compile(r'(<a href="[^"]+">)(/lw/[^/]+/[^/]+[-A-Z0-9+&@#/%?=~_|!:,.;]*[-A-Z0-9+&@#/%=~_|]</a>)')
  6. spaces_around_anchor_re = re.compile(r'<a href\s*=\s+"?([^">]+)"?\s*>', re.IGNORECASE)
  7. no_quotes_on_anchor_re = re.compile(r'<a href=([^>\'"]+)>([^<]+)</a>', re.IGNORECASE)
  8. single_quotes_on_anchor_re = re.compile(r'<a href=\'([^>]+)\'>')
  9. well_formed_uppercase_re = re.compile(r'<A [Hh][Rr][Ee][Ff]="([^"]+)">([^<]+)</A>')
  10. def sub_group_1(match):
  11. return "<a href=\"%s\">" % match.group(1)
  12. def sub_with_end_tag(match):
  13. return "<a href=\"%s\">%s</a>" % (match.group(1), match.group(2))
  14. def wrap_bare_link(match):
  15. return '%s<a href="%s">http://lesswrong.com%s</a>%s' % (match.group(1), match.group(2), match.group(2), match.group(3))
  16. def add_host_to_linked_bare_link(match):
  17. return match.group(1) + 'http://lesswrong.com' + match.group(2)
  18. def rewrite_bare_links(content):
  19. # Tidy up strange HTML first
  20. content = spaces_around_anchor_re.sub(sub_group_1, content)
  21. content = no_quotes_on_anchor_re.sub(sub_with_end_tag, content)
  22. content = single_quotes_on_anchor_re.sub(sub_group_1, content)
  23. content = well_formed_uppercase_re.sub(sub_with_end_tag, content)
  24. # Fix bare links
  25. content = bare_link_re.sub(wrap_bare_link, content)
  26. content = linked_bare_link_re.sub(add_host_to_linked_bare_link, content)
  27. return content
  28. def fix_bare_links(apply=False):
  29. from r2.models import Comment
  30. from r2.lib.db.thing import NotFound
  31. fbefore = codecs.open('fix_bare_links_before.txt', 'w', 'utf-8')
  32. fafter = codecs.open('fix_bare_links_after.txt', 'w', 'utf-8')
  33. comment_id = 1
  34. try:
  35. # The comments are retrieved like this to prevent the API from
  36. # attempting to load all comments at once and then iterating over them
  37. while True:
  38. comment = Comment._byID(comment_id, data=True)
  39. if (hasattr(comment, 'ob_imported') and comment.ob_imported) and (hasattr(comment, 'is_html') and comment.is_html):
  40. body = comment.body
  41. if isinstance(body, str):
  42. try:
  43. body = body.decode('utf-8')
  44. except UnicodeDecodeError:
  45. print >>sys.stderr, "UnicodeDecodeError, using 'ignore' error mode, comment: %d" % comment._id
  46. body = body.decode('utf-8', errors='ignore')
  47. new_content = rewrite_bare_links(body)
  48. if new_content != body:
  49. print >>fbefore, body
  50. print >>fafter, new_content
  51. if apply:
  52. comment.body = new_content
  53. comment._commit()
  54. try:
  55. print >>sys.stderr, "Rewrote comment %s" % comment.make_permalink_slow().encode('utf-8')
  56. except UnicodeError:
  57. print >>sys.stderr, "Rewrote comment with id: %d" % comment._id
  58. comment_id += 1
  59. except NotFound:
  60. # Assumes that comment ids are sequential and never deleted
  61. # (which I believe to true) -- wjm
  62. print >>sys.stderr, "Comment %d not found, exiting" % comment_id
  63. return