PageRenderTime 1060ms CodeModel.GetById 15ms RepoModel.GetById 8ms app.codeStats 0ms

/scripts/fix_imported_content_with_images.py

https://github.com/wangmxf/lesswrong
Python | 73 lines | 64 code | 6 blank | 3 comment | 7 complexity | f43ddc09354d2a0bbe953ddb83dc1a01 MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception, LGPL-2.1
  1. import urlparse
  2. import re
  3. def print_change(old, new):
  4. print " %s => %s" % (old, new)
  5. interesting_hosts = set(['www.overcomingbias.com', 'robinhanson.typepad.com'])
  6. funny_imgs = {
  7. 'http://robinhanson.typepad.com/.a/6a00d8341c6a2c53ef010536c21d63970b-800wi': 'http://lesswrong.com/static/imported/6a00d8341c6a2c53ef010536c21d63970b-800wi.jpg',
  8. }
  9. ext_re = re.compile(r'.*\.(jpg|gif|png)$', re.IGNORECASE)
  10. path_re = re.compile(r'/(images|uncategorized)/(\d{4}/\d{2}/\d{2}/[^/]+)$')
  11. def substitute_ob_url(url):
  12. if url in funny_imgs:
  13. # Special case
  14. print_change(url, funny_imgs[url])
  15. return funny_imgs[url]
  16. (scheme, host, path, query, fragment) = urlparse.urlsplit(url)
  17. if host not in interesting_hosts:
  18. return url
  19. # Check if this is an image URL at OB
  20. match = ext_re.search(path) or ext_re.search(query)
  21. if match:
  22. match = path_re.search(match.group())
  23. if match:
  24. # Translate to new path
  25. host = 'lesswrong.com'
  26. path = '/static/imported/%s' % match.group(2)
  27. old_url = url
  28. url = urlparse.urlunsplit((scheme, host, path, '', ''))
  29. print_change(old_url, url)
  30. else:
  31. print " Got unexpected image url: %s" % url
  32. return url
  33. # Borrowed from the importer
  34. url_re = re.compile(r"""(?:https?|ftp|file)://[-A-Z0-9+&@#/%?=~_|!:,.;]*[-A-Z0-9+&@#/%=~_|]""", re.IGNORECASE)
  35. def process_content(html):
  36. if html:
  37. # if isinstance(text, str):
  38. # text = text.decode('utf-8')
  39. #
  40. # # Double decode needed to handle some wierd characters
  41. # text = text.encode('utf-8')
  42. html = url_re.sub(lambda match: substitute_ob_url(match.group()), html)
  43. return html
  44. # Main function
  45. def fix_images(dryrun=True):
  46. from r2.models import Link, Comment
  47. links = Link._query(Link.c.ob_permalink != None, data = True)
  48. for link in links:
  49. ob_url = link.ob_permalink.strip()
  50. print "Processing %s" % ob_url
  51. new_content = process_content(link.article)
  52. if not dryrun:
  53. link.article = new_content
  54. link._commit()
  55. comments = Comment._query(Comment.c.link_id == link._id, data = True)
  56. for comment in comments:
  57. new_content = process_content(comment.body)
  58. if not dryrun:
  59. comment.body = new_content
  60. comment._commit()