PageRenderTime 46ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/scripts/add_links_to_ob_export.py

https://github.com/wangmxf/lesswrong
Python | 100 lines | 99 code | 1 blank | 0 comment | 1 complexity | 4bc455ae17b6cf59f8ad28b80775d286 MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception, LGPL-2.1
  1. import os
  2. import sys
  3. import yaml
  4. import re
  5. kill_whitespace_re = re.compile('\s')
  6. kill_entities_re = re.compile('&#?[a-z0-9]{1,4};')
  7. def kill_whitespace(body):
  8. body = kill_whitespace_re.sub('', body)
  9. body = kill_entities_re.sub('', body)
  10. body = body.replace('<p>', '')
  11. body = body.replace('</p>', '')
  12. body = body.replace('<br/>', '')
  13. return body
  14. if __name__ == '__main__':
  15. if len(sys.argv) <= 4:
  16. print 'Usage: %s <export_file> <api_file> <user_map> <outputfile>' % os.path.basename(sys.argv[0])
  17. print
  18. print ' Uses the api_file to supplement the export_file with permalinks.'
  19. print ' Writes the result to outputfile.'
  20. sys.exit(-1)
  21. export_file = open(sys.argv[1])
  22. api_file = open(sys.argv[2])
  23. mapfile = open(sys.argv[3])
  24. output_file = open(sys.argv[4], 'w')
  25. mappings = yaml.load(api_file, Loader=yaml.CLoader)
  26. export = yaml.load(export_file, Loader=yaml.CLoader)
  27. # Load the user mapping dict
  28. user_map = yaml.load(mapfile, Loader=yaml.CLoader)
  29. # Turn the mappings into a lookup table on title and content
  30. post_mapping = {}
  31. title_mapping = {}
  32. for post in mappings:
  33. title = post['title']
  34. body = post['description'] + post['mt_text_more']
  35. if not isinstance(body, unicode):
  36. body = unicode(body, 'utf-8')
  37. if not isinstance(title, unicode):
  38. title = unicode(title, 'utf-8')
  39. key = (kill_whitespace(body), kill_whitespace(title))
  40. post_mapping[key] = post
  41. title_mapping[kill_whitespace(title)] = key
  42. # Scan the export file
  43. new_export = []
  44. for entry in export:
  45. if 'Eliezer Yudkowsky' not in entry['author']:
  46. continue
  47. if entry['status'] != 'Publish':
  48. continue
  49. # Get the title and do a lookup on the permalink
  50. body = entry['description'] + entry['mt_text_more']
  51. body = body.decode('utf-8')
  52. title = entry['title']
  53. title = title.decode('utf-8')
  54. print title
  55. key = (kill_whitespace(body), kill_whitespace(title))
  56. try:
  57. api_post = post_mapping[key]
  58. except KeyError:
  59. print title_mapping[kill_whitespace(title)]
  60. print
  61. print key
  62. print
  63. import difflib
  64. d = difflib.Differ()
  65. diff = d.compare(title_mapping[kill_whitespace(title)], key)
  66. import pprint
  67. pprint.pprint(list(diff))
  68. raise
  69. new_entry = entry
  70. new_entry['permalink'] = api_post['permaLink']
  71. new_entry['description'] = api_post['description']
  72. new_entry['mt_text_more'] = api_post['mt_text_more']
  73. new_entry['authorEmail'] = user_map.get(new_entry['author'], '').lower()
  74. # Process comments
  75. comments = new_entry.get('comments', [])
  76. for comment in comments:
  77. if not comment['authorEmail']:
  78. comment['authorEmail'] = user_map.get(comment['author'], '').lower()
  79. else:
  80. comment['authorEmail'] = comment['authorEmail'].lower()
  81. new_export.append(new_entry)
  82. # Print out the result
  83. yaml.dump(new_export, output_file, Dumper=yaml.CDumper)