PageRenderTime 44ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/apps/import_wxp/parseWp.py

http://ihere-blog.googlecode.com/
Python | 331 lines | 262 code | 34 blank | 35 comment | 37 complexity | 76b7fb94f333002be365faefa30deb76 MD5 | raw file
Possible License(s): LGPL-2.0, LGPL-3.0
  1. #encoding=UTF-8
  2. #http://www.eriksmartt.com/blog/archives/306
  3. #In Part 1 of this series, I described some of the motivation, and the components being used to build a new blog for myself. In this (lengthy) post, IĆ¢€™ll address the solution I used to move my content archives from WordPress to the new app.
  4. #encoding=UTF-8
  5. import xml.etree.ElementTree as ET
  6. from google.appengine.ext import db
  7. from google.appengine.api import users
  8. from blog.models import *
  9. import datetime,logging,re,urllib,os,time
  10. from django.utils.encoding import force_unicode,smart_str
  11. from django.db.models import signals
  12. from optparse import OptionParser
  13. class Import(object):
  14. def __init__(self,wordpress_xml_file):
  15. self.tree = ET.parse(wordpress_xml_file)
  16. self.wpns='{http://wordpress.org/export/1.0/}'
  17. self.contentns="{http://purl.org/rss/1.0/modules/content/}"
  18. ET._namespace_map[self.wpns]='wp'
  19. ET._namespace_map[self.contentns]='content'
  20. self.results={}
  21. self.items=[]
  22. def make_comments(self,comments,post):
  23. self.results['comments']=[]
  24. for com in comments:
  25. try:
  26. comment_approved=int(com.findtext(self.wpns+'comment_approved'))
  27. except:
  28. comment_approved=0
  29. if comment_approved:
  30. comment=dict(
  31. author=com.findtext(self.wpns+'comment_author'),
  32. content=com.findtext(self.wpns+'comment_content'),
  33. email=com.findtext(self.wpns+'comment_author_email'),
  34. weburl=com.findtext(self.wpns+'comment_author_url'),
  35. author_IP=com.findtext(self.wpns+'comment_author_IP'),
  36. date=com.findtext(self.wpns+'comment_date'),
  37. date_gmt=com.findtext(self.wpns+'comment_date_gmt'),
  38. )
  39. self.results['comments'].append(comment)
  40. return self.results
  41. def make_categories_and_tags(self,categories):
  42. self.results['categories'] = []
  43. self.results['tags']=[]
  44. for cat in categories:
  45. if cat.attrib.has_key('domain'):
  46. cat_type=cat.attrib['domain']
  47. if cat_type=='tag':
  48. if cat.text:
  49. self.results['tags'].append(cat.text.strip())
  50. else:
  51. nicename=cat.findtext(self.wpns+'category_nicename').strip()
  52. name=cat.findtext(self.wpns+'cat_name').strip()
  53. self.results['categories'].append({'nicename':nicename,'name':name})
  54. else:
  55. self.results['category']=force_unicode(urllib.unquote(smart_str(cat.text.strip())))
  56. return self.results
  57. def get_slug(self,linkstr):
  58. # regex=ur'^.*/(.*)$'
  59. # match = re.search(regex, subject)
  60. # if match:
  61. # result = match.group()
  62. # else:
  63. # result = ""
  64. linkstr=force_unicode(urllib.unquote(smart_str(linkstr.strip())))
  65. if linkstr.rfind('.html')==-1:
  66. linkstr=linkstr[:linkstr.rfind('/')]
  67. else:
  68. linkstr=linkstr[:linkstr.rfind('.html')]
  69. slug=os.path.basename(linkstr)
  70. return slug
  71. def make_post(self,item):
  72. self.results['link'] = item.find("link").text
  73. # get slug from link
  74. self.results['slug'] = self.get_slug(item.find("link").text)
  75. self.results['title'] = item.find("title").text
  76. self.results['pubDate'] = item.find("pubDate").text
  77. self.results['summary'] = item.find("description").text
  78. self.results['body'] = item.find(self.contentns+"encoded").text
  79. self.results['post_date'] = item.find(self.wpns+"post_date").text
  80. self.results['post_date_gmt'] = item.find(self.wpns+"post_date_gmt").text
  81. return self.results
  82. def make_category_list(self):
  83. category_list=[]
  84. for category in self.tree.findall("channel/%scategory"%self.wpns):
  85. nicename=category.findtext(self.wpns+'category_nicename')
  86. name=category.findtext(self.wpns+'cat_name')
  87. category_list.append({'nicename':nicename,'name':name})
  88. return category_list
  89. def make_tag_list(self):
  90. tag_list=[]
  91. for tag in self.tree.findall("channel/%stag"%self.wpns):
  92. slug=tag.findtext(self.wpns+'tag_slug')
  93. name=tag.findtext(self.wpns+'tag_name')
  94. if slug and name:
  95. tag_list.append({'slug':slug,'name':name})
  96. return tag_list
  97. def generate(self):
  98. category_list=self.make_category_list()
  99. tag_list=self.make_tag_list()
  100. for item in self.tree.findall("channel/item"):
  101. post=self.make_post(item)
  102. categories = item.findall("category")
  103. self.make_categories_and_tags(categories)
  104. comments=item.findall(self.wpns+'comment')
  105. self.make_comments(comments, post)
  106. self.items.append(self.results)
  107. self.results={}
  108. return (self.items,category_list,tag_list)
  109. class Importer(object):
  110. def __init__(self,wordpress_xml_file):
  111. self.generater=Import(wordpress_xml_file)
  112. self.tree = self.generater.tree
  113. self.wpns='{http://wordpress.org/export/1.0/}'
  114. self.contentns="{http://purl.org/rss/1.0/modules/content/}"
  115. ET._namespace_map[self.wpns]='wp'
  116. ET._namespace_map[self.contentns]='content'
  117. (self.items,self.category_list,self.tag_list)=self.generater.generate()
  118. def import2Gae(self):
  119. signals.post_save.disconnect(log_event_on_post_save, sender=Comment)
  120. signals.post_save.disconnect(log_event_on_post_save, sender=Post)
  121. logging.info('importing categories...')
  122. category_list=self.__import_category_list()
  123. logging.info('%s categories imported...'%len(category_list))
  124. logging.info('importing tags...')
  125. tag_list=self.__import_tag_list()
  126. logging.info('%s tags imported...'%len(tag_list))
  127. logging.info('importing posts...')
  128. for item in self.items:
  129. post,created=self.__import_post(item)
  130. logging.info('post:%s imported...'%item['title'])
  131. # if created:
  132. # self.make_tag_counts(post)
  133. # self.make_category_counts(post)
  134. logging.info('%s posts imported...'%len(self.items))
  135. # post=db.run_in_transaction(self.__import_post, item)
  136. signals.post_save.connect(log_event_on_post_save, sender=Comment)
  137. signals.post_save.connect(log_event_on_post_save, sender=Post)
  138. def make_tag_counts(self,post):
  139. tags=post.tags
  140. for tagkey in tags:
  141. db.run_in_transaction(self.increment_counter,tagkey,post.key(), 1, True)
  142. def make_category_counts(self,post):
  143. category=post.category
  144. db.run_in_transaction(self.increment_counter,category.key(),post.key(), 1, True)
  145. def __import_tag_list(self):
  146. tag_list=[]
  147. for tag_dict in self.tag_list:
  148. key_name='tag_'+force_unicode(urllib.unquote(smart_str(tag_dict['name'])))
  149. tag=Tag.get_or_insert(key_name=key_name,parent=None,name=tag_dict['name'],slug=tag_dict['slug'],entrycount=0)
  150. tag_list.append(tag)
  151. # logging.info('__import_tag_list :%s'%tag_dict['name'])
  152. return tag_list
  153. def __import_category_list(self):
  154. category_list=[]
  155. for category_dict in self.category_list:
  156. key_name='category_'+force_unicode(urllib.unquote(smart_str(category_dict['name'])))
  157. category=Category.get_or_insert(key_name=key_name,parent=None,name=category_dict['name'],slug=category_dict['nicename'],entrycount=0)
  158. category_list.append(category)
  159. return category_list
  160. def __import_site_info(self,results):
  161. pass
  162. def __import_post(self,item):
  163. if 'category' not in item or not item['category']:
  164. item['category']='Uncategoried'
  165. category_key_name='category_'+force_unicode(urllib.unquote(smart_str(item['category'])))
  166. category=Category.get_or_insert(key_name=category_key_name,parent=None,name=item['category'],slug=item['category'],entrycount=0)
  167. timestamp = self.get_key_from_time(item['post_date'])
  168. key_name='post_'+timestamp
  169. created= not db.get(db.Key.from_path(Post.kind(),key_name))
  170. tags=self.__import_tags_of_post(item)
  171. categories=self.__import_categories_of_post(item)
  172. tags+=categories
  173. post=Post.get_or_insert(
  174. key_name=key_name,
  175. # parent=category,
  176. # category=category,
  177. title =item['title'],
  178. content =item['body'],
  179. date =self.get_date_from_string(item['post_date'],'%Y-%m-%d %H:%M:%S',),
  180. # date=datetime.datetime.strptime(item['post_date'],'%Y-%m-%d %H:%M:%S',),
  181. author =users.get_current_user(),
  182. authorEmail =users.get_current_user().email(),
  183. slug =item['slug'],
  184. tags=tags,
  185. # categories=categories,
  186. isPublished=True,
  187. )
  188. if not post.category:
  189. post.category=category
  190. post.put()
  191. self.__import_comments(item, post)
  192. return post,created
  193. def __import_categories_of_post(self,item):
  194. category_list=[]
  195. for cat in item['categories']:
  196. tag_name=force_unicode(urllib.unquote(smart_str(cat['name'])))
  197. key_name='tag_'+tag_name
  198. if cat['name'] is not None:
  199. tag = Tag.get_or_insert(key_name=key_name,parent=None,name=cat['name'],slug=cat['nicename'],entrycount=0)
  200. category_list.append(tag.key())
  201. # for category_dict in item['categories']:
  202. # key_name='category_'+force_unicode(urllib.unquote(smart_str(category_dict['name'])))
  203. # category= Category.get_or_insert(key_name=key_name,parent=None,name=category_dict['name'],slug=category_dict['nicename'])
  204. # category_list.append(category.key())
  205. return category_list
  206. def __import_tags_of_post(self,item):
  207. tag_list=[]
  208. for tag_name in item['tags']:
  209. tag_name=force_unicode(urllib.unquote(smart_str(tag_name)))
  210. key_name='tag_'+tag_name
  211. # tag= db.get(db.Key.from_path(Tag.kind(),key_name))
  212. if tag_name is not None:
  213. tag = Tag.get_or_insert(key_name=key_name,parent=None,name=tag_name,slug=tag_name,entrycount=0)
  214. tag_list.append(tag.key())
  215. # logging.info('__import_tags_of_post :%s'%tag_name)
  216. return tag_list
  217. def get_key_from_time(self,strtime):
  218. strtime=force_unicode(urllib.unquote(smart_str(strtime)))
  219. date = self.get_date_from_string(strtime,'%Y-%m-%d %H:%M:%S',).strftime('%Y-%m-%d %H:%M:%S')
  220. # logging.info('date:%s'%date)
  221. timestamp = time.strptime(date,'%Y-%m-%d %H:%M:%S',)
  222. # logging.info('timestamp:%s'%timestamp)
  223. return str(time.mktime(timestamp))[:10]
  224. def __import_comments(self,item,post):
  225. comment_list=[]
  226. for comment_dict in item['comments']:
  227. timestamp = self.get_key_from_time(comment_dict['date'])
  228. key_name='comment_'+timestamp
  229. comment= Comment.get_or_insert(
  230. key_name=key_name,
  231. post = post,
  232. user = comment_dict['author'],
  233. date = self.get_date_from_string(comment_dict['date'],'%Y-%m-%d %H:%M:%S',),
  234. # date = datetime.datetime.strptime(comment_dict['date'],'%Y-%m-%d %H:%M:%S',),
  235. # author = users.User(comment_dict['email']),
  236. authorEmail = comment_dict['email'] or 'none@dumy.com',
  237. authorWebsite = comment_dict['weburl'],
  238. userIp = comment_dict['author_IP'],
  239. content = comment_dict['content'],
  240. parent=post,
  241. )
  242. comment_list.append(comment)
  243. return comment_list
  244. def increment_counter(self,key,post_key, amount,created):
  245. obj = db.get(key)
  246. if created:
  247. obj.entrycount += amount
  248. return db.put(obj)
  249. else:
  250. return obj
  251. def get_date_from_string(self,t, format):
  252. try:
  253. return datetime.datetime(*time.strptime(t, format)[0:6])
  254. except ValueError, msg:
  255. if "%S" in format:
  256. msg = str(msg)
  257. mat = re.match(r"unconverted data remains:"
  258. " \.([0-9]{1,6})$", msg)
  259. if mat is not None:
  260. # fractional seconds are present - this is the style
  261. # used by datetime's isoformat() method
  262. frac = "." + mat.group(1)
  263. t = t[:-len(frac)]
  264. t = datetime.datetime(*time.strptime(t, format)[0:6])
  265. microsecond = int(float(frac)*1e6)
  266. return t.replace(microsecond=microsecond)
  267. else:
  268. mat = re.match(r"unconverted data remains:"
  269. " \,([0-9]{3,3})$", msg)
  270. if mat is not None:
  271. # fractional seconds are present - this is the style
  272. # used by the logging module
  273. frac = "." + mat.group(1)
  274. t = t[:-len(frac)]
  275. t = datetime.datetime(*time.strptime(t, format)[0:6])
  276. microsecond = int(float(frac)*1e6)
  277. return t.replace(microsecond=microsecond)
  278. def parseoptions(args):
  279. """Parses command line options."""
  280. parser = OptionParser()
  281. parser.add_option("-d", "--database", default="", type="string",
  282. help="The name of the database you want to connect to.")
  283. parser.add_option("-s", "--server", default="localhost", type="string",
  284. help="The name of the server you want to connect to.")
  285. parser.add_option("-u", "--username", default="", type="string",
  286. help="The username to connect to the database with.")
  287. parser.add_option("-p", "--password", type="string",
  288. help="The password to connect to the database with.")
  289. parser.add_option("-o", "--out", type="string",
  290. help="The filename where you want the output stored.")
  291. return parser.parse_args(args)[0]
  292. if __name__ == '__main__':
  293. options = parseoptions(sys.argv)
  294. # exporter = Exporter(options)
  295. # exporter.export()