PageRenderTime 175ms CodeModel.GetById 33ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/jekyll-import/importers/wordpress.rb

https://gitlab.com/jbwhips883/jekyll-import
Ruby | 362 lines | 285 code | 35 blank | 42 comment | 30 complexity | e476e0200c487e9618a91379d9783792 MD5 | raw file
  1. module JekyllImport
  2. module Importers
  3. class WordPress < Importer
  4. def self.require_deps
  5. JekyllImport.require_with_fallback(%w[
  6. rubygems
  7. sequel
  8. fileutils
  9. safe_yaml
  10. unidecode
  11. ])
  12. end
  13. def self.specify_options(c)
  14. c.option 'dbname', '--dbname DB', 'Database name (default: "")'
  15. c.option 'socket', '--socket SOCKET', 'Database socket (default: "")'
  16. c.option 'user', '--user USER', 'Database user name (default: "")'
  17. c.option 'password', '--password PW', "Database user's password (default: "")"
  18. c.option 'host', '--host HOST', 'Database host name (default: "localhost")'
  19. c.option 'table_prefix', '--table_prefix PREFIX', 'Table prefix name (default: "wp_")'
  20. c.option 'clean_entities', '--clean_entities', 'Whether to clean entities (default: true)'
  21. c.option 'comments', '--comments', 'Whether to import comments (default: true)'
  22. c.option 'categories', '--categories', 'Whether to import categories (default: true)'
  23. c.option 'tags', '--tags', 'Whether to import tags (default: true)'
  24. c.option 'more_excerpt', '--more_excerpt', 'Whether to use more excerpt (default: true)'
  25. c.option 'more_anchor', '--more_anchor', 'Whether to use more anchor (default: true)'
  26. c.option 'status', '--status STATUS,STATUS2', Array, 'Array of allowed statuses (default: ["publish"], other options: "draft", "private", "revision")'
  27. end
  28. # Main migrator function. Call this to perform the migration.
  29. #
  30. # dbname:: The name of the database
  31. # user:: The database user name
  32. # pass:: The database user's password
  33. # host:: The address of the MySQL database host. Default: 'localhost'
  34. # socket:: The database socket's path
  35. # options:: A hash table of configuration options.
  36. #
  37. # Supported options are:
  38. #
  39. # :table_prefix:: Prefix of database tables used by WordPress.
  40. # Default: 'wp_'
  41. # :clean_entities:: If true, convert non-ASCII characters to HTML
  42. # entities in the posts, comments, titles, and
  43. # names. Requires the 'htmlentities' gem to
  44. # work. Default: true.
  45. # :comments:: If true, migrate post comments too. Comments
  46. # are saved in the post's YAML front matter.
  47. # Default: true.
  48. # :categories:: If true, save the post's categories in its
  49. # YAML front matter.
  50. # :tags:: If true, save the post's tags in its
  51. # YAML front matter.
  52. # :more_excerpt:: If true, when a post has no excerpt but
  53. # does have a <!-- more --> tag, use the
  54. # preceding post content as the excerpt.
  55. # Default: true.
  56. # :more_anchor:: If true, convert a <!-- more --> tag into
  57. # two HTML anchors with ids "more" and
  58. # "more-NNN" (where NNN is the post number).
  59. # Default: true.
  60. # :status:: Array of allowed post statuses. Only
  61. # posts with matching status will be migrated.
  62. # Known statuses are :publish, :draft, :private,
  63. # and :revision. If this is nil or an empty
  64. # array, all posts are migrated regardless of
  65. # status. Default: [:publish].
  66. #
  67. def self.process(opts)
  68. options = {
  69. :user => opts.fetch('user', ''),
  70. :pass => opts.fetch('password', ''),
  71. :host => opts.fetch('host', 'localhost'),
  72. :socket => opts.fetch('socket', nil),
  73. :dbname => opts.fetch('dbname', ''),
  74. :table_prefix => opts.fetch('table_prefix', 'wp_'),
  75. :clean_entities => opts.fetch('clean_entities', true),
  76. :comments => opts.fetch('comments', true),
  77. :categories => opts.fetch('categories', true),
  78. :tags => opts.fetch('tags', true),
  79. :more_excerpt => opts.fetch('more_excerpt', true),
  80. :more_anchor => opts.fetch('more_anchor', true),
  81. :status => opts.fetch('status', ["publish"]).map(&:to_sym) # :draft, :private, :revision
  82. }
  83. if options[:clean_entities]
  84. begin
  85. require 'htmlentities'
  86. rescue LoadError
  87. STDERR.puts "Could not require 'htmlentities', so the " +
  88. ":clean_entities option is now disabled."
  89. options[:clean_entities] = false
  90. end
  91. end
  92. FileUtils.mkdir_p("_posts")
  93. FileUtils.mkdir_p("_drafts") if options[:status].include? :draft
  94. db = Sequel.mysql2(options[:dbname], :user => options[:user], :password => options[:pass],
  95. :socket => options[:socket], :host => options[:host], :encoding => 'utf8')
  96. px = options[:table_prefix]
  97. page_name_list = {}
  98. page_name_query = "
  99. SELECT
  100. posts.ID AS `id`,
  101. posts.post_title AS `title`,
  102. posts.post_name AS `slug`,
  103. posts.post_parent AS `parent`
  104. FROM #{px}posts AS `posts`
  105. WHERE posts.post_type = 'page'"
  106. db[page_name_query].each do |page|
  107. if !page[:slug] or page[:slug].empty?
  108. page[:slug] = sluggify(page[:title])
  109. end
  110. page_name_list[ page[:id] ] = {
  111. :slug => page[:slug],
  112. :parent => page[:parent]
  113. }
  114. end
  115. posts_query = "
  116. SELECT
  117. posts.ID AS `id`,
  118. posts.guid AS `guid`,
  119. posts.post_type AS `type`,
  120. posts.post_status AS `status`,
  121. posts.post_title AS `title`,
  122. posts.post_name AS `slug`,
  123. posts.post_date AS `date`,
  124. posts.post_date_gmt AS `date_gmt`,
  125. posts.post_content AS `content`,
  126. posts.post_excerpt AS `excerpt`,
  127. posts.comment_count AS `comment_count`,
  128. users.display_name AS `author`,
  129. users.user_login AS `author_login`,
  130. users.user_email AS `author_email`,
  131. users.user_url AS `author_url`
  132. FROM #{px}posts AS `posts`
  133. LEFT JOIN #{px}users AS `users`
  134. ON posts.post_author = users.ID"
  135. if options[:status] and not options[:status].empty?
  136. status = options[:status][0]
  137. posts_query << "
  138. WHERE posts.post_status = '#{status.to_s}'"
  139. options[:status][1..-1].each do |status|
  140. posts_query << " OR
  141. posts.post_status = '#{status.to_s}'"
  142. end
  143. end
  144. db[posts_query].each do |post|
  145. process_post(post, db, options, page_name_list)
  146. end
  147. end
  148. def self.process_post(post, db, options, page_name_list)
  149. px = options[:table_prefix]
  150. title = post[:title]
  151. if options[:clean_entities]
  152. title = clean_entities(title)
  153. end
  154. slug = post[:slug]
  155. if !slug or slug.empty?
  156. slug = sluggify(title)
  157. end
  158. date = post[:date] || Time.now
  159. name = "%02d-%02d-%02d-%s.markdown" % [date.year, date.month,
  160. date.day, slug]
  161. content = post[:content].to_s
  162. if options[:clean_entities]
  163. content = clean_entities(content)
  164. end
  165. excerpt = post[:excerpt].to_s
  166. more_index = content.index(/<!-- *more *-->/)
  167. more_anchor = nil
  168. if more_index
  169. if options[:more_excerpt] and
  170. (post[:excerpt].nil? or post[:excerpt].empty?)
  171. excerpt = content[0...more_index]
  172. end
  173. if options[:more_anchor]
  174. more_link = "more"
  175. content.sub!(/<!-- *more *-->/,
  176. "<a id=\"more\"></a>" +
  177. "<a id=\"more-#{post[:id]}\"></a>")
  178. end
  179. end
  180. categories = []
  181. tags = []
  182. if options[:categories] or options[:tags]
  183. cquery =
  184. "SELECT
  185. terms.name AS `name`,
  186. ttax.taxonomy AS `type`
  187. FROM
  188. #{px}terms AS `terms`,
  189. #{px}term_relationships AS `trels`,
  190. #{px}term_taxonomy AS `ttax`
  191. WHERE
  192. trels.object_id = '#{post[:id]}' AND
  193. trels.term_taxonomy_id = ttax.term_taxonomy_id AND
  194. terms.term_id = ttax.term_id"
  195. db[cquery].each do |term|
  196. if options[:categories] and term[:type] == "category"
  197. if options[:clean_entities]
  198. categories << clean_entities(term[:name])
  199. else
  200. categories << term[:name]
  201. end
  202. elsif options[:tags] and term[:type] == "post_tag"
  203. if options[:clean_entities]
  204. tags << clean_entities(term[:name])
  205. else
  206. tags << term[:name]
  207. end
  208. end
  209. end
  210. end
  211. comments = []
  212. if options[:comments] and post[:comment_count].to_i > 0
  213. cquery =
  214. "SELECT
  215. comment_ID AS `id`,
  216. comment_author AS `author`,
  217. comment_author_email AS `author_email`,
  218. comment_author_url AS `author_url`,
  219. comment_date AS `date`,
  220. comment_date_gmt AS `date_gmt`,
  221. comment_content AS `content`
  222. FROM #{px}comments
  223. WHERE
  224. comment_post_ID = '#{post[:id]}' AND
  225. comment_approved != 'spam'"
  226. db[cquery].each do |comment|
  227. comcontent = comment[:content].to_s
  228. if comcontent.respond_to?(:force_encoding)
  229. comcontent.force_encoding("UTF-8")
  230. end
  231. if options[:clean_entities]
  232. comcontent = clean_entities(comcontent)
  233. end
  234. comauthor = comment[:author].to_s
  235. if options[:clean_entities]
  236. comauthor = clean_entities(comauthor)
  237. end
  238. comments << {
  239. 'id' => comment[:id].to_i,
  240. 'author' => comauthor,
  241. 'author_email' => comment[:author_email].to_s,
  242. 'author_url' => comment[:author_url].to_s,
  243. 'date' => comment[:date].to_s,
  244. 'date_gmt' => comment[:date_gmt].to_s,
  245. 'content' => comcontent,
  246. }
  247. end
  248. comments.sort!{ |a,b| a['id'] <=> b['id'] }
  249. end
  250. # Get the relevant fields as a hash, delete empty fields and
  251. # convert to YAML for the header.
  252. data = {
  253. 'layout' => post[:type].to_s,
  254. 'status' => post[:status].to_s,
  255. 'published' => post[:status].to_s == 'draft' ? nil : (post[:status].to_s == 'publish'),
  256. 'title' => title.to_s,
  257. 'author' => {
  258. 'display_name'=> post[:author].to_s,
  259. 'login' => post[:author_login].to_s,
  260. 'email' => post[:author_email].to_s,
  261. 'url' => post[:author_url].to_s,
  262. },
  263. 'author_login' => post[:author_login].to_s,
  264. 'author_email' => post[:author_email].to_s,
  265. 'author_url' => post[:author_url].to_s,
  266. 'excerpt' => excerpt,
  267. 'more_anchor' => more_anchor,
  268. 'wordpress_id' => post[:id],
  269. 'wordpress_url' => post[:guid].to_s,
  270. 'date' => date.to_s,
  271. 'date_gmt' => post[:date_gmt].to_s,
  272. 'categories' => options[:categories] ? categories : nil,
  273. 'tags' => options[:tags] ? tags : nil,
  274. 'comments' => options[:comments] ? comments : nil,
  275. }.delete_if { |k,v| v.nil? || v == '' }.to_yaml
  276. if post[:type] == 'page'
  277. filename = page_path(post[:id], page_name_list) + 'index.markdown'
  278. FileUtils.mkdir_p(File.dirname(filename))
  279. elsif post[:status] == 'draft'
  280. filename = "_drafts/#{slug}.md"
  281. else
  282. filename = "_posts/#{name}"
  283. end
  284. # Write out the data and content to file
  285. File.open(filename, "w") do |f|
  286. f.puts data
  287. f.puts "---"
  288. f.puts Util.wpautop(content)
  289. end
  290. end
  291. def self.clean_entities( text )
  292. if text.respond_to?(:force_encoding)
  293. text.force_encoding("UTF-8")
  294. end
  295. text = HTMLEntities.new.encode(text, :named)
  296. # We don't want to convert these, it would break all
  297. # HTML tags in the post and comments.
  298. text.gsub!("&amp;", "&")
  299. text.gsub!("&lt;", "<")
  300. text.gsub!("&gt;", ">")
  301. text.gsub!("&quot;", '"')
  302. text.gsub!("&apos;", "'")
  303. text.gsub!("/", "&#47;")
  304. text
  305. end
  306. def self.sluggify( title )
  307. title = title.to_ascii.downcase.gsub(/[^0-9A-Za-z]+/, " ").strip.gsub(" ", "-")
  308. end
  309. def self.page_path( page_id, page_name_list )
  310. if page_name_list.key?(page_id)
  311. [
  312. page_path(page_name_list[page_id][:parent],page_name_list),
  313. page_name_list[page_id][:slug],
  314. '/'
  315. ].join("")
  316. else
  317. ""
  318. end
  319. end
  320. end
  321. end
  322. end