PageRenderTime 64ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/src/proboard_saver.coffee

https://gitlab.com/thallian/proboards-saver
CoffeeScript | 377 lines | 263 code | 109 blank | 5 comment | 45 complexity | bbf0e3c96961090c78a25b64c0336a2f MD5 | raw file
  1. # 2015 by Sebastian Hugentobler <shugentobler@vanwa.ch>
  2. # To the extent possible under law, the author(s) have dedicated all copyright
  3. # and related and neighboring rights to this software to the public domain
  4. # worldwide. This software is distributed without any warranty.
  5. # See http://creativecommons.org/publicdomain/zero/1.0/ for a description of CC0.
  6. casper = require('casper').create(
  7. verbose: false
  8. logLevel: 'info'
  9. pageSettings: {
  10. webSecurityEnabled: false
  11. }
  12. )
  13. utils = require('utils')
  14. fs = require('fs')
  15. casper.on 'error', (msg, trace) ->
  16. @echo "Error: #{msg}", "ERROR"
  17. casper.on 'page.error', (msg, trace) ->
  18. @echo "Error: #{msg}", "ERROR"
  19. casper.on 'remote.message', (msg, trace) ->
  20. if not /Unsafe JavaScript attempt/.test msg
  21. @echo "remote log: #{msg}", "INFO"
  22. loadImages = (searchString) ->
  23. images = searchString.match /\[img\](.*?)\[\/img\]/g
  24. if images
  25. for image in images
  26. detailImage = image.match(/\[img\](.*?)\[\/img\]/)[1]
  27. imageParts = detailImage.split '/'
  28. imageName = imageParts[imageParts.length - 1]
  29. console.log "\t\tdownloading image '#{imageName}'..."
  30. casper.download detailImage, "data/images/#{imageName}"
  31. re = new RegExp("\\[img\\]#{detailImage}\\[/img\\]", "i")
  32. searchString = searchString.replace re, "[img]{{baseurl}}/images/#{imageName}[/img]"
  33. return searchString
  34. replaceHtml = (element) ->
  35. images = Array::map.call element.querySelectorAll('img'), (img) ->
  36. src: img.src, alt: if img.hasAttribute('alt') then img.alt else ''
  37. for image in images
  38. element.innerHTML = element.innerHTML.replace /<img[^>]*>/, "[img#{if image.alt then '=' + image.alt else ''}]#{image.src}[/img]"
  39. videos = Array::map.call element.querySelectorAll("iframe[title='YouTube video player']"), (video) -> video.src.split('/')[4].split('?')[0]
  40. for video in videos
  41. element.innerHTML = element.innerHTML.replace /<iframe title="YouTube video player"[^>]*>.*?<\/iframe>/, "[video]https://www.youtube.com/watch?v=#{video}[/video]"
  42. cursiveElements = Array::map.call element.querySelectorAll('i'), (cursive) -> cursive.innerHTML
  43. for cursive in cursiveElements
  44. element.innerHTML = element.innerHTML.replace /<i>.*?<\/i>/, "[i]#{cursive}[/i]"
  45. boldElements = Array::map.call element.querySelectorAll('b'), (bold) -> bold.innerHTML
  46. for bold in boldElements
  47. element.innerHTML = element.innerHTML.replace /<b>.*?<\/b>/, "[b]#{bold}[/b]"
  48. underlinedElements = Array::map.call element.querySelectorAll('u'), (underlined) -> underlined.innerHTML
  49. for underlined in underlinedElements
  50. element.innerHTML = element.innerHTML.replace /<u>.*?<\/u>/, "[u]#{underlined}[/u]"
  51. colourElements = Array::map.call element.querySelectorAll('font[color]'), (colour) -> name: colour.attributes['color'].value.toLowerCase(), innerHTML: colour.innerHTML
  52. for colour in colourElements
  53. element.innerHTML = element.innerHTML.replace /<font color=".*">[^<\/font>]*<\/font>/, "[colour=#{colour.name}]#{colour.innerHTML}[/colour]"
  54. quote = element.querySelector 'div.quote_body'
  55. while quote
  56. quoteHeaderNode = quote.querySelector 'div.quote_header'
  57. registeredUserNode = if quoteHeaderNode then quote.querySelector('div.quote_header').querySelector('span[itemprop="name"]') else null
  58. user = null
  59. if registeredUserNode
  60. user = registeredUserNode.textContent
  61. else if quote.parentNode.attributes['author']
  62. user = quote.parentNode.attributes['author'].value
  63. if user.substr(0, 1) == '@'
  64. user = user.substr 1
  65. quoteHeader = quote.querySelector 'div.quote_header'
  66. if quoteHeader then quoteHeader.parentNode.removeChild quoteHeader
  67. quoteAvatar = quote.querySelector 'div.quote_avatar_container'
  68. if quoteAvatar then quoteAvatar.parentNode.removeChild quoteAvatar
  69. quoteClear = quote.querySelector 'div.quote_clear'
  70. if quoteClear then quoteClear.parentNode.removeChild quoteClear
  71. message = quote.innerHTML
  72. dummySpan = document.createElement 'span'
  73. dummySpan.setAttribute 'class', 'dummytag'
  74. dummySpan.innerHTML = "[quote#{if user then '=' + user else ''}]#{message}[/quote]"
  75. quote.parentNode.parentNode.replaceChild dummySpan, quote.parentNode
  76. quote = element.querySelector 'div.quote_body'
  77. dummyElements = Array::map.call element.querySelectorAll('span.dummytag'), (dummy) -> dummy.innerHTML
  78. for dummyContent in dummyElements
  79. element.innerHTML = element.innerHTML.replace /<span class="dummytag">.*<\/span>/, dummyContent
  80. linkElements = Array::map.call element.querySelectorAll('a[href]'), (link) -> target: link.attributes['href'].value, name: link.innerText
  81. for link in linkElements
  82. element.innerHTML = element.innerHTML.replace /<a[^>]*>.*?<\/a>/, "[url=#{link.target}]#{link.name}[/url]"
  83. element.innerHTML = element.innerHTML.replace /<font [^>]*>/g, ''
  84. element.innerHTML = element.innerHTML.replace /<\/font>/g, ''
  85. element.innerHTML = element.innerHTML.replace /<div class="quote_clear"><\/div>/g, ''
  86. finalText = element.innerText
  87. finalText = finalText.replace /<br>/g, '\n'
  88. finalText = finalText.replace /\[img=([^\]]*)\]http:\/\/images\.proboards\.com\/v5\/images\/smiley\/.*?\[\/img\]/g, '$1'
  89. finalText = finalText.replace /\[img=([^\]]*)\]http:\/\/images\.proboards\.com\/v5\/smiley\/.*?\[\/img\]/g, '$1'
  90. attachmentIndex = finalText.indexOf('\n\n[b]Attachments:[/b]\n\n')
  91. if attachmentIndex > -1
  92. finalText = finalText.substring 0, attachmentIndex
  93. return finalText
  94. findBoards = ->
  95. boards = document.querySelectorAll('tr.board.item td:nth-child(2) > span > a')
  96. boardTitles = Array::map.call boards, (e) -> e.textContent
  97. boardLinks = Array::map.call boards, (e) -> e.href
  98. boardDescriptions = document.querySelectorAll('tr.board.item td:nth-child(2) > p.description')
  99. boardDescriptionList = Array::map.call boardDescriptions, (e) -> e.textContent
  100. boardInfo = []
  101. i = 0
  102. while i < boardTitles.length
  103. boardInfo.push
  104. title: boardTitles[i]
  105. description: boardDescriptionList[i]
  106. link: boardLinks[i]
  107. i++
  108. return boardInfo
  109. findPages = ->
  110. shownPages = document.querySelectorAll('ul.ui-pagination > li.ui-pagination-page.ui-pagination-slot > a[href]')
  111. lastPage = shownPages[shownPages.length - 1]
  112. pageInfo = /(.*\?page=)(\d*)/.exec lastPage
  113. pageBase = pageInfo[1]
  114. maxPage = pageInfo[2]
  115. pages = ("#{pageBase}#{pageNr}" for pageNr in [1..maxPage])
  116. findThreads = ->
  117. threads = document.querySelectorAll('tr.item.thread > td:nth-child(3) a.thread-link')
  118. threadTitles = Array::map.call threads, (e) -> e.textContent
  119. threadLinks = Array::map.call threads, (e) -> e.href
  120. threadIds = Array::map.call threads, (e) ->
  121. /.*\/thread\/(\d*)\/.*/.exec(e.href)[1]
  122. threadInfo = []
  123. i = 0
  124. while i < threadTitles.length
  125. threadInfo.push
  126. id: threadIds[i]
  127. title: threadTitles[i]
  128. link: threadLinks[i]
  129. i++
  130. return threadInfo
  131. findPosts = (replaceHtml) ->
  132. postInfo = Array::map.call document.querySelectorAll('tr.item.post'), (e) ->
  133. messageNode = e.querySelector('td.content div.message')
  134. attachmentNodes = messageNode.querySelectorAll('div.post_attachments blockquote a')
  135. dateNode = e.querySelector('td.content span.date > abbr.time')
  136. userNode = e.querySelector('td.left-panel a.user-link,td.left-panel > div.mini-profile.guest-mini-profile')
  137. id = /post-(\d*)/.exec(e.id)[1]
  138. message = replaceHtml(messageNode)
  139. attachments = []
  140. for attachmentNode in attachmentNodes
  141. attachmentName = attachmentNode.text
  142. if attachmentNode.childElementCount > 0
  143. attachmentName = attachmentNode.children[0].alt
  144. attachments.push
  145. name: attachmentName
  146. url: attachmentNode.href
  147. timestamp = parseInt(dateNode.attributes['data-timestamp'].value, 10) / 1000
  148. user = { }
  149. if userNode.href
  150. linkSplit = userNode.href.split '/'
  151. user = { link: linkSplit[linkSplit.length - 1], name: userNode.textContent }
  152. else
  153. user = { link: '', name: userNode.firstChild.data.replace '\n\t', '' }
  154. return {
  155. id: id,
  156. message: message,
  157. attachments: attachments,
  158. timestamp: timestamp,
  159. user: user
  160. }
  161. return postInfo
  162. findUserLinks = ->
  163. Array::map.call document.querySelectorAll('div.container.members a.user-link'), (e) -> e.href
  164. getUser = (replaceHtml) ->
  165. user = {}
  166. user.name = document.querySelectorAll('span.big_username')[0].textContent
  167. signatureNode = document.querySelector('td#center-column > div.content-box:last-child')
  168. user.signature = ''
  169. if signatureNode
  170. user.signature = replaceHtml signatureNode
  171. if not /Signature\n/.test user.signature
  172. user.signature = ''
  173. user.signature = user.signature.replace 'Signature\n', ''
  174. statusNode = document.querySelectorAll('form.form_user_status div.content-box tr span.personal-text')
  175. user.status = if statusNode.length > 0 then statusNode[0].textContent else ''
  176. user.registered = parseInt(document.querySelectorAll('td#center-column > div.content-box abbr.time')[0].attributes['data-timestamp'].value, 10) / 1000
  177. return user
  178. missingArgumentError = (argument) ->
  179. console.log "missing the #{ argument } argument"
  180. casper.exit()
  181. if casper.cli.options['board-nr']
  182. proboardNr = casper.cli.options['board-nr']
  183. else
  184. missingArgumentError 'board-nr'
  185. if casper.cli.options['board-name']
  186. proboardName = casper.cli.options['board-name']
  187. else
  188. missingArgumentError 'board-name'
  189. if casper.cli.options['user']
  190. user = casper.cli.options['user']
  191. else
  192. missingArgumentError 'user'
  193. if casper.cli.options['password']
  194. password = casper.cli.options['password']
  195. else
  196. missingArgumentError 'password'
  197. proboardUrl = "http://#{ proboardName }.proboards.com/"
  198. proboardUserUrl = "#{ proboardUrl }members"
  199. casper.userAgent 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:30.0) Gecko/20100101 Firefox/30.0'
  200. casper.start proboardUrl, ->
  201. casper.thenOpen 'https://login.proboards.com/forum_submit/login',
  202. method: 'post'
  203. data:
  204. forum: proboardNr
  205. email: user
  206. password: password
  207. continue: 'Continue'
  208. , ->
  209. readBoard = (board) ->
  210. casper.thenOpen board.link, ->
  211. board.boards = @evaluate findBoards
  212. @each board.boards, (casper, subboard) ->
  213. readBoard subboard
  214. @thenOpen board.link, ->
  215. @echo "getting threads for board '#{ board.title }'..."
  216. board.threads = []
  217. boardPages = @evaluate findPages
  218. @each boardPages, (casper, boardPage) ->
  219. @thenOpen boardPage, ->
  220. board.threads = board.threads.concat @evaluate findThreads
  221. @then ->
  222. @each board.threads, (casper, thread) ->
  223. thread.posts = []
  224. @thenOpen thread.link, ->
  225. @echo "\tgetting posts for thread '#{ thread.title }'..."
  226. pollName = null
  227. if @exists 'div.poll.show.ui-poll'
  228. console.log '\t\tsaving poll...'
  229. linkParts = thread.link.split '/'
  230. pollName = "#{linkParts[linkParts.length - 1]}.png"
  231. @captureSelector "data/images/polls/#{pollName}", 'div.poll.show.ui-poll'
  232. thread.poll = pollName
  233. threadPages = @evaluate findPages
  234. @each threadPages, (casper, threadPage) ->
  235. @thenOpen threadPage, ->
  236. posts = @evaluate findPosts, replaceHtml
  237. @each posts, (casper, post) ->
  238. post.message = loadImages post.message
  239. for attachment in post.attachments
  240. casper.download attachment.url, "data/attachments/#{attachment.name}"
  241. attachment.url = "{{baseurl}}/attachments/#{attachment.name}"
  242. thread.posts = thread.posts.concat posts
  243. @then ->
  244. if thread.poll and thread.posts[0]
  245. thread.posts[0].message = "[img]{{baseurl}}/images/polls/#{thread.poll}[/img]\n\n#{thread.posts[0].message}"
  246. if thread.poll and not thread.posts[0]
  247. console.log "how the fuck did you manage that?"
  248. proboard = {}
  249. casper.thenOpen proboardUrl, ->
  250. proboard.boards = @evaluate findBoards
  251. @each proboard.boards, (casper, board) ->
  252. readBoard board
  253. casper.thenOpen proboardUserUrl, ->
  254. proboard.users = []
  255. userPages = @evaluate findPages
  256. @each userPages, (casper, userPage) ->
  257. @thenOpen userPage, ->
  258. userlinks = @evaluate findUserLinks
  259. @each userlinks, (casper, userlink) ->
  260. @thenOpen userlink, ->
  261. @echo "getting userinfo for '#{ userlink }'..."
  262. user = @evaluate getUser, replaceHtml
  263. user.signature = loadImages user.signature
  264. proboard.users = proboard.users.concat user
  265. casper.then ->
  266. json = JSON.stringify(proboard, null, '\t')
  267. fs.write "data/#{ proboardName }.json", json, 'w'
  268. casper.run()