/src/proboard_saver.coffee
CoffeeScript | 377 lines | 263 code | 109 blank | 5 comment | 45 complexity | bbf0e3c96961090c78a25b64c0336a2f MD5 | raw file
- # 2015 by Sebastian Hugentobler <shugentobler@vanwa.ch>
- # To the extent possible under law, the author(s) have dedicated all copyright
- # and related and neighboring rights to this software to the public domain
- # worldwide. This software is distributed without any warranty.
- # See http://creativecommons.org/publicdomain/zero/1.0/ for a description of CC0.
- casper = require('casper').create(
- verbose: false
- logLevel: 'info'
- pageSettings: {
- webSecurityEnabled: false
- }
- )
- utils = require('utils')
- fs = require('fs')
- casper.on 'error', (msg, trace) ->
- @echo "Error: #{msg}", "ERROR"
- casper.on 'page.error', (msg, trace) ->
- @echo "Error: #{msg}", "ERROR"
- casper.on 'remote.message', (msg, trace) ->
- if not /Unsafe JavaScript attempt/.test msg
- @echo "remote log: #{msg}", "INFO"
- loadImages = (searchString) ->
- images = searchString.match /\[img\](.*?)\[\/img\]/g
- if images
- for image in images
- detailImage = image.match(/\[img\](.*?)\[\/img\]/)[1]
- imageParts = detailImage.split '/'
- imageName = imageParts[imageParts.length - 1]
- console.log "\t\tdownloading image '#{imageName}'..."
- casper.download detailImage, "data/images/#{imageName}"
- re = new RegExp("\\[img\\]#{detailImage}\\[/img\\]", "i")
- searchString = searchString.replace re, "[img]{{baseurl}}/images/#{imageName}[/img]"
- return searchString
- replaceHtml = (element) ->
- images = Array::map.call element.querySelectorAll('img'), (img) ->
- src: img.src, alt: if img.hasAttribute('alt') then img.alt else ''
- for image in images
- element.innerHTML = element.innerHTML.replace /<img[^>]*>/, "[img#{if image.alt then '=' + image.alt else ''}]#{image.src}[/img]"
- videos = Array::map.call element.querySelectorAll("iframe[title='YouTube video player']"), (video) -> video.src.split('/')[4].split('?')[0]
- for video in videos
- element.innerHTML = element.innerHTML.replace /<iframe title="YouTube video player"[^>]*>.*?<\/iframe>/, "[video]https://www.youtube.com/watch?v=#{video}[/video]"
- cursiveElements = Array::map.call element.querySelectorAll('i'), (cursive) -> cursive.innerHTML
- for cursive in cursiveElements
- element.innerHTML = element.innerHTML.replace /<i>.*?<\/i>/, "[i]#{cursive}[/i]"
- boldElements = Array::map.call element.querySelectorAll('b'), (bold) -> bold.innerHTML
- for bold in boldElements
- element.innerHTML = element.innerHTML.replace /<b>.*?<\/b>/, "[b]#{bold}[/b]"
- underlinedElements = Array::map.call element.querySelectorAll('u'), (underlined) -> underlined.innerHTML
- for underlined in underlinedElements
- element.innerHTML = element.innerHTML.replace /<u>.*?<\/u>/, "[u]#{underlined}[/u]"
- colourElements = Array::map.call element.querySelectorAll('font[color]'), (colour) -> name: colour.attributes['color'].value.toLowerCase(), innerHTML: colour.innerHTML
- for colour in colourElements
- element.innerHTML = element.innerHTML.replace /<font color=".*">[^<\/font>]*<\/font>/, "[colour=#{colour.name}]#{colour.innerHTML}[/colour]"
- quote = element.querySelector 'div.quote_body'
- while quote
- quoteHeaderNode = quote.querySelector 'div.quote_header'
- registeredUserNode = if quoteHeaderNode then quote.querySelector('div.quote_header').querySelector('span[itemprop="name"]') else null
- user = null
- if registeredUserNode
- user = registeredUserNode.textContent
- else if quote.parentNode.attributes['author']
- user = quote.parentNode.attributes['author'].value
- if user.substr(0, 1) == '@'
- user = user.substr 1
- quoteHeader = quote.querySelector 'div.quote_header'
- if quoteHeader then quoteHeader.parentNode.removeChild quoteHeader
- quoteAvatar = quote.querySelector 'div.quote_avatar_container'
- if quoteAvatar then quoteAvatar.parentNode.removeChild quoteAvatar
- quoteClear = quote.querySelector 'div.quote_clear'
- if quoteClear then quoteClear.parentNode.removeChild quoteClear
- message = quote.innerHTML
- dummySpan = document.createElement 'span'
- dummySpan.setAttribute 'class', 'dummytag'
- dummySpan.innerHTML = "[quote#{if user then '=' + user else ''}]#{message}[/quote]"
- quote.parentNode.parentNode.replaceChild dummySpan, quote.parentNode
- quote = element.querySelector 'div.quote_body'
- dummyElements = Array::map.call element.querySelectorAll('span.dummytag'), (dummy) -> dummy.innerHTML
- for dummyContent in dummyElements
- element.innerHTML = element.innerHTML.replace /<span class="dummytag">.*<\/span>/, dummyContent
- linkElements = Array::map.call element.querySelectorAll('a[href]'), (link) -> target: link.attributes['href'].value, name: link.innerText
- for link in linkElements
- element.innerHTML = element.innerHTML.replace /<a[^>]*>.*?<\/a>/, "[url=#{link.target}]#{link.name}[/url]"
- element.innerHTML = element.innerHTML.replace /<font [^>]*>/g, ''
- element.innerHTML = element.innerHTML.replace /<\/font>/g, ''
- element.innerHTML = element.innerHTML.replace /<div class="quote_clear"><\/div>/g, ''
- finalText = element.innerText
- finalText = finalText.replace /<br>/g, '\n'
- finalText = finalText.replace /\[img=([^\]]*)\]http:\/\/images\.proboards\.com\/v5\/images\/smiley\/.*?\[\/img\]/g, '$1'
- finalText = finalText.replace /\[img=([^\]]*)\]http:\/\/images\.proboards\.com\/v5\/smiley\/.*?\[\/img\]/g, '$1'
- attachmentIndex = finalText.indexOf('\n\n[b]Attachments:[/b]\n\n')
- if attachmentIndex > -1
- finalText = finalText.substring 0, attachmentIndex
- return finalText
- findBoards = ->
- boards = document.querySelectorAll('tr.board.item td:nth-child(2) > span > a')
- boardTitles = Array::map.call boards, (e) -> e.textContent
- boardLinks = Array::map.call boards, (e) -> e.href
- boardDescriptions = document.querySelectorAll('tr.board.item td:nth-child(2) > p.description')
- boardDescriptionList = Array::map.call boardDescriptions, (e) -> e.textContent
- boardInfo = []
- i = 0
- while i < boardTitles.length
- boardInfo.push
- title: boardTitles[i]
- description: boardDescriptionList[i]
- link: boardLinks[i]
- i++
- return boardInfo
- findPages = ->
- shownPages = document.querySelectorAll('ul.ui-pagination > li.ui-pagination-page.ui-pagination-slot > a[href]')
- lastPage = shownPages[shownPages.length - 1]
- pageInfo = /(.*\?page=)(\d*)/.exec lastPage
- pageBase = pageInfo[1]
- maxPage = pageInfo[2]
- pages = ("#{pageBase}#{pageNr}" for pageNr in [1..maxPage])
- findThreads = ->
- threads = document.querySelectorAll('tr.item.thread > td:nth-child(3) a.thread-link')
- threadTitles = Array::map.call threads, (e) -> e.textContent
- threadLinks = Array::map.call threads, (e) -> e.href
- threadIds = Array::map.call threads, (e) ->
- /.*\/thread\/(\d*)\/.*/.exec(e.href)[1]
- threadInfo = []
- i = 0
- while i < threadTitles.length
- threadInfo.push
- id: threadIds[i]
- title: threadTitles[i]
- link: threadLinks[i]
- i++
- return threadInfo
- findPosts = (replaceHtml) ->
- postInfo = Array::map.call document.querySelectorAll('tr.item.post'), (e) ->
- messageNode = e.querySelector('td.content div.message')
- attachmentNodes = messageNode.querySelectorAll('div.post_attachments blockquote a')
- dateNode = e.querySelector('td.content span.date > abbr.time')
- userNode = e.querySelector('td.left-panel a.user-link,td.left-panel > div.mini-profile.guest-mini-profile')
- id = /post-(\d*)/.exec(e.id)[1]
- message = replaceHtml(messageNode)
- attachments = []
- for attachmentNode in attachmentNodes
- attachmentName = attachmentNode.text
- if attachmentNode.childElementCount > 0
- attachmentName = attachmentNode.children[0].alt
- attachments.push
- name: attachmentName
- url: attachmentNode.href
- timestamp = parseInt(dateNode.attributes['data-timestamp'].value, 10) / 1000
- user = { }
- if userNode.href
- linkSplit = userNode.href.split '/'
- user = { link: linkSplit[linkSplit.length - 1], name: userNode.textContent }
- else
- user = { link: '', name: userNode.firstChild.data.replace '\n\t', '' }
- return {
- id: id,
- message: message,
- attachments: attachments,
- timestamp: timestamp,
- user: user
- }
- return postInfo
- findUserLinks = ->
- Array::map.call document.querySelectorAll('div.container.members a.user-link'), (e) -> e.href
- getUser = (replaceHtml) ->
- user = {}
- user.name = document.querySelectorAll('span.big_username')[0].textContent
- signatureNode = document.querySelector('td#center-column > div.content-box:last-child')
- user.signature = ''
- if signatureNode
- user.signature = replaceHtml signatureNode
- if not /Signature\n/.test user.signature
- user.signature = ''
- user.signature = user.signature.replace 'Signature\n', ''
- statusNode = document.querySelectorAll('form.form_user_status div.content-box tr span.personal-text')
- user.status = if statusNode.length > 0 then statusNode[0].textContent else ''
- user.registered = parseInt(document.querySelectorAll('td#center-column > div.content-box abbr.time')[0].attributes['data-timestamp'].value, 10) / 1000
- return user
- missingArgumentError = (argument) ->
- console.log "missing the #{ argument } argument"
- casper.exit()
- if casper.cli.options['board-nr']
- proboardNr = casper.cli.options['board-nr']
- else
- missingArgumentError 'board-nr'
- if casper.cli.options['board-name']
- proboardName = casper.cli.options['board-name']
- else
- missingArgumentError 'board-name'
- if casper.cli.options['user']
- user = casper.cli.options['user']
- else
- missingArgumentError 'user'
- if casper.cli.options['password']
- password = casper.cli.options['password']
- else
- missingArgumentError 'password'
- proboardUrl = "http://#{ proboardName }.proboards.com/"
- proboardUserUrl = "#{ proboardUrl }members"
- casper.userAgent 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:30.0) Gecko/20100101 Firefox/30.0'
- casper.start proboardUrl, ->
- casper.thenOpen 'https://login.proboards.com/forum_submit/login',
- method: 'post'
- data:
- forum: proboardNr
- email: user
- password: password
- continue: 'Continue'
- , ->
- readBoard = (board) ->
- casper.thenOpen board.link, ->
- board.boards = @evaluate findBoards
- @each board.boards, (casper, subboard) ->
- readBoard subboard
- @thenOpen board.link, ->
- @echo "getting threads for board '#{ board.title }'..."
- board.threads = []
- boardPages = @evaluate findPages
- @each boardPages, (casper, boardPage) ->
- @thenOpen boardPage, ->
- board.threads = board.threads.concat @evaluate findThreads
- @then ->
- @each board.threads, (casper, thread) ->
- thread.posts = []
- @thenOpen thread.link, ->
- @echo "\tgetting posts for thread '#{ thread.title }'..."
- pollName = null
- if @exists 'div.poll.show.ui-poll'
- console.log '\t\tsaving poll...'
- linkParts = thread.link.split '/'
- pollName = "#{linkParts[linkParts.length - 1]}.png"
- @captureSelector "data/images/polls/#{pollName}", 'div.poll.show.ui-poll'
- thread.poll = pollName
- threadPages = @evaluate findPages
- @each threadPages, (casper, threadPage) ->
- @thenOpen threadPage, ->
- posts = @evaluate findPosts, replaceHtml
- @each posts, (casper, post) ->
- post.message = loadImages post.message
- for attachment in post.attachments
- casper.download attachment.url, "data/attachments/#{attachment.name}"
- attachment.url = "{{baseurl}}/attachments/#{attachment.name}"
- thread.posts = thread.posts.concat posts
- @then ->
- if thread.poll and thread.posts[0]
- thread.posts[0].message = "[img]{{baseurl}}/images/polls/#{thread.poll}[/img]\n\n#{thread.posts[0].message}"
- if thread.poll and not thread.posts[0]
- console.log "how the fuck did you manage that?"
- proboard = {}
- casper.thenOpen proboardUrl, ->
- proboard.boards = @evaluate findBoards
- @each proboard.boards, (casper, board) ->
- readBoard board
- casper.thenOpen proboardUserUrl, ->
- proboard.users = []
- userPages = @evaluate findPages
- @each userPages, (casper, userPage) ->
- @thenOpen userPage, ->
- userlinks = @evaluate findUserLinks
- @each userlinks, (casper, userlink) ->
- @thenOpen userlink, ->
- @echo "getting userinfo for '#{ userlink }'..."
- user = @evaluate getUser, replaceHtml
- user.signature = loadImages user.signature
- proboard.users = proboard.users.concat user
- casper.then ->
- json = JSON.stringify(proboard, null, '\t')
- fs.write "data/#{ proboardName }.json", json, 'w'
- casper.run()