PageRenderTime 45ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/src/Tools/offlinedoc/downloadwiki.py

https://github.com/emagdalena/FreeCAD
Python | 294 lines | 257 code | 10 blank | 27 comment | 8 complexity | 349e82f0bef63a112e7291c2c170ab18 MD5 | raw file
Possible License(s): LGPL-2.0, BSD-3-Clause
  1. #!/usr/bin/env python
  2. #***************************************************************************
  3. #* *
  4. #* Copyright (c) 2009 Yorik van Havre <yorik@uncreated.net> *
  5. #* *
  6. #* This program is free software; you can redistribute it and/or modify *
  7. #* it under the terms of the GNU Lesser General Public License (LGPL) *
  8. #* as published by the Free Software Foundation; either version 2 of *
  9. #* the License, or (at your option) any later version. *
  10. #* for detail see the LICENCE text file. *
  11. #* *
  12. #* This program is distributed in the hope that it will be useful, *
  13. #* but WITHOUT ANY WARRANTY; without even the implied warranty of *
  14. #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
  15. #* GNU Library General Public License for more details. *
  16. #* *
  17. #* You should have received a copy of the GNU Library General Public *
  18. #* License along with this program; if not, write to the Free Software *
  19. #* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *
  20. #* USA *
  21. #* *
  22. #***************************************************************************
  23. __title__="downloadwiki"
  24. __author__ = "Yorik van Havre <yorik@uncreated.net>"
  25. __url__ = "http://free-cad.sf.net"
  26. """
  27. This script retrieves the contents of a wiki site from a pages list
  28. """
  29. import sys, os, re, tempfile, getopt
  30. from urllib2 import urlopen, HTTPError
  31. # CONFIGURATION #################################################
  32. DEFAULTURL = "http://sourceforge.net/apps/mediawiki/free-cad" #default URL if no URL is passed
  33. INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki
  34. NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation', 'User_hub','Main_Page','About_this_site'] # pages that won't be fetched (kept online)
  35. GETTRANSLATIONS = False # Set true if you want to get the translations too.
  36. MAXFAIL = 3 # max number of retries if download fails
  37. VERBOSE = True # to display what's going on. Otherwise, runs totally silent.
  38. # END CONFIGURATION ##############################################
  39. FOLDER = "./localwiki"
  40. LISTFILE = "wikifiles.txt"
  41. URL = DEFAULTURL
  42. wikiindex = "/index.php?title="
  43. defaultfile = "<html><head><link type='text/css' href='wiki.css' rel='stylesheet'></head><body>&nbsp;</body></html>"
  44. css = """
  45. /* Basic CSS for offline wiki rendering */
  46. body {
  47. font-family: Arial,Helvetica,sans-serif;
  48. font-size: 13px;
  49. text-align: justify;
  50. background: #ffffff;
  51. color: #000000;
  52. }
  53. h1 {
  54. font-size: 2.2em;
  55. font-weight: bold;
  56. background: #46A4D0;
  57. color: white;
  58. padding: 5px;
  59. border-radius: 5px;
  60. }
  61. pre {
  62. border: 1px solid #888888;
  63. text-align: left;
  64. background: #EEEEEE;
  65. padding: 5px;
  66. border-radius: 5px;
  67. }
  68. a:link, a:visited {
  69. font-weight: bold;
  70. text-decoration: none;
  71. color: #0084FF;
  72. }
  73. a:hover {
  74. text-decoration: underline;
  75. }
  76. .printfooter {
  77. font-size: 0.8em;
  78. color: #333333;
  79. border-top: 1px solid #333333;
  80. }
  81. .wikitable #toc {
  82. font-size: 0.8em;
  83. }
  84. #toc,.docnav {
  85. display: none;
  86. }
  87. .ct, .ctTitle, .ctOdd, .ctEven th {
  88. text-align: left;
  89. width: 200px;
  90. float: right;
  91. background: #eeeeee;
  92. }
  93. """
  94. def crawl():
  95. "downloads an entire wiki site"
  96. processed = []
  97. if VERBOSE: print "crawling ", URL, ", saving in ", FOLDER
  98. if not os.path.isdir(FOLDER): os.mkdir(FOLDER)
  99. file = open(FOLDER + os.sep + "wiki.css",'wb')
  100. file.write(css)
  101. file.close()
  102. dfile = open(FOLDER + os.sep + "default.html",'wb')
  103. dfile.write(defaultfile)
  104. dfile.close()
  105. lfile = open(LISTFILE)
  106. global locallist
  107. locallist = []
  108. for l in lfile: locallist.append(l.replace("\n",""))
  109. lfile.close()
  110. todolist = locallist[:]
  111. print "getting",len(todolist),"files..."
  112. count = 1
  113. indexpages = get(INDEX)
  114. while todolist:
  115. targetpage = todolist.pop()
  116. if VERBOSE: print count, ": Fetching ", targetpage
  117. get(targetpage)
  118. count += 1
  119. if VERBOSE: print "Fetched ", count, " pages"
  120. if VERBOSE: print "All done!"
  121. return 0
  122. def get(page):
  123. "downloads a single page, returns the other pages it links to"
  124. if page[-4:] in [".png",".jpg",".svg",".gif","jpeg"]:
  125. print "getting image",page
  126. fetchimage(page)
  127. elif not exists(page):
  128. html = fetchpage(page)
  129. html = cleanhtml(html)
  130. pages = getlinks(html)
  131. html = cleanlinks(html,pages)
  132. html = cleanimagelinks(html)
  133. output(html,page)
  134. else:
  135. if VERBOSE: print "skipping",page
  136. def getlinks(html):
  137. "returns a list of wikipage links in html file"
  138. links = re.findall('<a[^>]*>.*?</a>',html)
  139. pages = []
  140. for l in links:
  141. # rg = re.findall('php\?title=(.*)\" title',l)
  142. rg = re.findall('href=.*?php\?title=(.*?)"',l)
  143. if rg:
  144. rg = rg[0]
  145. if "#" in rg:
  146. rg = rg.split('#')[0]
  147. if ":" in rg:
  148. NORETRIEVE.append(rg)
  149. if ";" in rg:
  150. NORETRIEVE.append(rg)
  151. if "&" in rg:
  152. NORETRIEVE.append(rg)
  153. if "/" in rg:
  154. if not GETTRANSLATIONS:
  155. NORETRIEVE.append(rg)
  156. pages.append(rg)
  157. return pages
  158. def getimagelinks(html):
  159. "returns a list of image links found in an html file"
  160. return re.findall('<img.*?src="(.*?)"',html)
  161. def cleanhtml(html):
  162. "cleans given html code from dirty script stuff"
  163. html = html.replace('\n','Wlinebreak') # removing linebreaks for regex processing
  164. html = re.compile('(.*)<div[^>]+column-content+[^>]+>').sub('',html) # stripping before content
  165. html = re.compile('<div[^>]+column-one+[^>]+>.*').sub('',html) # stripping after content
  166. html = re.compile('<!--[^>]+-->').sub('',html) # removing comment tags
  167. html = re.compile('<script[^>]*>.*?</script>').sub('',html) # removing script tags
  168. html = re.compile('<!--\[if[^>]*>.*?endif\]-->').sub('',html) # removing IE tags
  169. html = re.compile('<div id="jump-to-nav"[^>]*>.*?</div>').sub('',html) # removing nav div
  170. html = re.compile('<h3 id="siteSub"[^>]*>.*?</h3>').sub('',html) # removing print subtitle
  171. html = re.compile('Retrieved from').sub('Online version:',html) # changing online title
  172. html = re.compile('<div id="mw-normal-catlinks[^>]>.*?</div>').sub('',html) # removing catlinks
  173. html = re.compile('<div class="NavHead.*?</div>').sub('',html) # removing nav stuff
  174. html = re.compile('<div class="NavContent.*?</div>').sub('',html) # removing nav stuff
  175. html = re.compile('<div class="NavEnd.*?</div>').sub('',html) # removing nav stuff
  176. html = re.compile('<div class="docnav.*?</div></div>').sub('',html) # removing docnav
  177. if not GETTRANSLATIONS:
  178. html = re.compile('<div class="languages.*?</div>').sub('',html) # removing translations links
  179. html = re.compile('Wlinebreak').sub('\n',html) # restoring original linebreaks
  180. return html
  181. def cleanlinks(html, pages=None):
  182. "cleans page links found in html"
  183. if not pages: pages = getlinks(html)
  184. for page in pages:
  185. if page in NORETRIEVE:
  186. output = 'href="' + URL + wikiindex + page + '"'
  187. else:
  188. output = 'href="' + page.replace("/","-") + '.html"'
  189. html = re.compile('href="[^"]+' + page + '"').sub(output,html)
  190. return html
  191. def cleanimagelinks(html,links=None):
  192. "cleans image links in given html"
  193. if not links: links = getimagelinks(html)
  194. if links:
  195. for l in links:
  196. nl = re.findall('.*/(.*)',l)
  197. if nl: html = html.replace(l,nl[0])
  198. # fetchimage(l)
  199. return html
  200. def fetchpage(page):
  201. "retrieves given page from the wiki"
  202. failcount = 0
  203. while failcount < MAXFAIL:
  204. try:
  205. html = (urlopen(URL + wikiindex + page).read())
  206. return html
  207. except HTTPError:
  208. failcount += 1
  209. print 'Error: unable to fetch page ' + page
  210. def fetchimage(imagelink):
  211. "retrieves given image from the wiki and saves it"
  212. if imagelink[0:5] == "File:":
  213. print "Skipping file page link"
  214. return
  215. filename = re.findall('.*/(.*)',imagelink)[0]
  216. print "saving",filename
  217. if not exists(filename,image=True):
  218. failcount = 0
  219. while failcount < MAXFAIL:
  220. try:
  221. if VERBOSE: print "Fetching " + filename
  222. data = (urlopen(webroot(URL) + imagelink).read())
  223. path = local(filename,image=True)
  224. file = open(path,'wb')
  225. file.write(data)
  226. file.close()
  227. processed.append(filename)
  228. return
  229. except:
  230. failcount += 1
  231. print 'Error: unable to fetch file ' + filename
  232. def local(page,image=False):
  233. "returns a local path for a given page/image"
  234. if image:
  235. return FOLDER + os.sep + page
  236. else:
  237. return FOLDER + os.sep + page + '.html'
  238. def exists(page,image=False):
  239. "checks if given page/image already exists"
  240. path = local(page.replace("/","-"),image)
  241. if os.path.exists(path): return True
  242. return False
  243. def webroot(url):
  244. return re.findall('(http://.*?)/',url)[0]
  245. def output(html,page):
  246. "encapsulates raw html code into nice html body"
  247. header = "<html><head>"
  248. header += "<title>"
  249. header += page
  250. header += "</title>"
  251. header += "<link type='text/css' href='wiki.css' rel='stylesheet'>"
  252. header += "</head><body>"
  253. footer = "</body></html>"
  254. html = header+html+footer
  255. filename = local(page.replace("/","-"))
  256. print "saving",filename
  257. file = open(filename,'wb')
  258. file.write(html)
  259. file.close()
  260. if __name__ == "__main__":
  261. crawl()