/src/Tools/wiki2qhelp.py
Python | 622 lines | 557 code | 22 blank | 43 comment | 37 complexity | 89fc4b01d50408b093fa8b1b8cf2e680 MD5 | raw file
Possible License(s): LGPL-2.0, BSD-3-Clause
- #!/usr/bin/env python
- #***************************************************************************
- #* *
- #* Copyright (c) 2009 Yorik van Havre <yorik@uncreated.net> *
- #* *
- #* This program is free software; you can redistribute it and/or modify *
- #* it under the terms of the GNU Library General Public License (LGPL) *
- #* as published by the Free Software Foundation; either version 2 of *
- #* the License, or (at your option) any later version. *
- #* for detail see the LICENCE text file. *
- #* *
- #* This program is distributed in the hope that it will be useful, *
- #* but WITHOUT ANY WARRANTY; without even the implied warranty of *
- #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
- #* GNU Library General Public License for more details. *
- #* *
- #* You should have received a copy of the GNU Library General Public *
- #* License along with this program; if not, write to the Free Software *
- #* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *
- #* USA *
- #* *
- #***************************************************************************
- __title__="wiki2qhelp"
- __author__ = "Yorik van Havre <yorik@uncreated.net>"
- __url__ = "http://yorik.uncreated.net"
- """
- This script retrieves the contents of a wiki site and saves it locally,
- then calls qt help compiler to produce a qhelp-assistant help file.
- The script can be called without arguments, it will then use the default
- url below, or by passing it an url and optionally a TOC name.
- """
- import sys, os, re, tempfile, getopt
- from urllib2 import urlopen, HTTPError
- # CONFIGURATION #################################################
- DEFAULTURL = "http://sourceforge.net/apps/mediawiki/free-cad" #default URL if no URL is passed
- INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki
- NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation', 'User_hub','Main_Page','About_this_site'] # pages that won't be fetched (kept online)
- GETTRANSLATIONS = True # Set true if you want to get the translations too.
- MAXFAIL = 3 # max number of retries if download fails
- VERBOSE = True # to display what's going on. Otherwise, runs totally silent.
- COMPILE = True # Wether qt assistant will be used to compile the final help file
- OUTPUTPATH = os.path.expanduser("~")+os.sep+'.FreeCAD' # Where to store the qch file
- QHELPCOMPILER = 'qhelpgenerator'
- QCOLLECTIOMGENERATOR = 'qcollectiongenerator'
- PDFOUTPUT = False # if true, a pdf file will be generated instead of qhelp.
- REMOVE = True # if true, the temp html files are removed after successful operation
- PDFCONVERTOR = 'pisa' # can be 'pisa' or 'htmldoc'
- # END CONFIGURATION ##############################################
- URL = DEFAULTURL
- TMPFOLDER = tempfile.mkdtemp()
- wikiindex = "/index.php?title="
- processed = []
- pisa = None
- usage='''
- wiki2qhelp [options] [url] [index page]
- fetches wiki pages from the specified url, starting from specified
- index page, and outputs a .qch file in the specified output path.
- You must have qassistant installed.
- If no url, index page or output path is specified, the following
- default values will be used:
- url: '''+DEFAULTURL+'''
- index page: '''+INDEX+'''
- output path: '''+OUTPUTPATH+'''
- Options:
- -v: Verbose mode
- -c filename or --helpcompiler-exe filename: Uses filename as qt help compiler
- -g filename or --helpgenerator-exe filename: Uses filename as qt collection generator
- -o path or --out-path path: Specifies an output path
- -h or --help: Displays this help message
- -p [convertor] or --pdf [convertor]: Outputs a pdf file instead of qhelp. Convertor
- can be pisa (default) or htmldoc
- -t path or --tempfolder path: Uses path as temp folder for storing html files
- '''
- css = """/* Basic CSS for offline wiki rendering */
- body {
- font-family: Arial,Helvetica,sans-serif;
- font-size: 13px;
- text-align: justify;
- }
- h1 {
- font-size: 2.2em;
- font-weight: bold;
- background: #46A4D0;
- color: white;
- padding: 5px;
- -moz-border-radius: 5px;
- -webkit-border-radius: 5px;
- }
- pre {
- border: 1px dashed #333333;
- text-align: left;
- background: #EEEEEE;
- padding: 5px;
- }
- a:link, a:visited {
- font-weight: bold;
- text-decoration: none;
- color: #0084FF;
- }
- a:hover {
- text-decoration: underline;
- }
- .printfooter {
- font-size: 0.8em;
- color: #333333;
- border-top: 1px solid #333333;
- }
- .wikitable #toc {
- font-size: 0.8em;
- }
- #toc,.docnav {
- display: none;
- }
- """
- fcount = dcount = 0
- def rmall(dirPath): # delete dirPath and below
- global fcount, dcount
- namesHere = os.listdir(dirPath)
- for name in namesHere: # remove all contents first
- path = os.path.join(dirPath, name)
- if not os.path.isdir(path): # remove simple files
- os.remove(path)
- fcount = fcount + 1
- else: # recur to remove subdirs
- rmall(path)
- os.rmdir(dirPath) # remove now-empty dirPath
- dcount = dcount + 1
- def crawl(site=DEFAULTURL):
- "downloads an entire wiki site"
- # tests ###############################################
-
- if COMPILE and os.system(QHELPCOMPILER +' -v'):
- print ("Error: QAssistant not fully installed, exiting.")
- print (QHELPCOMPILER)
- return 1
- if COMPILE and os.system(QCOLLECTIOMGENERATOR +' -v'):
- print ("Error: QAssistant not fully installed, exiting.")
- return 1
- if PDFOUTPUT:
- if PDFCONVERTOR == 'pisa':
- try:
- import ho.pisa as pisa
- except: ("Error: Python-pisa not installed, exiting.")
- return 1
- else:
- if os.system('htmldoc --version'):
- print ("Error: Htmldoc not found, exiting.")
- return 1
- try:
- from pyPdf import PdfFileReader,PdfFileWriter
- except:
- print ("Error: Python-pypdf not installed, exiting.")
- # run ########################################################
-
- URL = site
- if VERBOSE: print ("crawling "), URL, ", saving in ", TMPFOLDER
- if not os.path.isdir(TMPFOLDER): os.mkdir(TMPFOLDER)
- file = open(TMPFOLDER + os.sep + "wiki.css",'wb')
- file.write(css)
- file.close()
- todolist = []
- count = 1
- indexpages = get(INDEX)
- todolist.extend(indexpages)
- while todolist:
- targetpage = todolist.pop()
- if not targetpage in NORETRIEVE:
- if VERBOSE: print (count, ": Fetching ", targetpage)
- pages = get(targetpage)
- count += 1
- processed.append(targetpage)
- for p in pages:
- if (not (p in todolist)) and (not (p in processed)):
- todolist.append(p)
- if VERBOSE: print ("Fetched ", count, " pages")
- if PDFOUTPUT:
- buildpdffiles()
- joinpdf()
- if REMOVE:
- if VERBOSE: print ("Deleting temp files...")
- rmall(TMPFOLDER)
- if COMPILE:
- qhp = buildtoc()
- qhcp = createCollProjectFile()
- if generate(qhcp) or compile(qhp):
- print ("Temp Folder ",TMPFOLDER," has not been deleted.")
- return 1
- else:
- if REMOVE:
- if VERBOSE: print ("Deleting temp files...")
- rmall(TMPFOLDER)
- if VERBOSE: print ("All done!")
- return 0
- def buildpdffiles(folder=TMPFOLDER,convertor=PDFCONVERTOR):
- "scans a folder for html files and converts them all to pdf"
- templist = os.listdir(folder)
- fileslist = []
- for i in templist:
- if i[-5:] == '.html':
- fileslist.append(i)
- for f in fileslist:
- if convertor == 'pisa': createpdf_pisa(f[:-5],folder)
- else: createpdf_htmldoc(f[:-5],folder)
- def fetch_resources(uri, rel):
- """
- Callback to allow pisa/reportlab to retrieve Images,Stylesheets, etc.
- 'uri' is the href attribute from the html link element.
- 'rel' gives a relative path, but it's not used here.
- Note from Yorik: Not working!!
- """
- path = os.path.join(TMPFOLDER,uri.replace("./", ""))
- return path
- def createpdf_pisa(pagename,folder=TMPFOLDER):
- "creates a pdf file from a saved page using pisa (python module)"
- infile = file(folder + os.sep + pagename+'.html','ro')
- outfile = file(folder + os.sep + pagename+'.pdf','wb')
- if VERBOSE: print ("Converting " + pagename + " to pdf...")
- pdf = pisa.CreatePDF(infile,outfile,folder,link_callback=fetch_resources)
- outfile.close()
- if pdf.err: return pdf.err
- return 0
- def createpdf_htmldoc(pagename,folder=TMPFOLDER):
- "creates a pdf file from a saved page using htmldoc (external app, but supports images)"
- infile = folder + os.sep + pagename+'.html'
- outfile = folder + os.sep + pagename+'.pdf'
- return os.system('htmldoc --webpage -f '+outfile+' '+infile)
- def joinpdf(folder=TMPFOLDER,startpage=INDEX,outputname='freecad.pdf'):
- "creates one pdf file from several others, following order from startpage"
- if VERBOSE: print ("Building table of contents...")
- f = open(folder+os.sep+startpage+'.html')
- html = ''
- for line in f: html += line
- f.close()
- html = html.replace("\n"," ")
- html = html.replace("> <","><")
- html = re.findall("<ul.*/ul>",html)[0]
- pages = re.findall('href="(.*?)"',html)
- pages.insert(1,startpage+".html")
- result = PdfFileWriter()
- for p in pages:
- if exists(p[:-5]):
- if VERBOSE: print ('Appending',p)
- try: inputfile = PdfFileReader(file(folder+os.sep+p[:-5]+'.pdf','rb'))
- except: print ('Unable to append',p)
- else:
- for i in range(inputfile.getNumPages()):
- result.addPage(inputfile.getPage(i))
- outputfile = file(OUTPUTPATH + os.sep + outputname,'wb')
- result.write(outputfile)
- outputfile.close()
- if VERBOSE: print ('Successfully created',OUTPUTPATH,os.sep,outputname)
-
- def compile(qhpfile,outputname='freecad.qch'):
- "compiles the whole html doc with qassistant"
- qchfile = OUTPUTPATH + os.sep + outputname
- if not os.system(QHELPCOMPILER + ' '+qhpfile+' -o '+qchfile):
- if VERBOSE: print ("Successfully created",qchfile)
- return 0
- def generate(qhcpfile):
- "generates qassistant-specific settings like icon, title, ..."
- txt="""
- The help files for FreeCAD.
- """
- about=open(TMPFOLDER + os.sep + "about.txt","w")
- about.write(txt)
- about.close()
- qhcfile = OUTPUTPATH + os.sep + "freecad.qhc"
- if not os.system(QCOLLECTIOMGENERATOR+' '+qhcpfile+' -o '+qhcfile):
- if VERBOSE: print ("Successfully created ",qhcfile)
- return 0
- def createCollProjectFile(folder=TMPFOLDER):
- qprojectfile = '''<?xml version="1.0" encoding="UTF-8"?>
- <QHelpCollectionProject version="1.0">
- <assistant>
- <title>FreeCAD User Manual</title>
- <applicationIcon>Crystal_Clear_app_tutorials.png</applicationIcon>
- <cacheDirectory>freecad/freecad</cacheDirectory>
- <startPage>qthelp://org.freecad.usermanual_0.9/doc/Online_Help_Startpage.html</startPage>
- <aboutMenuText>
- <text>About FreeCAD</text>
- </aboutMenuText>
- <aboutDialog>
- <file>about.txt</file>
- <!--
- <icon>images/icon.png</icon>
- -->
- <icon>Crystal_Clear_app_tutorials.png</icon>
- </aboutDialog>
- <enableDocumentationManager>true</enableDocumentationManager>
- <enableAddressBar>true</enableAddressBar>
- <enableFilterFunctionality>true</enableFilterFunctionality>
- </assistant>
- <docFiles>
- <generate>
- <file>
- <input>freecad.qhp</input>
- <output>freecad.qch</output>
- </file>
- </generate>
- <register>
- <file>freecad.qch</file>
- </register>
- </docFiles>
- </QHelpCollectionProject>
- '''
- if VERBOSE: print ("Building project file...")
- qfilename = folder + os.sep + "freecad.qhcp"
- f = open(qfilename,'w')
- f.write(qprojectfile)
- f.close()
- if VERBOSE: print ("Done writing qhcp file.")
- return qfilename
- def buildtoc(folder=TMPFOLDER,page=INDEX):
- "gets the table of contents page and parses its contents into a clean lists structure"
-
- qhelpfile = '''<?xml version="1.0" encoding="UTF-8"?>
- <QtHelpProject version="1.0">
- <namespace>org.freecad.usermanual_0.9</namespace>
- <virtualFolder>doc</virtualFolder>
- <!--
- <customFilter name="FreeCAD 0.10">
- <filterAttribute>FreeCAD</filterAttribute>
- <filterAttribute>0.10</filterAttribute>
- </customFilter>
- -->
- <filterSection>
- <!--
- <filterAttribute>FreeCAD</filterAttribute>
- <filterAttribute>0.10</filterAttribute>
- -->
- <toc>
- <inserttoc>
- </toc>
- <keywords>
- <insertkeywords>
- </keywords>
- <insertfiles>
- </filterSection>
- </QtHelpProject>
- '''
-
- def getname(line):
- line = re.compile('<li>').sub('',line)
- line = re.compile('</li>').sub('',line)
- title = line.strip()
- link = ''
- if "<a" in line:
- title = re.findall('<a[^>]*>(.*?)</a>',line)[0].strip()
- link = re.findall('href="(.*?)"',line)[0].strip()
- return title,link
- if VERBOSE: print ("Building table of contents...")
- f = open(folder+os.sep+page+'.html')
- html = ''
- for line in f: html += line
- f.close()
- html = html.replace("\n"," ")
- html = html.replace("> <","><")
- html = re.findall("<ul.*/ul>",html)[0]
- items = re.findall('<li[^>]*>.*?</li>|</ul></li>',html)
- inserttoc = '<section title="Table of Contents">\n'
- insertkeywords = ''
- for item in items:
- if not ("<ul>" in item):
- if ("</ul>" in item):
- inserttoc += '</section>\n'
- else:
- link = ''
- title,link=getname(item)
- if link:
- link='" ref="'+link
- insertkeywords += ('<keyword name="'+title+link+'"/>\n')
- inserttoc += ('<section title="'+title+link+'"></section>\n')
- else:
- subitems = item.split("<ul>")
- for i in range(len(subitems)):
- link = ''
- title,link=getname(subitems[i])
- if link:
- link='" ref="'+link
- insertkeywords += ('<keyword name="'+title+link+'"/>\n')
- trail = ''
- if i == len(subitems)-1: trail = '</section>'
- inserttoc += ('<section title="'+title+link+'">'+trail+'\n')
- inserttoc += '</section>\n'
- insertfiles = "<files>\n"
- for fil in os.listdir(folder):
- insertfiles += ("<file>"+fil+"</file>\n")
- insertfiles += "</files>\n"
- qhelpfile = re.compile('<insertkeywords>').sub(insertkeywords,qhelpfile)
- qhelpfile = re.compile('<inserttoc>').sub(inserttoc,qhelpfile)
- qhelpfile = re.compile('<insertfiles>').sub(insertfiles,qhelpfile)
- qfilename = folder + os.sep + "freecad.qhp"
- f = open(qfilename,'wb')
- f.write(qhelpfile)
- f.close()
- if VERBOSE: print ("Done writing qhp file.")
- return qfilename
- def get(page):
- "downloads a single page, returns the other pages it links to"
- html = fetchpage(page)
- html = cleanhtml(html)
- pages = getlinks(html)
- html = cleanlinks(html,pages)
- html = cleanimagelinks(html)
- output(html,page)
- return pages
- def cleanhtml(html):
- "cleans given html code from dirty script stuff"
- html = html.replace('\n','Wlinebreak') # removing linebreaks for regex processing
- html = re.compile('(.*)<div[^>]+column-content+[^>]+>').sub('',html) # stripping before content
- html = re.compile('<div[^>]+column-one+[^>]+>.*').sub('',html) # stripping after content
- html = re.compile('<!--[^>]+-->').sub('',html) # removing comment tags
- html = re.compile('<script[^>]*>.*?</script>').sub('',html) # removing script tags
- html = re.compile('<!--\[if[^>]*>.*?endif\]-->').sub('',html) # removing IE tags
- html = re.compile('<div id="jump-to-nav"[^>]*>.*?</div>').sub('',html) # removing nav div
- html = re.compile('<h3 id="siteSub"[^>]*>.*?</h3>').sub('',html) # removing print subtitle
- html = re.compile('Retrieved from').sub('Online version:',html) # changing online title
- html = re.compile('<div id="mw-normal-catlinks[^>]>.*?</div>').sub('',html) # removing catlinks
- html = re.compile('<div class="NavHead.*?</div>').sub('',html) # removing nav stuff
- html = re.compile('<div class="NavContent.*?</div>').sub('',html) # removing nav stuff
- html = re.compile('<div class="NavEnd.*?</div>').sub('',html) # removing nav stuff
- if not GETTRANSLATIONS:
- html = re.compile('<div class="languages.*?</div>').sub('',html) # removing translations links
- html = re.compile('Wlinebreak').sub('\n',html) # restoring original linebreaks
- return html
-
- def getlinks(html):
- "returns a list of wikipage links in html file"
- links = re.findall('<a[^>]*>.*?</a>',html)
- pages = []
- for l in links:
- # rg = re.findall('php\?title=(.*)\" title',l)
- rg = re.findall('href=.*?php\?title=(.*?)"',l)
- if rg:
- rg = rg[0]
- if "#" in rg:
- rg = rg.split('#')[0]
- if ":" in rg:
- NORETRIEVE.append(rg)
- if ";" in rg:
- NORETRIEVE.append(rg)
- if "&" in rg:
- NORETRIEVE.append(rg)
- if "/" in rg:
- if not GETTRANSLATIONS:
- NORETRIEVE.append(rg)
- pages.append(rg)
- return pages
- def getimagelinks(html):
- "returns a list of image links found in an html file"
- return re.findall('<img.*?src="(.*?)"',html)
- def cleanlinks(html, pages=None):
- "cleans page links found in html"
- if not pages: pages = getlinks(html)
- for page in pages:
- if page in NORETRIEVE:
- output = 'href="' + URL + wikiindex + page + '"'
- else:
- output = 'href="' + page.replace("/","-") + '.html"'
- html = re.compile('href="[^"]+' + page + '"').sub(output,html)
- return html
- def cleanimagelinks(html,links=None):
- "cleans image links in given html"
- if not links: links = getimagelinks(html)
- if links:
- for l in links:
- nl = re.findall('.*/(.*)',l)
- if nl: html = html.replace(l,nl[0])
- fetchimage(l)
- return html
- def fetchpage(page):
- "retrieves given page from the wiki"
- failcount = 0
- while failcount < MAXFAIL:
- try:
- html = (urlopen(URL + wikiindex + page).read())
- return html
- except HTTPError:
- failcount += 1
- print ('Error: unable to fetch page ' + page)
- def fetchimage(imagelink):
- "retrieves given image from the wiki and saves it"
- filename = re.findall('.*/(.*)',imagelink)[0]
- if not (filename in processed):
- failcount = 0
- while failcount < MAXFAIL:
- try:
- if VERBOSE: print ("Fetching " + filename)
- data = (urlopen(webroot(URL) + imagelink).read())
- path = local(filename,image=True)
- file = open(path,'wb')
- file.write(data)
- file.close()
- processed.append(filename)
- return
- except:
- failcount += 1
- print ('Error: unable to fetch file ' + filename)
- def local(page,image=False):
- "returns a local path for a given page/image"
- if image:
- return TMPFOLDER + os.sep + page
- else:
- return TMPFOLDER + os.sep + page + '.html'
- def exists(page,image=False):
- "checks if given page/image already exists"
- path = local(page,image)
- if os.path.exists(path): return True
- return False
- def webroot(url):
- return re.findall('(http://.*?)/',url)[0]
- def output(html,page):
- "encapsulates raw html code into nice html body"
- header = "<html><head>"
- header += "<title>"
- header += page
- header += "</title>"
- header += "<link type='text/css' href='wiki.css' rel='stylesheet'>"
- header += "</head><body>"
- footer = "</body></html>"
- html = header+html+footer
- filename = local(page.replace("/","-"))
- file = open(filename,'wb')
- file.write(html)
- file.close()
- def main(arg):
- global QHELPCOMPILER,QCOLLECTIOMGENERATOR,OUTPUTPATH,PDFOUTPUT,PDFCONVERTOR,TMPFOLDER
- try:
- opts, args = getopt.getopt(sys.argv[1:], "hp:t:c:g:o:", ["help", "pdf=", "noremove", "tempfolder=", "helpcompiler-exe=", "out-path=", "helpgenerator-exe="])
- except getopt.GetoptError:
- # print help information and exit:
- sys.stderr.write(usage)
- sys.exit(2)
- # checking on the options
- for o, a in opts:
- if o == "-v":
- VERBOSE = True
- if o in ("-p","--pdf"):
- PDFOUTPUT = True
- if a in ['pisa','htmldoc']:
- print ("using pdf converter:",a)
- PDFCONVERTOR = a
- if o in ("-t","--tempfolder"):
- print ("using tempfolder:",a)
- TMPFOLDER = a
- if o in ("-h", "--help"):
- sys.stderr.write(usage)
- sys.exit()
- if o in ("-c", "--helpcompiler-exe"):
- QHELPCOMPILER = a
- print ('Using: ',QHELPCOMPILER)
- if o in ("-g", "--helpgenerator-exe"):
- QCOLLECTIOMGENERATOR = a
- if o in ("-o", "--out-path"):
- print ("Using output path:",a)
- OUTPUTPATH = a
- # if arg:
- # if (arg[0] == '-h') or (arg[0] == '--help'):
- # print usage
- # else:
- # URL = arg[0]
- # if len(arg) > 1: INDEX = arg[1]
- # if len(arg) > 2: OUTPUTPATH = arg[2]
- # crawl()
- # else:
- crawl()
- if __name__ == "__main__":
- # main(sys.argv[1:])
- print "Warning! This script is obsolete. Use the scripts in the offlinedocs folder..."
-