wiki2qhelp.py - CONFIGURATION #############################…

/src/Tools/wiki2qhelp.py

https://github.com/emagdalena/FreeCAD · Python · 622 lines · 544 code · 30 blank · 48 comment · 75 complexity · 89fc4b01d50408b093fa8b1b8cf2e680 MD5 · raw file

#!/usr/bin/env python

#***************************************************************************
#*                                                                         *
#*   Copyright (c) 2009 Yorik van Havre <yorik@uncreated.net>              *  
#*                                                                         *
#*   This program is free software; you can redistribute it and/or modify  *
#*   it under the terms of the GNU Library General Public License (LGPL)   *
#*   as published by the Free Software Foundation; either version 2 of     *
#*   the License, or (at your option) any later version.                   *
#*   for detail see the LICENCE text file.                                 *
#*                                                                         *
#*   This program is distributed in the hope that it will be useful,       *
#*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
#*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
#*   GNU Library General Public License for more details.                  *
#*                                                                         *
#*   You should have received a copy of the GNU Library General Public     *
#*   License along with this program; if not, write to the Free Software   *
#*   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  *
#*   USA                                                                   *
#*                                                                         *
#***************************************************************************

__title__="wiki2qhelp"
__author__ = "Yorik van Havre <yorik@uncreated.net>"
__url__ = "http://yorik.uncreated.net"

"""
This script retrieves the contents of a wiki site and saves it locally,
then calls qt help compiler to produce a qhelp-assistant help file.
The script can be called without arguments, it will then use the default
url below, or by passing it an url and optionally a TOC name.
"""

import sys, os, re, tempfile, getopt
from urllib2 import urlopen, HTTPError

#    CONFIGURATION       #################################################

DEFAULTURL = "http://sourceforge.net/apps/mediawiki/free-cad" #default URL if no URL is passed
INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki
NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation', 'User_hub','Main_Page','About_this_site'] # pages that won't be fetched (kept online)
GETTRANSLATIONS = True # Set true if you want to get the translations too.
MAXFAIL = 3 # max number of retries if download fails
VERBOSE = True # to display what's going on. Otherwise, runs totally silent.
COMPILE = True # Wether qt assistant will be used to compile the final help file
OUTPUTPATH = os.path.expanduser("~")+os.sep+'.FreeCAD' # Where to store the qch file
QHELPCOMPILER = 'qhelpgenerator'
QCOLLECTIOMGENERATOR = 'qcollectiongenerator'
PDFOUTPUT = False # if true, a pdf file will be generated instead of qhelp.
REMOVE = True # if true, the temp html files are removed after successful operation
PDFCONVERTOR = 'pisa' # can be 'pisa' or 'htmldoc'

#    END CONFIGURATION      ##############################################

URL = DEFAULTURL
TMPFOLDER = tempfile.mkdtemp()
wikiindex = "/index.php?title="
processed = []
pisa = None
usage='''
    wiki2qhelp [options] [url] [index page]

    fetches wiki pages from the specified url, starting from specified
    index page, and outputs a .qch file in the specified output path.
    You must have qassistant installed.

    If no url, index page or output path is specified, the following
    default values will be used:
    url: '''+DEFAULTURL+'''
    index page: '''+INDEX+'''
    output path: '''+OUTPUTPATH+'''

    Options:

    -v: Verbose mode
    -c filename or --helpcompiler-exe filename: Uses filename as qt help compiler
    -g filename or --helpgenerator-exe filename: Uses filename as qt collection generator
    -o path or --out-path path: Specifies an output path
    -h or --help: Displays this help message
    -p [convertor] or --pdf [convertor]: Outputs a pdf file instead of qhelp. Convertor
                                         can be pisa (default) or htmldoc
    -t path or --tempfolder path: Uses path as temp folder for storing html files

    '''
css = """/* Basic CSS for offline wiki rendering */

body {
  font-family: Arial,Helvetica,sans-serif;
  font-size: 13px;
  text-align: justify;
  }

h1 {
  font-size: 2.2em;
  font-weight: bold;
  background: #46A4D0;
  color: white;
  padding: 5px;
  -moz-border-radius: 5px;
  -webkit-border-radius: 5px;
  }

pre {
  border: 1px dashed #333333;
  text-align: left;
  background: #EEEEEE;
  padding: 5px;
  }

a:link, a:visited {
  font-weight: bold;
  text-decoration: none;
  color: #0084FF;
  }

a:hover {
  text-decoration: underline;
  }

.printfooter {
  font-size: 0.8em;
  color: #333333;
  border-top: 1px solid #333333;
  }

.wikitable #toc {
  font-size: 0.8em;
  }

#toc,.docnav {
  display: none;
  }

"""
fcount = dcount = 0

def rmall(dirPath):                             # delete dirPath and below
    global fcount, dcount
    namesHere = os.listdir(dirPath)
    for name in namesHere:                      # remove all contents first
        path = os.path.join(dirPath, name)
        if not os.path.isdir(path):             # remove simple files
            os.remove(path)
            fcount = fcount + 1
        else:                                   # recur to remove subdirs
            rmall(path)
    os.rmdir(dirPath)                           # remove now-empty dirPath
    dcount = dcount + 1

def crawl(site=DEFAULTURL):
    "downloads an entire wiki site"

    # tests ###############################################
    
    if COMPILE and os.system(QHELPCOMPILER +' -v'):
        print ("Error: QAssistant not fully installed, exiting.")
        print (QHELPCOMPILER)
        return 1
    if COMPILE and os.system(QCOLLECTIOMGENERATOR +' -v'):
        print ("Error: QAssistant not fully installed, exiting.")
        return 1
    if PDFOUTPUT:
        if PDFCONVERTOR == 'pisa':
            try:
                import ho.pisa as pisa
            except: ("Error: Python-pisa not installed, exiting.")
            return 1
        else:
            if os.system('htmldoc --version'):
                print ("Error: Htmldoc not found, exiting.")
                return 1
        try:
            from pyPdf import PdfFileReader,PdfFileWriter
        except:
            print ("Error: Python-pypdf not installed, exiting.")

    # run ########################################################
    
    URL = site
    if VERBOSE: print ("crawling "), URL, ", saving in ", TMPFOLDER
    if not os.path.isdir(TMPFOLDER): os.mkdir(TMPFOLDER)
    file = open(TMPFOLDER + os.sep + "wiki.css",'wb')
    file.write(css)
    file.close()
    todolist = []
    count = 1
    indexpages = get(INDEX)
    todolist.extend(indexpages)
    while todolist:
        targetpage = todolist.pop()
        if not targetpage in NORETRIEVE:
            if VERBOSE: print (count, ": Fetching ", targetpage)
            pages = get(targetpage)
            count += 1
            processed.append(targetpage)
            for p in pages:
                if (not (p in todolist)) and (not (p in processed)):
                    todolist.append(p)
    if VERBOSE: print ("Fetched ", count, " pages")
    if PDFOUTPUT:
        buildpdffiles()
        joinpdf()
        if REMOVE:
            if VERBOSE: print ("Deleting temp files...")
            rmall(TMPFOLDER)
    if COMPILE:
        qhp = buildtoc()
        qhcp = createCollProjectFile()
        if generate(qhcp) or compile(qhp):
            print ("Temp Folder ",TMPFOLDER," has not been deleted.")
            return 1
        else:
            if REMOVE:
                if VERBOSE: print ("Deleting temp files...")
                rmall(TMPFOLDER)
    if VERBOSE: print ("All done!")
    return 0

def buildpdffiles(folder=TMPFOLDER,convertor=PDFCONVERTOR):
    "scans a folder for html files and converts them all to pdf"
    templist = os.listdir(folder)
    fileslist = []
    for i in templist:
        if i[-5:] == '.html':
            fileslist.append(i)
    for f in fileslist:
        if convertor == 'pisa': createpdf_pisa(f[:-5],folder)
        else: createpdf_htmldoc(f[:-5],folder)

def fetch_resources(uri, rel):
        """
        Callback to allow pisa/reportlab to retrieve Images,Stylesheets, etc.
        'uri' is the href attribute from the html link element.
        'rel' gives a relative path, but it's not used here.

        Note from Yorik: Not working!!
        """
        path = os.path.join(TMPFOLDER,uri.replace("./", ""))
        return path

def createpdf_pisa(pagename,folder=TMPFOLDER):
    "creates a pdf file from a saved page using pisa (python module)"
    infile = file(folder + os.sep + pagename+'.html','ro')
    outfile = file(folder + os.sep + pagename+'.pdf','wb')
    if VERBOSE: print ("Converting " + pagename + " to pdf...")
    pdf = pisa.CreatePDF(infile,outfile,folder,link_callback=fetch_resources)
    outfile.close()
    if pdf.err: return pdf.err
    return 0

def createpdf_htmldoc(pagename,folder=TMPFOLDER):
    "creates a pdf file from a saved page using htmldoc (external app, but supports images)"
    infile = folder + os.sep + pagename+'.html'
    outfile = folder + os.sep + pagename+'.pdf'
    return os.system('htmldoc --webpage -f '+outfile+' '+infile)

def joinpdf(folder=TMPFOLDER,startpage=INDEX,outputname='freecad.pdf'):
    "creates one pdf file from several others, following order from startpage"
    if VERBOSE: print ("Building table of contents...")
    f = open(folder+os.sep+startpage+'.html')
    html = ''
    for line in f: html += line
    f.close()
    html = html.replace("\n"," ")
    html = html.replace("> <","><")
    html = re.findall("<ul.*/ul>",html)[0]
    pages = re.findall('href="(.*?)"',html)
    pages.insert(1,startpage+".html")
    result = PdfFileWriter()
    for p in pages:
        if exists(p[:-5]):
            if VERBOSE: print ('Appending',p)
            try: inputfile = PdfFileReader(file(folder+os.sep+p[:-5]+'.pdf','rb'))
            except: print ('Unable to append',p)
            else:
                for i in range(inputfile.getNumPages()):
                    result.addPage(inputfile.getPage(i))
    outputfile = file(OUTPUTPATH + os.sep + outputname,'wb')
    result.write(outputfile)
    outputfile.close()
    if VERBOSE: print ('Successfully created',OUTPUTPATH,os.sep,outputname)
    
def compile(qhpfile,outputname='freecad.qch'):
    "compiles the whole html doc with qassistant"
    qchfile = OUTPUTPATH + os.sep + outputname
    if not os.system(QHELPCOMPILER + ' '+qhpfile+' -o '+qchfile):
        if VERBOSE: print ("Successfully created",qchfile)
        return 0

def generate(qhcpfile):
    "generates qassistant-specific settings like icon, title, ..."
    txt="""
The help files for FreeCAD.
    """
    about=open(TMPFOLDER + os.sep + "about.txt","w")
    about.write(txt)
    about.close()
    qhcfile = OUTPUTPATH + os.sep + "freecad.qhc"
    if not os.system(QCOLLECTIOMGENERATOR+' '+qhcpfile+' -o '+qhcfile):
        if VERBOSE: print ("Successfully created ",qhcfile)
        return 0

def createCollProjectFile(folder=TMPFOLDER):
    qprojectfile = '''<?xml version="1.0" encoding="UTF-8"?>
<QHelpCollectionProject version="1.0">
    <assistant>
        <title>FreeCAD User Manual</title>
        <applicationIcon>Crystal_Clear_app_tutorials.png</applicationIcon>
        <cacheDirectory>freecad/freecad</cacheDirectory>
        <startPage>qthelp://org.freecad.usermanual_0.9/doc/Online_Help_Startpage.html</startPage>
        <aboutMenuText>
            <text>About FreeCAD</text>
        </aboutMenuText>
        <aboutDialog>
            <file>about.txt</file>
            <!--
            <icon>images/icon.png</icon>
            -->
            <icon>Crystal_Clear_app_tutorials.png</icon>
        </aboutDialog>
        <enableDocumentationManager>true</enableDocumentationManager>
        <enableAddressBar>true</enableAddressBar>
        <enableFilterFunctionality>true</enableFilterFunctionality>
    </assistant>
    <docFiles>
        <generate>
            <file>
                <input>freecad.qhp</input>
                <output>freecad.qch</output>
                </file>
            </generate>
        <register>
            <file>freecad.qch</file>
        </register>
    </docFiles>
</QHelpCollectionProject>
'''
    if VERBOSE: print ("Building project file...")
    qfilename = folder + os.sep + "freecad.qhcp"
    f = open(qfilename,'w')
    f.write(qprojectfile)
    f.close()
    if VERBOSE: print ("Done writing qhcp file.")
    return qfilename

def buildtoc(folder=TMPFOLDER,page=INDEX):
    "gets the table of contents page and parses its contents into a clean lists structure"
    
    qhelpfile = '''<?xml version="1.0" encoding="UTF-8"?>
<QtHelpProject version="1.0">
    <namespace>org.freecad.usermanual_0.9</namespace>
    <virtualFolder>doc</virtualFolder>
    <!--
    <customFilter name="FreeCAD 0.10">
        <filterAttribute>FreeCAD</filterAttribute>
        <filterAttribute>0.10</filterAttribute>
    </customFilter>
    -->
    <filterSection>
        <!--
        <filterAttribute>FreeCAD</filterAttribute>
        <filterAttribute>0.10</filterAttribute>
        -->
        <toc>
            <inserttoc>
        </toc>
        <keywords>
            <insertkeywords>
        </keywords>
        <insertfiles>
    </filterSection>
</QtHelpProject>
'''
    
    def getname(line):
        line = re.compile('<li>').sub('',line)
        line = re.compile('</li>').sub('',line)
        title = line.strip()
        link = ''
        if "<a" in line:
            title = re.findall('<a[^>]*>(.*?)</a>',line)[0].strip()
            link = re.findall('href="(.*?)"',line)[0].strip()
        return title,link

    if VERBOSE: print ("Building table of contents...")
    f = open(folder+os.sep+page+'.html')
    html = ''
    for line in f: html += line
    f.close()
    html = html.replace("\n"," ")
    html = html.replace("> <","><")
    html = re.findall("<ul.*/ul>",html)[0]
    items = re.findall('<li[^>]*>.*?</li>|</ul></li>',html)
    inserttoc = '<section title="Table of Contents">\n'
    insertkeywords = ''
    for item in items:
        if not ("<ul>" in item):
            if ("</ul>" in item):
                inserttoc += '</section>\n'
            else:
                link = ''
                title,link=getname(item)
                if link:
                    link='" ref="'+link
                    insertkeywords += ('<keyword name="'+title+link+'"/>\n')
                inserttoc += ('<section title="'+title+link+'"></section>\n')
        else:
            subitems = item.split("<ul>")
            for i in range(len(subitems)):
                link = ''
                title,link=getname(subitems[i])
                if link:
                    link='" ref="'+link
                    insertkeywords += ('<keyword name="'+title+link+'"/>\n')
                trail = ''
                if i == len(subitems)-1: trail = '</section>'
                inserttoc += ('<section title="'+title+link+'">'+trail+'\n')
    inserttoc += '</section>\n'

    insertfiles = "<files>\n"
    for fil in os.listdir(folder):
        insertfiles += ("<file>"+fil+"</file>\n")
    insertfiles += "</files>\n"

    qhelpfile = re.compile('<insertkeywords>').sub(insertkeywords,qhelpfile)
    qhelpfile = re.compile('<inserttoc>').sub(inserttoc,qhelpfile)
    qhelpfile = re.compile('<insertfiles>').sub(insertfiles,qhelpfile)
    qfilename = folder + os.sep + "freecad.qhp"
    f = open(qfilename,'wb')
    f.write(qhelpfile)
    f.close()
    if VERBOSE: print ("Done writing qhp file.")
    return qfilename

def get(page):
    "downloads a single page, returns the other pages it links to"
    html = fetchpage(page)
    html = cleanhtml(html)
    pages = getlinks(html)
    html = cleanlinks(html,pages)
    html = cleanimagelinks(html)
    output(html,page)
    return pages

def cleanhtml(html):
    "cleans given html code from dirty script stuff"
    html = html.replace('\n','Wlinebreak') # removing linebreaks for regex processing
    html = re.compile('(.*)<div[^>]+column-content+[^>]+>').sub('',html) # stripping before content
    html = re.compile('<div[^>]+column-one+[^>]+>.*').sub('',html) # stripping after content
    html = re.compile('<!--[^>]+-->').sub('',html) # removing comment tags
    html = re.compile('<script[^>]*>.*?</script>').sub('',html) # removing script tags
    html = re.compile('<!--\[if[^>]*>.*?endif\]-->').sub('',html) # removing IE tags
    html = re.compile('<div id="jump-to-nav"[^>]*>.*?</div>').sub('',html) # removing nav div
    html = re.compile('<h3 id="siteSub"[^>]*>.*?</h3>').sub('',html) # removing print subtitle
    html = re.compile('Retrieved from').sub('Online version:',html) # changing online title
    html = re.compile('<div id="mw-normal-catlinks[^>]>.*?</div>').sub('',html) # removing catlinks
    html = re.compile('<div class="NavHead.*?</div>').sub('',html) # removing nav stuff
    html = re.compile('<div class="NavContent.*?</div>').sub('',html) # removing nav stuff
    html = re.compile('<div class="NavEnd.*?</div>').sub('',html) # removing nav stuff
    if not GETTRANSLATIONS:
        html = re.compile('<div class="languages.*?</div>').sub('',html) # removing translations links
    html = re.compile('Wlinebreak').sub('\n',html) # restoring original linebreaks
    return html
    
def getlinks(html):
    "returns a list of wikipage links in html file"
    links = re.findall('<a[^>]*>.*?</a>',html)
    pages = []
    for l in links:
        # rg = re.findall('php\?title=(.*)\" title',l)
        rg = re.findall('href=.*?php\?title=(.*?)"',l)
        if rg:
            rg = rg[0]
            if "#" in rg:
                rg = rg.split('#')[0]
            if ":" in rg:
                NORETRIEVE.append(rg)
            if ";" in rg:
                NORETRIEVE.append(rg)
            if "&" in rg:
                NORETRIEVE.append(rg)
            if "/" in rg:
                if not GETTRANSLATIONS:
                    NORETRIEVE.append(rg)
            pages.append(rg)
    return pages

def getimagelinks(html):
    "returns a list of image links found in an html file"
    return re.findall('<img.*?src="(.*?)"',html)

def cleanlinks(html, pages=None):
    "cleans page links found in html"
    if not pages: pages = getlinks(html)
    for page in pages:
        if  page in NORETRIEVE:
            output = 'href="' + URL + wikiindex + page + '"'
        else:
            output = 'href="' + page.replace("/","-") + '.html"'
        html = re.compile('href="[^"]+' + page + '"').sub(output,html)
    return html

def cleanimagelinks(html,links=None):
    "cleans image links in given html"
    if not links: links = getimagelinks(html)
    if links:
        for l in links:
            nl = re.findall('.*/(.*)',l)
            if nl: html = html.replace(l,nl[0])
            fetchimage(l)
    return html

def fetchpage(page):
    "retrieves given page from the wiki"
    failcount = 0
    while failcount < MAXFAIL:
        try:
            html = (urlopen(URL + wikiindex + page).read())
            return html
        except HTTPError:
            failcount += 1
    print ('Error: unable to fetch page ' + page)

def fetchimage(imagelink):
    "retrieves given image from the wiki and saves it"
    filename = re.findall('.*/(.*)',imagelink)[0]
    if not (filename in processed):
        failcount = 0
        while failcount < MAXFAIL:
            try:
                if VERBOSE: print ("Fetching " + filename)
                data = (urlopen(webroot(URL) + imagelink).read())
                path = local(filename,image=True)
                file = open(path,'wb')
                file.write(data)
                file.close()
                processed.append(filename)
                return
            except:
                failcount += 1
        print ('Error: unable to fetch file ' + filename)

def local(page,image=False):
    "returns a local path for a given page/image"
    if image:
        return TMPFOLDER + os.sep + page
    else:
        return TMPFOLDER + os.sep + page + '.html'

def exists(page,image=False):
    "checks if given page/image already exists"
    path = local(page,image)
    if os.path.exists(path): return True
    return False

def webroot(url):
    return re.findall('(http://.*?)/',url)[0]

def output(html,page):
    "encapsulates raw html code into nice html body"
    header = "<html><head>"
    header += "<title>"
    header += page
    header += "</title>"
    header += "<link type='text/css' href='wiki.css' rel='stylesheet'>"
    header += "</head><body>"
    footer = "</body></html>"
    html = header+html+footer
    filename = local(page.replace("/","-"))
    file = open(filename,'wb')
    file.write(html)
    file.close()

def main(arg):
	global QHELPCOMPILER,QCOLLECTIOMGENERATOR,OUTPUTPATH,PDFOUTPUT,PDFCONVERTOR,TMPFOLDER
	try:
		opts, args = getopt.getopt(sys.argv[1:], "hp:t:c:g:o:", ["help", "pdf=", "noremove", "tempfolder=", "helpcompiler-exe=", "out-path=", "helpgenerator-exe="])
	except getopt.GetoptError:
		# print help information and exit:
		sys.stderr.write(usage)
		sys.exit(2)

	# checking on the options
	for o, a in opts:
		if o == "-v":
			VERBOSE = True
		if o in ("-p","--pdf"):
			PDFOUTPUT = True
			if a in ['pisa','htmldoc']:
				print ("using pdf converter:",a)
				PDFCONVERTOR = a
		if o in ("-t","--tempfolder"):
			print ("using tempfolder:",a)
			TMPFOLDER = a
		if o in ("-h", "--help"):
			sys.stderr.write(usage)
			sys.exit()
		if o in ("-c", "--helpcompiler-exe"):
			QHELPCOMPILER = a
			print ('Using: ',QHELPCOMPILER)
		if o in ("-g", "--helpgenerator-exe"):
			QCOLLECTIOMGENERATOR = a
		if o in ("-o", "--out-path"):
			print ("Using output path:",a)
			OUTPUTPATH = a
#    if arg:
#        if (arg[0] == '-h') or (arg[0] == '--help'):
#            print usage
#        else:
#            URL = arg[0]
#            if len(arg) > 1: INDEX = arg[1]
#            if len(arg) > 2: OUTPUTPATH = arg[2]
#            crawl()
#    else:
		crawl()

if __name__ == "__main__":
    # main(sys.argv[1:])
    print "Warning! This script is obsolete. Use the scripts in the offlinedocs folder..."
Tech Fingerprint

Alerts (44)

'def' Ensure functions have docstrings for documentation
139 152 221 243 253 259 285 292 348 377 437 447 490 494
'global' Avoid global variables; use function parameters or class attributes for better scope management
140
Complexity hotspot; lines 164 to 166 (total complexity: 3)
164 165 166
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
168 176 276
'open(' Use 'with open()' to ensure Files are properly closed
184 262 297 342 388 431
Complexity hotspot; lines 198 to 199 (total complexity: 3)
198 199
Complexity hotspot; lines 214 to 216 (total complexity: 3)
214 215 216
Complexity hotspot; lines 228 to 230 (total complexity: 3)
228 229 230
Complexity hotspot; lines 496 to 498 (total complexity: 3)
496 497 498
Complexity hotspot; lines 507 to 509 (total complexity: 3)
507 508 509
Complexity hotspot; lines 531 to 533 (total complexity: 3)
531 532 533