parsing.py | searchcode

/jornada/json/parsing.py

https://bitbucket.org/lsjcp/jornada-api · Python · 250 lines · 240 code · 4 blank · 6 comment · 0 complexity · 88180314f313c591a9d5367b4635aa0e MD5 · raw file

# -*- coding: utf-8 -*-
'''
Created on 13/05/2012

@author: lcammx
'''


import re
from htmlgen import HTMLgen

class parsing(object):
    
    def __init__(self):
        '''constructor'''
            
    def dumpJsonItems(self, jItems):
        '''method'''
        
    def dumpErrorLog(self, error):
        '''method'''
    
    def getText(self, nodelist):
            rc = []
            try:
                mem = ""
                for node in nodelist: 
                    if node.nodeType == node.TEXT_NODE:
                        val = node.nodeValue
                        if val != None:
                            val = val.strip()
                            if val != "" and not val.isspace(): 
                                if len(val)<2: 
                                    mem = val
                                else:                                
                                    rc.append(mem+val)
                                    mem = ""
            except:
                rci = nodelist.nodeValue
                return rci
            str = ''.join(rc)
            return str.strip() 
        
    def getRecursiveText(self, nodelist):
        rc = ""
        if hasattr(nodelist, 'hasChildNodes'):
            if nodelist.hasChildNodes():
                for node in nodelist.childNodes: 
                    if node.nodeName == "p" and not node.hasAttributes():
                        rc+= "\r"
                    val = self.getRecursiveText(node)
                    if node.nodeName == "p" and not node.hasAttributes():
                        rc+= "\r"
                    if val != None:
                        val = val.strip()
                        if val != "" and not val.isspace():                              
                            rc+=val + " "
            else:
                if nodelist.nodeType == nodelist.TEXT_NODE:
                    val = self.getLastText(nodelist)                   
                    if val != None:
                        val = val.strip()
                        if val != "" and not val.isspace():                              
                            rc+= val + " "    
        else:
            for nodeitem in nodelist:
                val = self.getRecursiveText(nodeitem)
                if val != None:
                    val = val.strip()
                    if val != "" and not val.isspace():                              
                        rc += val + " "
        return rc
        
    def getRecursiveText2(self, nodelist):
            rc = []
            if hasattr(nodelist, 'hasChildNodes'):
                if nodelist.hasChildNodes():
                    for node in nodelist.childNodes: 
                        val = self.getRecursiveText(node)
                        if val != None:
                            val = val.strip()
                            if val != "" and not val.isspace():                              
                                rc.append(val+ "  ")
                else:
                    if nodelist.nodeType == nodelist.TEXT_NODE:
                        val = self.getLastText(nodelist)                   
                        if val != None:
                            val = val.strip()
                            if val != "" and not val.isspace():                              
                                rc.append(val+ "  ")            
            else:
                for nodeitem in nodelist:
                    val = self.getRecursiveText(nodeitem)
                    if val != None:
                        val = val.strip()
                        if val != "" and not val.isspace():                              
                            rc.append(val+ "  ")
            return self.joinLines(rc, "\r \r", 1)
        
    def getSingleLineText(self, nodelist):
            r = ""
            if hasattr(nodelist, 'hasChildNodes'):
                if nodelist.hasChildNodes():
                    for node in nodelist.childNodes: 
                        val = self.getSingleLineText(node)
                        if val != None:
                            val = val.strip()
                            if val != "" and not val.isspace():                              
                                r += " " + val
                else:
                    if nodelist.nodeType == nodelist.TEXT_NODE:
                        val = self.getLastText(nodelist)                   
                        if val != None:
                            val = val.strip()
                            if val != "" and not val.isspace():                              
                                r += " " + val      
            else:
                for nodeitem in nodelist:
                    val = self.getRecursiveText(nodeitem)
                    if val != None:
                        val = val.strip()
                        if val != "" and not val.isspace():                              
                            r += " " + val
            return r
    
    def getIntFromText(self, text):
        text = text.strip()
        text = re.sub("[^0-9]", "", text)
        return int(text)
        
    def getLastText(self, node):
            rc = ""
            try:
                val = node.nodeValue
                if val != None:
                    val = val.strip()
                    if val != "" and not val.isspace(): 
                        rc+=val                    
            except Exception as e:
                rci = e.__str__()
                return rci
            return rc
    
    def getArray(self, nodelist):
            rc = []
            try:
                mem = ""
                for node in nodelist: 
                    if node.hasChildNodes():
                        for nnode in node.childNodes:
                            val = self.getRecursiveText(nnode)
                            if val != None:
                                val = val.strip()
                                if val != "" and not val.isspace(): 
                                    if len(val)<2: 
                                        mem = val
                                    else:                                
                                        rc.append(mem+val)
                                        mem = ""
                    else:
                        val = node.nodeValue
                        if val != None:
                            val = val.strip()
                            if val != "" and not val.isspace(): 
                                if len(val)<2: 
                                    mem = val
                                else:                                
                                    rc.append(mem+val)
                                    mem = ""                   
            except Exception as e:
                rci = e.__str__()
                return [rci]
            return rc

    def getHtmlFromParragraphs(self, content):
        html = HTMLgen('utf-8')
        text = ""
        if isinstance(content, list):
            for item in content:
                if hasattr(item, 'nodeName'):
                    val =  item.toxml()
                    val = html.sanHTML(val)
                    if val != None:
                        val = val.strip()
                        if val != "" and not val.isspace():   
                            text += val
        else:
            if hasattr(content, 'hasChildNodes'):
                if content.hasChildNodes():
                    for child in content.childNodes:
                        val =  item.toxml()
                        val = html.sanHTML(val)
                        if val != None:
                            val = val.strip()
                            if val != "" and not val.isspace():   
                                text += val
            else:
                val =  item.toxml()
                val = html.sanHTML(val)
                if val != None:
                    val = val.strip()
                    if val != "" and not val.isspace():   
                        text += val
        return text
            
    def joinLines(self, lst, breaker, minimus):
        txt = ""
        for i in range(len(lst)):
            txt += lst[i]
            if i<(len(lst)-1):
                if len(lst[i])>1:
                    txt+= breaker
        return txt
    
    def getListItems(self, content):
        rlist = []
        
        if isinstance(content, list):
            mem = ""
            for item in content:
                itxt = ""
                itxt = self.getRecursiveText(item)
                if itxt != None:
                    itxt = itxt.strip()
                    if itxt != "" and not itxt.isspace():
                        if len(itxt)>2:
                            rlist.append(mem+itxt)
                            mem = ""
                        else:
                            mem = itxt
            if len(mem)>0: rlist.append(mem)
        else :
            rlist.append(self.getRecursiveText(content))
        return rlist
    
    
                        
    def appendNodeToHeuristics(self, heuristics, node, words, oddness,keywords, abstracted):
        try: 
            for item in node:
                if item.nodeName=='p':
                    inp = self.getRecursiveText(item)
                    if inp != None:
                        if inp != '':
                            heuristics._matchTextToList(inp, words)
                            heuristics._proccessOddness(inp, oddness)
                            heuristics._matchTextToDict(inp, keywords)
                            heuristics._processAbstraction(inp, abstracted)
        except:
            pass
Alerts (14)

'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
38 249
'def' Ensure functions have docstrings for documentation
44 74 100 126 131 144 175 206 215 238
'isinstance(' Overuse may indicate design issues; consider polymorphism
178 218