/jornada/json/parsing.py

https://bitbucket.org/lsjcp/jornada-api · Python · 250 lines · 240 code · 4 blank · 6 comment · 0 complexity · 88180314f313c591a9d5367b4635aa0e MD5 · raw file

  1. # -*- coding: utf-8 -*-
  2. '''
  3. Created on 13/05/2012
  4. @author: lcammx
  5. '''
  6. import re
  7. from htmlgen import HTMLgen
  8. class parsing(object):
  9. def __init__(self):
  10. '''constructor'''
  11. def dumpJsonItems(self, jItems):
  12. '''method'''
  13. def dumpErrorLog(self, error):
  14. '''method'''
  15. def getText(self, nodelist):
  16. rc = []
  17. try:
  18. mem = ""
  19. for node in nodelist:
  20. if node.nodeType == node.TEXT_NODE:
  21. val = node.nodeValue
  22. if val != None:
  23. val = val.strip()
  24. if val != "" and not val.isspace():
  25. if len(val)<2:
  26. mem = val
  27. else:
  28. rc.append(mem+val)
  29. mem = ""
  30. except:
  31. rci = nodelist.nodeValue
  32. return rci
  33. str = ''.join(rc)
  34. return str.strip()
  35. def getRecursiveText(self, nodelist):
  36. rc = ""
  37. if hasattr(nodelist, 'hasChildNodes'):
  38. if nodelist.hasChildNodes():
  39. for node in nodelist.childNodes:
  40. if node.nodeName == "p" and not node.hasAttributes():
  41. rc+= "\r"
  42. val = self.getRecursiveText(node)
  43. if node.nodeName == "p" and not node.hasAttributes():
  44. rc+= "\r"
  45. if val != None:
  46. val = val.strip()
  47. if val != "" and not val.isspace():
  48. rc+=val + " "
  49. else:
  50. if nodelist.nodeType == nodelist.TEXT_NODE:
  51. val = self.getLastText(nodelist)
  52. if val != None:
  53. val = val.strip()
  54. if val != "" and not val.isspace():
  55. rc+= val + " "
  56. else:
  57. for nodeitem in nodelist:
  58. val = self.getRecursiveText(nodeitem)
  59. if val != None:
  60. val = val.strip()
  61. if val != "" and not val.isspace():
  62. rc += val + " "
  63. return rc
  64. def getRecursiveText2(self, nodelist):
  65. rc = []
  66. if hasattr(nodelist, 'hasChildNodes'):
  67. if nodelist.hasChildNodes():
  68. for node in nodelist.childNodes:
  69. val = self.getRecursiveText(node)
  70. if val != None:
  71. val = val.strip()
  72. if val != "" and not val.isspace():
  73. rc.append(val+ " ")
  74. else:
  75. if nodelist.nodeType == nodelist.TEXT_NODE:
  76. val = self.getLastText(nodelist)
  77. if val != None:
  78. val = val.strip()
  79. if val != "" and not val.isspace():
  80. rc.append(val+ " ")
  81. else:
  82. for nodeitem in nodelist:
  83. val = self.getRecursiveText(nodeitem)
  84. if val != None:
  85. val = val.strip()
  86. if val != "" and not val.isspace():
  87. rc.append(val+ " ")
  88. return self.joinLines(rc, "\r \r", 1)
  89. def getSingleLineText(self, nodelist):
  90. r = ""
  91. if hasattr(nodelist, 'hasChildNodes'):
  92. if nodelist.hasChildNodes():
  93. for node in nodelist.childNodes:
  94. val = self.getSingleLineText(node)
  95. if val != None:
  96. val = val.strip()
  97. if val != "" and not val.isspace():
  98. r += " " + val
  99. else:
  100. if nodelist.nodeType == nodelist.TEXT_NODE:
  101. val = self.getLastText(nodelist)
  102. if val != None:
  103. val = val.strip()
  104. if val != "" and not val.isspace():
  105. r += " " + val
  106. else:
  107. for nodeitem in nodelist:
  108. val = self.getRecursiveText(nodeitem)
  109. if val != None:
  110. val = val.strip()
  111. if val != "" and not val.isspace():
  112. r += " " + val
  113. return r
  114. def getIntFromText(self, text):
  115. text = text.strip()
  116. text = re.sub("[^0-9]", "", text)
  117. return int(text)
  118. def getLastText(self, node):
  119. rc = ""
  120. try:
  121. val = node.nodeValue
  122. if val != None:
  123. val = val.strip()
  124. if val != "" and not val.isspace():
  125. rc+=val
  126. except Exception as e:
  127. rci = e.__str__()
  128. return rci
  129. return rc
  130. def getArray(self, nodelist):
  131. rc = []
  132. try:
  133. mem = ""
  134. for node in nodelist:
  135. if node.hasChildNodes():
  136. for nnode in node.childNodes:
  137. val = self.getRecursiveText(nnode)
  138. if val != None:
  139. val = val.strip()
  140. if val != "" and not val.isspace():
  141. if len(val)<2:
  142. mem = val
  143. else:
  144. rc.append(mem+val)
  145. mem = ""
  146. else:
  147. val = node.nodeValue
  148. if val != None:
  149. val = val.strip()
  150. if val != "" and not val.isspace():
  151. if len(val)<2:
  152. mem = val
  153. else:
  154. rc.append(mem+val)
  155. mem = ""
  156. except Exception as e:
  157. rci = e.__str__()
  158. return [rci]
  159. return rc
  160. def getHtmlFromParragraphs(self, content):
  161. html = HTMLgen('utf-8')
  162. text = ""
  163. if isinstance(content, list):
  164. for item in content:
  165. if hasattr(item, 'nodeName'):
  166. val = item.toxml()
  167. val = html.sanHTML(val)
  168. if val != None:
  169. val = val.strip()
  170. if val != "" and not val.isspace():
  171. text += val
  172. else:
  173. if hasattr(content, 'hasChildNodes'):
  174. if content.hasChildNodes():
  175. for child in content.childNodes:
  176. val = item.toxml()
  177. val = html.sanHTML(val)
  178. if val != None:
  179. val = val.strip()
  180. if val != "" and not val.isspace():
  181. text += val
  182. else:
  183. val = item.toxml()
  184. val = html.sanHTML(val)
  185. if val != None:
  186. val = val.strip()
  187. if val != "" and not val.isspace():
  188. text += val
  189. return text
  190. def joinLines(self, lst, breaker, minimus):
  191. txt = ""
  192. for i in range(len(lst)):
  193. txt += lst[i]
  194. if i<(len(lst)-1):
  195. if len(lst[i])>1:
  196. txt+= breaker
  197. return txt
  198. def getListItems(self, content):
  199. rlist = []
  200. if isinstance(content, list):
  201. mem = ""
  202. for item in content:
  203. itxt = ""
  204. itxt = self.getRecursiveText(item)
  205. if itxt != None:
  206. itxt = itxt.strip()
  207. if itxt != "" and not itxt.isspace():
  208. if len(itxt)>2:
  209. rlist.append(mem+itxt)
  210. mem = ""
  211. else:
  212. mem = itxt
  213. if len(mem)>0: rlist.append(mem)
  214. else :
  215. rlist.append(self.getRecursiveText(content))
  216. return rlist
  217. def appendNodeToHeuristics(self, heuristics, node, words, oddness,keywords, abstracted):
  218. try:
  219. for item in node:
  220. if item.nodeName=='p':
  221. inp = self.getRecursiveText(item)
  222. if inp != None:
  223. if inp != '':
  224. heuristics._matchTextToList(inp, words)
  225. heuristics._proccessOddness(inp, oddness)
  226. heuristics._matchTextToDict(inp, keywords)
  227. heuristics._processAbstraction(inp, abstracted)
  228. except:
  229. pass