PageRenderTime 68ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/InfoSlicer-15/infoslicer/processing/Article_Builder.py

#
Python | 242 lines | 218 code | 10 blank | 14 comment | 24 complexity | 03004970179253fd183aedf1222e4c8c MD5 | raw file
Possible License(s): GPL-2.0
  1. # Copyright (C) IBM Corporation 2008
  2. from BeautifulSoup import Tag
  3. from NewtifulSoup import NewtifulStoneSoup as BeautifulStoneSoup
  4. from Article_Data import *
  5. import re
  6. import os
  7. import logging
  8. logger = logging.getLogger('infoslicer')
  9. """
  10. Created by Christopher Leonard.
  11. ID descriptions:
  12. 0 - picture
  13. 1 - heading
  14. > 1 - anything
  15. This class converts between DITA and article_data representation of articles. Badly in need of refactoring!
  16. """
  17. def get_article_from_dita(image_path, dita):
  18. """
  19. This method takes an article in DITA format as input, parses the DITA, and outputs the corresponding article_data object
  20. """
  21. has_shortdesc = False
  22. input = BeautifulStoneSoup(dita)
  23. article_id = input.resourceid['id']
  24. current_section_id = ""
  25. current_p_id = ""
  26. sentence_data_list = []
  27. paragraph_data_list = []
  28. section_data_list = []
  29. if input.find("shortdesc") != None:
  30. paragraph_data=[]
  31. for ph in input.shortdesc.findAll("ph"):
  32. id = ph['id']
  33. source_sentence_id = id
  34. source_paragraph_id = "shortdesc"
  35. source_section_id = "shortdesc"
  36. source_article_id = article_id
  37. text = ph.renderContents().replace("\n", "").replace(" ", "").strip() + " "
  38. if text[0:5] == "Satur":
  39. logger.debug(unicode(text))
  40. sentence_data = Sentence_Data(id, source_article_id, source_section_id, source_paragraph_id, source_sentence_id, text)
  41. sentence_data_list.append(sentence_data)
  42. paragraph_data.append(Paragraph_Data("shortdesc", article_id, "shortdesc", "shortdesc", sentence_data_list))
  43. section_data = Section_Data("shortdesc", article_id, "shortdesc", paragraph_data)
  44. section_data_list.append(section_data)
  45. sentence_data_list = []
  46. input.shortdesc.extract()
  47. has_shortdesc = True
  48. taglist = input.findAll(re.compile("refbody|section|p|ph|image"))
  49. for i in xrange(len(taglist)):
  50. tag = taglist[len(taglist) - i - 1]
  51. if tag.name == "ph":
  52. id = tag['id']
  53. source_sentence_id = id
  54. source_paragraph_id = current_p_id
  55. source_section_id = current_section_id
  56. source_article_id = article_id
  57. text = tag.renderContents().replace("\n", "").replace(" ", "").strip() + " "
  58. sentence_data = Sentence_Data(id, source_article_id, source_section_id, source_paragraph_id, source_sentence_id, text)
  59. sentence_data_list.insert(0, sentence_data)
  60. elif tag.name == "p":
  61. if not tag.has_key("id"):
  62. id = -1
  63. else:
  64. id = tag['id']
  65. source_paragraph_id = id
  66. source_section_id = current_section_id
  67. source_article_id = article_id
  68. paragraph_data = Paragraph_Data(id, source_article_id, source_section_id, source_paragraph_id, sentence_data_list)
  69. paragraph_data_list.insert(0, paragraph_data)
  70. sentence_data_list = []
  71. current_p_id = id
  72. elif tag.name == "refbody" :
  73. if tag.findParent("reference").has_key("id"):
  74. id = "r" + tag.findParent("reference")['id']
  75. else:
  76. id = "r90000"
  77. source_section_id = id
  78. source_article_id = article_id
  79. section_data = Section_Data(id, source_article_id, source_section_id, paragraph_data_list)
  80. if has_shortdesc:
  81. section_data_list.insert(1,section_data)
  82. else:
  83. section_data_list.insert(0,section_data)
  84. if tag.findChild("title", recursive=False) != None:
  85. heading = tag.findChild('title').renderContents().replace("\n", "").replace(" ", "").strip()
  86. sen = Sentence_Data(1, source_article_id, source_section_id, 1, 1, heading)
  87. par = Paragraph_Data(1, source_article_id, source_section_id, 1, [sen])
  88. headingdata = Section_Data(1, source_article_id, source_section_id, [par])
  89. if has_shortdesc:
  90. section_data_list.insert(1,headingdata)
  91. else:
  92. section_data_list.insert(0,headingdata)
  93. paragraph_data_list = []
  94. current_section_id = tag.name[0] + id
  95. elif tag.name == "section":
  96. id = "s" + tag['id']
  97. source_section_id = id
  98. source_article_id = article_id
  99. section_data = Section_Data(id, source_article_id, source_section_id, paragraph_data_list)
  100. if has_shortdesc:
  101. section_data_list.insert(1,section_data)
  102. else:
  103. section_data_list.insert(0,section_data)
  104. if tag.findChild("title", recursive=False) != None:
  105. heading = tag.findChild('title').renderContents().replace("\n", "").replace(" ", "").strip()
  106. sen = Sentence_Data(1, source_article_id, source_section_id, 1, 1, heading)
  107. par = Paragraph_Data(1, source_article_id, source_section_id, 1, [sen])
  108. headingdata = Section_Data(1, source_article_id, source_section_id, [par])
  109. if has_shortdesc:
  110. section_data_list.insert(1,headingdata)
  111. else:
  112. section_data_list.insert(0,headingdata)
  113. paragraph_data_list = []
  114. current_section_id = id
  115. elif tag.name == "image":
  116. if tag.parent.name == "p":
  117. source_article_id = article_id
  118. text = image_path + '/' + tag['href']
  119. if not os.path.exists(text):
  120. logger.info('cannot find image %s' % text)
  121. else:
  122. picture_data = Picture_Data(source_article_id, text,
  123. tag['orig_href'])
  124. sentence_data_list.insert(0, picture_data)
  125. article_title = input.find("title").renderContents().replace("\n", "").strip()
  126. image_list = []
  127. imglist_tag = input.find(True, attrs={"id" : "imagelist"})
  128. if imglist_tag != None:
  129. for img in imglist_tag.findAll("image"):
  130. caption = img.findChild("alt")
  131. if caption != None:
  132. caption = caption.renderContents().replace("\n", "").strip()
  133. else:
  134. caption = ""
  135. if not os.path.exists(os.path.join(image_path, img['href'])):
  136. logger.info('cannot find image %s' % img['href'])
  137. else:
  138. image_list.append((img['href'], caption, img['orig_href']))
  139. data = Article_Data(article_id, article_id, article_title, "theme", section_data_list, image_list)
  140. return data
  141. def get_dita_from_article(image_path, article):
  142. """
  143. This method takes as input an instance of the Article class.
  144. It calls the getData method of the article class to get the article_data representation of the article.
  145. It then constructs the corresponding DITA representation of the article.
  146. """
  147. article_data = article.getData()
  148. output = BeautifulStoneSoup("<?xml version='1.0' encoding='utf-8'?><!DOCTYPE reference PUBLIC \"-//IBM//DTD DITA IBM Reference//EN\" \"ibm-reference.dtd\"><reference><title>%s</title><prolog></prolog></reference>" % article_data.article_title)
  149. current_ref = output.reference
  150. current_title = None
  151. for section in article_data.sections_data:
  152. #headings check
  153. if len(section.paragraphs_data) == 1 and len(section.paragraphs_data[0].sentences_data) == 1 and section.paragraphs_data[0].sentences_data[0].id == 1:
  154. paragraph = section.paragraphs_data[0]
  155. current_title = paragraph.sentences_data[0].text
  156. elif str(section.id).startswith("r"):
  157. reference_tag = _tag_generator(output, "reference", attrs=[("id", section.id.replace("r", ""))])
  158. if current_title != None:
  159. reference_tag.append(_tag_generator(output, "title", contents=current_title))
  160. current_title = None
  161. reference_tag.append(_tag_generator(output, "refbody"))
  162. for paragraph in section.paragraphs_data:
  163. if paragraph.id == "shortdesc":
  164. paragraph_tag = _tag_generator(output, "shortdesc")
  165. else:
  166. paragraph_tag = _tag_generator(output, "p", attrs=[("id", str(paragraph.id))])
  167. for sentence in paragraph.sentences_data:
  168. ph_tag = _tag_generator(output, "ph", attrs=[("id", str(sentence.id))], contents = sentence.text)
  169. paragraph_tag.append(ph_tag)
  170. reference_tag.refbody.append(paragraph_tag)
  171. output.reference.append(reference_tag)
  172. current_ref = reference_tag.refbody
  173. else:
  174. if section.id == "shortdesc":
  175. section_tag = _tag_generator(output, "section", attrs=[("id", "shortdesc")])
  176. else:
  177. section_tag = _tag_generator(output, "section", attrs=[("id", str(section.id).replace("s", ""))])
  178. if current_title != None:
  179. section_tag.append(_tag_generator(output, "title", contents=current_title))
  180. current_title = None
  181. for paragraph in section.paragraphs_data:
  182. paragraph_tag = _tag_generator(output, "p", attrs=[("id", str(paragraph.id))])
  183. for sentence in paragraph.sentences_data:
  184. if sentence.type == "sentence":
  185. ph_tag = _tag_generator(output, "ph", attrs=[("id", str(sentence.id))], contents = sentence.text)
  186. paragraph_tag.append(ph_tag)
  187. elif sentence.type == "picture":
  188. # switch image to relative path
  189. text = sentence.text.replace(image_path, '') \
  190. .lstrip('/')
  191. image_tag = _tag_generator(output,
  192. "image", attrs=[("href", text),
  193. ('orig_href', sentence.orig)])
  194. paragraph_tag.append(image_tag)
  195. else:
  196. logger.ebiug(sentence.type)
  197. section_tag.append(paragraph_tag)
  198. current_ref.append(section_tag)
  199. if current_title != None:
  200. current_ref.append('<section id="56756757"><p id="6875534"><ph id="65657657">%s</ph></p></section>' % current_title)
  201. current_title = None
  202. if article_data.image_list != []:
  203. for unnecessary_tag in output.findAll(True, attrs={"id" : "imagelist"}):
  204. unnecessary_tag.extract()
  205. image_list = _tag_generator(output, "reference", [("id", "imagelist")])
  206. output.reference.append(image_list)
  207. image_list_body = _tag_generator(output, "refbody")
  208. image_list.append(image_list_body)
  209. for image in article_data.image_list:
  210. image_tag = _tag_generator(output, "image", [("href", image[0]), ("orig_href", image[2])], "<alt>" + image[-1] + "</alt>")
  211. image_list_body.append(image_tag)
  212. dita = output.prettify()
  213. return dita
  214. def _tag_generator(soup, name, attrs=[], contents=None):
  215. if attrs != []:
  216. new_tag = Tag(soup, name, attrs)
  217. else:
  218. new_tag = Tag(soup, name)
  219. if contents != None:
  220. new_tag.insert(0, contents)
  221. return new_tag