/InfoSlicer-15/infoslicer/processing/Article_Builder.py
Python | 242 lines | 218 code | 10 blank | 14 comment | 24 complexity | 03004970179253fd183aedf1222e4c8c MD5 | raw file
Possible License(s): GPL-2.0
- # Copyright (C) IBM Corporation 2008
-
- from BeautifulSoup import Tag
- from NewtifulSoup import NewtifulStoneSoup as BeautifulStoneSoup
- from Article_Data import *
- import re
- import os
- import logging
-
- logger = logging.getLogger('infoslicer')
-
- """
- Created by Christopher Leonard.
-
- ID descriptions:
- 0 - picture
- 1 - heading
- > 1 - anything
-
- This class converts between DITA and article_data representation of articles. Badly in need of refactoring!
- """
- def get_article_from_dita(image_path, dita):
- """
- This method takes an article in DITA format as input, parses the DITA, and outputs the corresponding article_data object
- """
- has_shortdesc = False
- input = BeautifulStoneSoup(dita)
- article_id = input.resourceid['id']
- current_section_id = ""
- current_p_id = ""
- sentence_data_list = []
- paragraph_data_list = []
- section_data_list = []
- if input.find("shortdesc") != None:
- paragraph_data=[]
- for ph in input.shortdesc.findAll("ph"):
- id = ph['id']
- source_sentence_id = id
- source_paragraph_id = "shortdesc"
- source_section_id = "shortdesc"
- source_article_id = article_id
- text = ph.renderContents().replace("\n", "").replace(" ", "").strip() + " "
- if text[0:5] == "Satur":
- logger.debug(unicode(text))
- sentence_data = Sentence_Data(id, source_article_id, source_section_id, source_paragraph_id, source_sentence_id, text)
- sentence_data_list.append(sentence_data)
- paragraph_data.append(Paragraph_Data("shortdesc", article_id, "shortdesc", "shortdesc", sentence_data_list))
- section_data = Section_Data("shortdesc", article_id, "shortdesc", paragraph_data)
- section_data_list.append(section_data)
- sentence_data_list = []
- input.shortdesc.extract()
- has_shortdesc = True
- taglist = input.findAll(re.compile("refbody|section|p|ph|image"))
- for i in xrange(len(taglist)):
- tag = taglist[len(taglist) - i - 1]
- if tag.name == "ph":
- id = tag['id']
- source_sentence_id = id
- source_paragraph_id = current_p_id
- source_section_id = current_section_id
- source_article_id = article_id
- text = tag.renderContents().replace("\n", "").replace(" ", "").strip() + " "
- sentence_data = Sentence_Data(id, source_article_id, source_section_id, source_paragraph_id, source_sentence_id, text)
- sentence_data_list.insert(0, sentence_data)
- elif tag.name == "p":
- if not tag.has_key("id"):
- id = -1
- else:
- id = tag['id']
- source_paragraph_id = id
- source_section_id = current_section_id
- source_article_id = article_id
- paragraph_data = Paragraph_Data(id, source_article_id, source_section_id, source_paragraph_id, sentence_data_list)
- paragraph_data_list.insert(0, paragraph_data)
- sentence_data_list = []
- current_p_id = id
- elif tag.name == "refbody" :
- if tag.findParent("reference").has_key("id"):
- id = "r" + tag.findParent("reference")['id']
- else:
- id = "r90000"
- source_section_id = id
- source_article_id = article_id
- section_data = Section_Data(id, source_article_id, source_section_id, paragraph_data_list)
- if has_shortdesc:
- section_data_list.insert(1,section_data)
- else:
- section_data_list.insert(0,section_data)
- if tag.findChild("title", recursive=False) != None:
- heading = tag.findChild('title').renderContents().replace("\n", "").replace(" ", "").strip()
- sen = Sentence_Data(1, source_article_id, source_section_id, 1, 1, heading)
- par = Paragraph_Data(1, source_article_id, source_section_id, 1, [sen])
- headingdata = Section_Data(1, source_article_id, source_section_id, [par])
-
- if has_shortdesc:
- section_data_list.insert(1,headingdata)
- else:
- section_data_list.insert(0,headingdata)
- paragraph_data_list = []
- current_section_id = tag.name[0] + id
-
- elif tag.name == "section":
- id = "s" + tag['id']
- source_section_id = id
- source_article_id = article_id
-
- section_data = Section_Data(id, source_article_id, source_section_id, paragraph_data_list)
- if has_shortdesc:
- section_data_list.insert(1,section_data)
- else:
- section_data_list.insert(0,section_data)
- if tag.findChild("title", recursive=False) != None:
- heading = tag.findChild('title').renderContents().replace("\n", "").replace(" ", "").strip()
- sen = Sentence_Data(1, source_article_id, source_section_id, 1, 1, heading)
- par = Paragraph_Data(1, source_article_id, source_section_id, 1, [sen])
- headingdata = Section_Data(1, source_article_id, source_section_id, [par])
-
- if has_shortdesc:
- section_data_list.insert(1,headingdata)
- else:
- section_data_list.insert(0,headingdata)
- paragraph_data_list = []
- current_section_id = id
-
- elif tag.name == "image":
-
- if tag.parent.name == "p":
- source_article_id = article_id
- text = image_path + '/' + tag['href']
- if not os.path.exists(text):
- logger.info('cannot find image %s' % text)
- else:
- picture_data = Picture_Data(source_article_id, text,
- tag['orig_href'])
- sentence_data_list.insert(0, picture_data)
-
- article_title = input.find("title").renderContents().replace("\n", "").strip()
-
- image_list = []
- imglist_tag = input.find(True, attrs={"id" : "imagelist"})
- if imglist_tag != None:
- for img in imglist_tag.findAll("image"):
- caption = img.findChild("alt")
- if caption != None:
- caption = caption.renderContents().replace("\n", "").strip()
- else:
- caption = ""
- if not os.path.exists(os.path.join(image_path, img['href'])):
- logger.info('cannot find image %s' % img['href'])
- else:
- image_list.append((img['href'], caption, img['orig_href']))
-
- data = Article_Data(article_id, article_id, article_title, "theme", section_data_list, image_list)
-
- return data
-
-
- def get_dita_from_article(image_path, article):
- """
- This method takes as input an instance of the Article class.
- It calls the getData method of the article class to get the article_data representation of the article.
- It then constructs the corresponding DITA representation of the article.
- """
- article_data = article.getData()
- output = BeautifulStoneSoup("<?xml version='1.0' encoding='utf-8'?><!DOCTYPE reference PUBLIC \"-//IBM//DTD DITA IBM Reference//EN\" \"ibm-reference.dtd\"><reference><title>%s</title><prolog></prolog></reference>" % article_data.article_title)
- current_ref = output.reference
- current_title = None
-
- for section in article_data.sections_data:
- #headings check
- if len(section.paragraphs_data) == 1 and len(section.paragraphs_data[0].sentences_data) == 1 and section.paragraphs_data[0].sentences_data[0].id == 1:
- paragraph = section.paragraphs_data[0]
- current_title = paragraph.sentences_data[0].text
- elif str(section.id).startswith("r"):
- reference_tag = _tag_generator(output, "reference", attrs=[("id", section.id.replace("r", ""))])
- if current_title != None:
- reference_tag.append(_tag_generator(output, "title", contents=current_title))
- current_title = None
- reference_tag.append(_tag_generator(output, "refbody"))
- for paragraph in section.paragraphs_data:
- if paragraph.id == "shortdesc":
- paragraph_tag = _tag_generator(output, "shortdesc")
- else:
- paragraph_tag = _tag_generator(output, "p", attrs=[("id", str(paragraph.id))])
- for sentence in paragraph.sentences_data:
- ph_tag = _tag_generator(output, "ph", attrs=[("id", str(sentence.id))], contents = sentence.text)
- paragraph_tag.append(ph_tag)
- reference_tag.refbody.append(paragraph_tag)
- output.reference.append(reference_tag)
- current_ref = reference_tag.refbody
- else:
- if section.id == "shortdesc":
- section_tag = _tag_generator(output, "section", attrs=[("id", "shortdesc")])
- else:
- section_tag = _tag_generator(output, "section", attrs=[("id", str(section.id).replace("s", ""))])
- if current_title != None:
- section_tag.append(_tag_generator(output, "title", contents=current_title))
- current_title = None
- for paragraph in section.paragraphs_data:
- paragraph_tag = _tag_generator(output, "p", attrs=[("id", str(paragraph.id))])
- for sentence in paragraph.sentences_data:
- if sentence.type == "sentence":
- ph_tag = _tag_generator(output, "ph", attrs=[("id", str(sentence.id))], contents = sentence.text)
- paragraph_tag.append(ph_tag)
- elif sentence.type == "picture":
- # switch image to relative path
- text = sentence.text.replace(image_path, '') \
- .lstrip('/')
- image_tag = _tag_generator(output,
- "image", attrs=[("href", text),
- ('orig_href', sentence.orig)])
- paragraph_tag.append(image_tag)
- else:
- logger.ebiug(sentence.type)
-
- section_tag.append(paragraph_tag)
- current_ref.append(section_tag)
- if current_title != None:
- current_ref.append('<section id="56756757"><p id="6875534"><ph id="65657657">%s</ph></p></section>' % current_title)
- current_title = None
- if article_data.image_list != []:
- for unnecessary_tag in output.findAll(True, attrs={"id" : "imagelist"}):
- unnecessary_tag.extract()
- image_list = _tag_generator(output, "reference", [("id", "imagelist")])
- output.reference.append(image_list)
- image_list_body = _tag_generator(output, "refbody")
- image_list.append(image_list_body)
- for image in article_data.image_list:
- image_tag = _tag_generator(output, "image", [("href", image[0]), ("orig_href", image[2])], "<alt>" + image[-1] + "</alt>")
- image_list_body.append(image_tag)
- dita = output.prettify()
-
- return dita
-
- def _tag_generator(soup, name, attrs=[], contents=None):
- if attrs != []:
- new_tag = Tag(soup, name, attrs)
- else:
- new_tag = Tag(soup, name)
- if contents != None:
- new_tag.insert(0, contents)
- return new_tag