/ftw/pdfgenerator/html2latex/subconverters/listing.py

https://github.com/4teamwork/ftw.pdfgenerator · Python · 178 lines · 168 code · 4 blank · 6 comment · 1 complexity · c1214e54281f25942b8e74eca50bd4ce MD5 · raw file

  1. from BeautifulSoup import BeautifulSoup
  2. from ftw.pdfgenerator.html2latex import subconverter
  3. from ftw.pdfgenerator.utils import html2xmlentities
  4. from xml.dom import minidom
  5. from xml.parsers.expat import ExpatError
  6. # LaTeX allows a maximum list nesting of 4. Deeper nesting will be flattened
  7. # to the nesting limit so that the produced LaTeX code is still valid.
  8. LIST_NESTING_LIMIT = 4
  9. class ListConverter(subconverter.SubConverter):
  10. """
  11. The ListConverter converts <ul> and <ol> lists
  12. into latex' itemize- and enumerate-environments.
  13. """
  14. pattern = r'<(ul|ol|dl)(.*)</\1>'
  15. listing_tag_mapping = {
  16. 'ul': 'itemize',
  17. 'ol': 'enumerate',
  18. 'dl': 'description',
  19. }
  20. def __init__(self, *args, **kwargs):
  21. super(ListConverter, self).__init__(*args, **kwargs)
  22. self.nesting_level = 0
  23. def __call__(self):
  24. html = self.get_html()
  25. # minidom hates htmlentities, but loves xmlentities -.-
  26. html = '<dummy>%s</dummy>' % html
  27. html = html2xmlentities(html)
  28. # parse DOM
  29. try:
  30. dom = minidom.parseString(html)
  31. except ExpatError, exc:
  32. # cleanup html with BeautifulSoup
  33. html = str(BeautifulSoup(html))
  34. dom = minidom.parseString(html)
  35. latex = []
  36. for node in dom.getElementsByTagName('dummy')[0].childNodes:
  37. if node.nodeType == minidom.Node.ELEMENT_NODE and \
  38. node.tagName.lower() in self.listing_tag_mapping.keys():
  39. latex.extend(self.convert_listing_environment(node))
  40. else:
  41. latex.append(self.converter.convert(node.toxml()))
  42. latex.append('')
  43. self.replace_and_lock('\n'.join(latex))
  44. def convert_listing_environment(self, node):
  45. """Converts a <ul>, <ol> or <dl> node to latex.
  46. """
  47. has_items = self._listing_has_items(node)
  48. if has_items and not self.nesting_level >= LIST_NESTING_LIMIT:
  49. self.nesting_level += 1
  50. result = self._convert_reduced_listing_environment(node)
  51. self.nesting_level -= 1
  52. return result
  53. else:
  54. return self._convert_reduced_listing_environment(
  55. node, environment=False)
  56. def _listing_has_items(self, node):
  57. for elm in node.childNodes:
  58. if elm.nodeType == minidom.Node.ELEMENT_NODE and \
  59. elm.tagName.lower() in ('li', 'dt', 'dd'):
  60. return True
  61. return False
  62. def _convert_reduced_listing_environment(self, node, environment=True):
  63. """Internal method should only be called
  64. by ``convert_listing_environment``.
  65. """
  66. if node.tagName.lower() in ('ol', 'ul'):
  67. nodes_latex = self._convert_listing_items(node)
  68. else:
  69. nodes_latex = self._convert_description_items(node)
  70. if not nodes_latex:
  71. return []
  72. elif environment:
  73. begin_env, end_env = self._create_environ(node)
  74. return ['', begin_env, nodes_latex, end_env]
  75. else:
  76. return [nodes_latex]
  77. def _convert_listing_items(self, list_node):
  78. """Converts <li> nodes to LaTeX.
  79. """
  80. latex = []
  81. for elm in list_node.childNodes:
  82. if elm.nodeType == minidom.Node.ELEMENT_NODE and \
  83. elm.tagName.lower() == 'li':
  84. content = self._get_node_content(elm)
  85. if content:
  86. latex.append(r'\item %s' % content.strip())
  87. elif elm.nodeType == minidom.Node.ELEMENT_NODE and \
  88. elm.tagName.lower() in self.listing_tag_mapping.keys():
  89. latex.extend(self.convert_listing_environment(elm))
  90. else:
  91. content_latex = self._get_node_content(elm)
  92. if content_latex is not None:
  93. latex.append(content_latex)
  94. return '\n'.join(latex)
  95. def _convert_description_items(self, list_node):
  96. """Converts <dt> / <dd> nodes to LaTeX.
  97. """
  98. latex = []
  99. dt_node = None
  100. for elm in list_node.childNodes:
  101. if elm.nodeType == minidom.Node.ELEMENT_NODE and \
  102. elm.tagName.lower() == 'dt':
  103. dt_node = elm
  104. elif elm.nodeType == minidom.Node.ELEMENT_NODE and \
  105. elm.tagName.lower() == 'dd' and \
  106. dt_node is not None:
  107. dt_content = self._get_node_content(dt_node) or ''
  108. dd_content = self._get_node_content(elm) or ''
  109. latex.append(r'\item[%s] %s' % (
  110. dt_content.strip(),
  111. dd_content.strip()))
  112. dt_node = None
  113. elif elm.nodeType == minidom.Node.ELEMENT_NODE and \
  114. elm.tagName.lower() in self.listing_tag_mapping.keys():
  115. latex.extend(self.convert_listing_environment(elm))
  116. else:
  117. content_latex = self._get_node_content(elm)
  118. if content_latex is not None:
  119. latex.append(content_latex)
  120. return '\n'.join(latex)
  121. def _get_node_content(self, elm):
  122. """Returns the LaTeX for the node `elm`.
  123. """
  124. if elm.nodeType == minidom.Node.TEXT_NODE:
  125. content_html = elm.toxml().strip()
  126. else: # tag node
  127. content_html = ''.join(
  128. [e.toxml() for e in elm.childNodes])
  129. if len(content_html) == 0:
  130. return None
  131. else:
  132. return self.converter.convert(content_html)
  133. def _create_environ(self, list_):
  134. """Creates an environment for the node `list_`.
  135. """
  136. name = list_.tagName.lower()
  137. env = self.listing_tag_mapping[name]
  138. return (r'\begin{%s}' % env,
  139. r'\end{%s}' % env)