PageRenderTime 154ms CodeModel.GetById 31ms RepoModel.GetById 0ms app.codeStats 0ms

/pants-plugins/src/python/internal_backend/sitegen/tasks/sitegen.py

https://gitlab.com/Ivy001/pants
Python | 374 lines | 319 code | 27 blank | 28 comment | 23 complexity | 6eb3e066c4582b2fe26b988958784df9 MD5 | raw file
  1. # coding=utf-8
  2. # Copyright 2014 Pants project contributors (see CONTRIBUTORS.md).
  3. # Licensed under the Apache License, Version 2.0 (see LICENSE).
  4. from __future__ import (absolute_import, division, generators, nested_scopes, print_function,
  5. unicode_literals, with_statement)
  6. import collections
  7. import datetime
  8. import json
  9. import os
  10. import re
  11. import shutil
  12. import pystache
  13. from six.moves import range
  14. from pants.base.exceptions import TaskError
  15. from pants.task.task import Task
  16. """Static Site Generator for the Pants Build documentation site.
  17. Suggested use:
  18. cd pants
  19. ./build-support/bin/publish_docs.sh # invokes sitegen.py
  20. """
  21. def beautiful_soup(*args, **kwargs):
  22. """Indirection function so we can lazy-import bs4.
  23. It's an expensive import that invokes re.compile a lot, so we don't want to incur that cost
  24. unless we must.
  25. """
  26. import bs4
  27. return bs4.BeautifulSoup(*args, **kwargs)
  28. class SiteGen(Task):
  29. """Generate the Pants static web site."""
  30. @classmethod
  31. def register_options(cls, register):
  32. super(SiteGen, cls).register_options(register)
  33. register('--config-path', type=list, help='Path to .json file describing site structure.')
  34. def execute(self):
  35. if not self.get_options().config_path:
  36. raise TaskError('The config_path option must be specified, e.g., with the --config-path flag')
  37. for config_path in self.get_options().config_path:
  38. config = load_config(config_path)
  39. soups = load_soups(config)
  40. precomputed = precompute(config, soups)
  41. transform_soups(config, soups, precomputed)
  42. template = load_template(config)
  43. write_en_pages(config, soups, precomputed, template)
  44. copy_extras(config)
  45. def load_config(json_path):
  46. """Load config info from a .json file and return it."""
  47. with open(json_path) as json_file:
  48. config = json.loads(json_file.read().decode('utf8'))
  49. # sanity-test the config:
  50. assert(config['tree'][0]['page'] == 'index')
  51. return config
  52. def load_soups(config):
  53. """Generate BeautifulSoup AST for each page listed in config."""
  54. soups = {}
  55. for page, path in config['sources'].items():
  56. with open(path, 'rb') as orig_file:
  57. soups[page] = beautiful_soup(orig_file.read().decode('utf-8'))
  58. return soups
  59. class Precomputed(object):
  60. """Info we compute (and preserve) before we mutate things."""
  61. def __init__(self, page, pantsref):
  62. """
  63. :param page: dictionary of per-page precomputed info
  64. :param pantsref: dictionary of pantsrefs {'foo': 'path/to/page.html#fooref', ...}
  65. """
  66. self.page = page
  67. self.pantsref = pantsref
  68. class PrecomputedPageInfo(object):
  69. """Info we compute (and preserve) for each page before we mutate things."""
  70. def __init__(self, title, show_toc):
  71. """
  72. :param title: Page title
  73. :param show_toc: True iff we should show a toc for this page.
  74. """
  75. self.title = title
  76. self.show_toc = show_toc
  77. self.toc = []
  78. def precompute_pantsrefs(soups):
  79. """Return links for <a pantsmark="foo"> tags. Mutates soups to give needed ids.
  80. If we see <a pantsref="foo">something</a>, that's a link whose destination is
  81. a <a pantsmark="foo"> </a> tag, perhaps on some other tag. To stitch these
  82. together, we scan the docset to find all the pantsmarks. If an pantsmark does not
  83. yet have an id to anchor, we give it one.
  84. Return value dictionary maps pantsrefs to locations:
  85. { "foo": "path/to/foo.html#fooref", "bar": "other/page.html#barref", ...}
  86. """
  87. accumulator = {}
  88. for (page, soup) in soups.items():
  89. existing_anchors = find_existing_anchors(soup)
  90. count = 100
  91. for tag in soup.find_all('a'):
  92. if tag.has_attr('pantsmark'):
  93. pantsmark = tag['pantsmark']
  94. if pantsmark in accumulator:
  95. raise TaskError('pantsmarks are unique but "{0}" appears in {1} and {2}'
  96. .format(pantsmark, page, accumulator[pantsmark]))
  97. # To link to a place "mid-page", we need an HTML anchor.
  98. # If this tag already has such an anchor, use it.
  99. # Else, make one up.
  100. anchor = tag.get('id') or tag.get('name')
  101. if not anchor:
  102. anchor = pantsmark
  103. while anchor in existing_anchors:
  104. count += 1
  105. anchor = '{0}_{1}'.format(pantsmark, count)
  106. tag['id'] = anchor
  107. existing_anchors = find_existing_anchors(soup)
  108. link = '{0}.html#{1}'.format(page, anchor)
  109. accumulator[pantsmark] = link
  110. return accumulator
  111. def precompute(config, soups):
  112. """Return info we want to compute (and preserve) before we mutate things."""
  113. show_toc = config.get('show_toc', {})
  114. page = {}
  115. pantsrefs = precompute_pantsrefs(soups)
  116. for p, soup in soups.items():
  117. title = get_title(soup) or p
  118. page[p] = PrecomputedPageInfo(title=title, show_toc=show_toc.get(p, True))
  119. return Precomputed(page=page, pantsref=pantsrefs)
  120. def fixup_internal_links(config, soups):
  121. """Find href="..." links that link to pages in our docset; fix them up.
  122. We don't preserve relative paths between files as we copy-transform them
  123. from source to dest. So adjust the paths to work with new locations.
  124. """
  125. # Pages can come from different dirs; they can go to different dirs.
  126. # Thus, there's some relative-path-computing here.
  127. reverse_directory = {}
  128. for d, s in config['sources'].items():
  129. reverse_directory[s] = d
  130. for name, soup in soups.items():
  131. old_src_dir = os.path.dirname(config['sources'][name])
  132. for tag in soup.find_all(True):
  133. if not 'href' in tag.attrs: continue
  134. old_rel_path = tag['href'].split('#')[0]
  135. old_dst = os.path.normpath(os.path.join(old_src_dir, old_rel_path))
  136. if not old_dst in reverse_directory: continue
  137. new_dst = reverse_directory[old_dst] + '.html'
  138. new_rel_path = rel_href(name, new_dst)
  139. # string replace instead of assign to not loose anchor in foo.html#anchor
  140. tag['href'] = tag['href'].replace(old_rel_path, new_rel_path, 1)
  141. _heading_re = re.compile('^h[1-6]$') # match heading tag names h1,h2,h3,...
  142. def rel_href(src, dst):
  143. """For src='foo/bar.html', dst='garply.html#frotz' return relative link '../garply.html#frotz'.
  144. """
  145. src_dir = os.path.dirname(src)
  146. return os.path.relpath(dst, src_dir)
  147. def find_existing_anchors(soup):
  148. """Return existing ids (and names) from a soup."""
  149. existing_anchors = set()
  150. for tag in soup.find_all(True):
  151. for attr in ['id', 'name']:
  152. if tag.has_attr(attr):
  153. existing_anchors.add(tag.get(attr))
  154. return existing_anchors
  155. def ensure_headings_linkable(soups):
  156. """foreach soup, foreach h1,h2,etc, if no id=... or name=..., give it one.
  157. Enables tables of contents.
  158. """
  159. for soup in soups.values():
  160. # To avoid re-assigning an existing id, note 'em down.
  161. # Case-insensitve because distinguishing links #Foo and #foo would be weird.
  162. existing_anchors = find_existing_anchors(soup)
  163. count = 100
  164. for tag in soup.find_all(_heading_re):
  165. if not (tag.has_attr('id') or tag.has_attr('name')):
  166. snippet = ''.join([c for c in tag.text if c.isalpha()])[:20]
  167. while True:
  168. count += 1
  169. candidate_id = 'heading_{0}_{1}'.format(snippet, count).lower()
  170. if not candidate_id in existing_anchors:
  171. existing_anchors.add(candidate_id)
  172. tag['id'] = candidate_id
  173. break
  174. def link_pantsrefs(soups, precomputed):
  175. """Transorm soups: <a pantsref="foo"> becomes <a href="../foo_page.html#foo">"""
  176. for (page, soup) in soups.items():
  177. for a in soup.find_all('a'):
  178. if a.has_attr('pantsref'):
  179. pantsref = a['pantsref']
  180. if not pantsref in precomputed.pantsref:
  181. raise TaskError('Page {0} has pantsref "{1}" and I cannot find pantsmark for'
  182. ' it'.format(page, pantsref))
  183. a['href'] = rel_href(page, precomputed.pantsref[pantsref])
  184. def transform_soups(config, soups, precomputed):
  185. """Mutate our soups to be better when we write them out later."""
  186. fixup_internal_links(config, soups)
  187. ensure_headings_linkable(soups)
  188. # Do this after ensure_headings_linkable so that there will be links.
  189. generate_page_tocs(soups, precomputed)
  190. link_pantsrefs(soups, precomputed)
  191. def get_title(soup):
  192. """Given a soup, pick out a title"""
  193. if soup.title: return soup.title.string
  194. if soup.h1: return soup.h1.string
  195. return ''
  196. def generate_site_toc(config, precomputed, here):
  197. site_toc = []
  198. def recurse(tree, depth_so_far):
  199. for node in tree:
  200. if 'heading' in node:
  201. heading = node['heading']
  202. site_toc.append(dict(depth=depth_so_far,
  203. link=None,
  204. text=heading,
  205. here=False))
  206. if 'page' in node and node['page'] != 'index':
  207. dst = node['page']
  208. if dst == here:
  209. link = here + '.html'
  210. else:
  211. link = os.path.relpath(dst + '.html', os.path.dirname(here))
  212. site_toc.append(dict(depth=depth_so_far,
  213. link=link,
  214. text=precomputed.page[dst].title,
  215. here=(dst == here)))
  216. if 'children' in node:
  217. recurse(node['children'], depth_so_far + 1)
  218. if 'tree' in config:
  219. recurse(config['tree'], 0)
  220. return site_toc
  221. def hdepth(tag):
  222. """Compute an h tag's "outline depth".
  223. E.g., h1 at top level is 1, h1 in a section is 2, h2 at top level is 2.
  224. """
  225. if not _heading_re.search(tag.name):
  226. raise TaskError('Can\'t compute heading depth of non-heading {0}'.format(tag))
  227. depth = int(tag.name[1], 10) # get the 2 from 'h2'
  228. cursor = tag
  229. while cursor:
  230. if cursor.name == 'section':
  231. depth += 1
  232. cursor = cursor.parent
  233. return depth
  234. def generate_page_tocs(soups, precomputed):
  235. for name, soup in soups.items():
  236. if precomputed.page[name].show_toc:
  237. precomputed.page[name].toc = generate_page_toc(soup)
  238. def generate_page_toc(soup):
  239. """Return page-level (~list of headings) TOC template data for soup"""
  240. # Maybe we don't want to show all the headings. E.g., it's common for a page
  241. # to have just one H1, a title at the top. Our heuristic: if a page has just
  242. # one heading of some outline level, don't show it.
  243. found_depth_counts = collections.defaultdict(int)
  244. for tag in soup.find_all(_heading_re):
  245. if (tag.get('id') or tag.get('name')):
  246. found_depth_counts[hdepth(tag)] += 1
  247. depth_list = [i for i in range(100) if 1 < found_depth_counts[i]]
  248. depth_list = depth_list[:4]
  249. toc = []
  250. for tag in soup.find_all(_heading_re):
  251. depth = hdepth(tag)
  252. if depth in depth_list:
  253. toc.append(dict(depth=depth_list.index(depth) + 1,
  254. link=tag.get('id') or tag.get('name'),
  255. text=tag.text))
  256. return toc
  257. def generate_generated(config, here):
  258. return('{0} {1}'.format(config['sources'][here],
  259. datetime.datetime.now().isoformat()))
  260. def render_html(dst, config, soups, precomputed, template):
  261. soup = soups[dst]
  262. renderer = pystache.Renderer()
  263. title = precomputed.page[dst].title
  264. topdots = ('../' * dst.count('/'))
  265. if soup.body:
  266. body_html = '{0}'.format(soup.body)
  267. else:
  268. body_html = '{0}'.format(soup)
  269. html = renderer.render(template,
  270. body_html=body_html,
  271. generated=generate_generated(config, dst),
  272. site_toc=generate_site_toc(config, precomputed, dst),
  273. has_page_toc=bool(precomputed.page[dst].toc),
  274. page_path=dst,
  275. page_toc=precomputed.page[dst].toc,
  276. title=title,
  277. topdots=topdots)
  278. return html
  279. def write_en_pages(config, soups, precomputed, template):
  280. outdir = config['outdir']
  281. for dst in soups:
  282. html = render_html(dst, config, soups, precomputed, template)
  283. dst_path = os.path.join(outdir, dst + '.html')
  284. dst_dir = os.path.dirname(dst_path)
  285. if not os.path.isdir(dst_dir):
  286. os.makedirs(dst_dir)
  287. with open(dst_path, 'wb') as f:
  288. f.write(html.encode('utf-8'))
  289. def copy_extras(config):
  290. """copy over "extra" files named in config json: stylesheets, logos, ..."""
  291. outdir = config['outdir']
  292. for dst, src in config['extras'].items():
  293. dst_path = os.path.join(outdir, dst)
  294. dst_dir = os.path.dirname(dst_path)
  295. if not os.path.isdir(dst_dir):
  296. os.makedirs(dst_dir)
  297. shutil.copy(src, dst_path)
  298. def load_template(config):
  299. """Return text of template file specified in config"""
  300. with open(config['template'], 'rb') as template_file:
  301. template = template_file.read().decode('utf-8')
  302. return template