sitegen.py - Copyright 2014 Pants project contributors (see…

/pants-plugins/src/python/internal_backend/sitegen/tasks/sitegen.py

https://gitlab.com/Ivy001/pants · Python · 374 lines · 289 code · 42 blank · 43 comment · 51 complexity · 6eb3e066c4582b2fe26b988958784df9 MD5 · raw file

# coding=utf-8
# Copyright 2014 Pants project contributors (see CONTRIBUTORS.md).
# Licensed under the Apache License, Version 2.0 (see LICENSE).

from __future__ import (absolute_import, division, generators, nested_scopes, print_function,
                        unicode_literals, with_statement)

import collections
import datetime
import json
import os
import re
import shutil

import pystache
from six.moves import range

from pants.base.exceptions import TaskError
from pants.task.task import Task


"""Static Site Generator for the Pants Build documentation site.

Suggested use:
  cd pants
  ./build-support/bin/publish_docs.sh  # invokes sitegen.py
"""


def beautiful_soup(*args, **kwargs):
  """Indirection function so we can lazy-import bs4.

  It's an expensive import that invokes re.compile a lot, so we don't want to incur that cost
  unless we must.
  """
  import bs4
  return bs4.BeautifulSoup(*args, **kwargs)


class SiteGen(Task):
  """Generate the Pants static web site."""

  @classmethod
  def register_options(cls, register):
    super(SiteGen, cls).register_options(register)
    register('--config-path', type=list, help='Path to .json file describing site structure.')

  def execute(self):
    if not self.get_options().config_path:
      raise TaskError('The config_path option must be specified, e.g., with the --config-path flag')
    for config_path in self.get_options().config_path:
      config = load_config(config_path)
      soups = load_soups(config)
      precomputed = precompute(config, soups)
      transform_soups(config, soups, precomputed)
      template = load_template(config)
      write_en_pages(config, soups, precomputed, template)
      copy_extras(config)


def load_config(json_path):
  """Load config info from a .json file and return it."""
  with open(json_path) as json_file:
    config = json.loads(json_file.read().decode('utf8'))
  # sanity-test the config:
  assert(config['tree'][0]['page'] == 'index')
  return config


def load_soups(config):
  """Generate BeautifulSoup AST for each page listed in config."""
  soups = {}
  for page, path in config['sources'].items():
    with open(path, 'rb') as orig_file:
      soups[page] = beautiful_soup(orig_file.read().decode('utf-8'))
  return soups


class Precomputed(object):
  """Info we compute (and preserve) before we mutate things."""

  def __init__(self, page, pantsref):
    """
    :param page: dictionary of per-page precomputed info
    :param pantsref: dictionary of pantsrefs {'foo': 'path/to/page.html#fooref', ...}
    """
    self.page = page
    self.pantsref = pantsref


class PrecomputedPageInfo(object):
  """Info we compute (and preserve) for each page before we mutate things."""

  def __init__(self, title, show_toc):
    """
    :param title: Page title
    :param show_toc: True iff we should show a toc for this page.
    """
    self.title = title
    self.show_toc = show_toc
    self.toc = []


def precompute_pantsrefs(soups):
  """Return links for <a pantsmark="foo"> tags. Mutates soups to give needed ids.

  If we see <a pantsref="foo">something</a>, that's a link whose destination is
  a <a pantsmark="foo"> </a> tag, perhaps on some other tag. To stitch these
  together, we scan the docset to find all the pantsmarks. If an pantsmark does not
  yet have an id to anchor, we give it one.

  Return value dictionary maps pantsrefs to locations:
  { "foo": "path/to/foo.html#fooref", "bar": "other/page.html#barref", ...}
  """
  accumulator = {}
  for (page, soup) in soups.items():
    existing_anchors = find_existing_anchors(soup)
    count = 100
    for tag in soup.find_all('a'):
      if tag.has_attr('pantsmark'):
        pantsmark = tag['pantsmark']
        if pantsmark in accumulator:
          raise TaskError('pantsmarks are unique but "{0}" appears in {1} and {2}'
                          .format(pantsmark, page, accumulator[pantsmark]))

        # To link to a place "mid-page", we need an HTML anchor.
        # If this tag already has such an anchor, use it.
        # Else, make one up.
        anchor = tag.get('id') or tag.get('name')
        if not anchor:
          anchor = pantsmark
          while anchor in existing_anchors:
            count += 1
            anchor = '{0}_{1}'.format(pantsmark, count)
          tag['id'] = anchor
          existing_anchors = find_existing_anchors(soup)

        link = '{0}.html#{1}'.format(page, anchor)
        accumulator[pantsmark] = link
  return accumulator


def precompute(config, soups):
  """Return info we want to compute (and preserve) before we mutate things."""
  show_toc = config.get('show_toc', {})
  page = {}
  pantsrefs = precompute_pantsrefs(soups)
  for p, soup in soups.items():
    title = get_title(soup) or p
    page[p] = PrecomputedPageInfo(title=title, show_toc=show_toc.get(p, True))
  return Precomputed(page=page, pantsref=pantsrefs)


def fixup_internal_links(config, soups):
  """Find href="..." links that link to pages in our docset; fix them up.

  We don't preserve relative paths between files as we copy-transform them
  from source to dest. So adjust the paths to work with new locations.
  """
  # Pages can come from different dirs; they can go to different dirs.
  # Thus, there's some relative-path-computing here.
  reverse_directory = {}
  for d, s in config['sources'].items():
    reverse_directory[s] = d
  for name, soup in soups.items():
    old_src_dir = os.path.dirname(config['sources'][name])
    for tag in soup.find_all(True):
      if not 'href' in tag.attrs: continue
      old_rel_path = tag['href'].split('#')[0]
      old_dst = os.path.normpath(os.path.join(old_src_dir, old_rel_path))
      if not old_dst in reverse_directory: continue
      new_dst = reverse_directory[old_dst] + '.html'
      new_rel_path = rel_href(name, new_dst)
      # string replace instead of assign to not loose anchor in foo.html#anchor
      tag['href'] = tag['href'].replace(old_rel_path, new_rel_path, 1)


_heading_re = re.compile('^h[1-6]$')  # match heading tag names h1,h2,h3,...


def rel_href(src, dst):
  """For src='foo/bar.html', dst='garply.html#frotz' return relative link '../garply.html#frotz'.
  """
  src_dir = os.path.dirname(src)
  return os.path.relpath(dst, src_dir)


def find_existing_anchors(soup):
  """Return existing ids (and names) from a soup."""
  existing_anchors = set()
  for tag in soup.find_all(True):
    for attr in ['id', 'name']:
      if tag.has_attr(attr):
        existing_anchors.add(tag.get(attr))
  return existing_anchors


def ensure_headings_linkable(soups):
  """foreach soup, foreach h1,h2,etc, if no id=... or name=..., give it one.

  Enables tables of contents.
  """
  for soup in soups.values():
    # To avoid re-assigning an existing id, note 'em down.
    # Case-insensitve because distinguishing links #Foo and #foo would be weird.
    existing_anchors = find_existing_anchors(soup)
    count = 100
    for tag in soup.find_all(_heading_re):
      if not (tag.has_attr('id') or tag.has_attr('name')):
        snippet = ''.join([c for c in tag.text if c.isalpha()])[:20]
        while True:
          count += 1
          candidate_id = 'heading_{0}_{1}'.format(snippet, count).lower()
          if not candidate_id in existing_anchors:
            existing_anchors.add(candidate_id)
            tag['id'] = candidate_id
            break


def link_pantsrefs(soups, precomputed):
  """Transorm soups: <a pantsref="foo"> becomes <a href="../foo_page.html#foo">"""
  for (page, soup) in soups.items():
    for a in soup.find_all('a'):
      if a.has_attr('pantsref'):
        pantsref = a['pantsref']
        if not pantsref in precomputed.pantsref:
          raise TaskError('Page {0} has pantsref "{1}" and I cannot find pantsmark for'
                          ' it'.format(page, pantsref))
        a['href'] = rel_href(page, precomputed.pantsref[pantsref])


def transform_soups(config, soups, precomputed):
  """Mutate our soups to be better when we write them out later."""
  fixup_internal_links(config, soups)
  ensure_headings_linkable(soups)

  # Do this after ensure_headings_linkable so that there will be links.
  generate_page_tocs(soups, precomputed)
  link_pantsrefs(soups, precomputed)


def get_title(soup):
  """Given a soup, pick out a title"""
  if soup.title: return soup.title.string
  if soup.h1: return soup.h1.string
  return ''


def generate_site_toc(config, precomputed, here):
  site_toc = []

  def recurse(tree, depth_so_far):
    for node in tree:
      if 'heading' in node:
        heading = node['heading']
        site_toc.append(dict(depth=depth_so_far,
                             link=None,
                             text=heading,
                             here=False))
      if 'page' in node and node['page'] != 'index':
        dst = node['page']
        if dst == here:
          link = here + '.html'
        else:
          link = os.path.relpath(dst + '.html', os.path.dirname(here))
        site_toc.append(dict(depth=depth_so_far,
                             link=link,
                             text=precomputed.page[dst].title,
                             here=(dst == here)))
      if 'children' in node:
        recurse(node['children'], depth_so_far + 1)
  if 'tree' in config:
    recurse(config['tree'], 0)
  return site_toc


def hdepth(tag):
  """Compute an h tag's "outline depth".

  E.g., h1 at top level is 1, h1 in a section is 2, h2 at top level is 2.
  """
  if not _heading_re.search(tag.name):
    raise TaskError('Can\'t compute heading depth of non-heading {0}'.format(tag))
  depth = int(tag.name[1], 10)  # get the 2 from 'h2'
  cursor = tag
  while cursor:
    if cursor.name == 'section':
      depth += 1
    cursor = cursor.parent
  return depth


def generate_page_tocs(soups, precomputed):
  for name, soup in soups.items():
    if precomputed.page[name].show_toc:
      precomputed.page[name].toc = generate_page_toc(soup)


def generate_page_toc(soup):
  """Return page-level (~list of headings) TOC template data for soup"""
  # Maybe we don't want to show all the headings. E.g., it's common for a page
  # to have just one H1, a title at the top. Our heuristic: if a page has just
  # one heading of some outline level, don't show it.
  found_depth_counts = collections.defaultdict(int)
  for tag in soup.find_all(_heading_re):
    if (tag.get('id') or tag.get('name')):
      found_depth_counts[hdepth(tag)] += 1

  depth_list = [i for i in range(100) if 1 < found_depth_counts[i]]
  depth_list = depth_list[:4]
  toc = []
  for tag in soup.find_all(_heading_re):
    depth = hdepth(tag)
    if depth in depth_list:
      toc.append(dict(depth=depth_list.index(depth) + 1,
                      link=tag.get('id') or tag.get('name'),
                      text=tag.text))
  return toc


def generate_generated(config, here):
  return('{0} {1}'.format(config['sources'][here],
                          datetime.datetime.now().isoformat()))


def render_html(dst, config, soups, precomputed, template):
  soup = soups[dst]
  renderer = pystache.Renderer()
  title = precomputed.page[dst].title
  topdots = ('../' * dst.count('/'))
  if soup.body:
    body_html = '{0}'.format(soup.body)
  else:
    body_html = '{0}'.format(soup)
  html = renderer.render(template,
                         body_html=body_html,
                         generated=generate_generated(config, dst),
                         site_toc=generate_site_toc(config, precomputed, dst),
                         has_page_toc=bool(precomputed.page[dst].toc),
                         page_path=dst,
                         page_toc=precomputed.page[dst].toc,
                         title=title,
                         topdots=topdots)
  return html


def write_en_pages(config, soups, precomputed, template):
  outdir = config['outdir']
  for dst in soups:
    html = render_html(dst, config, soups, precomputed, template)
    dst_path = os.path.join(outdir, dst + '.html')
    dst_dir = os.path.dirname(dst_path)
    if not os.path.isdir(dst_dir):
      os.makedirs(dst_dir)
    with open(dst_path, 'wb') as f:
      f.write(html.encode('utf-8'))


def copy_extras(config):
  """copy over "extra" files named in config json: stylesheets, logos, ..."""
  outdir = config['outdir']
  for dst, src in config['extras'].items():
    dst_path = os.path.join(outdir, dst)
    dst_dir = os.path.dirname(dst_path)
    if not os.path.isdir(dst_dir):
      os.makedirs(dst_dir)
    shutil.copy(src, dst_path)


def load_template(config):
  """Return text of template file specified in config"""
  with open(config['template'], 'rb') as template_file:
    template = template_file.read().decode('utf-8')
  return template