/documentation/tools/build.py
Python | 537 lines | 530 code | 7 blank | 0 comment | 8 complexity | da87d6b66e092f4bed70aeb1514d9a76 MD5 | raw file
- import hashlib
- import json
- import os
- import re
- import shutil
- from datetime import datetime
- from functools import lru_cache as cache
- from pathlib import Path, PurePath
- from string import Template
- from sys import maxsize
- from typing import List, Union
- import emoji
- import markdown
- import sass
- import yaml
- from bs4 import BeautifulSoup
- from jinja2 import Environment, FileSystemLoader, select_autoescape
- HERE = Path(__file__).resolve().parent
- path_prefix = os.environ.get("BLOG_PATH_PREFIX", "")
- class Environments:
- dev = "dev"
- dist = "dist"
- def default_jinja_variables():
- now = datetime.utcnow()
- return {"articles": dict(), "articles_by_tag": dict(), "current_year": now.year}
- jinja_variables = default_jinja_variables()
- scss_ptn = re.compile(r"href=\"(.+?\.scss)\s*?\"")
- @cache()
- def environment():
- return os.environ.get("BLOG_ENV", Environments.dev).lower()
- def state_path():
- return HERE / ".state.json"
- def load_state():
- if not state_path().exists():
- return dict()
- try:
- with state_path().open("r") as fp:
- return json.load(fp)
- except json.JSONDecodeError:
- return dict()
- def save_state(state):
- with state_path().open("w+") as fp:
- json.dump(state, fp, indent=2)
- def hash_file(file: Path):
- h = hashlib.blake2s()
- with file.open("rb") as fp:
- h.update(fp.read())
- return h.hexdigest().lower()
- def sass_output_style():
- if environment() == Environments.dev:
- return "expanded"
- elif environment() == Environments.dist:
- return "compressed"
- return "compact"
- def here() -> Path:
- return Path(__file__).resolve().parent
- def tools() -> Path:
- return here()
- def root() -> Path:
- return tools().parent
- def src() -> Path:
- return root() / "src"
- def build() -> Path:
- if environment() == Environments.dev:
- return Path("/", "tmp", "grapejuice-docs", "build")
- elif environment() == Environments.dist:
- return root() / "dist"
- else:
- return root() / "build"
- def clean():
- shutil.rmtree(build(), ignore_errors=True)
- def get_build_files():
- build_files = list(filter(Path.is_file, build().rglob("*")))
- return dict(zip(map(str, build_files), map(hash_file, build_files)))
- def copy():
- state = load_state()
- shutil.copytree(
- src(), build(), ignore=lambda *_: list(state.get("static_files", dict()).keys())
- )
- state["build_files"] = get_build_files()
- save_state(state)
- def rewrite_extension(p: Path, extension: str) -> Path:
- s = p.name.split(".")[:-1]
- s.extend([extension])
- return p.parent / ".".join(s)
- class Summarizer:
- _words: List[str]
- _character_counter: int = 0
- _limit: int
- _break_pads: List[str]
- def __init__(self, limit: int = 50, break_pads: Union[List[str], None] = None):
- self._words = []
- self._limit = limit
- self._break_pads = list(set(map(str.strip, break_pads))) if break_pads else []
- @property
- def limit_reached(self) -> bool:
- return self._character_counter >= self._limit
- @property
- def content(self) -> str:
- return " ".join(self._words)
- def add(self, words: str):
- for word in re.split("\s+", words):
- word = word.strip()
- length = len(word)
- for bp in self._break_pads:
- if word.lower() == bp.lower():
- return
- if self._character_counter + length < self._limit:
- self._words.append(word)
- self._character_counter += length
- else:
- return
- def __str__(self):
- return self.content
- def process_markdown():
- def markdown_files():
- return build().rglob("*.md")
- for md_file in markdown_files():
- html_file = rewrite_extension(md_file, "html")
- html_file = html_file.parent / re.sub("[\s]", "_", html_file.name)
- with md_file.open("r") as fp:
- md_content = fp.read()
- front_matter_lines = []
- md_content_lines = []
- line_target = front_matter_lines
- found_front_matter = False
- all_md_lines = md_content.split("\n")
- all_md_lines.append("")
- line_zero = all_md_lines[0].strip()
- scan_for_front_matter = ":" in line_zero
- if scan_for_front_matter:
- for line in all_md_lines:
- line = line.replace("\r", "")
- stripped_line = line.strip()
- if not found_front_matter and stripped_line.startswith("---"):
- line_target = md_content_lines
- found_front_matter = True
- line_target.append(line)
- else:
- md_content_lines = all_md_lines
- if found_front_matter:
- try:
- yaml.safe_load("\n".join(front_matter_lines))
- except:
- found_front_matter = False
- if found_front_matter:
- front_matter_data = yaml.safe_load("\n".join(front_matter_lines))
- else:
- md_content_lines = [*front_matter_lines, *md_content_lines]
- front_matter_lines = []
- front_matter_data = dict()
- if not isinstance(front_matter_data, dict):
- front_matter_data = dict()
- md_content = "\n".join(md_content_lines)
- md_content = emoji.emojize(md_content, variant="emoji_type", use_aliases=True)
- html_template = Template(
- """{% extends "layout/_article.html" %}
- {% block article %}
- $MD_HTML
- {% endblock %}
- """
- )
- rendered_markdown = markdown.markdown(
- md_content,
- extensions=[
- "markdown.extensions.tables",
- "markdown.extensions.fenced_code",
- "markdown.extensions.codehilite",
- "markdown.extensions.smarty",
- "markdown.extensions.toc",
- "mdx_truly_sane_lists",
- ],
- )
- html_content = html_template.safe_substitute(
- {"MD_HTML": re.sub(r"\s*\[summary\-snip\]\s*", "", rendered_markdown)}
- )
- md_soup = BeautifulSoup(rendered_markdown, "lxml")
- summarizer = Summarizer(break_pads=["[summary-snip]"])
- summarizer.add(md_soup.text)
- with html_file.open("w+") as fp:
- fp.write(html_content)
- href = "/" + str(html_file.relative_to(build()))
- article_date = front_matter_data.get("date")
- if isinstance(article_date, str):
- article_date = datetime.fromisocalendar(article_date)
- if article_date is None:
- md_stat = os.stat(md_file)
- article_date = datetime.fromtimestamp(md_stat.st_ctime)
- article_info = {
- "href": href,
- "front_matter": front_matter_data,
- "title": front_matter_data.get(
- "title", PurePath(href).name.rstrip(".html")
- ),
- "subtitle": front_matter_data.get("subtitle", ""),
- "summary": summarizer.content,
- "date": article_date,
- }
- if tags := front_matter_data.get("tags", None):
- for tag in tags:
- articles_list = jinja_variables["articles_by_tag"].setdefault(tag, [])
- articles_list.append(article_info)
- jinja_variables["articles"][href] = article_info
- os.remove(md_file)
- def process_html_file(
- jenv: Environment, source_file: Path, target_file: Union[Path, None] = None
- ):
- target_file = target_file or source_file
- with source_file.open("r") as fp:
- content = fp.read()
- href = "/" + str(source_file.relative_to(build()))
- if href in jinja_variables["articles"]:
- jinja_variables["article"] = jinja_variables["articles"][href]
- else:
- jinja_variables["article"] = None
- # Template render
- template = jenv.from_string(content)
- content = template.render(jinja_variables)
- # SCSS
- def map_style_path(s: str):
- if s.startswith("/"):
- return build() / s.lstrip("/")
- else:
- return source_file.parent / s
- styles = list(set(scss_ptn.findall(content)))
- style_paths = list(map(map_style_path, styles))
- for style, style_file in zip(styles, style_paths):
- try:
- output_style_file = rewrite_extension(style_file, "css")
- output_style = ""
- with style_file.open("r") as fp:
- output_style = sass.compile(
- string=fp.read(),
- output_style=sass_output_style(),
- source_map_embed=True,
- include_paths=[str(style_file.parent)],
- )
- with output_style_file.open("w+") as fp:
- fp.write(output_style)
- content = content.replace(
- style, "/" + str(output_style_file.relative_to(build()))
- )
- except sass.CompileError as e:
- print(
- f"SASS compiler error:\n{str(e)}\n{str(style_file.relative_to(build()))}\n----------"
- )
- except FileNotFoundError as e:
- print(f"File not found: {e}\n----------")
- soup = BeautifulSoup(content, "html5lib")
- def update_targeting_attr(attrs, attr):
- v = attrs.get(attr)
- if v == path_prefix or v.startswith("http") or v.startswith("#"):
- return
- if v == "/":
- attrs[attr] = path_prefix if path_prefix else v
- return
- else:
- target = find_href_target(target_file, v)
- if isinstance(target, Path):
- target = str(target.relative_to(build()))
- attrs[attr] = (
- path_prefix + "/" + target if path_prefix else "/" + target
- ).rstrip("/")
- def update_external_anchor_tag(tag):
- v = tag.attrs.get("href", "")
- if v.startswith("http://") or v.startswith("https://"):
- tag.attrs["target"] = tag.attrs.get("target", "_blank")
- el_image = soup.new_tag("img", src=f"/images/external_link.svg")
- el_image["class"] = "external-link-image"
- tag.append(el_image)
- for href_tag in soup.find_all(href=True):
- update_targeting_attr(href_tag, "href")
- for href_tag in soup.find_all("a", href=True):
- update_external_anchor_tag(href_tag)
- for src_tag in soup.find_all(src=True):
- update_targeting_attr(src_tag, "src")
- content = soup.prettify()
- # Emit
- with target_file.open("w+") as fp:
- fp.write(content)
- def find_href_target(from_file: Path, href: str):
- href_split = list(filter(None, href.split("/")))
- if len(href_split) <= 0:
- return href
- if href_split[0] == "https:" or href_split[0] == "http:":
- return href
- href_starts_at_root = href[0] == "/"
- if href_starts_at_root:
- path = Path(build(), *href_split)
- else:
- path = Path(from_file.parent, *href_split)
- path = path.resolve()
- if not path.exists() and not path.name.endswith(".html"):
- path = path.parent / (path.name + ".html")
- assert (
- path.exists()
- ), f"Could not find href target for {href} -> {path} in {from_file}"
- return path
- def process_html_file_multi_out(jenv: Environment, source_file: Path, variable: str):
- filename_template = Template(source_file.name)
- for v in jinja_variables[variable]:
- target_file_name = filename_template.safe_substitute({variable: v})
- target_file = source_file.parent / target_file_name
- jinja_variables["multi_out"] = {"variable": v}
- process_html_file(jenv, source_file, target_file=target_file)
- jinja_variables["multi_out"] = None
- os.remove(source_file)
- def process_html():
- jinja_variables["articles_list"] = list(
- sorted(
- jinja_variables["articles"].values(),
- key=lambda a: a.get("date") or datetime.now(),
- )
- )
- jinja_variables["tags"] = list(jinja_variables["articles_by_tag"].keys())
- jenv = Environment(
- loader=FileSystemLoader(build()), autoescape=select_autoescape(), cache_size=0
- )
- def html_files():
- return filter(lambda p: not p.name.startswith("_"), build().rglob("*.html"))
- for html_file in html_files():
- if match := re.search(r"\$(\w+)\.html", html_file.name):
- process_html_file_multi_out(jenv, html_file, match.group(1))
- else:
- process_html_file(jenv, html_file)
- for scss_file in build().rglob("*.scss"):
- os.remove(scss_file)
- def strip_partials():
- def file_filter(p: Path):
- return p.name.startswith("_") and p.is_file()
- for file in filter(file_filter, build().rglob("*")):
- os.remove(file)
- def remove_empty_directories():
- f = True
- def directory_filter(p: Path):
- return p.is_dir() and len(list(p.glob("*"))) <= 0
- while f:
- empty_directories = list(filter(directory_filter, build().rglob("*")))
- f = len(empty_directories) <= 0
- for d in empty_directories:
- os.rmdir(d)
- def update_static_files_in_state():
- state = load_state()
- old_build_files = list(map(tuple, state.get("build_files", dict()).items()))
- build_files = list(map(tuple, get_build_files().items()))
- state["static_files"] = dict(filter(lambda x: x in old_build_files, build_files))
- save_state(state)
- def do_build():
- global jinja_variables
- print("### Building a blog! ###")
- print(f"Environment = {environment()}")
- print("")
- jinja_variables = default_jinja_variables()
- clean()
- copy()
- process_markdown()
- process_html()
- if environment() == Environments.dist:
- strip_partials()
- remove_empty_directories()
- update_static_files_in_state()
- print("Done!")
- def main():
- do_build()
- if __name__ == "__main__":
- main()