PageRenderTime 63ms CodeModel.GetById 34ms RepoModel.GetById 1ms app.codeStats 0ms

/documentation/tools/build.py

https://gitlab.com/imbest91/grapejuice
Python | 537 lines | 530 code | 7 blank | 0 comment | 8 complexity | da87d6b66e092f4bed70aeb1514d9a76 MD5 | raw file
  1. import hashlib
  2. import json
  3. import os
  4. import re
  5. import shutil
  6. from datetime import datetime
  7. from functools import lru_cache as cache
  8. from pathlib import Path, PurePath
  9. from string import Template
  10. from sys import maxsize
  11. from typing import List, Union
  12. import emoji
  13. import markdown
  14. import sass
  15. import yaml
  16. from bs4 import BeautifulSoup
  17. from jinja2 import Environment, FileSystemLoader, select_autoescape
  18. HERE = Path(__file__).resolve().parent
  19. path_prefix = os.environ.get("BLOG_PATH_PREFIX", "")
  20. class Environments:
  21. dev = "dev"
  22. dist = "dist"
  23. def default_jinja_variables():
  24. now = datetime.utcnow()
  25. return {"articles": dict(), "articles_by_tag": dict(), "current_year": now.year}
  26. jinja_variables = default_jinja_variables()
  27. scss_ptn = re.compile(r"href=\"(.+?\.scss)\s*?\"")
  28. @cache()
  29. def environment():
  30. return os.environ.get("BLOG_ENV", Environments.dev).lower()
  31. def state_path():
  32. return HERE / ".state.json"
  33. def load_state():
  34. if not state_path().exists():
  35. return dict()
  36. try:
  37. with state_path().open("r") as fp:
  38. return json.load(fp)
  39. except json.JSONDecodeError:
  40. return dict()
  41. def save_state(state):
  42. with state_path().open("w+") as fp:
  43. json.dump(state, fp, indent=2)
  44. def hash_file(file: Path):
  45. h = hashlib.blake2s()
  46. with file.open("rb") as fp:
  47. h.update(fp.read())
  48. return h.hexdigest().lower()
  49. def sass_output_style():
  50. if environment() == Environments.dev:
  51. return "expanded"
  52. elif environment() == Environments.dist:
  53. return "compressed"
  54. return "compact"
  55. def here() -> Path:
  56. return Path(__file__).resolve().parent
  57. def tools() -> Path:
  58. return here()
  59. def root() -> Path:
  60. return tools().parent
  61. def src() -> Path:
  62. return root() / "src"
  63. def build() -> Path:
  64. if environment() == Environments.dev:
  65. return Path("/", "tmp", "grapejuice-docs", "build")
  66. elif environment() == Environments.dist:
  67. return root() / "dist"
  68. else:
  69. return root() / "build"
  70. def clean():
  71. shutil.rmtree(build(), ignore_errors=True)
  72. def get_build_files():
  73. build_files = list(filter(Path.is_file, build().rglob("*")))
  74. return dict(zip(map(str, build_files), map(hash_file, build_files)))
  75. def copy():
  76. state = load_state()
  77. shutil.copytree(
  78. src(), build(), ignore=lambda *_: list(state.get("static_files", dict()).keys())
  79. )
  80. state["build_files"] = get_build_files()
  81. save_state(state)
  82. def rewrite_extension(p: Path, extension: str) -> Path:
  83. s = p.name.split(".")[:-1]
  84. s.extend([extension])
  85. return p.parent / ".".join(s)
  86. class Summarizer:
  87. _words: List[str]
  88. _character_counter: int = 0
  89. _limit: int
  90. _break_pads: List[str]
  91. def __init__(self, limit: int = 50, break_pads: Union[List[str], None] = None):
  92. self._words = []
  93. self._limit = limit
  94. self._break_pads = list(set(map(str.strip, break_pads))) if break_pads else []
  95. @property
  96. def limit_reached(self) -> bool:
  97. return self._character_counter >= self._limit
  98. @property
  99. def content(self) -> str:
  100. return " ".join(self._words)
  101. def add(self, words: str):
  102. for word in re.split("\s+", words):
  103. word = word.strip()
  104. length = len(word)
  105. for bp in self._break_pads:
  106. if word.lower() == bp.lower():
  107. return
  108. if self._character_counter + length < self._limit:
  109. self._words.append(word)
  110. self._character_counter += length
  111. else:
  112. return
  113. def __str__(self):
  114. return self.content
  115. def process_markdown():
  116. def markdown_files():
  117. return build().rglob("*.md")
  118. for md_file in markdown_files():
  119. html_file = rewrite_extension(md_file, "html")
  120. html_file = html_file.parent / re.sub("[\s]", "_", html_file.name)
  121. with md_file.open("r") as fp:
  122. md_content = fp.read()
  123. front_matter_lines = []
  124. md_content_lines = []
  125. line_target = front_matter_lines
  126. found_front_matter = False
  127. all_md_lines = md_content.split("\n")
  128. all_md_lines.append("")
  129. line_zero = all_md_lines[0].strip()
  130. scan_for_front_matter = ":" in line_zero
  131. if scan_for_front_matter:
  132. for line in all_md_lines:
  133. line = line.replace("\r", "")
  134. stripped_line = line.strip()
  135. if not found_front_matter and stripped_line.startswith("---"):
  136. line_target = md_content_lines
  137. found_front_matter = True
  138. line_target.append(line)
  139. else:
  140. md_content_lines = all_md_lines
  141. if found_front_matter:
  142. try:
  143. yaml.safe_load("\n".join(front_matter_lines))
  144. except:
  145. found_front_matter = False
  146. if found_front_matter:
  147. front_matter_data = yaml.safe_load("\n".join(front_matter_lines))
  148. else:
  149. md_content_lines = [*front_matter_lines, *md_content_lines]
  150. front_matter_lines = []
  151. front_matter_data = dict()
  152. if not isinstance(front_matter_data, dict):
  153. front_matter_data = dict()
  154. md_content = "\n".join(md_content_lines)
  155. md_content = emoji.emojize(md_content, variant="emoji_type", use_aliases=True)
  156. html_template = Template(
  157. """{% extends "layout/_article.html" %}
  158. {% block article %}
  159. $MD_HTML
  160. {% endblock %}
  161. """
  162. )
  163. rendered_markdown = markdown.markdown(
  164. md_content,
  165. extensions=[
  166. "markdown.extensions.tables",
  167. "markdown.extensions.fenced_code",
  168. "markdown.extensions.codehilite",
  169. "markdown.extensions.smarty",
  170. "markdown.extensions.toc",
  171. "mdx_truly_sane_lists",
  172. ],
  173. )
  174. html_content = html_template.safe_substitute(
  175. {"MD_HTML": re.sub(r"\s*\[summary\-snip\]\s*", "", rendered_markdown)}
  176. )
  177. md_soup = BeautifulSoup(rendered_markdown, "lxml")
  178. summarizer = Summarizer(break_pads=["[summary-snip]"])
  179. summarizer.add(md_soup.text)
  180. with html_file.open("w+") as fp:
  181. fp.write(html_content)
  182. href = "/" + str(html_file.relative_to(build()))
  183. article_date = front_matter_data.get("date")
  184. if isinstance(article_date, str):
  185. article_date = datetime.fromisocalendar(article_date)
  186. if article_date is None:
  187. md_stat = os.stat(md_file)
  188. article_date = datetime.fromtimestamp(md_stat.st_ctime)
  189. article_info = {
  190. "href": href,
  191. "front_matter": front_matter_data,
  192. "title": front_matter_data.get(
  193. "title", PurePath(href).name.rstrip(".html")
  194. ),
  195. "subtitle": front_matter_data.get("subtitle", ""),
  196. "summary": summarizer.content,
  197. "date": article_date,
  198. }
  199. if tags := front_matter_data.get("tags", None):
  200. for tag in tags:
  201. articles_list = jinja_variables["articles_by_tag"].setdefault(tag, [])
  202. articles_list.append(article_info)
  203. jinja_variables["articles"][href] = article_info
  204. os.remove(md_file)
  205. def process_html_file(
  206. jenv: Environment, source_file: Path, target_file: Union[Path, None] = None
  207. ):
  208. target_file = target_file or source_file
  209. with source_file.open("r") as fp:
  210. content = fp.read()
  211. href = "/" + str(source_file.relative_to(build()))
  212. if href in jinja_variables["articles"]:
  213. jinja_variables["article"] = jinja_variables["articles"][href]
  214. else:
  215. jinja_variables["article"] = None
  216. # Template render
  217. template = jenv.from_string(content)
  218. content = template.render(jinja_variables)
  219. # SCSS
  220. def map_style_path(s: str):
  221. if s.startswith("/"):
  222. return build() / s.lstrip("/")
  223. else:
  224. return source_file.parent / s
  225. styles = list(set(scss_ptn.findall(content)))
  226. style_paths = list(map(map_style_path, styles))
  227. for style, style_file in zip(styles, style_paths):
  228. try:
  229. output_style_file = rewrite_extension(style_file, "css")
  230. output_style = ""
  231. with style_file.open("r") as fp:
  232. output_style = sass.compile(
  233. string=fp.read(),
  234. output_style=sass_output_style(),
  235. source_map_embed=True,
  236. include_paths=[str(style_file.parent)],
  237. )
  238. with output_style_file.open("w+") as fp:
  239. fp.write(output_style)
  240. content = content.replace(
  241. style, "/" + str(output_style_file.relative_to(build()))
  242. )
  243. except sass.CompileError as e:
  244. print(
  245. f"SASS compiler error:\n{str(e)}\n{str(style_file.relative_to(build()))}\n----------"
  246. )
  247. except FileNotFoundError as e:
  248. print(f"File not found: {e}\n----------")
  249. soup = BeautifulSoup(content, "html5lib")
  250. def update_targeting_attr(attrs, attr):
  251. v = attrs.get(attr)
  252. if v == path_prefix or v.startswith("http") or v.startswith("#"):
  253. return
  254. if v == "/":
  255. attrs[attr] = path_prefix if path_prefix else v
  256. return
  257. else:
  258. target = find_href_target(target_file, v)
  259. if isinstance(target, Path):
  260. target = str(target.relative_to(build()))
  261. attrs[attr] = (
  262. path_prefix + "/" + target if path_prefix else "/" + target
  263. ).rstrip("/")
  264. def update_external_anchor_tag(tag):
  265. v = tag.attrs.get("href", "")
  266. if v.startswith("http://") or v.startswith("https://"):
  267. tag.attrs["target"] = tag.attrs.get("target", "_blank")
  268. el_image = soup.new_tag("img", src=f"/images/external_link.svg")
  269. el_image["class"] = "external-link-image"
  270. tag.append(el_image)
  271. for href_tag in soup.find_all(href=True):
  272. update_targeting_attr(href_tag, "href")
  273. for href_tag in soup.find_all("a", href=True):
  274. update_external_anchor_tag(href_tag)
  275. for src_tag in soup.find_all(src=True):
  276. update_targeting_attr(src_tag, "src")
  277. content = soup.prettify()
  278. # Emit
  279. with target_file.open("w+") as fp:
  280. fp.write(content)
  281. def find_href_target(from_file: Path, href: str):
  282. href_split = list(filter(None, href.split("/")))
  283. if len(href_split) <= 0:
  284. return href
  285. if href_split[0] == "https:" or href_split[0] == "http:":
  286. return href
  287. href_starts_at_root = href[0] == "/"
  288. if href_starts_at_root:
  289. path = Path(build(), *href_split)
  290. else:
  291. path = Path(from_file.parent, *href_split)
  292. path = path.resolve()
  293. if not path.exists() and not path.name.endswith(".html"):
  294. path = path.parent / (path.name + ".html")
  295. assert (
  296. path.exists()
  297. ), f"Could not find href target for {href} -> {path} in {from_file}"
  298. return path
  299. def process_html_file_multi_out(jenv: Environment, source_file: Path, variable: str):
  300. filename_template = Template(source_file.name)
  301. for v in jinja_variables[variable]:
  302. target_file_name = filename_template.safe_substitute({variable: v})
  303. target_file = source_file.parent / target_file_name
  304. jinja_variables["multi_out"] = {"variable": v}
  305. process_html_file(jenv, source_file, target_file=target_file)
  306. jinja_variables["multi_out"] = None
  307. os.remove(source_file)
  308. def process_html():
  309. jinja_variables["articles_list"] = list(
  310. sorted(
  311. jinja_variables["articles"].values(),
  312. key=lambda a: a.get("date") or datetime.now(),
  313. )
  314. )
  315. jinja_variables["tags"] = list(jinja_variables["articles_by_tag"].keys())
  316. jenv = Environment(
  317. loader=FileSystemLoader(build()), autoescape=select_autoescape(), cache_size=0
  318. )
  319. def html_files():
  320. return filter(lambda p: not p.name.startswith("_"), build().rglob("*.html"))
  321. for html_file in html_files():
  322. if match := re.search(r"\$(\w+)\.html", html_file.name):
  323. process_html_file_multi_out(jenv, html_file, match.group(1))
  324. else:
  325. process_html_file(jenv, html_file)
  326. for scss_file in build().rglob("*.scss"):
  327. os.remove(scss_file)
  328. def strip_partials():
  329. def file_filter(p: Path):
  330. return p.name.startswith("_") and p.is_file()
  331. for file in filter(file_filter, build().rglob("*")):
  332. os.remove(file)
  333. def remove_empty_directories():
  334. f = True
  335. def directory_filter(p: Path):
  336. return p.is_dir() and len(list(p.glob("*"))) <= 0
  337. while f:
  338. empty_directories = list(filter(directory_filter, build().rglob("*")))
  339. f = len(empty_directories) <= 0
  340. for d in empty_directories:
  341. os.rmdir(d)
  342. def update_static_files_in_state():
  343. state = load_state()
  344. old_build_files = list(map(tuple, state.get("build_files", dict()).items()))
  345. build_files = list(map(tuple, get_build_files().items()))
  346. state["static_files"] = dict(filter(lambda x: x in old_build_files, build_files))
  347. save_state(state)
  348. def do_build():
  349. global jinja_variables
  350. print("### Building a blog! ###")
  351. print(f"Environment = {environment()}")
  352. print("")
  353. jinja_variables = default_jinja_variables()
  354. clean()
  355. copy()
  356. process_markdown()
  357. process_html()
  358. if environment() == Environments.dist:
  359. strip_partials()
  360. remove_empty_directories()
  361. update_static_files_in_state()
  362. print("Done!")
  363. def main():
  364. do_build()
  365. if __name__ == "__main__":
  366. main()