Merge branch 'preprocess-org'

author: Kévin Le Gouguec <kevin.legouguec@gmail.com> 2020-11-25 19:33:59 +0100
committer: Kévin Le Gouguec <kevin.legouguec@gmail.com> 2020-11-25 19:33:59 +0100
commit: e1a80a5596dddc9582969e0a95fa8c09882085a9 (patch)
tree: cd60ce5f2ea2192a89d205e0f5950edb8d6b6486
parent: e72881f68cc3f2ddfbbd3f51449e0251042473ef (diff)
parent: 5fbd9a9e13332a8867eef3d2f408df24b19a34ef (diff)
download: memory-leaks-e1a80a5596dddc9582969e0a95fa8c09882085a9.tar.xz
5 files changed, 191 insertions, 56 deletions
diff --git a/repo/www/TODO b/repo/www/TODO
index 7eac4fe..1799529 100644
--- a/repo/www/TODO
+++ b/repo/www/TODO
@@ -1,15 +1,13 @@
-- preprocess Org files
-  Org's HTML backend adds a lot of stuff I don't like (intermediate
-  divs, unstable section IDs); I'll use the markdown backend, then
-  feed that to pandoc
-    - change description of custom +LINKs
-    - convert properties
-    - convert tags
+- org preprocessing:
+    - dump properties
+    - fontify TODO keywords
 - compute "leak count" on toplevel index
 - get stylin'
     - pandoc template
     - tufte css? at least sidenotes rather than footnotes
-- use tags somehow (eg to fill in the "keywords" metadata in pandoc template)
+- use tags somehow, eg
+    - fill in the "keywords" metadata in pandoc template
+    - index files/sections by tags
 - add author
 - add creation & last update dates
 - link to history
diff --git a/repo/www/generate-index.py b/repo/www/generate-index.py
index 16d1874..ab5b2a4 100755
--- a/repo/www/generate-index.py
+++ b/repo/www/generate-index.py
@@ -10,7 +10,7 @@ from tempfile import NamedTemporaryFile
 
 from git import Repo
 
-from helpers import deserialize_directories, generate_crumbs, pandoc
+from helpers import deserialize_directories, generate_crumbs, PandocRunner
 
 
 def parse_arguments():
@@ -103,41 +103,40 @@ def main(arguments):
     if arguments.site_title is not None:
         metadata['sitetitle'] = arguments.site_title
 
-    if readme is not None:
-        repo_top = Repo(search_parent_directories=True).working_dir
-        readme_path = Path(repo_top, target, readme)
-
-        # If the README doesn't have a title, give a default to pandoc
-        # out-of-band.
-        if not has_title(readme_path):
-            metadata['pagetitle'] = target or 'README'
+    pandoc = PandocRunner(
+        arguments.output, arguments.template, arguments.filters,
+        stylesheets, variables
+    )
 
-        with NamedTemporaryFile(mode='w+') as toc:
-            toc.write(f'<h1>{toc_title}</h1>\n')
+    if readme is None:
+        with NamedTemporaryFile(suffix='.md') as dummy_readme, \
+             NamedTemporaryFile(mode='w+') as toc:
             toc.write(html_toc)
             toc.flush()
 
-            pandoc(
-                readme_path, arguments.output,
-                arguments.template, arguments.filters, stylesheets,
-                include_after=(toc.name,),
-                variables=variables, metadata=metadata
+            metadata['pagetitle'] = toc_title
+            metadata['title'] = 'Index'
+
+            pandoc.run(
+                dummy_readme.name, include_after=(toc.name,), metadata=metadata
             )
-        return
+            return
 
-    with NamedTemporaryFile(suffix='.md') as dummy_readme, \
-         NamedTemporaryFile(mode='w+') as toc:
+    repo_top = Repo(search_parent_directories=True).working_dir
+    readme_path = Path(repo_top, target, readme)
+
+    # If the README doesn't have a title, give a default to pandoc
+    # out-of-band.
+    if not has_title(readme_path):
+        metadata['pagetitle'] = target or 'README'
+
+    with NamedTemporaryFile(mode='w+') as toc:
+        toc.write(f'<h1>{toc_title}</h1>\n')
         toc.write(html_toc)
         toc.flush()
 
-        metadata['pagetitle'] = toc_title
-        metadata['title'] = 'Index'
-
-        pandoc(
-            dummy_readme.name, arguments.output,
-            arguments.template, arguments.filters, stylesheets,
-            include_after=(toc.name,),
-            variables=variables, metadata=metadata
+        pandoc.run(
+            readme_path, include_after=(toc.name,), metadata=metadata
         )
 
 
diff --git a/repo/www/generate-page.py b/repo/www/generate-page.py
index cb2317b..bbe1288 100755
--- a/repo/www/generate-page.py
+++ b/repo/www/generate-page.py
@@ -6,7 +6,7 @@ from pathlib import Path
 
 from git import Repo
 
-from helpers import generate_crumbs, pandoc
+from helpers import generate_crumbs, PandocRunner
 
 
 def parse_arguments():
@@ -44,13 +44,16 @@ def main(arguments):
 
     page_path = Path(arguments.page).resolve().relative_to(repo_top)
 
-    pandoc(
-        arguments.page,
+    pandoc = PandocRunner(
         arguments.output,
         arguments.template,
         arguments.filters,
         stylesheets,
         variables={'crumbs': generate_crumbs(page_path)},
+    )
+
+    pandoc.run(
+        arguments.page,
         metadata={'pagetitle': arguments.title,
                   'sitetitle': arguments.site_title}
     )
diff --git a/repo/www/helpers.py b/repo/www/helpers.py
index 48ebccf..12d9a41 100644
--- a/repo/www/helpers.py
+++ b/repo/www/helpers.py
@@ -2,8 +2,10 @@ from collections import defaultdict
 from dataclasses import dataclass, field
 from itertools import chain
 from os import environ, path
-from subprocess import run
-from typing import Iterator
+from pathlib import Path
+from subprocess import CalledProcessError, run
+from tempfile import NamedTemporaryFile
+from typing import Dict, Iterator, Union
 
 
 @dataclass
@@ -56,26 +58,76 @@ def deserialize_directories(directories):
     }
 
 
-def pandoc(page, output, template, filters, stylesheets, include_after=(),
-           variables=None, metadata=None):
-    cmd = (
-        'pandoc', '-s', page, '-o', output, '--template', template,
-        *chain(*(('--lua-filter', f) for f in filters)),
-        *chain(*(('--css', s) for s in stylesheets)),
-        *chain(*(('--include-after-body', f) for f in include_after))
-    )
+class _NullPreprocessor:
+    def __init__(self, source_path):
+        self._source_path = source_path
+
+    def __enter__(self):
+        self.output = self._source_path
+        return self
+
+    def __exit__(self, *args):
+        pass
+
+class _OrgPreprocessor:
+    def __init__(self, source_path):
+        self._source_path = source_path
+
+    def __enter__(self):
+        self._output = NamedTemporaryFile(mode='w+', suffix='.org')
+        try:
+            run((
+                'emacs', '-Q', '--batch', '--load', 'preprocess-org.el',
+                '--eval', f'(preprocess-org "{self._source_path}")'
+            ), check=True, stdout=self._output)
+        except CalledProcessError:
+            self._output.close()
+            raise
+
+        self.output = self._output.name
+        return self
+
+    def __exit__(self, *args):
+        self._output.close()
+
+_PREPROCESSORS = defaultdict(lambda: _NullPreprocessor,
+                             (('org', _OrgPreprocessor),))
+
+
+_PathArg = Union[Path, str, bytes]
+
+@dataclass
+class PandocRunner:
+    output: _PathArg
+    template: _PathArg
+    filters: Iterator[_PathArg]
+    stylesheets: Iterator[_PathArg]
+    variables: Dict[str, str] = field(default_factory=dict)
+
+    def run(self, page, include_after=(), metadata=None):
+        cmd = (
+            'pandoc', '-s', '-o', self.output, '--template', self.template,
+            *chain(*(('--lua-filter', f) for f in self.filters)),
+            *chain(*(('--css', s) for s in self.stylesheets)),
+            *chain(*(('--include-after-body', f) for f in include_after))
+        )
 
-    if variables is not None:
-        cmd += tuple(chain(
-            *(('-V', f'{k}={v}') for k, v in variables.items())
-        ))
-    if metadata is not None:
         cmd += tuple(chain(
-            *(('-M', f'{k}={v}') for k, v in metadata.items())
+            *(('-V', f'{k}={v}') for k, v in self.variables.items())
         ))
+        if metadata is not None:
+            cmd += tuple(chain(
+                *(('-M', f'{k}={v}') for k, v in metadata.items())
+            ))
+
+        environ['LUA_PATH'] = '.cache/?.lua;;'
+
+        _, ext = path.splitext(page)
+        preprocessor = _PREPROCESSORS[ext[1:]]
 
-    environ['LUA_PATH'] = '.cache/?.lua;;'
-    run(cmd, check=True)
+        with preprocessor(page) as preproc:
+            cmd = cmd + (preproc.output,)
+            run(cmd, check=True)
 
 
 def generate_crumbs(target):
diff --git a/repo/www/preprocess-org.el b/repo/www/preprocess-org.el
new file mode 100644
index 0000000..fe63962
--- /dev/null
+++ b/repo/www/preprocess-org.el
@@ -0,0 +1,83 @@
+;; -*- lexical-binding: t -*-
+
+;; How I Convert Org Files To HTML.
+;; ================================
+;;
+;; Or: Why We Can't Have Nice Things: Exhibit #42.
+;;     -------------------------------------------
+;;
+;; Or: I Got Way Too Much Time On My Hands, Apparently.
+;;     ------------------------------------------------
+;;
+;; I see two straightforward ways to export Org files to HTML:
+;;
+;; 1. ox-html.el, Org's HTML backend: even with all the settings and
+;;    filters available, there are still a few things that annoy me:
+;;    lots of extra <div>s, unstable section IDs…
+;;
+;;    Also, I want to squeeze pandoc somewhere in the pipeline, to run
+;;    my Lua filters.
+;;
+;; 2. pandoc: does not cover all of Org's features.  Org is so crammed
+;;    with constructs that don't exist in other markup formats
+;;    (agendas, logbooks, spreadsheets, properties…) and so many knobs
+;;    can be tweaked on a per-file basis (link abbreviations, tags,
+;;    TODO cycles) that Elisp remains the least painful way to process
+;;    these files, IMO.
+;;
+;; A less-straightforward, but still reasonably simple way to go would
+;; be to use Org's markdown backend, then run pandoc on the result.
+;; Unfortunately, AFAICT ox-md.el does not implement definition lists,
+;; nor syntax-highlighting in fenced code blocks.
+;;
+;; So here's where I'm at: using Elisp, I'll preprocess Org files to
+;; add a bunch of #+OPTIONS pandoc recognizes, "dumb down" the stuff
+;; pandoc does not recognize, format some other stuff arbitrarily,
+;; *then* I'll run pandoc on the result.
+
+(defun pp-org/list-tags ()
+  (goto-char (point-min))
+  (while (re-search-forward org-heading-regexp nil t)
+    (save-excursion
+      (save-match-data
+        (when-let ((tags (org-get-tags (point))))
+          (insert "\n#+begin_tags\n")
+          (dolist (tag tags)
+            (insert "- " tag "\n"))
+          (insert "#+end_tags\n"))))))
+
+(defun pp-org/expand-links ()
+  ;; Expand #+LINK abbreviations, since pandoc does not grok them.
+  ;; Also, use the abbreviation as default description for links that
+  ;; lack one.
+  (pcase-dolist (`(,key . ,expansion) org-link-abbrev-alist-local)
+    (goto-char (point-min))
+    (let ((link-re (rx "[[" (group (literal key) ":"
+                                   (group (+ (not "]"))))
+                       "]" (? (group "["
+                                     (group (+ (not "]")))
+                                     "]"))
+                       "]"))
+          (expand-link (if (string-match-p "%s" expansion)
+                           (lambda (tag) (format expansion tag))
+                         (lambda (tag) (concat expansion tag)))))
+      (while (re-search-forward link-re nil t)
+        (let ((link-beg (match-beginning 0))
+              (link-abbrev (match-string 1))
+              (link-tag (match-string 2))
+              (description (match-string 4)))
+          (replace-match (funcall expand-link link-tag) t t nil 1)
+          (unless description
+            (save-excursion
+              (goto-char (1+ link-beg))
+              (forward-sexp)
+              (insert (format "[%s]" link-abbrev)))))))))
+
+(defun preprocess-org (input)
+  (with-temp-buffer
+    (insert "#+OPTIONS: ^:{} tags:nil H:6\n")
+    (insert-file-contents input)
+    (org-mode)
+    (pp-org/list-tags)
+    (pp-org/expand-links)
+    (princ (buffer-string))))
author	Kévin Le Gouguec <kevin.legouguec@gmail.com>	2020-11-25 19:33:59 +0100
committer	Kévin Le Gouguec <kevin.legouguec@gmail.com>	2020-11-25 19:33:59 +0100
commit	e1a80a5596dddc9582969e0a95fa8c09882085a9 (patch)
tree	cd60ce5f2ea2192a89d205e0f5950edb8d6b6486
parent	e72881f68cc3f2ddfbbd3f51449e0251042473ef (diff)
parent	5fbd9a9e13332a8867eef3d2f408df24b19a34ef (diff)
download	memory-leaks-e1a80a5596dddc9582969e0a95fa8c09882085a9.tar.xz