ghwikipp: offline: Make link generation more correct...and much, much more robust.

From 4453187cae4eaf713cf3f1e4412367c1fcda8843 Mon Sep 17 00:00:00 2001
From: "Ryan C. Gordon" <[EMAIL REDACTED]>
Date: Wed, 7 May 2025 01:37:55 -0400
Subject: [PATCH] offline: Make link generation more correct...and much, much
 more robust.

Fixes https://github.com/libsdl-org/sdlwiki/issues/760
---
 make_offline_archive.php  |  19 ++++--
 pandoc-filter-offline.lua | 133 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 140 insertions(+), 12 deletions(-)

diff --git a/make_offline_archive.php b/make_offline_archive.php
index 9188d72..4deebff 100755
--- a/make_offline_archive.php
+++ b/make_offline_archive.php
@@ -17,11 +17,11 @@
 $max_children = 4;
 $num_children = 0;
 
-function cook_tree_for_offline_html($srcdir, $dstdir)
+function cook_tree_for_offline_html($srcdir, $dstdir, $input_dir)
 {
-    global $supported_formats, $max_children, $num_children;
+    global $supported_formats, $max_children, $num_children, $base_url, $raw_data;
 
-    //print("cooktree: '$srcdir' -> '$dstdir'\n");
+    //print("cooktree: '$srcdir' ('$input_dir') -> '$dstdir'\n");
     $dirp = opendir($srcdir);
     if ($dirp === false) {
         return;
@@ -32,9 +32,10 @@ function cook_tree_for_offline_html($srcdir, $dstdir)
         //print("cookdent: '$dent'\n");
         $src = "$srcdir/$dent";
         $dst = "$dstdir/$dent";
+        $input_path = ($input_dir == NULL) ? $dent : "$input_dir/$dent";
         if (is_dir($src)) {
             mkdir($dst);
-            cook_tree_for_offline_html($src, $dst);
+            cook_tree_for_offline_html($src, $dst, $input_path);
         } else {
             $ext = strrchr($dent, '.');
             if ($ext !== false) {
@@ -46,6 +47,12 @@ function cook_tree_for_offline_html($srcdir, $dstdir)
                     $page = preg_replace('/^.*\//', '', $page);
                     $dst = preg_replace('/\..*?$/', '.html', $dst);
 
+                    $env = array(
+                        'GHWIKIPP_INPUT_PATH' => $input_path,
+                        'GHWIKIPP_BASE_URL' => $base_url,
+                        'GHWIKIPP_RAW_DIR' => $raw_data
+                    );
+
                     // split this across CPU cores.
                     while ($num_children >= $max_children) {
                         pcntl_waitpid(-1, $status);  // wait for any child to end.
@@ -56,7 +63,7 @@ function cook_tree_for_offline_html($srcdir, $dstdir)
                     if ($pid == -1) {
                         print("FAILED TO FORK!\n");
                     } else if ($pid == 0) {  // child process.
-                        pcntl_exec('/usr/bin/pandoc', [ '--metadata', "pagetitle=$page", '--embed-resources', '--standalone', '-f', $from_format, '-t', 'html', '--css=static_files/ghwikipp.css', '--css=static_files/pandoc.css', '--lua-filter=./pandoc-filter-offline.lua', '-o', $dst, $src ]);
+                        pcntl_exec('/usr/bin/pandoc', [ '--metadata', "pagetitle=$page", '--embed-resources', '--standalone', '-f', $from_format, '-t', 'html', '--css=static_files/ghwikipp.css', '--css=static_files/pandoc.css', '--lua-filter=./pandoc-filter-offline.lua', '-o', $dst, $src ], $env);
                         exit(1);
                     } else {  // parent process.
                         $num_children++;
@@ -118,7 +125,7 @@ function cook_tree_for_offline_html($srcdir, $dstdir)
 unset($git_repo_lock_fp);
 
 // okay, now we can operate on this data.
-cook_tree_for_offline_html($tmprawdir, $tmpoutdir);
+cook_tree_for_offline_html($tmprawdir, $tmpoutdir, NULL);
 while ($num_children > 0) {
     pcntl_waitpid(-1, $status);  // wait for any child to end.
     $num_children--;
diff --git a/pandoc-filter-offline.lua b/pandoc-filter-offline.lua
index ca4e29a..a96623d 100644
--- a/pandoc-filter-offline.lua
+++ b/pandoc-filter-offline.lua
@@ -1,10 +1,131 @@
+local base_url = os.getenv("GHWIKIPP_BASE_URL")
+local input_path = os.getenv("GHWIKIPP_INPUT_PATH")
+local rawdir = os.getenv("GHWIKIPP_RAW_DIR")
+
+-- !!! FIXME: this is super-gross and unix-specific.
+function is_dir(path)
+    local p = io.popen("[ -d '" .. path .. "' ] && echo 'yes' || echo 'no'")
+    local result = p:read('*l')
+    p:close()
+    --print("is_dir('" .. path .. "') => " .. result);
+    return result == 'yes'
+end
+
+local function split_path(str, sep)
+    local result = {}
+    if str ~= nil then
+        for part in string.gmatch(str, '([^' .. sep .. ']+)') do
+          table.insert(result, part)
+        end
+    end
+    local fname = result[#result]
+    if fname == nil then
+        fname = ''
+    end
+    table.remove(result)  -- drop the filename from the array
+    local nofname = ''
+    local sep = ''
+    for i, v in ipairs(result) do
+        nofname = nofname .. sep .. v
+        sep = '/'
+    end
+    return result, fname, nofname
+end
+
+local input_path_array, input_path_filename, input_path_nofilename = split_path(input_path, '/')
+
 function Link (link)
-  -- drop any Markdown or MediaWiki file extensions that might have snuck in.
-  link.target = string.gsub(link.target, '%.md$', '')
-  link.target = string.gsub(link.target, '%.mediawiki$', '')
+    local url = link.target
+    local pos, endpos
+
+    -- if there's an anchor (the '#blahblahblah' at the end of a URL),
+    -- split it out to a separate var and remove it from the url.
+    local anchor = ''
+    pos, endpos = string.find(url, '#.*$')
+    if pos ~= nil then
+        anchor = string.sub(url, pos)
+        url = string.sub(url, 1, pos - 1)
+    end
+
+    -- http into https on base urls.
+    local https_url = string.gsub(url, '^http:', 'https:', 1);
+
+    -- if this a full URL to somewhere else in the wiki, chop off the base so we can use it as a path on the filesystem.
+    if https_url == base_url then
+        url = '/FrontPage'
+    else
+        pos, endpos = string.find(https_url, base_url, 1, false)
+        if pos == 1 then
+            url = string.sub(https_url, endpos + 1)
+        end
+    end
+
+    if url == '/' then
+        url = '/FrontPage'
+    end
+
+    -- Most links are relative in the wiki, but occasionally we'll do "/TopLevelDirectory/Whatever" when we have to
+    --  access a parent directory, because we can't do ".." paths on the wiki, but these don't fly
+    --  when looking at static HTML files on the user's local disk.
+    -- If this is an absolute path into the wiki instead of relative, convert it to a relative page.
+    -- Note that full URLs (https://wiki.example.com/) will land in here, too, as we intentionally chop off the base_url to leave the initial '/'.
+    if string.sub(url, 1, 1) == '/' then  -- if first char is '/', it's an absolute path.
+        local split_path_array, split_path_filename = split_path(url, '/')
+        local dotdot = false
+        local sep = ''
+        local final = ''
+        for i, v in ipairs(input_path_array) do
+            if not dotdot then
+                if split_path_array[1] == v then
+                    table.remove(split_path_array, 1)
+                else
+                    dotdot = true
+                end
+            end
+            if dotdot then
+                final = final .. sep .. '..'
+                sep = '/'
+            end
+        end
+
+        for i, v in ipairs(split_path_array) do
+            final = final .. sep .. v
+            sep = '/'
+        end
+
+        url = final .. sep .. split_path_filename
+    end
+
+    if string.find(url, '^.*://') ~= 1 then  -- if this is an external URL, don't mangle it.
+        -- Is this a link to a directory or a file?
+        local rawpath = rawdir .. '/' .. input_path_nofilename .. '/' .. url
+        if (url ~= '') and is_dir(rawpath) then
+            --print("rawpath: '" .. rawpath .. "'")
+            -- it's a directory! Either link directly to a FrontPage.html, if one will exist, or just dump the user in the directory itself if not.
+            local f = io.open(rawpath .. '/FrontPage.md', 'r')
+            if not f then f = io.open(rawpath .. '/FrontPage.mediawiki', 'r') end
+            if f then
+                f:close()
+                url = url .. '/FrontPage.html';
+            end
+        else
+            -- drop any Markdown or MediaWiki file extensions that might have snuck in.
+            url = string.gsub(url, '%.md$', '')
+            url = string.gsub(url, '%.mediawiki$', '')
+
+            -- Add HTML file extension.
+            if url ~= '' then  -- could be '' if there's nothing but an anchor on the current page.
+                url = url .. '.html'
+            end
+        end
+    end
+
+    -- Add anchor back in
+    url = url .. anchor
+
+    --print(link.target .. ' -> ' .. url)
 
-  -- Add HTML file extension.
-  link.target = link.target .. '.html'
-  return link
+    link.target = url
+    return link
 end