ghwikipp: build_categories: Check for bogus category names.

From 7e429a13d385e448dd2efaf014e5aaddb6eff66b Mon Sep 17 00:00:00 2001
From: "Ryan C. Gordon" <[EMAIL REDACTED]>
Date: Tue, 28 Feb 2023 13:36:34 -0500
Subject: [PATCH] build_categories: Check for bogus category names.

---
 build_categories.php | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/build_categories.php b/build_categories.php
index 034bc51..41abcdb 100755
--- a/build_categories.php
+++ b/build_categories.php
@@ -64,20 +64,34 @@ function build_category_lists($srcdir)
 
             while (($line = fgets($fp)) !== false) {
                 // The categories come right after a '----' line.
+
+                // Strictly speaking, it's legal in Markdown to do:
+                //
+                // My sub-header
+                // -------------
+                //
+                // (which is equivalent to "# My sub-header" on a single line.)
+                // ...so this can catch nonsense if the sub-header is four chars long,
+                // but we attempt to mitigate.
                 if (trim($line) == '----') {
                     if (($line = fgets($fp)) !== false) {
                         $cats = explode(',', trim($line));
                         foreach ($cats as $c) {
+                            $c = trim($c);
+                            $count = 0;
                             if ($from_format == "mediawiki") {
-                                $c = preg_replace('/^\[\[(.*?)\]\]$/', '$1', trim($c));
+                                $c = preg_replace('/^\[\[(.*?)\]\]$/', '$1', $c, 1, $count);
                             } else if ($from_format == "markdown_github") {
-                                $c = preg_replace('/^\[(.*?)\]\(.*?\)$/', '$1', trim($c));
+                                $c = preg_replace('/^\[(.*?)\]\(.*?\)$/', '$1', $c, 1, $count);
                             }
-                            if (!isset($categories[$c])) {
-                                $categories[$c] = array();
+                            // currently we have pages that don't have these wikilinked, so don't check $count==1 here for now.
+                            if (/*($count == 1) &&*/ ($c != "")) {
+                                if (!isset($categories[$c])) {
+                                    $categories[$c] = array();
+                                }
+                                //print("Adding '$page' to '$c'\n");
+                                $categories[$c][$page] = true;
                             }
-                            //print("Adding '$page' to '$c'\n");
-                            $categories[$c][$page] = true;
                         }
                     }
                 }
@@ -145,7 +159,7 @@ function handle_subdir($dname)
     build_category_lists($dname);
 
     foreach ($categories as $cat => $pages) {
-        //print("CATEGORY '$cat':\n");
+        //print("DIR '$dname' CATEGORY '$cat':\n");
         //print_r($pages);
 
         // keep in MediaWiki format if it exists, start new pages in Markdown.