GmCapsule [gsorg-style]

Added "rewrite" module; updated documentation

=> 375360214ce32bb545c0fd8beef676e90d36980a

diff --git a/README.md b/README.md
index 823db45..9f8f074 100644
--- a/README.md
+++ b/README.md
@@ -37,7 +37,7 @@ Replace `` with the actual path of `gmcapsuled`. `pip` will i
 Then you can do the usual:
 
     systemctl --user daemon-reload
-    systemctl --user enable gmcapsule.service
+    systemctl --user enable gmcapsule
     systemctl --user start gmcapsule
 
 The log can be viewed via journalctl (or syslog):
@@ -46,6 +46,12 @@ The log can be viewed via journalctl (or syslog):
 
 ## Change log
 
+### v0.4
+
+* Added built-in module "rewrite" that matches regular expressions against the request path and can rewrite the path or return a custom status for redirection, "Gone" messages, or other exceptional situations.
+* Extension module load order is determined after locating all modules from all directories. Previously, the order was local to each directory.
+* Added a new configuration section `[priority]` for overriding default module priorities determined from file names. This is useful for changing the priority of the built-in modules.
+
 ### v0.3
 
 * Added a shutdown event for custom background workers.
diff --git a/gmcapsule/__init__.py b/gmcapsule/__init__.py
index 3afb139..f93f021 100644
--- a/gmcapsule/__init__.py
+++ b/gmcapsule/__init__.py
@@ -58,8 +58,9 @@ The GmCapsule configuration file is in `INI format
 defined:
 
 - :ref:`server` — server settings
-- :ref:`static` — serving static files
 - :ref:`titan` — Titan upload settings
+- :ref:`static` — serving static files
+- :ref:`rewrite.*` — URL rewriting rules
 - :ref:`cgi` — General CGI settings
 - :ref:`cgi.*` — CGI programs
 - :ref:`gitview` — Git repository viewer settings
@@ -132,6 +133,50 @@ root : path [path...]
     files will be served from `/home/user/gemini/example.com/`.
 
 
+rewrite.*
+---------
+
+Settings for the `rewrite` module that checks regular expressions against
+the request path and can rewrite the path or return a custom status. You can
+use this for internal remapping of directories and files, redirections,
+"Gone" statuses, or other exceptional situations.
+
+Each rewriting rule is a section that begins with ``rewrite.``.
+
+.. code-block:: ini
+
+    [rewrite.rename]
+    path    = ^/old-path/
+    repl    = /new-location/
+
+    [rewrite.elsewhere]
+    path    = .*\\.gmi$
+    status  = 31 gemini://mygemlog.space/\\1.gmi
+
+protocol : string
+    Protocol for the rewrite rule. If omitted, the rule applies to both
+    ``gemini`` and ``titan``.
+
+host : string
+    Hostname for the rewrite rule. If omitted, defaults to the first
+    hostname defined in the :ref:`server` section.
+
+path : string
+    Regular expression that is matched against the request path. You may use
+    capture groups and refer to them in the replacement text. Note that the
+    request path always begins with a slash.
+
+repl : string
+    Replacement path. The part of the request path that matches the "path"
+    pattern is replaced with this. You can use backslashes to refer to
+    capture groups (``\\1``).
+
+status : string
+    Custom status to respond with. Must begin with the status code followed
+    by the meta line. You can use backslashes to refer to capture groups
+    (``\\1``).
+
+
 cgi
 ---
 
@@ -440,7 +485,7 @@ from .gemini import Server, Cache
 from .markdown import to_gemtext as markdown_to_gemtext
 
 
-__version__ = '0.3.2'
+__version__ = '0.4.0'
 __all__ = [
     'Config', 'Capsule', 'Cache',
     'get_mime_type', 'markdown_to_gemtext'
@@ -629,24 +674,40 @@ class Capsule:
         self.sv.add_cache(cache)
 
     def load_modules(self):
-        name_pattern = re.compile(r'[0-9][0-9]_(.*)\.py')
+        # The configuration can override default priorities.
+        mod_priority = {}
+        if 'priority' in self.cfg.ini:
+            for name, priority in self.cfg.section('priority').items():
+                mod_priority[name] = int(priority)
+
+        # We will load all recognized modules.
+        name_pattern = re.compile(r'([0-9][0-9])_(.*)\.py')
         dirs = []
         for user_dir in self.cfg.mod_dirs():
             if user_dir not in dirs:
                 dirs.append(user_dir)
         dirs += [Path(__file__).parent.resolve() / 'modules']
+        mods = []
         for mdir in dirs:
             for mod_file in sorted(os.listdir(mdir)):
                 m = name_pattern.match(mod_file)
                 if m:
                     path = (mdir / mod_file).resolve()
-                    name = m.group(1)
+                    name = m.group(2)
                     loader = importlib.machinery.SourceFileLoader(name, str(path))
                     spec = importlib.util.spec_from_loader(name, loader)
                     mod = importlib.util.module_from_spec(spec)
                     loader.exec_module(mod)
-                    print('MODULE:', mod.__doc__)
-                    mod.init(self)
+                    if name in mod_priority:
+                        priority = mod_priority[name]
+                    else:
+                        priority = int(m.group(1))
+                    mods.append((priority, name, mod))
+
+        # Initialize in priority order.
+        for _, _, mod in sorted(mods):
+            print(f'Init:', mod.__doc__)
+            mod.init(self)
 
     def shutdown_event(self):
         """
@@ -657,6 +718,20 @@ class Capsule:
         """
         return self.sv.shutdown_event
 
+    def call_entrypoint(self, request):
+        """
+        Calls the registered entry point for a request.
+
+        Args:
+            request (Request): Request object.
+
+        Returns:
+            Tuple with (response, cache). The response can be binary data, text,
+            tuple with status and meta string, or tuple with status, meta, and body.
+            The cache is None if the data was not read from a cache.
+        """
+        return self.sv.call_entrypoint(request)
+
     def run(self):
         """
         Start worker threads and begin accepting incoming connections. The
diff --git a/gmcapsule/gemini.py b/gmcapsule/gemini.py
index 05794f7..26ded98 100644
--- a/gmcapsule/gemini.py
+++ b/gmcapsule/gemini.py
@@ -16,6 +16,12 @@ import OpenSSL.crypto
 from OpenSSL import SSL, crypto
 
 
+class GeminiError(Exception):
+    def __init__(self, status, msg):
+        Exception.__init__(self, msg)
+        self.status = status
+
+
 class AbortedIOError(Exception):
     def __init__(self, msg):
         Exception.__init__(self, msg)
@@ -271,6 +277,9 @@ class Request:
         self.content_mime = content_mime
         self.content = content
 
+    def url(self):
+        return f'{self.scheme}://{self.hostname}{self.path}{"?" + self.query if self.query else ""}'
+
 
 def verify_callback(connection, cert, err_num, err_depth, ret_code):
     #print("verify_callback:", connection, cert, ret_code)
@@ -445,72 +454,42 @@ class Worker(threading.Thread):
 
         url = urlparse(request)
         path = url.path
-        if url.port != None and url.port != self.server.port:
-            report_error(stream, 59, "Invalid port number")
-            return
         if path == '':
             path = '/'
         hostname = url.hostname
-        entrypoint = self.server.find_entrypoint(url.scheme, hostname, path)
 
-        # Server name indication is required.
+        if url.port != None and url.port != self.server.port:
+            report_error(stream, 59, "Invalid port number")
+            return
         if not stream.get_servername():
+            # Server name indication is required.
             report_error(stream, 59, "Missing TLS server name indication")
             return
-        if stream.get_servername().decode() != url.hostname:
+        if stream.get_servername().decode() != hostname:
             report_error(stream, 53, "Proxy request refused")
             return
 
-        caches = [] if (url.scheme != 'gemini' or identity or len(url.query) > 0) \
-            else self.server.caches
-        from_cache = None
-
-        # print(f'Request : {request}')
-        # print(f'Cert    : {cl_cert}')
-
-        if entrypoint:
-            # Check the caches first.
-            for cache in caches:
-                media, content = cache.try_load(hostname + path)
-                if not media is None:
-                    response = 20, media, content
-                    from_cache = cache
-                    if hasattr(content, '__len__'):
-                        print('%d bytes from cache, %s' % (len(content), media))
-                    else:
-                        print('stream from cache,', media)
-                    break
-
-            # Process the request normally if there is nothing cached.
-            if not from_cache:
-                try:
-                    response = entrypoint(Request(
-                        identity,
-                        remote_address=from_addr,
-                        scheme=url.scheme,
-                        hostname=hostname,
-                        path=path,
-                        query=url.query if '?' in request else None,
-                        content_token=req_token,
-                        content_mime=req_mime,
-                        content=data if len(data) else None
-                    ))
-                except Exception as x:
-                    import traceback
-                    traceback.print_exception(x)
-                    report_error(stream, 40, 'Temporary failure')
-                    return
+        try:
+            request = Request(
+                identity,
+                remote_address=from_addr,
+                scheme=url.scheme,
+                hostname=hostname,
+                path=path,
+                query=url.query if '?' in request else None,
+                content_token=req_token,
+                content_mime=req_mime,
+                content=data if len(data) else None
+            )
+            response, from_cache = self.server.call_entrypoint(request)
 
             # Determine status code, meta line, and body content.
             if type(response) == tuple:
                 if len(response) == 2:
-                    status = response[0]
-                    meta = response[1]
+                    status, meta = response
                     response = ''
                 else:
-                    status = response[0]
-                    meta = response[1]
-                    response = response[2]
+                    status, meta, response = response
             else:
                 status = 20
                 meta = 'text/gemini; charset=utf-8'
@@ -528,7 +507,7 @@ class Worker(threading.Thread):
             # Save to cache.
             if not from_cache and status == 20 and \
                     (type(response_data) == bytes or type(response_data) == bytearray):
-                for cache in caches:
+                for cache in self.server.caches:
                     if cache.save(hostname + path, meta, response_data):
                         break
 
@@ -536,8 +515,9 @@ class Worker(threading.Thread):
             if hasattr(response_data, 'close'):
                 response_data.close()
 
-        else:
-            report_error(stream, 50, 'Permanent failure')
+        except GeminiError as error:
+            report_error(stream, error.status, str(error))
+            return
 
 
 class Server:
@@ -693,7 +673,39 @@ class Server:
                     handler = path_pattern(path)
                     if handler:
                         return handler
-        except:
+        except Exception as x:
+            print(x)
             return None
 
         return None
+
+    def call_entrypoint(self, request):
+        entrypoint = self.find_entrypoint(request.scheme, request.hostname, request.path)
+
+        caches = self.caches if (request.scheme == 'gemini' and
+                                 not request.identity and
+                                 not request.query) else []
+        from_cache = None
+
+        if entrypoint:
+            # Check the caches first.
+            for cache in caches:
+                media, content = cache.try_load(request.hostname + request.path)
+                if not media is None:
+                    response = 20, media, content
+                    if hasattr(content, '__len__'):
+                        print('%d bytes from cache, %s' % (len(content), media))
+                    else:
+                        print('stream from cache,', media)
+                    return response, cache
+
+            # Process the request normally if there is nothing cached.
+            if not from_cache:
+                try:
+                    return entrypoint(request), None
+                except Exception as x:
+                    import traceback
+                    traceback.print_exception(x)
+                    raise GeminiError(40, 'Temporary failure')
+
+        raise GeminiError(50, 'Permanent failure')
diff --git a/gmcapsule/modules/10_rewrite.py b/gmcapsule/modules/10_rewrite.py
new file mode 100644
index 0000000..584e7a9
--- /dev/null
+++ b/gmcapsule/modules/10_rewrite.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2023 Jaakko Keränen 
+# License: BSD-2-Clause
+
+"""Rewriter"""
+
+import re
+
+
+class PathRewriteHandler:
+    def __init__(self, capsule, rewritten_path):
+        self.capsule = capsule
+        self.rewritten_path = rewritten_path
+
+    def __call__(self, req):
+        old_path = req.path
+        req.path = self.rewritten_path
+
+        # Don't allow rewriting the same request too many times.
+        if hasattr(req, 'num_rewrites'):
+            req.num_rewrites += 1
+        else:
+            req.num_rewrites = 1
+        if req.num_rewrites == 100:
+            return 40, "Stuck in rewrite loop: " + req.url()
+
+        print("[rewrite]", old_path, "->", req.path)
+        return self.capsule.call_entrypoint(req)[0]
+
+
+class Responder:
+    def __init__(self, response):
+        self.response = response
+
+    def __call__(self, req):
+        return self.response
+
+
+class Rewriter:
+    def __init__(self, capsule, protocol, host, src_path, dst_path, status):
+        self.capsule = capsule
+        self.protocol = protocol
+        self.host = host
+        self.src_path = src_path
+        self.dst_path = dst_path
+        self.status = status
+
+    def __call__(self, path):
+        # If path matches a rewritten URL, return the handler object that calls the
+        # correct handler for the updated URL.
+        if self.dst_path:
+            new_path = self.src_path.sub(self.dst_path, path)
+            if new_path != path:
+                return PathRewriteHandler(self.capsule, new_path)
+
+        elif self.status:
+            m = self.src_path.match(path)
+            if m:
+                status = self.status
+                for i in range(self.src_path.groups + 1):
+                    cap = m[i]
+                    if cap:
+                        status = status.replace(f'\\{i}', cap)
+                code, meta = status.split()
+                print("[rewrite]", code, meta)
+                return Responder((int(code), meta))
+
+        return None
+
+
+def init(capsule):
+    cfg = capsule.config()
+    for section in cfg.prefixed_sections('rewrite.').values():
+        protocol = section.get('protocol', None)
+        host = section.get('host', cfg.hostnames()[0])
+        src_path = re.compile(section.get('path'))
+        dst_path = section.get('repl', None)
+        status = section.get('status', None)
+        for proto in [protocol] if protocol else ['gemini', 'titan']:
+            capsule.add(Rewriter(capsule, proto, host, src_path, dst_path, status),
+                        None, # `Rewriter` will return a suitable handler callback.
+                        host,
+                        proto)
Proxy Information
Original URL
gemini://git.skyjake.fi/gmcapsule/gsorg-style/cdiff/375360214ce32bb545c0fd8beef676e90d36980a
Status Code
Success (20)
Meta
text/gemini; charset=utf-8
Capsule Response Time
31.83386 milliseconds
Gemini-to-HTML Time
0.776548 milliseconds

This content has been proxied by September (ba2dc).