{- | The shared serve-action vocabulary of the front door, and the agnostic
default router.

A 'Route' is one classified request -- everything the proxy is willing to serve,
named independently of any ecosystem's URL grammar. The /actions/ are common
across registries (fetch a packument, stream a tarball, publish a first-party
package, answer a liveness probe, deny a search); only the
__(method, URL)→action__ mapping is ecosystem-specific. That mapping is a
'Classifier', injected at the composition root, so this module stays free of any
one ecosystem's path conventions while the dispatcher routes through whatever
classifier its mount carries.

The classifier is __method-aware__ because the same path can name different
actions by HTTP method: @GET \/{pkg}@ reads a packument, @PUT \/{pkg}@ publishes
one. A read and a write are genuinely distinct serve actions (not a rendering
variation the way a @HEAD@ is a bodiless @GET@), so the method is part of what the
classifier maps, and a write earns its own 'Route' rather than being inferred at
dispatch.

The model is __deny by default__, mirroring the rules engine ("Ecluse.Core.Rules"):
the agnostic default 'denyAll' classifies every path as 'Unsupported' (a @404@ at
the edge), so a deployment that wires no ecosystem router serves nothing rather
than guessing. An ecosystem adapter supplies a 'Classifier' that recognises its
own paths and falls back to 'Unsupported' for the rest.

'Route' is a small sum so the whole routing table is unit-testable with __no
server__: feed a 'Classifier' some segments, assert the 'Route'.
-}
module Ecluse.Core.Server.Route (
    -- * Routes
    Route (..),
    Filename (..),

    -- * Classification
    Classifier,
    denyAll,

    -- * Component safety
    isSafeComponent,
    encodeComponent,
) where

import Data.ByteString qualified as BS
import Data.ByteString.Internal (w2c)
import Data.Char (intToDigit, isControl, toUpper)
import Data.Text qualified as T
import Network.HTTP.Types.Method (Method)

import Ecluse.Core.Package (PackageName)
import Ecluse.Core.Version (Version)

{- | A classified request. Everything the front door is willing to serve is one
of these; an unrecognised path is 'Unsupported' (deny by default).

The constructors are the proxy's /actions/, shared across ecosystems -- the
artifact a 'Tarball' streams and the metadata a 'Packument' merges are the same
serve behaviour whether the upstream is npm, PyPI, or another registry. Only the
mapping from a request path to one of these (a 'Classifier') is
ecosystem-specific.
-}
data Route
    = -- | A package-metadata request -- the /packument/.
      Packument PackageName
    | {- | An artifact request, as a __parsed coordinate__: the package, the
      'Version' the classifier read out of the artifact name, and the 'Filename'
      itself. The 'Version' is the coordinate the rules gate on; the 'Filename' is
      the artifact's on-the-wire name, __preserved verbatim__ -- it, not a name
      rebuilt from @(package, version)@, is authoritative for fetching the bytes.
      -}
      Tarball PackageName Version Filename
    | {- | A first-party __publish__ request -- @PUT \/{pkg}@. The one client-driven
      /write/ action: the publisher's own publish document (the version manifest plus
      the base64 tarball) is relayed to the configured /publication target/ after the
      anti-shadowing scope guard, with the publisher's own forwarded credential (see
      @docs\/architecture\/registry-model.md@ → "Publishing first-party packages"). The
      'PackageName' is the route's authoritative identity -- the scope guard and the
      upstream write path both key on it, never on the document's self-reported name.
      The version lives inside the relayed document, so the route carries none.
      -}
      Publish PackageName
    | -- | A registry liveness probe, answered locally.
      Ping
    | -- | Package search (unsupported).
      Search
    | {- | Anything unrecognised. Renders as a @404@ -- deny by default at the
      routing layer.
      -}
      Unsupported
    deriving stock (Route -> Route -> Bool
(Route -> Route -> Bool) -> (Route -> Route -> Bool) -> Eq Route
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: Route -> Route -> Bool
== :: Route -> Route -> Bool
$c/= :: Route -> Route -> Bool
/= :: Route -> Route -> Bool
Eq, Int -> Route -> ShowS
[Route] -> ShowS
Route -> String
(Int -> Route -> ShowS)
-> (Route -> String) -> ([Route] -> ShowS) -> Show Route
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Route -> ShowS
showsPrec :: Int -> Route -> ShowS
$cshow :: Route -> String
show :: Route -> String
$cshowList :: [Route] -> ShowS
showList :: [Route] -> ShowS
Show)

{- | An artifact's on-the-wire file name, the agnostic artifact-name type a
'Tarball' route carries.

It is held as a distinct type, not a bare 'Text', because it is __authoritative
for fetching the bytes__: the proxy fetches an artifact at the upstream path built
from this exact name, never one reconstructed from @(package, version)@, so that a
registry whose artifact naming differs from the proxy's own convention still
resolves. The name is preserved verbatim as received; the classifier that produces
it has already applied the component-safety gate ('isSafeComponent'), so the value
is safe to interpolate into a downstream URL.
-}
newtype Filename = Filename Text
    deriving stock (Filename -> Filename -> Bool
(Filename -> Filename -> Bool)
-> (Filename -> Filename -> Bool) -> Eq Filename
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: Filename -> Filename -> Bool
== :: Filename -> Filename -> Bool
$c/= :: Filename -> Filename -> Bool
/= :: Filename -> Filename -> Bool
Eq, Int -> Filename -> ShowS
[Filename] -> ShowS
Filename -> String
(Int -> Filename -> ShowS)
-> (Filename -> String) -> ([Filename] -> ShowS) -> Show Filename
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Filename -> ShowS
showsPrec :: Int -> Filename -> ShowS
$cshow :: Filename -> String
show :: Filename -> String
$cshowList :: [Filename] -> ShowS
showList :: [Filename] -> ShowS
Show)

{- | The mapping from an ecosystem-native request to a 'Route'.

A classifier sees the request's HTTP 'Method' and the already-mount-stripped,
percent-decoded path segments and returns the serve action. The method is part of
the mapping because the same path names different actions by method (@GET \/{pkg}@
reads, @PUT \/{pkg}@ publishes); a @HEAD@, by contrast, classifies like its @GET@
(it is a bodiless variation the dispatcher handles, not a distinct action). Each
ecosystem adapter contributes its own classifier -- recognising its
(method, path) grammar and denying everything else -- so the agnostic dispatcher
stays closed while every mount routes through its ecosystem's template. Dispatch
chooses the classifier per matched mount (see "Ecluse.Server"), so the same shape
carries either a single ecosystem or a mount-keyed selection.
-}
type Classifier = Method -> [Text] -> Route

{- | The agnostic default classifier: every request is 'Unsupported'.

This is the deny-by-default base a deployment runs with until a composition root
wires an ecosystem's classifier in, so an unwired server serves nothing rather
than guessing a grammar. It deliberately knows no path conventions of its own.
-}
denyAll :: Classifier
denyAll :: Classifier
denyAll ByteString
_method [Text]
_segments = Route
Unsupported

{- | Whether a single decoded path component is __safe to interpolate__ into a
downstream upstream URL -- the deny-by-default gate a classifier applies to every
component it accepts (a scope, base name, or tarball filename).

The path is percent-decoded before it reaches us, so a single segment can carry a
@\'\/\'@, a @\'\\\\\'@, a control character, or be @"."@\/@".."@; any of these
enables path traversal or request smuggling once the name reaches the upstream
URL. A component is UNSAFE iff it is empty, is exactly @"."@ or @".."@, or
contains a @\'\/\'@, a @\'\\\\\'@, or any 'isControl' character. Everything else
is accepted: this is a security boundary, __not__ an ecosystem-policy validator,
so ordinary names with interior dots (@lodash.merge@, @is.odd@), hyphens,
underscores, digits, or uppercase all pass.

It lives in the agnostic layer because the threat -- interpolating a hostile
segment into an upstream URL -- is ecosystem-independent; both an ecosystem's path
classifier and the defence-in-depth check in "Ecluse.Core.Security" share this one
rule.

This gate is __structural__: it stops a component that would change the upstream
URL's /shape/ (a traversal, an embedded separator, a control character). It does
__not__ stop a component that carries other URL-reserved bytes -- a @\'%\'@,
@\'?\'@, @\'#\'@, @\'\;\'@, or a space -- which an accepted name can still hold
(notably a once-decoded segment carrying a literal @%2e%2e%2f@). Those are
neutralised not by widening this denylist but by percent-encoding every accepted
component with 'encodeComponent' when the upstream URL is built, so the safety of
an interpolated component rests on encode-on-build, not on this gate alone.
-}
isSafeComponent :: Text -> Bool
isSafeComponent :: Text -> Bool
isSafeComponent Text
c =
    Bool -> Bool
not (Text -> Bool
T.null Text
c)
        Bool -> Bool -> Bool
&& Text
c Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
/= Text
"."
        Bool -> Bool -> Bool
&& Text
c Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
/= Text
".."
        Bool -> Bool -> Bool
&& (Char -> Bool) -> Text -> Bool
T.all Char -> Bool
safeChar Text
c
  where
    safeChar :: Char -> Bool
safeChar Char
ch = Char
ch Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'/' Bool -> Bool -> Bool
&& Char
ch Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'\\' Bool -> Bool -> Bool
&& Bool -> Bool
not (Char -> Bool
isControl Char
ch)

{- | Percent-encode a single decoded path component for __safe interpolation__
into an upstream URL -- the encode-on-build partner of 'isSafeComponent'.

A component is the content between a URL's structural delimiters (a scope, base
name, or filename), never the delimiters themselves, so this encodes
conservatively: it keeps only the RFC 3986 __unreserved__ set
(@A-Z@, @a-z@, @0-9@, and @\'-\'@, @\'.\'@, @\'_\'@, @\'~\'@) verbatim and
percent-encodes __every other byte__ of the component's UTF-8 encoding as
@%XX@ (upper-case hex). A caller composing a path therefore writes the structural
@\'\/\'@, scope @%2F@, @\'\@\'@ sigil, and the like itself, around encoded
components -- so a @\'%\'@, @\'\/\'@, @\'?\'@, @\'#\'@, @\'\;\'@, space, or control
byte inside a component cannot alter the URL's shape, inject a query or fragment,
or -- the once-decoded @%2e%2e%2f@ case -- survive as a live escape a
decode-and-normalise upstream could resolve to traversal.

Encoding is per-byte over the UTF-8 form, so a multi-byte character is encoded one
@%XX@ per byte (@\'é\'@ → @%C3%A9@). It does __not__ encode an already-percent-encoded
escape idempotently -- a literal @\'%\'@ is always re-encoded to @%25@ -- which is the
point: the component is decoded content, so any @\'%\'@ in it is a literal to be
escaped, not a structural escape to preserve.
-}
encodeComponent :: Text -> Text
encodeComponent :: Text -> Text
encodeComponent = [Text] -> Text
T.concat ([Text] -> Text) -> (Text -> [Text]) -> Text -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Word8 -> Text) -> [Word8] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map Word8 -> Text
encodeByte ([Word8] -> [Text]) -> (Text -> [Word8]) -> Text -> [Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> [Word8]
BS.unpack (ByteString -> [Word8]) -> (Text -> ByteString) -> Text -> [Word8]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> ByteString
forall a b. ConvertUtf8 a b => a -> b
encodeUtf8
  where
    encodeByte :: Word8 -> Text
    encodeByte :: Word8 -> Text
encodeByte Word8
b
        | Word8 -> Bool
isUnreserved Word8
b = Char -> Text
T.singleton (Word8 -> Char
chr8 Word8
b)
        | Bool
otherwise = String -> Text
T.pack [Char
'%', Word8 -> Char
hexDigit (Word8
b Word8 -> Word8 -> Word8
forall a. Integral a => a -> a -> a
`div` Word8
16), Word8 -> Char
hexDigit (Word8
b Word8 -> Word8 -> Word8
forall a. Integral a => a -> a -> a
`mod` Word8
16)]

    -- RFC 3986 §2.3 unreserved: ALPHA / DIGIT / "-" / "." / "_" / "~".
    isUnreserved :: Word8 -> Bool
    isUnreserved :: Word8 -> Bool
isUnreserved Word8
b =
        (Word8
b Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
>= Word8
0x41 Bool -> Bool -> Bool
&& Word8
b Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word8
0x5A) -- A-Z
            Bool -> Bool -> Bool
|| (Word8
b Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
>= Word8
0x61 Bool -> Bool -> Bool
&& Word8
b Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word8
0x7A) -- a-z
            Bool -> Bool -> Bool
|| (Word8
b Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
>= Word8
0x30 Bool -> Bool -> Bool
&& Word8
b Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word8
0x39) -- 0-9
            Bool -> Bool -> Bool
|| Word8
b Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0x2D -- '-'
            Bool -> Bool -> Bool
|| Word8
b Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0x2E -- '.'
            Bool -> Bool -> Bool
|| Word8
b Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0x5F -- '_'
            Bool -> Bool -> Bool
|| Word8
b Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0x7E -- '~'

    -- An unreserved byte is ASCII, so its 'Char' is its code point.
    chr8 :: Word8 -> Char
    chr8 :: Word8 -> Char
chr8 = Word8 -> Char
w2c

    hexDigit :: Word8 -> Char
    hexDigit :: Word8 -> Char
hexDigit = Char -> Char
toUpper (Char -> Char) -> (Word8 -> Char) -> Word8 -> Char
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Char
intToDigit (Int -> Char) -> (Word8 -> Int) -> Word8 -> Char
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral