{- | Outbound-request guards for the proxy's data plane: defending how an upstream URL is derived.

Écluse builds outbound HTTP requests from two untrusted sources -- __client-supplied
package identifiers__ (the request path) and __upstream-supplied artifact
locations__ (a packument's @dist.tarball@). This module provides the pure guard layer
that enforces safe URL construction.

__How an upstream URL is derived:__ 'upstreamUrlFor' builds an artifact\/metadata
URL from a configured base URL and an __already-parsed__ 'PackageName', never
from raw client path segments, re-checking each name component with the router's
own safety rule so traversal, encoded slashes, or an absolute URL cannot change
the target.
-}
module Ecluse.Core.Security.Url (
    -- * Identifier → URL safety
    upstreamUrlFor,
    UrlError (..),
) where

import Data.Text qualified as T

import Ecluse.Core.Package (PackageName, renderPackageName)
import Ecluse.Core.Server.Route (encodeComponent, isSafeComponent)
import Ecluse.Core.Text (joinUrlPath)

-- | Why building an upstream URL from an identifier was refused.
data UrlError
    = {- | A name component (scope or base name) is unsafe to interpolate -- see
      'Ecluse.Core.Server.Route.isSafeComponent'. Carries the offending component.
      -}
      UnsafeComponent Text
    | -- | The configured base URL is empty, so no URL can be formed.
      EmptyBaseUrl
    deriving stock (UrlError -> UrlError -> Bool
(UrlError -> UrlError -> Bool)
-> (UrlError -> UrlError -> Bool) -> Eq UrlError
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: UrlError -> UrlError -> Bool
== :: UrlError -> UrlError -> Bool
$c/= :: UrlError -> UrlError -> Bool
/= :: UrlError -> UrlError -> Bool
Eq, Int -> UrlError -> ShowS
[UrlError] -> ShowS
UrlError -> String
(Int -> UrlError -> ShowS)
-> (UrlError -> String) -> ([UrlError] -> ShowS) -> Show UrlError
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> UrlError -> ShowS
showsPrec :: Int -> UrlError -> ShowS
$cshow :: UrlError -> String
show :: UrlError -> String
$cshowList :: [UrlError] -> ShowS
showList :: [UrlError] -> ShowS
Show)

{- | Build an upstream URL for a package from a configured base URL and an
__already-parsed__ 'PackageName'.

This is the only sanctioned way to derive an upstream URL for a package: the
target is @{baseUrl}\/{path}@, where @path@ is built from the package's structural
components and @baseUrl@ is __configuration__, never a client-supplied path. The
client never chooses the host or the path prefix -- only which (validated) package
-- so @..\/@ traversal, an encoded slash, an absolute URL, or a CRLF in the original
request cannot steer the fetch elsewhere (see the module header).

The path is built with two complementary defences. First, although a 'PackageName'
is normally produced by the router's already-safe parse, its smart constructor does
no validation, so this __re-checks every structural component__ (scope and base
name) with the router's own 'Ecluse.Core.Server.Route.isSafeComponent' -- a name carrying
a @\'\/\'@, @\'\\\\\'@, control character, or a @"."@\/@".."@ component is refused
with 'UnsafeComponent' rather than interpolated. Second, each accepted component is
then __percent-encoded__ ('Ecluse.Core.Server.Route.encodeComponent') around the
structural @\'\@\'@ sigil and @%2F@ scope separator this builder writes -- so a
@\'%\'@, @\'?\'@, @\'#\'@, or other reserved byte the denylist accepts (notably a
once-decoded @%2e%2e%2f@) cannot reach the upstream URL raw. A scoped
@\@scope\/name@ therefore yields exactly one @%2F@ (the separator written here, not
an encoding of a component), with no double-encoding. An empty @baseUrl@ is refused
with 'EmptyBaseUrl'. A single trailing slash on @baseUrl@ is tolerated so the join
never doubles it.
-}
upstreamUrlFor :: Text -> PackageName -> Either UrlError Text
upstreamUrlFor :: Text -> PackageName -> Either UrlError Text
upstreamUrlFor Text
baseUrl PackageName
name
    | Text -> Bool
T.null Text
baseUrl = UrlError -> Either UrlError Text
forall a b. a -> Either a b
Left UrlError
EmptyBaseUrl
    | Bool
otherwise = case [Text] -> Maybe Text
firstUnsafe (NameParts -> [Text]
componentsOf NameParts
parts) of
        Just Text
bad -> UrlError -> Either UrlError Text
forall a b. a -> Either a b
Left (Text -> UrlError
UnsafeComponent Text
bad)
        Maybe Text
Nothing -> Text -> Either UrlError Text
forall a b. b -> Either a b
Right (Text -> Text -> Text
joinUrlPath Text
baseUrl (NameParts -> Text
encodePath NameParts
parts))
  where
    parts :: NameParts
parts = PackageName -> NameParts
nameParts PackageName
name

    firstUnsafe :: [Text] -> Maybe Text
    firstUnsafe :: [Text] -> Maybe Text
firstUnsafe = (Text -> Bool) -> [Text] -> Maybe Text
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Maybe a
find (Bool -> Bool
not (Bool -> Bool) -> (Text -> Bool) -> Text -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Bool
isSafeComponent)

{- The structural decomposition of a package name for URL building: a scope with a
base name, or a single component, recovered by splitting the rendered name on the
@\'\/\'@ scope separator so a legitimate scoped name's own separator is not judged
as unsafe content. One source of truth for both the safety re-check ('componentsOf')
and the encoded path ('encodePath'), so the two cannot disagree about where the
component boundaries are. A leading @\'\@\'@ with no @\'\/\'@ (or an empty base) is
a single component (the @\@foo@ fallback).
-}
data NameParts
    = -- A scoped name: scope and base, each a component to check and encode.
      Scoped Text Text
    | -- An unscoped (or @\@@-leading, separator-free) name: one component.
      Single Text

nameParts :: PackageName -> NameParts
nameParts :: PackageName -> NameParts
nameParts PackageName
name =
    let rendered :: Text
rendered = PackageName -> Text
renderPackageName PackageName
name
     in case Text -> Text -> Maybe Text
T.stripPrefix Text
"@" Text
rendered of
            Just Text
scopeAndBase ->
                let (Text
scope, Text
base) = HasCallStack => Text -> Text -> (Text, Text)
Text -> Text -> (Text, Text)
T.breakOn Text
"/" Text
scopeAndBase
                 in if Text -> Bool
T.null Text
base
                        then Text -> NameParts
Single Text
rendered
                        else Text -> Text -> NameParts
Scoped Text
scope (Int -> Text -> Text
T.drop Int
1 Text
base)
            Maybe Text
Nothing -> Text -> NameParts
Single Text
rendered

-- The components each of which must independently pass 'isSafeComponent'.
componentsOf :: NameParts -> [Text]
componentsOf :: NameParts -> [Text]
componentsOf = \case
    Scoped Text
scope Text
base -> [Text
scope, Text
base]
    Single Text
c -> [Text
c]

{- The encoded on-the-wire path for the components: the @\'\@\'@ sigil and the
@%2F@ scope separator are written here, around each percent-encoded component, so a
legitimate scoped name carries exactly one @%2F@ and no component byte can alter the
URL's shape. -}
encodePath :: NameParts -> Text
encodePath :: NameParts -> Text
encodePath = \case
    Scoped Text
scope Text
base -> Text
"@" Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text -> Text
encodeComponent Text
scope Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
"%2F" Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text -> Text
encodeComponent Text
base
    Single Text
c -> Text -> Text
encodeComponent Text
c