{- | Request shaping and URL building for the npm data plane. Three details of the wire protocol are load-bearing and handled here: * __Content negotiation.__ Metadata comes in two forms selected by @Accept@: the __abbreviated__ install view (@application/vnd.npm.install-v1+json@), which the proxy treats as primary, and the __full__ packument (@application/json@), needed when a rule reasons over publish age (the abbreviated form drops the @time@ map). 'MetadataForm' selects between them; both request @Accept-Encoding: gzip@, since popular packuments are megabytes. * __Scoped-name path encoding.__ A scoped name @\@scope/name@ is encoded on the wire as @\@scope%2Fname@: the scope separator is percent-encoded but the leading @\@@ is not. 'metadataRequest' builds this from an __already-parsed__ 'PackageName', never from raw client path segments. * __Streaming and buffering.__ 'artifactRequest' marks its request __non-decompressing__ ('decompress' returns 'False'): a tarball is opaque binary that must reach the client byte-for-byte, so the @.tgz@ is never gunzipped in flight (and its @dist.integrity@ stays valid). -} module Ecluse.Core.Registry.Npm.Request ( -- * Content negotiation MetadataForm (..), metadataAccept, -- * Conditional-GET validators Validators (..), noValidators, -- * Request building metadataRequest, artifactRequest, artifactRequestByFile, artifactRequestByUrl, artifactFileUrl, packageUrl, joinPath, -- * Shared internals encodePackagePath, withToken, addValidators, parseRequestEither, ) where import Data.Text qualified as T import Network.HTTP.Client (Request (decompress, redirectCount, requestHeaders), applyBearerAuth, parseRequest) import Network.HTTP.Types.Header (hAccept, hAcceptEncoding, hIfModifiedSince, hIfNoneMatch) import Ecluse.Core.Credential (Secret, unSecret) import Ecluse.Core.Package (PackageName, pkgNamespace, renderPackageName, unScope, unscopedName) import Ecluse.Core.Registry (UrlFormationError (EmptyBaseUrl, UnparseableUrl)) import Ecluse.Core.Server.Route (encodeComponent) import Ecluse.Core.Text (joinUrlPath) import Ecluse.Core.Version (Version, renderVersion) {- | Which of npm's two metadata documents to request, selected by the @Accept@ header (see 'metadataAccept'). -} data MetadataForm = {- | The install-optimised __abbreviated__ packument (@application/vnd.npm.install-v1+json@). Smaller and the proxy's primary view, but it drops the @time@ map. -} Abbreviated | {- | The __full__ packument (@application/json@). Larger, but the only form carrying the @time@ map a publish-age rule needs. -} Full deriving stock (MetadataForm -> MetadataForm -> Bool (MetadataForm -> MetadataForm -> Bool) -> (MetadataForm -> MetadataForm -> Bool) -> Eq MetadataForm forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a $c== :: MetadataForm -> MetadataForm -> Bool == :: MetadataForm -> MetadataForm -> Bool $c/= :: MetadataForm -> MetadataForm -> Bool /= :: MetadataForm -> MetadataForm -> Bool Eq, Int -> MetadataForm -> ShowS [MetadataForm] -> ShowS MetadataForm -> String (Int -> MetadataForm -> ShowS) -> (MetadataForm -> String) -> ([MetadataForm] -> ShowS) -> Show MetadataForm forall a. (Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a $cshowsPrec :: Int -> MetadataForm -> ShowS showsPrec :: Int -> MetadataForm -> ShowS $cshow :: MetadataForm -> String show :: MetadataForm -> String $cshowList :: [MetadataForm] -> ShowS showList :: [MetadataForm] -> ShowS Show) {- | The @Accept@ header value selecting a 'MetadataForm'. >>> metadataAccept Abbreviated "application/vnd.npm.install-v1+json" >>> metadataAccept Full "application/json" -} metadataAccept :: MetadataForm -> ByteString metadataAccept :: MetadataForm -> ByteString metadataAccept = \case MetadataForm Abbreviated -> ByteString "application/vnd.npm.install-v1+json" MetadataForm Full -> ByteString "application/json" {- | The conditional-GET validators to relay on a metadata fetch. Replaying an upstream's @ETag@ as @If-None-Match@ (or its @Last-Modified@ as @If-Modified-Since@) lets the upstream answer @304 Not Modified@ with no body: the cheap freshness check the proxy uses on a cache revalidation. Both are forwarded only when present. -} data Validators = Validators { Validators -> Maybe ByteString validatorIfNoneMatch :: Maybe ByteString -- ^ An entity tag to send as @If-None-Match@ (an upstream @ETag@). , Validators -> Maybe ByteString validatorIfModifiedSince :: Maybe ByteString {- ^ An RFC-1123 date to send as @If-Modified-Since@ (an upstream @Last-Modified@). -} } deriving stock (Validators -> Validators -> Bool (Validators -> Validators -> Bool) -> (Validators -> Validators -> Bool) -> Eq Validators forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a $c== :: Validators -> Validators -> Bool == :: Validators -> Validators -> Bool $c/= :: Validators -> Validators -> Bool /= :: Validators -> Validators -> Bool Eq, Int -> Validators -> ShowS [Validators] -> ShowS Validators -> String (Int -> Validators -> ShowS) -> (Validators -> String) -> ([Validators] -> ShowS) -> Show Validators forall a. (Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a $cshowsPrec :: Int -> Validators -> ShowS showsPrec :: Int -> Validators -> ShowS $cshow :: Validators -> String show :: Validators -> String $cshowList :: [Validators] -> ShowS showList :: [Validators] -> ShowS Show) -- | No conditional-GET validators: an unconditional fetch. noValidators :: Validators noValidators :: Validators noValidators = Validators{validatorIfNoneMatch :: Maybe ByteString validatorIfNoneMatch = Maybe ByteString forall a. Maybe a Nothing, validatorIfModifiedSince :: Maybe ByteString validatorIfModifiedSince = Maybe ByteString forall a. Maybe a Nothing} {- | Build the metadata @GET@ request for a package: the URL is @{baseUrl}/{encoded-name}@ with the @Accept@ header for the chosen 'MetadataForm', @Accept-Encoding: gzip@, an optional bearer token, and any relayed conditional-GET 'Validators'. The package path is derived from an __already-parsed__ 'PackageName', then the scope separator is percent-encoded (@\@scope/name@ -> @\@scope%2Fname@). Fails with a 'UrlFormationError' only when the URL cannot be formed (an empty base URL). -} metadataRequest :: Text -> Maybe Secret -> MetadataForm -> Validators -> PackageName -> Either UrlFormationError Request metadataRequest :: Text -> Maybe Secret -> MetadataForm -> Validators -> PackageName -> Either UrlFormationError Request metadataRequest Text baseUrl Maybe Secret token MetadataForm form Validators validators PackageName name = do url <- Text -> PackageName -> Either UrlFormationError Text packageUrl Text baseUrl PackageName name base <- parseRequestEither url pure . withToken token . addValidators validators $ base { requestHeaders = (hAccept, metadataAccept form) : (hAcceptEncoding, "gzip") : requestHeaders base } {- | Build the artifact @GET@ request for one version's tarball. The request is marked __non-decompressing__ ('decompress' returns 'False') so the @.tgz@ bytes are streamed through verbatim: a tarball is opaque binary and must reach the client byte-for-byte for its @dist.integrity@ to verify. The artifact URL is the registry-served tarball location, derived like 'metadataRequest' but addressing the version's artifact path. Exposed so the web layer can bracket it for bounded-memory streaming (see the module header). Fails with a 'UrlFormationError' only when the URL cannot be formed. -} artifactRequest :: Text -> Maybe Secret -> PackageName -> Version -> Either UrlFormationError Request artifactRequest :: Text -> Maybe Secret -> PackageName -> Version -> Either UrlFormationError Request artifactRequest Text baseUrl Maybe Secret token PackageName name Version version = do url <- Text -> PackageName -> Version -> Either UrlFormationError Text artifactUrl Text baseUrl PackageName name Version version base <- parseRequestEither url pure . withToken token $ base { -- A tarball must never be gunzipped in flight: it is opaque binary -- whose integrity the client verifies, so stream the raw bytes. We -- deliberately advertise no @Accept-Encoding@ here: a @.tgz@ is -- already-compressed application data, and requesting a transport -- encoding we then refuse to decode ('decompress' is 'False') would -- risk a doubly-gzipped body that fails its @dist.integrity@. decompress = const False } {- | Build the artifact @GET@ request addressing a tarball by its __preserved on-the-wire filename__, at @{baseUrl}/{encoded-pkg}/-/{filename}@. The serve path fetches an artifact by the exact filename the client requested: the authoritative name for the bytes: rather than reconstructing it from @(package, version)@ as 'artifactRequest' does, so a registry whose tarball naming differs from the proxy's own convention still resolves. The @filename@ is taken verbatim (the classifier has already passed it through the component-safety gate), and the package segment is the same scope-percent-encoded path 'artifactRequest' uses. The request is marked __non-decompressing__ for the same reason: a @.tgz@ is opaque binary streamed byte-for-byte so its @dist.integrity@ verifies. Exposed so the web layer can bracket it for bounded-memory streaming. Fails with a 'UrlFormationError' only when the URL cannot be formed. -} artifactRequestByFile :: Text -> Maybe Secret -> PackageName -> Text -> Either UrlFormationError Request artifactRequestByFile :: Text -> Maybe Secret -> PackageName -> Text -> Either UrlFormationError Request artifactRequestByFile Text baseUrl Maybe Secret token PackageName name Text filename = do url <- Text -> PackageName -> Text -> Either UrlFormationError Text artifactFileUrl Text baseUrl PackageName name Text filename base <- parseRequestEither url pure . withToken token $ base { -- A tarball must never be gunzipped in flight (see 'artifactRequest'). decompress = const False } {- | Build the artifact @GET@ request addressing a tarball at its __authoritative upstream location__: the absolute @url@ the projection preserved from the upstream's @dist.tarball@: rather than reconstructing it from @(base, package, file)@. The artifact location is server-chosen data, not a derivable fact: a registry may serve a version's tarball from a different host or a path the npm @/-/@ convention cannot rebuild. Honouring the preserved location is what lets Écluse front those registries; the URL it fetches is the same one the served packument's @dist.integrity@ is paired with, so the bytes still verify. The request is marked __non-decompressing__ for the same reason as 'artifactRequest': a @.tgz@ is opaque binary streamed byte-for-byte. Fails with a 'UrlFormationError' only when the @url@ cannot be parsed into a request. -} artifactRequestByUrl :: Text -> Maybe Secret -> Text -> Either UrlFormationError Request artifactRequestByUrl :: Text -> Maybe Secret -> Text -> Either UrlFormationError Request artifactRequestByUrl Text _baseUrl Maybe Secret token Text url = do base <- Text -> Either UrlFormationError Request parseRequestEither Text url pure . withToken token $ base { -- A tarball must never be gunzipped in flight (see 'artifactRequest'). decompress = const False } {- The metadata/publish URL for a package: @{baseUrl}/{encoded-name}@, with the scoped-name separator percent-encoded (@\@scope/name@ -> @\@scope%2Fname@). -} packageUrl :: Text -> PackageName -> Either UrlFormationError Text packageUrl :: Text -> PackageName -> Either UrlFormationError Text packageUrl Text baseUrl PackageName name = Text -> Text -> Either UrlFormationError Text joinPath Text baseUrl (PackageName -> Text encodePackagePath PackageName name) {- The artifact (tarball) URL for one version: @{baseUrl}/{encoded-name}/-/{tarball-file}@. npm serves a version's tarball under the package's @/-/@ path; the filename is @{base}-{version}.tgz@ (scope dropped from the file segment, as npm names it). -} artifactUrl :: Text -> PackageName -> Version -> Either UrlFormationError Text artifactUrl :: Text -> PackageName -> Version -> Either UrlFormationError Text artifactUrl Text baseUrl PackageName name Version version = Text -> Text -> Either UrlFormationError Text joinPath Text baseUrl (PackageName -> Text encodePackagePath PackageName name Text -> Text -> Text forall a. Semigroup a => a -> a -> a <> Text "/-/" Text -> Text -> Text forall a. Semigroup a => a -> a -> a <> PackageName -> Version -> Text tarballFile PackageName name Version version) {- | The artifact (tarball) URL addressing a __preserved filename__: @{baseUrl}/{encoded-name}/-/{encoded-filename}@. The filename is the exact on-the-wire name (not @{base}-{version}.tgz@ rebuilt from the coordinate), so the bytes are fetched by the name the client requested; it is percent-encoded as a single component ('Ecluse.Core.Server.Route.encodeComponent') so a once-decoded escape in it cannot reach the upstream raw. Exposed so the serve path can record the public artifact location on a mirror job (the same URL its public fetch targets). Fails with a 'UrlFormationError' only when the URL cannot be formed. -} artifactFileUrl :: Text -> PackageName -> Text -> Either UrlFormationError Text artifactFileUrl :: Text -> PackageName -> Text -> Either UrlFormationError Text artifactFileUrl Text baseUrl PackageName name Text filename = Text -> Text -> Either UrlFormationError Text joinPath Text baseUrl (PackageName -> Text encodePackagePath PackageName name Text -> Text -> Text forall a. Semigroup a => a -> a -> a <> Text "/-/" Text -> Text -> Text forall a. Semigroup a => a -> a -> a <> Text -> Text encodeComponent Text filename) {- Join a base URL and an already-encoded path, tolerating one trailing slash on the base so the join never doubles it. An empty base URL is refused with a 'UrlFormationError': the read- and write-path builders share this report, so an unformable URL is never mislabelled as a publish failure. -} joinPath :: Text -> Text -> Either UrlFormationError Text joinPath :: Text -> Text -> Either UrlFormationError Text joinPath Text baseUrl Text path | Text -> Bool T.null Text baseUrl = UrlFormationError -> Either UrlFormationError Text forall a b. a -> Either a b Left UrlFormationError EmptyBaseUrl | Bool otherwise = Text -> Either UrlFormationError Text forall a b. b -> Either a b Right (Text -> Text -> Text joinUrlPath Text baseUrl Text path) {- Encode a package name as its on-the-wire path segment. Each name component (scope, base name) is percent-encoded ('Ecluse.Core.Server.Route.encodeComponent') around the structural delimiters this builder writes: a scoped @\@scope/name@ becomes @\@{enc-scope}%2F{enc-base}@: the leading @\@@ and the @%2F@ separator are written here, never derived from a component, so a legitimate scoped name yields exactly one @%2F@: and an unscoped name is its single encoded component. Encoding each component is the defence in depth that keeps a @'%'@, @'/'@, or other reserved byte inside a decoded name from reaching the upstream URL raw (a once-decoded @%2e%2e%2f@ is re-encoded to @%252e%252e%252f@), without double-encoding the structural separator. -} encodePackagePath :: PackageName -> Text encodePackagePath :: PackageName -> Text encodePackagePath PackageName name = case PackageName -> Maybe Scope pkgNamespace PackageName name of Just Scope scope -> Text "@" Text -> Text -> Text forall a. Semigroup a => a -> a -> a <> Text -> Text encodeComponent (Scope -> Text unScope Scope scope) Text -> Text -> Text forall a. Semigroup a => a -> a -> a <> Text "%2F" Text -> Text -> Text forall a. Semigroup a => a -> a -> a <> Text -> Text encodeComponent (PackageName -> Text unscopedName PackageName name) Maybe Scope Nothing -> Text -> Text encodeComponent (PackageName -> Text renderPackageName PackageName name) {- The conventional npm tarball filename for a version: @{base}-{version}.tgz@. The base name and version are percent-encoded as components around the structural @'-'@ and @.tgz@ this builder writes, so a reserved byte in either cannot reach the upstream URL raw. -} tarballFile :: PackageName -> Version -> Text tarballFile :: PackageName -> Version -> Text tarballFile PackageName name Version version = Text -> Text encodeComponent (PackageName -> Text unscopedName PackageName name) Text -> Text -> Text forall a. Semigroup a => a -> a -> a <> Text "-" Text -> Text -> Text forall a. Semigroup a => a -> a -> a <> Text -> Text encodeComponent (Version -> Text renderVersion Version version) Text -> Text -> Text forall a. Semigroup a => a -> a -> a <> Text ".tgz" {- Finalize an npm data-plane request: __disable redirect following__ ('redirectCount' = 0) on __every__ request, and attach a bearer token when one is injected. This is the single request-finalization point for the whole npm data plane: every builder and call site funnels through it (it is also the only 'applyBearerAuth'): so pinning @redirectCount = 0@ here makes one invariant universal: __Écluse never follows an upstream redirect__, on the credentialed and the anonymous plane alike. Two dangers it forecloses, one per plane: \* __Credential leakage__ (credentialed plane). http-client's default ('redirectCount' = 10) re-sends the @Authorization@ header to the redirect's @Location@, and its @shouldStripHeaderOnRedirect@ does not strip it cross-host: so a hostile or misconfigured upstream could @302@ a forwarded/minted credential to an attacker-chosen host. That is especially dangerous on the __trusted private manager__, where a redirect could exfiltrate the credential to an attacker-chosen target; pinning @redirectCount = 0@ removes the hop entirely rather than relying on the per-hop egress controls. \* __SSRF via redirect__ (anonymous plane). The host allowlist is enforced when the URL is built, not per redirect hop, so following a @302@ would let an allowlisted upstream steer an anonymous fetch to __any__ host: an internal/cloud-metadata address or any off-allowlist host: re-gated by nothing. Not following the redirect removes the hop there is to gate. The accepted consequence, symmetric across both planes: a read no longer follows an upstream's CDN @302@: it returns the @3xx@ to the serve path rather than chasing it. That is the safer posture, and the proxy already honours the __packument's__ @dist.tarball@ location explicitly, gated by the egress policy, rather than relying on redirects. Redirect-following for a nonstandard upstream (a presigned/redirecting object store) is an explicit, per-upstream opt-in, never the default. -} withToken :: Maybe Secret -> Request -> Request withToken :: Maybe Secret -> Request -> Request withToken Maybe Secret Nothing Request request = Request request{redirectCount = 0} withToken (Just Secret secret) Request request = ByteString -> Request -> Request applyBearerAuth (Text -> ByteString forall a b. ConvertUtf8 a b => a -> b encodeUtf8 (Secret -> Text unSecret Secret secret)) Request request{redirectCount = 0} -- Add the present conditional-GET validators as request headers. addValidators :: Validators -> Request -> Request addValidators :: Validators -> Request -> Request addValidators Validators validators Request request = Request request{requestHeaders = newHeaders <> requestHeaders request} where newHeaders :: RequestHeaders newHeaders = [Maybe Header] -> RequestHeaders forall a. [Maybe a] -> [a] catMaybes [ (,) HeaderName hIfNoneMatch (ByteString -> Header) -> Maybe ByteString -> Maybe Header forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b <$> Validators -> Maybe ByteString validatorIfNoneMatch Validators validators , (,) HeaderName hIfModifiedSince (ByteString -> Header) -> Maybe ByteString -> Maybe Header forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b <$> Validators -> Maybe ByteString validatorIfModifiedSince Validators validators ] {- Parse a built URL into a 'Request', mapping a parse failure into a 'UrlFormationError'. The URL is derived from configuration and an already-safe name, so a failure here is a configuration fault, reported uniformly with the other URL-formation errors. -} parseRequestEither :: Text -> Either UrlFormationError Request parseRequestEither :: Text -> Either UrlFormationError Request parseRequestEither Text url = case String -> Maybe Request forall (m :: * -> *). MonadThrow m => String -> m Request parseRequest (Text -> String forall a. ToString a => a -> String toString Text url) of Just Request request -> Request -> Either UrlFormationError Request forall a b. b -> Either a b Right Request request Maybe Request Nothing -> UrlFormationError -> Either UrlFormationError Request forall a b. a -> Either a b Left (Text -> UrlFormationError UnparseableUrl Text url)