-- TupleSections: local convenience for pairing a parsed name with its trailing -- segments in 'takeScoped' ((,rest) / (,more)); see STYLE.md §2. {-# LANGUAGE TupleSections #-} {- | The npm path grammar: the request router that maps an npm-native request path to a shared "Ecluse.Core.Server.Route". 'classify' turns an npm request -- its HTTP method and the already-mount-stripped, percent-decoded path segments -- into a 'Route', so the whole npm routing table is unit-testable with __no server__: feed it a method and segments, assert the 'Route'. The agnostic dispatcher carries a route classifier per mount; this module is npm's, wired in at the composition root. A @PUT \/{pkg}@ is the npm __publish__ request, so the method is part of the match: a @PUT@ over a bare-package path is a 'Publish', while every read method (@GET@, @HEAD@, …) over the same path is a 'Packument'. The read grammar below is otherwise method-independent -- a @HEAD@ classifies like its @GET@, the dispatcher answering it bodiless. The model is __deny by default__: anything not explicitly recognised is 'Unsupported' (a @404@ at the edge). Three npm-specific facts shape the matching, all from the protocol research (see @docs\/research\/reverse-engineering\/npm.md@ §2 and §7): * __Reserved meta-routes (@\/-\/…@) are matched first.__ A real package name can never begin with @\'-\'@, so a leading @"-"@ segment is unambiguously a meta-route; an /unknown/ one is 'Unsupported' rather than a package. * __Scoped names arrive in two encodings.__ The path is percent-decoded before it reaches us, so a scoped name arrives either as one decoded segment (@\@scope\/pkg@) or as two (@\@scope@, @pkg@). Both are normalised to the same 'PackageName' here, so nothing downstream re-checks the encoding. * __A tarball path is @\/{pkg}\/-\/{file}.tgz@.__ The interior @"-"@ segment and the @.tgz@ suffix distinguish it from a packument request (@\/{pkg}@); for a scoped package the basename drops the scope (@\@babel\/code-frame@ → @code-frame-7.0.0.tgz@). 'classify' is the npm-side parse of the artifact coordinate: it checks the @file@'s basename is exactly @{unscoped-name}-{rest}@ for the requested package and reads @rest@ as the version (@mkVersion@, total), yielding @'Tarball' name version ('Filename' file)@ with the file __preserved verbatim__. A basename that does not match the package is a path-confusion attempt and denies (deny by default), never a fabricated coordinate. Mount dispatch / prefix-stripping and the liveness\/readiness routes are handled in the agnostic web layer (see @docs\/architecture\/web-layer.md@); 'classify' only ever sees the npm-native request, so it models exactly the 'Route's the proxy serves. -} module Ecluse.Core.Registry.Npm.Route ( -- * Classification classify, ) where import Data.Text qualified as T import Network.HTTP.Types.Method (methodPut) import Ecluse.Core.Ecosystem (Ecosystem (Npm)) import Ecluse.Core.Package (PackageName, mkPackageName, mkScope, unscopedName) import Ecluse.Core.Server.Route (Classifier, Filename (Filename), Route (..), isSafeComponent) import Ecluse.Core.Version (mkVersion) {- | Classify an npm-native request (its method and path) into a shared 'Route'. A @PUT@ is the publish method, so it is dispatched first: a @PUT@ over a bare-package path is a 'Publish', everything else under @PUT@ denies. Every other method reads, taking the path through the read grammar where matching order is significant -- reserved meta-routes (a leading @"-"@ segment) are tried first, since a real package name can never begin with @\'-\'@; only then is the path read as a package request. See the module header for the npm conventions this encodes. -} classify :: Classifier classify :: Classifier classify Method method [Text] segments | Method method Method -> Method -> Bool forall a. Eq a => a -> a -> Bool == Method methodPut = [Text] -> Route classifyPublish [Text] segments | Bool otherwise = [Text] -> Route classifyRead [Text] segments {- Classify a read request's path (any non-@PUT@ method): reserved meta-routes first, then a package request. A @HEAD@ takes this same path as its @GET@ -- the dispatcher answers it bodiless -- so the read grammar is method-independent. -} classifyRead :: [Text] -> Route classifyRead :: [Text] -> Route classifyRead (Text "-" : [Text] meta) = [Text] -> Route classifyMeta [Text] meta classifyRead [Text] segments = [Text] -> Route classifyPackage [Text] segments {- Classify a @PUT@ as an npm publish. npm publishes a package with @PUT \/{pkg}@, the version manifest and tarball carried in the body, so a publish is exactly a __bare-package__ path (no trailing segments) -- both scoped encodings handled by 'takePackage'. A @PUT@ to anything else (a tarball slot, a meta-route, trailing junk) is 'Unsupported' (deny by default); the version is /not/ read from the path here -- it lives in the relayed document. -} classifyPublish :: [Text] -> Route classifyPublish :: [Text] -> Route classifyPublish [Text] segments = case [Text] -> Maybe (PackageName, [Text]) takePackage [Text] segments of Just (PackageName name, []) -> PackageName -> Route Publish PackageName name Maybe (PackageName, [Text]) _ -> Route Unsupported {- Classify a reserved meta-route -- the segments __after__ the leading @"-"@. Only the routes the proxy actually serves are recognised; every other meta-route is 'Unsupported' (never re-interpreted as a package). -} classifyMeta :: [Text] -> Route classifyMeta :: [Text] -> Route classifyMeta = \case [Text "ping"] -> Route Ping [Text "v1", Text "search"] -> Route Search [Text] _ -> Route Unsupported {- Classify a non-meta path as a package request. Splits off the leading package unit (handling both scoped encodings) and dispatches on what trails it: a bare package is a 'Packument', @\/-\/{file}.tgz@ a 'Tarball' when its basename parses for the package, anything else 'Unsupported'. -} classifyPackage :: [Text] -> Route classifyPackage :: [Text] -> Route classifyPackage [Text] segments = case [Text] -> Maybe (PackageName, [Text]) takePackage [Text] segments of Maybe (PackageName, [Text]) Nothing -> Route Unsupported Just (PackageName name, [Text] rest) -> PackageName -> [Text] -> Route dispatch PackageName name [Text] rest where dispatch :: PackageName -> [Text] -> Route dispatch PackageName name = \case [] -> PackageName -> Route Packument PackageName name [Text "-", Text file] | Text -> Bool isSafeComponent Text file -> PackageName -> Text -> Route tarballRoute PackageName name Text file [Text] _ -> Route Unsupported {- Peel the leading package unit off a path, returning its 'PackageName' and the remaining segments. A leading segment beginning with @\'\@\'@ is a scoped name, peeled by 'takeScoped' (which handles both wire encodings). Returns 'Nothing' (so the caller denies it) for anything without a usable package: an empty path, or a name with an __unsafe component__ -- a scope or base name that 'isSafeComponent' rejects (empty, @"."@\/@".."@, or carrying a @\'\/\'@, @\'\\\\\'@, or control character). This covers the degenerate scoped names (@\@\/pkg@, @\@scope\/@ reachable from @\/\@scope%2F@, @\@scope\/a\/b@) and the hostile unscoped names (@\/foo%2Fbar@ → @"foo\/bar"@, @".."@, @"."@) alike. 'mkScope'\/'mkPackageName' do no validation, so this boundary is where such names are rejected rather than passed downstream into an interpolated upstream URL. -} takePackage :: [Text] -> Maybe (PackageName, [Text]) takePackage :: [Text] -> Maybe (PackageName, [Text]) takePackage [] = Maybe (PackageName, [Text]) forall a. Maybe a Nothing takePackage (Text seg : [Text] rest) | Text "@" <- Int -> Text -> Text T.take Int 1 Text seg = Text -> [Text] -> Maybe (PackageName, [Text]) takeScoped Text seg [Text] rest | Text -> Bool isSafeComponent Text seg = (PackageName, [Text]) -> Maybe (PackageName, [Text]) forall a. a -> Maybe a Just (Ecosystem -> Maybe Scope -> Text -> PackageName mkPackageName Ecosystem Npm Maybe Scope forall a. Maybe a Nothing Text seg, [Text] rest) | Bool otherwise = Maybe (PackageName, [Text]) forall a. Maybe a Nothing {- Peel a scoped package unit -- the leading @\@…@ segment -- handling both wire encodings of a scoped name: \* one decoded segment, @\@scope\/pkg@ -- split on the first @\'\/\'@; \* two segments, @\@scope@ then @pkg@ -- consume both. -} takeScoped :: Text -> [Text] -> Maybe (PackageName, [Text]) takeScoped :: Text -> [Text] -> Maybe (PackageName, [Text]) takeScoped Text seg [Text] rest = case HasCallStack => Text -> Text -> (Text, Text) Text -> Text -> (Text, Text) T.breakOn Text "/" (Int -> Text -> Text T.drop Int 1 Text seg) of -- One decoded segment "@scope/pkg": scope before the '/', base after. -- 'scopedName' may reject it, propagating 'Nothing' through (,rest). (Text scope, Text base) | Bool -> Bool not (Text -> Bool T.null Text base) -> (,[Text] rest) (PackageName -> (PackageName, [Text])) -> Maybe PackageName -> Maybe (PackageName, [Text]) forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b <$> Text -> Text -> Maybe PackageName scopedName Text scope (Int -> Text -> Text T.drop Int 1 Text base) -- Bare scope "@scope": the package name is the next segment. (Text, Text) _ -> case [Text] rest of (Text base : [Text] more) -> (,[Text] more) (PackageName -> (PackageName, [Text])) -> Maybe PackageName -> Maybe (PackageName, [Text]) forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b <$> Text -> Text -> Maybe PackageName scopedName (Int -> Text -> Text T.drop Int 1 Text seg) Text base [Text] _ -> Maybe (PackageName, [Text]) forall a. Maybe a Nothing -- A scoped name is usable only when both halves are safe components. The -- leading '@' is already stripped from both arguments, so a degenerate or -- hostile name ('@/pkg', '@scope/', '@scope/a/b', '@../pkg') is rejected here -- rather than passed to the no-op 'mkScope'/'mkPackageName'. scopedName :: Text -> Text -> Maybe PackageName scopedName :: Text -> Text -> Maybe PackageName scopedName Text scope Text base | Text -> Bool isSafeComponent Text scope Bool -> Bool -> Bool && Text -> Bool isSafeComponent Text base = PackageName -> Maybe PackageName forall a. a -> Maybe a Just (Ecosystem -> Maybe Scope -> Text -> PackageName mkPackageName Ecosystem Npm (Scope -> Maybe Scope forall a. a -> Maybe a Just (Text -> Scope mkScope Text scope)) Text base) | Bool otherwise = Maybe PackageName forall a. Maybe a Nothing {- Parse an npm tarball-slot @file@ into a 'Tarball' coordinate for @name@, or deny it. The npm convention is @{unscoped-name}-{version}.tgz@, so the file must: \* end in @.tgz@ over a non-empty name (a bare @.tgz@ is not an artifact), and \* have a basename of exactly @{unscoped-name}-{version}@ -- the unscoped name (the scope dropped, as npm names the file), a @\'-\'@, then a non-empty @version@. A basename that does not begin with @{unscoped-name}-@ is addressing some other package's artifact under this package's path -- a path-confusion attempt -- so it denies rather than fabricating a coordinate. On a match the @version@ run is read by the total 'mkVersion' (an unparseable version still yields a coordinate, so a parser gap never drops a real artifact), and the @file@ is preserved verbatim in the 'Filename'. The caller has already passed @file@ through 'isSafeComponent'. -} tarballRoute :: PackageName -> Text -> Route tarballRoute :: PackageName -> Text -> Route tarballRoute PackageName name Text file = case Text -> Text -> Maybe Text T.stripSuffix Text ".tgz" Text file Maybe Text -> (Text -> Maybe Text) -> Maybe Text forall a b. Maybe a -> (a -> Maybe b) -> Maybe b forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b >>= Text -> Text -> Maybe Text T.stripPrefix (PackageName -> Text unscopedName PackageName name Text -> Text -> Text forall a. Semigroup a => a -> a -> a <> Text "-") of Just Text version | Bool -> Bool not (Text -> Bool T.null Text version) -> PackageName -> Version -> Filename -> Route Tarball PackageName name (Ecosystem -> Text -> Version mkVersion Ecosystem Npm Text version) (Text -> Filename Filename Text file) Maybe Text _ -> Route Unsupported