{- | Telemetry configuration resolution and export-failure routing -- the boot-time
substrate that sits between the operator's environment and the OpenTelemetry SDK.

Écluse's maintainer runs Datadog, but the project is vendor-neutral, so an operator
may describe the same telemetry identity in either dialect: a Datadog shop sets the
@DD_*@ variables, a plain OpenTelemetry shop sets the @OTEL_*@ ones. This module is
the __self-aligning resolver__ that collapses both into one answer, so logs and
traces share a single identity whichever dialect was provided.

== The resolver

'resolveTelemetry' is a bounded precedence table over exactly four fields --
@service.name@, @deployment.environment@, @service.version@, and the OTLP export
endpoint -- each resolved __Datadog-value-wins → vanilla OpenTelemetry → default__.
It is deliberately /not/ a general per-variable merge: only these four cross between
the dialects, and only their fixed precedence is encoded. The @DD_API_KEY@ \/
@DD_SITE@ agentless-SaaS credentials are __never read__ -- Écluse exports to an
__operator-declared__, node-local collector\/Agent, never directly to a vendor's
cloud, so there is no path by which a key in the environment turns into off-cluster
egress. The endpoint itself is a declared destination (like the mirror queue), not an
attack surface, so it is normalised and used as given, not classified or gated.

The resolved 'ResolvedTelemetry' is the __single source of truth__ for both halves
of the telemetry stack: 'otelEnvironmentOverrides' projects it back to the canonical
@OTEL_*@ variables the env-driven SDK reads (so a @DD_*@-only deployment still
configures the exporter), and the same record feeds the @dd@ log object that stitches
a log line to its trace.

== Export-failure routing

Telemetry failures must stay off the request path and out of raw stderr. The SDK's
batch exporter runs asynchronously, so an unreachable collector never touches a served
request. This module owns the __shared throttle__ those failures coalesce through: an
'ExportFailureSink' carries one throttle plus a @katip@ target, and 'routeExportFailure'
surfaces the first failure plainly, then a periodic heartbeat carrying the suppressed
count, so a persistently unreachable endpoint is one visible warning and a heartbeat,
not a per-flush flood. The exporter wrappers ("Ecluse.Telemetry") feed the sink through
'observeExportResult'; 'installExportErrorHandler' routes the SDK's own diagnostic
stream through the same sink.

The configuration model and the export-failure mechanism are described in
@docs\/architecture\/observability.md@.
-}
module Ecluse.Telemetry.Resolve (
    -- * The resolved telemetry identity
    ResolvedTelemetry (..),
    TelemetryEndpoint (..),
    EndpointSource (..),
    resolveTelemetry,

    -- * Canonical @OTEL_*@ projection
    otelEnvironmentOverrides,

    -- * Export-failure throttle (pure core)
    ThrottleState (..),
    ThrottleEmit (..),
    initialThrottle,
    throttleInterval,
    throttleStep,

    -- * Export-failure routing
    ExportFailureSink,
    newExportFailureSink,
    exportFailureSink,
    routeExportFailure,
    observeExportResult,
    installExportErrorHandler,

    -- * Boot wiring
    prepareTelemetry,
) where

import Data.List (lookup)
import Data.Map.Strict qualified as Map
import Data.Text qualified as T
import Data.Time (NominalDiffTime, UTCTime, diffUTCTime, getCurrentTime)
import System.Environment (setEnv)

import Katip (LogEnv, Severity (WarningS), logFM, ls)
import Katip.Monadic (runKatipContextT)
import OpenTelemetry.Exporter.Span (ExportResult (..))
import OpenTelemetry.Internal.Logging (setGlobalErrorHandler)

import Ecluse.Core.Text (nonBlank)
import Ecluse.Log (moduleField)

{- | Where a resolved OTLP endpoint came from, so the boot path can distinguish a
deliberately-configured target from the silent default and warn on the latter.
-}
data EndpointSource
    = -- | Derived from @DD_AGENT_HOST@ (as @http:\/\/{host}:4318@).
      FromDdAgentHost
    | -- | Taken verbatim from @OTEL_EXPORTER_OTLP_ENDPOINT@.
      FromOtelEndpoint
    | -- | No endpoint was configured; the @http:\/\/localhost:4318@ default applies.
      DefaultedEndpoint
    deriving stock (EndpointSource -> EndpointSource -> Bool
(EndpointSource -> EndpointSource -> Bool)
-> (EndpointSource -> EndpointSource -> Bool) -> Eq EndpointSource
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: EndpointSource -> EndpointSource -> Bool
== :: EndpointSource -> EndpointSource -> Bool
$c/= :: EndpointSource -> EndpointSource -> Bool
/= :: EndpointSource -> EndpointSource -> Bool
Eq, Int -> EndpointSource -> ShowS
[EndpointSource] -> ShowS
EndpointSource -> String
(Int -> EndpointSource -> ShowS)
-> (EndpointSource -> String)
-> ([EndpointSource] -> ShowS)
-> Show EndpointSource
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> EndpointSource -> ShowS
showsPrec :: Int -> EndpointSource -> ShowS
$cshow :: EndpointSource -> String
show :: EndpointSource -> String
$cshowList :: [EndpointSource] -> ShowS
showList :: [EndpointSource] -> ShowS
Show)

-- | A resolved OTLP export endpoint and the source it was resolved from.
data TelemetryEndpoint = TelemetryEndpoint
    { TelemetryEndpoint -> Text
teUrl :: Text
    -- ^ The endpoint URL the exporter targets (always @http\/protobuf@).
    , TelemetryEndpoint -> EndpointSource
teSource :: EndpointSource
    -- ^ How the URL was resolved.
    }
    deriving stock (TelemetryEndpoint -> TelemetryEndpoint -> Bool
(TelemetryEndpoint -> TelemetryEndpoint -> Bool)
-> (TelemetryEndpoint -> TelemetryEndpoint -> Bool)
-> Eq TelemetryEndpoint
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: TelemetryEndpoint -> TelemetryEndpoint -> Bool
== :: TelemetryEndpoint -> TelemetryEndpoint -> Bool
$c/= :: TelemetryEndpoint -> TelemetryEndpoint -> Bool
/= :: TelemetryEndpoint -> TelemetryEndpoint -> Bool
Eq, Int -> TelemetryEndpoint -> ShowS
[TelemetryEndpoint] -> ShowS
TelemetryEndpoint -> String
(Int -> TelemetryEndpoint -> ShowS)
-> (TelemetryEndpoint -> String)
-> ([TelemetryEndpoint] -> ShowS)
-> Show TelemetryEndpoint
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> TelemetryEndpoint -> ShowS
showsPrec :: Int -> TelemetryEndpoint -> ShowS
$cshow :: TelemetryEndpoint -> String
show :: TelemetryEndpoint -> String
$cshowList :: [TelemetryEndpoint] -> ShowS
showList :: [TelemetryEndpoint] -> ShowS
Show)

{- | The telemetry identity resolved from the environment: the single source of
truth for both the SDK configuration and the @dd@ log object. 'rtEnvironment' and
'rtVersion' are 'Nothing' when the operator named neither dialect's form -- they are
genuinely optional resource attributes, not defaulted to a placeholder.
-}
data ResolvedTelemetry = ResolvedTelemetry
    { ResolvedTelemetry -> Text
rtServiceName :: Text
    -- ^ @service.name@ \/ @dd.service@ (defaults to @ecluse@).
    , ResolvedTelemetry -> Maybe Text
rtEnvironment :: Maybe Text
    -- ^ @deployment.environment@ \/ @dd.env@, when configured.
    , ResolvedTelemetry -> Maybe Text
rtVersion :: Maybe Text
    -- ^ @service.version@ \/ @dd.version@, when configured.
    , ResolvedTelemetry -> TelemetryEndpoint
rtEndpoint :: TelemetryEndpoint
    -- ^ The resolved OTLP export endpoint.
    }
    deriving stock (ResolvedTelemetry -> ResolvedTelemetry -> Bool
(ResolvedTelemetry -> ResolvedTelemetry -> Bool)
-> (ResolvedTelemetry -> ResolvedTelemetry -> Bool)
-> Eq ResolvedTelemetry
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: ResolvedTelemetry -> ResolvedTelemetry -> Bool
== :: ResolvedTelemetry -> ResolvedTelemetry -> Bool
$c/= :: ResolvedTelemetry -> ResolvedTelemetry -> Bool
/= :: ResolvedTelemetry -> ResolvedTelemetry -> Bool
Eq, Int -> ResolvedTelemetry -> ShowS
[ResolvedTelemetry] -> ShowS
ResolvedTelemetry -> String
(Int -> ResolvedTelemetry -> ShowS)
-> (ResolvedTelemetry -> String)
-> ([ResolvedTelemetry] -> ShowS)
-> Show ResolvedTelemetry
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> ResolvedTelemetry -> ShowS
showsPrec :: Int -> ResolvedTelemetry -> ShowS
$cshow :: ResolvedTelemetry -> String
show :: ResolvedTelemetry -> String
$cshowList :: [ResolvedTelemetry] -> ShowS
showList :: [ResolvedTelemetry] -> ShowS
Show)

{- | Resolve the telemetry identity from an environment list, each field
__Datadog-value-wins → vanilla OpenTelemetry → default__. @service.name@ falls
@DD_SERVICE@ → @OTEL_SERVICE_NAME@ → @service.name@ in @OTEL_RESOURCE_ATTRIBUTES@ →
@ecluse@; @deployment.environment@ and @service.version@ fall @DD_ENV@\/@DD_VERSION@
→ the matching @OTEL_RESOURCE_ATTRIBUTES@ key → unset; the endpoint is @DD_AGENT_HOST@
(as @http:\/\/{host}:4318@) → @OTEL_EXPORTER_OTLP_ENDPOINT@ → @http:\/\/localhost:4318@.

A value present but blank is treated as unset, so an empty @DD_ENV=@ does not stamp an
empty environment onto every signal. @DD_API_KEY@ and @DD_SITE@ are never consulted.

>>> rtServiceName (resolveTelemetry [("DD_SERVICE", "api"), ("OTEL_SERVICE_NAME", "ignored")])
"api"

>>> teUrl (rtEndpoint (resolveTelemetry []))
"http://localhost:4318"
-}
resolveTelemetry :: [(String, String)] -> ResolvedTelemetry
resolveTelemetry :: [(String, String)] -> ResolvedTelemetry
resolveTelemetry [(String, String)]
environment =
    ResolvedTelemetry
        { rtServiceName :: Text
rtServiceName = Text -> Maybe Text -> Text
forall a. a -> Maybe a -> a
fromMaybe Text
defaultServiceName Maybe Text
serviceName
        , rtEnvironment :: Maybe Text
rtEnvironment = String -> Maybe Text
lk String
"DD_ENV" Maybe Text -> Maybe Text -> Maybe Text
forall a. Maybe a -> Maybe a -> Maybe a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Text -> Maybe Text
attr Text
"deployment.environment"
        , rtVersion :: Maybe Text
rtVersion = String -> Maybe Text
lk String
"DD_VERSION" Maybe Text -> Maybe Text -> Maybe Text
forall a. Maybe a -> Maybe a -> Maybe a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Text -> Maybe Text
attr Text
"service.version"
        , rtEndpoint :: TelemetryEndpoint
rtEndpoint = TelemetryEndpoint
endpoint
        }
  where
    lk :: String -> Maybe Text
    lk :: String -> Maybe Text
lk String
name = Text -> Maybe Text
nonBlank (Text -> Maybe Text) -> (String -> Text) -> String -> Maybe Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> Text
forall a. ToText a => a -> Text
toText (String -> Maybe Text) -> Maybe String -> Maybe Text
forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< String -> [(String, String)] -> Maybe String
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup String
name [(String, String)]
environment

    attrs :: Map Text Text
    attrs :: Map Text Text
attrs = Map Text Text
-> (Text -> Map Text Text) -> Maybe Text -> Map Text Text
forall b a. b -> (a -> b) -> Maybe a -> b
maybe Map Text Text
forall k a. Map k a
Map.empty Text -> Map Text Text
parseResourceAttributes (String -> Maybe Text
lk String
"OTEL_RESOURCE_ATTRIBUTES")

    attr :: Text -> Maybe Text
    attr :: Text -> Maybe Text
attr Text
key = Text -> Maybe Text
nonBlank (Text -> Maybe Text) -> Maybe Text -> Maybe Text
forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< Text -> Map Text Text -> Maybe Text
forall k a. Ord k => k -> Map k a -> Maybe a
Map.lookup Text
key Map Text Text
attrs

    serviceName :: Maybe Text
    serviceName :: Maybe Text
serviceName = String -> Maybe Text
lk String
"DD_SERVICE" Maybe Text -> Maybe Text -> Maybe Text
forall a. Maybe a -> Maybe a -> Maybe a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> String -> Maybe Text
lk String
"OTEL_SERVICE_NAME" Maybe Text -> Maybe Text -> Maybe Text
forall a. Maybe a -> Maybe a -> Maybe a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Text -> Maybe Text
attr Text
"service.name"

    endpoint :: TelemetryEndpoint
    endpoint :: TelemetryEndpoint
endpoint = case String -> Maybe Text
lk String
"DD_AGENT_HOST" of
        Just Text
host -> Text -> EndpointSource -> TelemetryEndpoint
TelemetryEndpoint (Text -> Text
agentHostUrl Text
host) EndpointSource
FromDdAgentHost
        Maybe Text
Nothing -> case String -> Maybe Text
lk String
"OTEL_EXPORTER_OTLP_ENDPOINT" of
            Just Text
url -> Text -> EndpointSource -> TelemetryEndpoint
TelemetryEndpoint Text
url EndpointSource
FromOtelEndpoint
            Maybe Text
Nothing -> Text -> EndpointSource -> TelemetryEndpoint
TelemetryEndpoint Text
defaultEndpointUrl EndpointSource
DefaultedEndpoint

defaultServiceName :: Text
defaultServiceName :: Text
defaultServiceName = Text
"ecluse"

defaultEndpointUrl :: Text
defaultEndpointUrl :: Text
defaultEndpointUrl = Text
"http://localhost:4318"

{- Build the OTLP HTTP\/protobuf endpoint URL for a Datadog Agent host: the Agent's
OTLP receiver listens on 4318 for HTTP\/protobuf, the only transport we build. A
literal IPv6 host is bracketed so the authority is well-formed -- @http:\/\/[fd00::1]:4318@,
not the invalid @http:\/\/fd00::1:4318@ the SDK exporter would fail to parse. A host
that already carries a scheme is used verbatim, and one already carrying a port is not
given a second, so a deliberately-qualified @DD_AGENT_HOST@ is never mangled. Colon
count disambiguates: a bare IPv6 literal has two or more colons, a @host:port@ exactly
one, and a bare host or IPv4 none. -}
agentHostUrl :: Text -> Text
agentHostUrl :: Text -> Text
agentHostUrl Text
raw
    | Text
"://" Text -> Text -> Bool
`T.isInfixOf` Text
host = Text
host
    | Bool
otherwise = Text
"http://" Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
authority
  where
    host :: Text
host = Text -> Text
T.strip Text
raw
    authority :: Text
authority
        | Text
"[" Text -> Text -> Bool
`T.isPrefixOf` Text
host = if Text
"]:" Text -> Text -> Bool
`T.isInfixOf` Text
host then Text
host else Text
host Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
":4318"
        | HasCallStack => Text -> Text -> Int
Text -> Text -> Int
T.count Text
":" Text
host Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
2 = Text
"[" Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
host Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
"]:4318"
        | HasCallStack => Text -> Text -> Int
Text -> Text -> Int
T.count Text
":" Text
host Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
1 = Text
host
        | Bool
otherwise = Text
host Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
":4318"

{- | Project the resolved identity back to the canonical @OTEL_*@ variables the
env-driven SDK reads, so a @DD_*@-only deployment still configures the exporter. The
overrides set @OTEL_SERVICE_NAME@, the OTLP endpoint, the @http\/protobuf@ protocol
(the only transport built -- gRPC is behind a disabled cabal flag), and an
@OTEL_RESOURCE_ATTRIBUTES@ whose @service.name@\/@deployment.environment@\/
@service.version@ keys are overlaid by the resolution while any other operator-set
attributes are preserved.

Applied with 'System.Environment.setEnv' before the SDK initialises (see
'prepareTelemetry'); idempotent for a vanilla deployment that already set the same
@OTEL_*@ values.
-}
otelEnvironmentOverrides :: [(String, String)] -> [(String, String)]
otelEnvironmentOverrides :: [(String, String)] -> [(String, String)]
otelEnvironmentOverrides [(String, String)]
environment =
    [ (String
"OTEL_SERVICE_NAME", Text -> String
forall a. ToString a => a -> String
toString (ResolvedTelemetry -> Text
rtServiceName ResolvedTelemetry
resolved))
    , (String
"OTEL_EXPORTER_OTLP_ENDPOINT", Text -> String
forall a. ToString a => a -> String
toString (TelemetryEndpoint -> Text
teUrl (ResolvedTelemetry -> TelemetryEndpoint
rtEndpoint ResolvedTelemetry
resolved)))
    , (String
"OTEL_EXPORTER_OTLP_PROTOCOL", String
"http/protobuf")
    , (String
"OTEL_RESOURCE_ATTRIBUTES", Text -> String
forall a. ToString a => a -> String
toString (Map Text Text -> Text
renderResourceAttributes (ResolvedTelemetry -> [(String, String)] -> Map Text Text
mergedResourceAttributes ResolvedTelemetry
resolved [(String, String)]
environment)))
    ]
  where
    resolved :: ResolvedTelemetry
    resolved :: ResolvedTelemetry
resolved = [(String, String)] -> ResolvedTelemetry
resolveTelemetry [(String, String)]
environment

mergedResourceAttributes :: ResolvedTelemetry -> [(String, String)] -> Map Text Text
mergedResourceAttributes :: ResolvedTelemetry -> [(String, String)] -> Map Text Text
mergedResourceAttributes ResolvedTelemetry
resolved [(String, String)]
environment =
    -- Left-biased union: a resolved attribute must win over an inherited
    -- OTEL_RESOURCE_ATTRIBUTES value of the same key, so the resolved map sits on the
    -- LEFT of (<>) ('Map.union' is left-biased). Reversing the operands would let a
    -- stale operator-set value silently override the resolution.
    Map Text Text
resolvedAttrs Map Text Text -> Map Text Text -> Map Text Text
forall a. Semigroup a => a -> a -> a
<> Map Text Text
existing
  where
    existing :: Map Text Text
    existing :: Map Text Text
existing =
        Map Text Text
-> (Text -> Map Text Text) -> Maybe Text -> Map Text Text
forall b a. b -> (a -> b) -> Maybe a -> b
maybe
            Map Text Text
forall k a. Map k a
Map.empty
            Text -> Map Text Text
parseResourceAttributes
            (Text -> Maybe Text
nonBlank (Text -> Maybe Text) -> (String -> Text) -> String -> Maybe Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> Text
forall a. ToText a => a -> Text
toText (String -> Maybe Text) -> Maybe String -> Maybe Text
forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< String -> [(String, String)] -> Maybe String
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup String
"OTEL_RESOURCE_ATTRIBUTES" [(String, String)]
environment)

    resolvedAttrs :: Map Text Text
    resolvedAttrs :: Map Text Text
resolvedAttrs =
        [(Text, Text)] -> Map Text Text
forall k a. Ord k => [(k, a)] -> Map k a
Map.fromList
            [ (Text
key, Text
value)
            | (Text
key, Just Text
value) <-
                [ (Text
"service.name", Text -> Maybe Text
forall a. a -> Maybe a
Just (ResolvedTelemetry -> Text
rtServiceName ResolvedTelemetry
resolved))
                , (Text
"deployment.environment", ResolvedTelemetry -> Maybe Text
rtEnvironment ResolvedTelemetry
resolved)
                , (Text
"service.version", ResolvedTelemetry -> Maybe Text
rtVersion ResolvedTelemetry
resolved)
                ]
            ]

-- Parse the @key1=value1,key2=value2@ resource-attribute string into a map,
-- trimming surrounding whitespace and dropping any entry that carries no @=@ or an
-- empty key. Lenient by design -- this is operator-authored configuration, not a
-- wire format -- so a stray trailing comma or spacing is tolerated.
parseResourceAttributes :: Text -> Map Text Text
parseResourceAttributes :: Text -> Map Text Text
parseResourceAttributes Text
raw =
    [(Text, Text)] -> Map Text Text
forall k a. Ord k => [(k, a)] -> Map k a
Map.fromList
        [ (Text
key, Text -> Text
T.strip (Int -> Text -> Text
T.drop Int
1 Text
value))
        | Text
pair <- HasCallStack => Text -> Text -> [Text]
Text -> Text -> [Text]
T.splitOn Text
"," Text
raw
        , let (Text
before, Text
value) = HasCallStack => Text -> Text -> (Text, Text)
Text -> Text -> (Text, Text)
T.breakOn Text
"=" Text
pair
        , let key :: Text
key = Text -> Text
T.strip Text
before
        , Bool -> Bool
not (Text -> Bool
T.null Text
key)
        , Bool -> Bool
not (Text -> Bool
T.null Text
value)
        ]

-- Render a resource-attribute map back to the @key1=value1,key2=value2@ form, in
-- key order so the projection is deterministic.
renderResourceAttributes :: Map Text Text -> Text
renderResourceAttributes :: Map Text Text -> Text
renderResourceAttributes =
    Text -> [Text] -> Text
T.intercalate Text
"," ([Text] -> Text)
-> (Map Text Text -> [Text]) -> Map Text Text -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ((Text, Text) -> Text) -> [(Text, Text)] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map (\(Text
key, Text
value) -> Text
key Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
"=" Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
value) ([(Text, Text)] -> [Text])
-> (Map Text Text -> [(Text, Text)]) -> Map Text Text -> [Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Map Text Text -> [(Text, Text)]
forall k a. Map k a -> [(k, a)]
Map.toList

defaultedEndpointMessage :: Text -> Text
defaultedEndpointMessage :: Text -> Text
defaultedEndpointMessage Text
url =
    Text
"no telemetry export endpoint configured (DD_AGENT_HOST / OTEL_EXPORTER_OTLP_ENDPOINT unset); defaulting to "
        Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
url
        Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
"."

{- | The throttle state for SDK export-error routing: when an error was last
logged, and how many have been suppressed since. Exposed so the throttle decision
is unit-tested without wall-clock timing.
-}
data ThrottleState = ThrottleState
    { ThrottleState -> Maybe UTCTime
tsLastLogged :: Maybe UTCTime
    -- ^ When an error was last surfaced ('Nothing' before the first).
    , ThrottleState -> Int
tsSuppressed :: Int
    -- ^ Errors suppressed since the last surfaced one.
    }
    deriving stock (ThrottleState -> ThrottleState -> Bool
(ThrottleState -> ThrottleState -> Bool)
-> (ThrottleState -> ThrottleState -> Bool) -> Eq ThrottleState
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: ThrottleState -> ThrottleState -> Bool
== :: ThrottleState -> ThrottleState -> Bool
$c/= :: ThrottleState -> ThrottleState -> Bool
/= :: ThrottleState -> ThrottleState -> Bool
Eq, Int -> ThrottleState -> ShowS
[ThrottleState] -> ShowS
ThrottleState -> String
(Int -> ThrottleState -> ShowS)
-> (ThrottleState -> String)
-> ([ThrottleState] -> ShowS)
-> Show ThrottleState
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> ThrottleState -> ShowS
showsPrec :: Int -> ThrottleState -> ShowS
$cshow :: ThrottleState -> String
show :: ThrottleState -> String
$cshowList :: [ThrottleState] -> ShowS
showList :: [ThrottleState] -> ShowS
Show)

-- | What 'throttleStep' decided to do with an export error.
data ThrottleEmit
    = -- | The first error: surface it plainly.
      EmitFirst
    | {- | The throttle window elapsed: surface a heartbeat carrying the count of
      errors since the last surfaced one (this one included).
      -}
      EmitHeartbeat Int
    | -- | Within the window: suppress and count.
      EmitSuppress
    deriving stock (ThrottleEmit -> ThrottleEmit -> Bool
(ThrottleEmit -> ThrottleEmit -> Bool)
-> (ThrottleEmit -> ThrottleEmit -> Bool) -> Eq ThrottleEmit
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: ThrottleEmit -> ThrottleEmit -> Bool
== :: ThrottleEmit -> ThrottleEmit -> Bool
$c/= :: ThrottleEmit -> ThrottleEmit -> Bool
/= :: ThrottleEmit -> ThrottleEmit -> Bool
Eq, Int -> ThrottleEmit -> ShowS
[ThrottleEmit] -> ShowS
ThrottleEmit -> String
(Int -> ThrottleEmit -> ShowS)
-> (ThrottleEmit -> String)
-> ([ThrottleEmit] -> ShowS)
-> Show ThrottleEmit
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> ThrottleEmit -> ShowS
showsPrec :: Int -> ThrottleEmit -> ShowS
$cshow :: ThrottleEmit -> String
show :: ThrottleEmit -> String
$cshowList :: [ThrottleEmit] -> ShowS
showList :: [ThrottleEmit] -> ShowS
Show)

-- | The initial throttle state: nothing logged, nothing suppressed.
initialThrottle :: ThrottleState
initialThrottle :: ThrottleState
initialThrottle = Maybe UTCTime -> Int -> ThrottleState
ThrottleState Maybe UTCTime
forall a. Maybe a
Nothing Int
0

-- | How long export errors are coalesced between surfaced heartbeats.
throttleInterval :: NominalDiffTime
throttleInterval :: NominalDiffTime
throttleInterval = NominalDiffTime
60

{- | Advance the throttle for one export error at @now@: surface the first error,
surface a heartbeat once the 'throttleInterval' has elapsed since the last surfaced
one (resetting the suppressed count), and otherwise suppress while counting. Pure,
so a sequence of @(time, decision)@ steps is asserted directly.
-}
throttleStep :: NominalDiffTime -> UTCTime -> ThrottleState -> (ThrottleState, ThrottleEmit)
throttleStep :: NominalDiffTime
-> UTCTime -> ThrottleState -> (ThrottleState, ThrottleEmit)
throttleStep NominalDiffTime
interval UTCTime
now ThrottleState
st = case ThrottleState -> Maybe UTCTime
tsLastLogged ThrottleState
st of
    Maybe UTCTime
Nothing -> (Maybe UTCTime -> Int -> ThrottleState
ThrottleState (UTCTime -> Maybe UTCTime
forall a. a -> Maybe a
Just UTCTime
now) Int
0, ThrottleEmit
EmitFirst)
    Just UTCTime
lastLogged
        | UTCTime -> UTCTime -> NominalDiffTime
diffUTCTime UTCTime
now UTCTime
lastLogged NominalDiffTime -> NominalDiffTime -> Bool
forall a. Ord a => a -> a -> Bool
>= NominalDiffTime
interval ->
            (Maybe UTCTime -> Int -> ThrottleState
ThrottleState (UTCTime -> Maybe UTCTime
forall a. a -> Maybe a
Just UTCTime
now) Int
0, Int -> ThrottleEmit
EmitHeartbeat (ThrottleState -> Int
tsSuppressed ThrottleState
st Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1))
        | Bool
otherwise ->
            (ThrottleState
st{tsSuppressed = tsSuppressed st + 1}, ThrottleEmit
EmitSuppress)

{- | Prepare the telemetry substrate at boot, before the SDK initialises: resolve the
identity and normalise the canonical @OTEL_*@ environment the env-driven SDK reads (so a
@DD_*@-only deployment still configures the exporter). The export-failure observation
itself is wired when the substrate stands up ("Ecluse.Telemetry.withTelemetry"), which
builds the shared sink and installs the exporter wrappers and the SDK error handler.

A defaulted endpoint -- neither @DD_AGENT_HOST@ nor @OTEL_EXPORTER_OTLP_ENDPOINT@ set --
is surfaced through @katip@ as one boot warning and falls back to
@http:\/\/localhost:4318@; it is never a failure. The OTLP endpoint is an
__operator-declared destination__ (like the mirror queue), so it is normalised and used
as given, not classified or gated.
-}
prepareTelemetry :: LogEnv -> [(String, String)] -> IO ()
prepareTelemetry :: LogEnv -> [(String, String)] -> IO ()
prepareTelemetry LogEnv
logEnv [(String, String)]
environment = do
    let resolved :: ResolvedTelemetry
resolved = [(String, String)] -> ResolvedTelemetry
resolveTelemetry [(String, String)]
environment
    Bool -> IO () -> IO ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (TelemetryEndpoint -> EndpointSource
teSource (ResolvedTelemetry -> TelemetryEndpoint
rtEndpoint ResolvedTelemetry
resolved) EndpointSource -> EndpointSource -> Bool
forall a. Eq a => a -> a -> Bool
== EndpointSource
DefaultedEndpoint) (IO () -> IO ()) -> IO () -> IO ()
forall a b. (a -> b) -> a -> b
$
        LogEnv -> Severity -> Text -> IO ()
logResolve LogEnv
logEnv Severity
WarningS (Text -> Text
defaultedEndpointMessage (TelemetryEndpoint -> Text
teUrl (ResolvedTelemetry -> TelemetryEndpoint
rtEndpoint ResolvedTelemetry
resolved)))
    ((String, String) -> IO ()) -> [(String, String)] -> IO ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
(a -> m b) -> t a -> m ()
mapM_ ((String -> String -> IO ()) -> (String, String) -> IO ()
forall a b c. (a -> b -> c) -> (a, b) -> c
uncurry String -> String -> IO ()
setEnv) ([(String, String)] -> [(String, String)]
otelEnvironmentOverrides [(String, String)]
environment)

{- | The shared export-failure sink: a single throttle plus the @katip@ target that
every export failure feeds -- the span exporter, the metric exporter, and the SDK's own
diagnostic stream -- so a persistently unreachable collector is one coalesced stream (the
first failure plainly, then a periodic heartbeat) rather than several independent floods.

The clock and the surfacing action are injected so the throttle decision is unit-tested
without wall-clock timing or a live @katip@ scribe (mirroring the pure 'throttleStep'
tests); 'exportFailureSink' wires the production clock and @katip@ target.
-}
data ExportFailureSink = ExportFailureSink
    { ExportFailureSink -> IO UTCTime
sinkNow :: IO UTCTime
    , ExportFailureSink -> IORef ThrottleState
sinkState :: IORef ThrottleState
    , ExportFailureSink -> Severity -> Text -> IO ()
sinkSurface :: Severity -> Text -> IO ()
    }

-- | Build an export-failure sink over an injected clock and surfacing action.
newExportFailureSink :: IO UTCTime -> (Severity -> Text -> IO ()) -> IO ExportFailureSink
newExportFailureSink :: IO UTCTime -> (Severity -> Text -> IO ()) -> IO ExportFailureSink
newExportFailureSink IO UTCTime
now Severity -> Text -> IO ()
surface = do
    throttleRef <- ThrottleState -> IO (IORef ThrottleState)
forall (m :: * -> *) a. MonadIO m => a -> m (IORef a)
newIORef ThrottleState
initialThrottle
    pure ExportFailureSink{sinkNow = now, sinkState = throttleRef, sinkSurface = surface}

{- | The production sink: the wall clock and the composition-root 'LogEnv' as the @katip@
target, tagged with this module (the plain-'IO' @katip@ path the boot phase uses).
-}
exportFailureSink :: LogEnv -> IO ExportFailureSink
exportFailureSink :: LogEnv -> IO ExportFailureSink
exportFailureSink LogEnv
logEnv = IO UTCTime -> (Severity -> Text -> IO ()) -> IO ExportFailureSink
newExportFailureSink IO UTCTime
getCurrentTime (LogEnv -> Severity -> Text -> IO ()
logResolve LogEnv
logEnv)

{- | Route one export-failure diagnostic through the shared throttle into @katip@: the
first surfaced plainly, a heartbeat carrying the suppressed count once 'throttleInterval'
has elapsed since the last surfaced one, otherwise suppressed and counted.
-}
routeExportFailure :: ExportFailureSink -> Text -> IO ()
routeExportFailure :: ExportFailureSink -> Text -> IO ()
routeExportFailure ExportFailureSink
sink Text
diagnostic = do
    now <- ExportFailureSink -> IO UTCTime
sinkNow ExportFailureSink
sink
    emit <- atomicModifyIORef' (sinkState sink) (throttleStep throttleInterval now)
    case emit of
        ThrottleEmit
EmitFirst -> ExportFailureSink -> Severity -> Text -> IO ()
sinkSurface ExportFailureSink
sink Severity
WarningS (Text -> Text
firstErrorMessage Text
diagnostic)
        EmitHeartbeat Int
suppressed -> ExportFailureSink -> Severity -> Text -> IO ()
sinkSurface ExportFailureSink
sink Severity
WarningS (Int -> Text -> Text
heartbeatMessage Int
suppressed Text
diagnostic)
        ThrottleEmit
EmitSuppress -> IO ()
forall (f :: * -> *). Applicative f => f ()
pass

{- | Observe one exporter's 'ExportResult', routing a 'Failure' through the sink and
ignoring a 'Success'. This only /observes/ the failure -- the inner result is the
caller's to return unchanged, so export semantics are untouched (a failed export stays
off the request path). @signal@ names the failing exporter (@span@ \/ @metric@).
-}
observeExportResult :: ExportFailureSink -> Text -> ExportResult -> IO ()
observeExportResult :: ExportFailureSink -> Text -> ExportResult -> IO ()
observeExportResult ExportFailureSink
sink Text
signal = \case
    ExportResult
Success -> IO ()
forall (f :: * -> *). Applicative f => f ()
pass
    Failure Maybe SomeException
mErr -> ExportFailureSink -> Text -> IO ()
routeExportFailure ExportFailureSink
sink (Text
signal Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
" export failed" Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text -> (SomeException -> Text) -> Maybe SomeException -> Text
forall b a. b -> (a -> b) -> Maybe a -> b
maybe Text
"" ((Text
": " Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<>) (Text -> Text) -> (SomeException -> Text) -> SomeException -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. SomeException -> Text
forall b a. (Show a, IsString b) => a -> b
show) Maybe SomeException
mErr)

{- | Install a process-global handler for the SDK's own diagnostic stream, routed through
the shared sink so it coalesces with the exporter-failure feed. In @hs-opentelemetry
1.0.0.0@ the only caller of this handler is the SDK's internal logging -- a failed OTLP
export is dropped there rather than routed here -- so the export-failure feed comes from
the exporter wrappers ('observeExportResult'); this handler is kept for the SDK-internal
diagnostics it still serves.

The forwarded diagnostic 'String' is the SDK's own text and is trusted not to carry
secrets: this module never reads the credential-bearing telemetry inputs
(@OTEL_EXPORTER_OTLP_HEADERS@, @DD_API_KEY@, @DD_SITE@), so the only residual channel is
whatever the SDK itself chooses to log, which the upstream exporter keeps to
endpoint/status diagnostics.
-}
installExportErrorHandler :: ExportFailureSink -> IO ()
installExportErrorHandler :: ExportFailureSink -> IO ()
installExportErrorHandler ExportFailureSink
sink = (String -> IO ()) -> IO ()
setGlobalErrorHandler (ExportFailureSink -> Text -> IO ()
routeExportFailure ExportFailureSink
sink (Text -> IO ()) -> (String -> Text) -> String -> IO ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> Text
forall a. ToText a => a -> Text
toText)

firstErrorMessage :: Text -> Text
firstErrorMessage :: Text -> Text
firstErrorMessage Text
diagnostic =
    Text
"telemetry export error (subsequent identical errors are throttled): " Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
diagnostic

heartbeatMessage :: Int -> Text -> Text
heartbeatMessage :: Int -> Text -> Text
heartbeatMessage Int
suppressed Text
diagnostic =
    Text
"telemetry export still failing: "
        Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Int -> Text
forall b a. (Show a, IsString b) => a -> b
show Int
suppressed
        Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
" export errors since the last report. Latest: "
        Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
diagnostic

-- Log one line through the composition-root 'LogEnv', tagged with this module -- the
-- plain-'IO' katip path the boot phase uses (it holds no 'Handler' reader).
logResolve :: LogEnv -> Severity -> Text -> IO ()
logResolve :: LogEnv -> Severity -> Text -> IO ()
logResolve LogEnv
logEnv Severity
severity Text
message =
    LogEnv
-> SimpleLogPayload -> Namespace -> KatipContextT IO () -> IO ()
forall c (m :: * -> *) a.
LogItem c =>
LogEnv -> c -> Namespace -> KatipContextT m a -> m a
runKatipContextT LogEnv
logEnv (Text -> SimpleLogPayload
moduleField Text
"Ecluse.Telemetry.Resolve") Namespace
forall a. Monoid a => a
mempty (KatipContextT IO () -> IO ()) -> KatipContextT IO () -> IO ()
forall a b. (a -> b) -> a -> b
$
        Severity -> LogStr -> KatipContextT IO ()
forall (m :: * -> *).
(Applicative m, KatipContext m) =>
Severity -> LogStr -> m ()
logFM Severity
severity (Text -> LogStr
forall a. StringConv a Text => a -> LogStr
ls Text
message)