{- | The OpenTelemetry substrate: the tracer and meter providers the rest of the
proxy will hang spans and metrics on, behind a master switch that defaults to
__off__.

Écluse is a self-hosted proxy operators run inside their own infrastructure, so
observability is __opt-in and vendor-neutral__: the substrate is OpenTelemetry,
emitting OTLP that any compatible backend can receive. The maintainer's choice of backend (Datadog)
must never become every consumer's obligation, so __with @ECLUSE_TELEMETRY@ unset
nothing is wired and no telemetry is emitted__ -- the SDK is not even initialised.

This module is purely the __substrate__: it stands up (or, by default, declines to
stand up) the providers and brackets their lifecycle. The spans on the request
lifecycle and the metric instruments layer on top of this substrate; nothing
here instruments the hot path.

== The switch and the handle

'TelemetrySwitch' is the @ECLUSE_TELEMETRY@ master switch, parsed at the
configuration boundary ("Ecluse.Config") in the same strict, fail-loud style as
the other enums. The 'Telemetry' handle it produces is one of two shapes:

* __'telemetryDisabled'__ -- the off-by-default no-op. It holds no providers, the
  SDK is never initialised, and nothing is exported. This is what an unset
  @ECLUSE_TELEMETRY@ yields.
* an __enabled__ handle carrying the SDK's tracer and meter providers, built from
  the standard @OTEL_*@ environment variables the SDK reads directly
  (@OTEL_SERVICE_NAME@, @OTEL_RESOURCE_ATTRIBUTES@, @OTEL_EXPORTER_OTLP_ENDPOINT@,
  @OTEL_EXPORTER_OTLP_PROTOCOL@, the sampler). The OTLP exporter defaults to
  HTTP\/protobuf; gRPC stays behind the exporter's cabal flag, off.

'withTelemetry' is the lifecycle bracket the composition root ("Ecluse.Env") runs
the proxy within: when enabled it initialises the providers and tears them down --
flushing buffered spans and metrics -- along every exit path; when disabled it is a
pure pass-through that opens nothing to tear down.

When enabled it also makes export failures __visible__: the OTLP span and metric
exporters are wrapped so a failed export -- which @hs-opentelemetry 1.0.0.0@ otherwise
drops silently -- is observed and routed through the shared @katip@ throttle
("Ecluse.Telemetry.Resolve"), the first failure logged plainly then a periodic
heartbeat. The wrappers only /observe/; export semantics are unchanged, so an
unreachable collector still degrades off the request path.

The configuration model and the signal catalogue are described in
@docs\/architecture\/observability.md@.
-}
module Ecluse.Telemetry (
    -- * Master switch
    TelemetrySwitch (..),
    parseTelemetrySwitch,
    renderTelemetrySwitch,

    -- * The telemetry handle
    Telemetry (..),
    TelemetryProviders (..),
    telemetryDisabled,
    telemetryEnabled,
    telemetryTracerProvider,
    telemetryMeterProvider,

    -- * Lifecycle
    withTelemetry,

    -- * Export-failure observation (exporter wrappers)
    observeSpanExporter,
    observeMetricExporter,
) where

import Katip (LogEnv)
import OpenTelemetry.Environment (lookupBooleanEnv)
import OpenTelemetry.Exporter.Metric (MetricExporter (..))
import OpenTelemetry.Exporter.OTLP.Span (loadExporterEnvironmentVariables, otlpExporter)
import OpenTelemetry.Exporter.Span (SpanExporter (..))
import OpenTelemetry.Log (initializeGlobalLoggerProvider, shutdownLoggerProvider)
import OpenTelemetry.Metric (
    MeterProvider (..),
    PeriodicMetricReaderHandle (..),
    createMeterProvider,
    defaultSdkMeterProviderOptions,
    forkPeriodicMetricReader,
    noopMeterProvider,
    periodicMetricReaderOptionsFromEnv,
    resolveMetricExporter,
    setGlobalMeterProvider,
    shutdownMeterProvider,
 )
import OpenTelemetry.Registry (registerSpanExporterFactory)
import OpenTelemetry.Resource (materializeResources, mergeResources, mkResource)
import OpenTelemetry.Resource.Detect (detectBuiltInResources, detectResourceAttributes)
import OpenTelemetry.SDK (OTelSignals (..))
import OpenTelemetry.Trace (TracerProvider, initializeGlobalTracerProvider, shutdownTracerProvider)
import UnliftIO (bracket)
import UnliftIO.Exception (catchAny)

import Ecluse.Telemetry.Resolve (
    ExportFailureSink,
    exportFailureSink,
    installExportErrorHandler,
    observeExportResult,
 )

import Ecluse.Core.Wire (WireVocab (..), parseWire, renderWire)

{- | The @ECLUSE_TELEMETRY@ master switch: telemetry is opt-in, so 'TelemetryOff' is
the default and the FOSS posture. A sum type rather than a 'Bool' so each case
names its intent and a future mode (e.g. a metrics-only switch) is a new
constructor, not a second flag.
-}
data TelemetrySwitch
    = -- | Telemetry is disabled (the default): nothing is wired and nothing is emitted.
      TelemetryOff
    | {- | Telemetry is enabled: the SDK providers are built from the standard
      @OTEL_*@ environment and the OTLP exporter is active.
      -}
      TelemetryOn
    deriving stock (TelemetrySwitch -> TelemetrySwitch -> Bool
(TelemetrySwitch -> TelemetrySwitch -> Bool)
-> (TelemetrySwitch -> TelemetrySwitch -> Bool)
-> Eq TelemetrySwitch
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: TelemetrySwitch -> TelemetrySwitch -> Bool
== :: TelemetrySwitch -> TelemetrySwitch -> Bool
$c/= :: TelemetrySwitch -> TelemetrySwitch -> Bool
/= :: TelemetrySwitch -> TelemetrySwitch -> Bool
Eq, Int -> TelemetrySwitch -> ShowS
[TelemetrySwitch] -> ShowS
TelemetrySwitch -> String
(Int -> TelemetrySwitch -> ShowS)
-> (TelemetrySwitch -> String)
-> ([TelemetrySwitch] -> ShowS)
-> Show TelemetrySwitch
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> TelemetrySwitch -> ShowS
showsPrec :: Int -> TelemetrySwitch -> ShowS
$cshow :: TelemetrySwitch -> String
show :: TelemetrySwitch -> String
$cshowList :: [TelemetrySwitch] -> ShowS
showList :: [TelemetrySwitch] -> ShowS
Show)

-- The wire vocabulary of a 'TelemetrySwitch': the single source both 'parseWire' and
-- 'renderWire' derive from for this type. Listed @on@ before @off@, the order the
-- accepted-set message has always named them.
instance WireVocab TelemetrySwitch where
    wireKind :: Text
wireKind = Text
"telemetry switch"
    wireTable :: NonEmpty (TelemetrySwitch, Text)
wireTable =
        (TelemetrySwitch
TelemetryOn, Text
"on")
            (TelemetrySwitch, Text)
-> [(TelemetrySwitch, Text)] -> NonEmpty (TelemetrySwitch, Text)
forall a. a -> [a] -> NonEmpty a
:| [(TelemetrySwitch
TelemetryOff, Text
"off")]

{- | Parse a 'TelemetrySwitch' from its wire name, naming the accepted set on
failure. The same strict, fail-loud style as the other configuration enums
("Ecluse.Config"): an unrecognised value is a loud failure, never a silent
fallback to one mode or the other.

>>> parseTelemetrySwitch "off"
Right TelemetryOff

>>> parseTelemetrySwitch "on"
Right TelemetryOn

>>> parseTelemetrySwitch "maybe"
Left "unknown telemetry switch \"maybe\" (expected one of: on, off)"
-}
parseTelemetrySwitch :: Text -> Either Text TelemetrySwitch
parseTelemetrySwitch :: Text -> Either Text TelemetrySwitch
parseTelemetrySwitch = Text -> Either Text TelemetrySwitch
forall a. WireVocab a => Text -> Either Text a
parseWire

-- | The wire name of a 'TelemetrySwitch' (the inverse of 'parseTelemetrySwitch').
renderTelemetrySwitch :: TelemetrySwitch -> Text
renderTelemetrySwitch :: TelemetrySwitch -> Text
renderTelemetrySwitch = TelemetrySwitch -> Text
forall a. (Eq a, WireVocab a) => a -> Text
renderWire

{- | The telemetry handle held in the composition root: either the off-by-default
no-op or the enabled providers. Spans and metric instruments are derived from the
providers it carries; the disabled case carries none, so a layer that reaches for a
provider finds nothing to emit through -- telemetry is inert, not merely
unsampled.
-}
data Telemetry
    = -- | The off-by-default no-op: no providers, nothing emitted.
      TelemetryDisabled
    | {- | The enabled handle carrying the SDK's providers, built from the standard
      @OTEL_*@ environment. The providers live in a 'TelemetryProviders' product so
      neither field is a partial record selector on this sum.
      -}
      TelemetryEnabled TelemetryProviders

{- | The SDK providers an enabled 'Telemetry' handle carries -- a total product, so
its fields are not partial selectors over the 'Telemetry' sum.
-}
data TelemetryProviders = TelemetryProviders
    { TelemetryProviders -> TracerProvider
tpTracerProvider :: TracerProvider
    -- ^ The SDK tracer provider spans are created through.
    , TelemetryProviders -> MeterProvider
tpMeterProvider :: MeterProvider
    -- ^ The SDK meter provider metric instruments are created through.
    }

{- | The disabled telemetry handle: the off-by-default no-op that holds no
providers and emits nothing. This is what an unset @ECLUSE_TELEMETRY@ resolves to.
-}
telemetryDisabled :: Telemetry
telemetryDisabled :: Telemetry
telemetryDisabled = Telemetry
TelemetryDisabled

{- | Build an enabled telemetry handle from the SDK signals -- the tracer and meter
providers. The disabled case has no constructor argument, so this is the only way
to obtain an enabled handle, keeping its providers' origin (the bracketed SDK
lifecycle) explicit.
-}
telemetryEnabled :: OTelSignals -> Telemetry
telemetryEnabled :: OTelSignals -> Telemetry
telemetryEnabled OTelSignals
signals =
    TelemetryProviders -> Telemetry
TelemetryEnabled
        TelemetryProviders
            { tpTracerProvider :: TracerProvider
tpTracerProvider = OTelSignals -> TracerProvider
otelTracerProvider OTelSignals
signals
            , tpMeterProvider :: MeterProvider
tpMeterProvider = OTelSignals -> MeterProvider
otelMeterProvider OTelSignals
signals
            }

{- | The tracer provider a 'Telemetry' handle exposes, 'Nothing' when telemetry is
disabled. A caller that wants to create a span resolves this first; 'Nothing' is
the signal to emit nothing rather than to fabricate a no-op provider at the edge.
-}
telemetryTracerProvider :: Telemetry -> Maybe TracerProvider
telemetryTracerProvider :: Telemetry -> Maybe TracerProvider
telemetryTracerProvider = \case
    Telemetry
TelemetryDisabled -> Maybe TracerProvider
forall a. Maybe a
Nothing
    TelemetryEnabled TelemetryProviders
providers -> TracerProvider -> Maybe TracerProvider
forall a. a -> Maybe a
Just (TelemetryProviders -> TracerProvider
tpTracerProvider TelemetryProviders
providers)

{- | The meter provider a 'Telemetry' handle exposes, 'Nothing' when telemetry is
disabled (the dual of 'telemetryTracerProvider' for metric instruments).
-}
telemetryMeterProvider :: Telemetry -> Maybe MeterProvider
telemetryMeterProvider :: Telemetry -> Maybe MeterProvider
telemetryMeterProvider = \case
    Telemetry
TelemetryDisabled -> Maybe MeterProvider
forall a. Maybe a
Nothing
    TelemetryEnabled TelemetryProviders
providers -> MeterProvider -> Maybe MeterProvider
forall a. a -> Maybe a
Just (TelemetryProviders -> MeterProvider
tpMeterProvider TelemetryProviders
providers)

{- | Run an action with a 'Telemetry' handle whose lifecycle is bracketed by the
'TelemetrySwitch', tearing the providers down -- flushing buffered spans and
metrics -- along every exit path.

* __'TelemetryOff'__ (the default) is a pure pass-through: the SDK is __never
  initialised__, the body runs against 'telemetryDisabled', the 'LogEnv' is unused,
  and there is nothing to tear down. An unset @ECLUSE_TELEMETRY@ therefore opens no
  exporter and emits nothing.
* __'TelemetryOn'__ initialises the SDK from the standard @OTEL_*@ environment with the
  OTLP exporters wrapped for failure observation (the shared throttle feeds the supplied
  'LogEnv'), runs the body against the enabled handle, and shuts the providers down on
  exit.

This is the scope the composition root ("Ecluse.Env") runs the server and worker
within, so telemetry is established once and flushed on shutdown.
-}
withTelemetry :: TelemetrySwitch -> LogEnv -> (Telemetry -> IO a) -> IO a
withTelemetry :: forall a. TelemetrySwitch -> LogEnv -> (Telemetry -> IO a) -> IO a
withTelemetry TelemetrySwitch
switch LogEnv
logEnv Telemetry -> IO a
use = case TelemetrySwitch
switch of
    TelemetrySwitch
TelemetryOff -> Telemetry -> IO a
use Telemetry
telemetryDisabled
    TelemetrySwitch
TelemetryOn -> do
        sink <- LogEnv -> IO ExportFailureSink
exportFailureSink LogEnv
logEnv
        installExportErrorHandler sink
        registerObservedSpanExporter sink
        bracket (initializeObservedOpenTelemetry sink) otelShutdown (use . telemetryEnabled)

{- Wrap the OTLP span exporter so a failed export is observed -- routed through the shared
'ExportFailureSink' into @katip@ under a throttle -- and the inner result returned
unchanged. @hs-opentelemetry 1.0.0.0@ drops a failed export silently (the batch processor
discards the 'ExportResult'), so this wrapper is where Écluse learns the export failed
without changing export semantics: the failure stays off the request path. -}
observeSpanExporter :: ExportFailureSink -> SpanExporter -> SpanExporter
observeSpanExporter :: ExportFailureSink -> SpanExporter -> SpanExporter
observeSpanExporter ExportFailureSink
sink SpanExporter
inner =
    SpanExporter
inner
        { spanExporterExport = \HashMap InstrumentationLibrary (Vector ImmutableSpan)
completedSpans -> do
            result <- SpanExporter
-> HashMap InstrumentationLibrary (Vector ImmutableSpan)
-> IO ExportResult
spanExporterExport SpanExporter
inner HashMap InstrumentationLibrary (Vector ImmutableSpan)
completedSpans
            observeExportResult sink "span" result
            pure result
        }

-- Dual of 'observeSpanExporter' for the periodic metric reader's exporter (which likewise
-- discards the 'ExportResult').
observeMetricExporter :: ExportFailureSink -> MetricExporter -> MetricExporter
observeMetricExporter :: ExportFailureSink -> MetricExporter -> MetricExporter
observeMetricExporter ExportFailureSink
sink MetricExporter
inner =
    MetricExporter
inner
        { metricExporterExport = \Vector ResourceMetricsExport
batches -> do
            result <- MetricExporter -> Vector ResourceMetricsExport -> IO ExportResult
metricExporterExport MetricExporter
inner Vector ResourceMetricsExport
batches
            observeExportResult sink "metric" result
            pure result
        }

{- Register the observed OTLP span exporter under the @otlp@ key before the SDK's
env-driven tracer init runs: 'initializeGlobalTracerProvider' resolves
@OTEL_TRACES_EXPORTER@ through the exporter 'OpenTelemetry.Registry', which prefers a
registered factory over the built-in default, so the wrapped exporter is the one the
batch processor drives. The metric path has no such registry hook, so it is wrapped
directly in 'initializeObservedMeterProvider'. -}
registerObservedSpanExporter :: ExportFailureSink -> IO ()
registerObservedSpanExporter :: ExportFailureSink -> IO ()
registerObservedSpanExporter ExportFailureSink
sink =
    Text -> IO SpanExporter -> IO ()
registerSpanExporterFactory
        Text
"otlp"
        (ExportFailureSink -> SpanExporter -> SpanExporter
observeSpanExporter ExportFailureSink
sink (SpanExporter -> SpanExporter)
-> IO SpanExporter -> IO SpanExporter
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (OTLPExporterConfig -> IO SpanExporter
forall (m :: * -> *).
MonadIO m =>
OTLPExporterConfig -> m SpanExporter
otlpExporter (OTLPExporterConfig -> IO SpanExporter)
-> IO OTLPExporterConfig -> IO SpanExporter
forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< IO OTLPExporterConfig
forall (m :: * -> *). MonadIO m => m OTLPExporterConfig
loadExporterEnvironmentVariables))

{- Stand up the three SDK signal providers from the @OTEL_*@ environment with the OTLP
exporters wrapped for failure observation, mirroring @hs-opentelemetry-sdk@'s own
@initializeOpenTelemetry@. The tracer picks up the observed span exporter through the
registry ('registerObservedSpanExporter', run before this); the meter is built here
because the SDK's metric init exposes no registry hook for its exporter. This and
'initializeObservedMeterProvider' are pinned to @hs-opentelemetry-sdk 1.0.0.0@; re-diff
both against the SDK on any version bump. -}
initializeObservedOpenTelemetry :: ExportFailureSink -> IO OTelSignals
initializeObservedOpenTelemetry :: ExportFailureSink -> IO OTelSignals
initializeObservedOpenTelemetry ExportFailureSink
sink = do
    tracerProvider <- IO TracerProvider
initializeGlobalTracerProvider
    meterProvider <- initializeObservedMeterProvider sink
    loggerProvider <- initializeGlobalLoggerProvider
    let shutdown = do
            IO ShutdownResult -> IO ()
forall (f :: * -> *) a. Functor f => f a -> f ()
void (TracerProvider -> Maybe Int -> IO ShutdownResult
forall (m :: * -> *).
MonadIO m =>
TracerProvider -> Maybe Int -> m ShutdownResult
shutdownTracerProvider TracerProvider
tracerProvider Maybe Int
forall a. Maybe a
Nothing) IO () -> (SomeException -> IO ()) -> IO ()
forall (m :: * -> *) a.
MonadUnliftIO m =>
m a -> (SomeException -> m a) -> m a
`catchAny` IO () -> SomeException -> IO ()
forall a b. a -> b -> a
const IO ()
forall (f :: * -> *). Applicative f => f ()
pass
            IO ShutdownResult -> IO ()
forall (f :: * -> *) a. Functor f => f a -> f ()
void (MeterProvider -> Maybe Int -> IO ShutdownResult
shutdownMeterProvider MeterProvider
meterProvider Maybe Int
forall a. Maybe a
Nothing) IO () -> (SomeException -> IO ()) -> IO ()
forall (m :: * -> *) a.
MonadUnliftIO m =>
m a -> (SomeException -> m a) -> m a
`catchAny` IO () -> SomeException -> IO ()
forall a b. a -> b -> a
const IO ()
forall (f :: * -> *). Applicative f => f ()
pass
            IO ShutdownResult -> IO ()
forall (f :: * -> *) a. Functor f => f a -> f ()
void (LoggerProvider -> Maybe Int -> IO ShutdownResult
forall (m :: * -> *).
MonadIO m =>
LoggerProvider -> Maybe Int -> m ShutdownResult
shutdownLoggerProvider LoggerProvider
loggerProvider Maybe Int
forall a. Maybe a
Nothing) IO () -> (SomeException -> IO ()) -> IO ()
forall (m :: * -> *) a.
MonadUnliftIO m =>
m a -> (SomeException -> m a) -> m a
`catchAny` IO () -> SomeException -> IO ()
forall a b. a -> b -> a
const IO ()
forall (f :: * -> *). Applicative f => f ()
pass
    pure
        OTelSignals
            { otelTracerProvider = tracerProvider
            , otelMeterProvider = meterProvider
            , otelLoggerProvider = loggerProvider
            , otelPropagators = mempty
            , otelShutdown = shutdown
            }

{- Build the global meter provider with the OTLP metric exporter wrapped for failure
observation. Mirrors @hs-opentelemetry-sdk@'s @initializeGlobalMeterProvider@ exactly,
differing only in wrapping the exporter the periodic reader drives -- the SDK's metric
init takes the exporter directly ('resolveMetricExporter') with no registry injection
point, unlike the span path. Pinned to @hs-opentelemetry-sdk 1.0.0.0@; re-verify against
the SDK's @initializeGlobalMeterProvider@ on any version bump. -}
initializeObservedMeterProvider :: ExportFailureSink -> IO MeterProvider
initializeObservedMeterProvider :: ExportFailureSink -> IO MeterProvider
initializeObservedMeterProvider ExportFailureSink
sink = do
    disabled <- String -> IO Bool
lookupBooleanEnv String
"OTEL_SDK_DISABLED"
    if disabled
        then noopMeterProvider <$ setGlobalMeterProvider noopMeterProvider
        else do
            exporter <- observeMetricExporter sink <$> resolveMetricExporter
            readerOptions <- periodicMetricReaderOptionsFromEnv
            builtInResources <- detectBuiltInResources
            envResources <- mkResource . map Just <$> detectResourceAttributes
            let resources = Resource -> MaterializedResources
materializeResources (Resource -> Resource -> Resource
mergeResources Resource
envResources Resource
builtInResources)
            (provider, env) <- createMeterProvider resources defaultSdkMeterProviderOptions
            readerHandle <- forkPeriodicMetricReader env exporter readerOptions
            let provider' = PeriodicMetricReaderHandle -> MeterProvider -> MeterProvider
stopReaderOnShutdown PeriodicMetricReaderHandle
readerHandle MeterProvider
provider
            setGlobalMeterProvider provider'
            pure provider'

{- Wrap a meter provider so its shutdown stops the periodic metric reader before the
provider's own shutdown, as the mirrored SDK init does; part of the same version-pin
re-diff surface as 'initializeObservedMeterProvider'. -}
stopReaderOnShutdown :: PeriodicMetricReaderHandle -> MeterProvider -> MeterProvider
stopReaderOnShutdown :: PeriodicMetricReaderHandle -> MeterProvider -> MeterProvider
stopReaderOnShutdown PeriodicMetricReaderHandle
readerHandle MeterProvider
provider =
    MeterProvider
provider
        { meterProviderShutdown = \Maybe Int
timeout -> do
            PeriodicMetricReaderHandle -> IO ()
stopPeriodicMetricReader PeriodicMetricReaderHandle
readerHandle
            MeterProvider -> Maybe Int -> IO ShutdownResult
meterProviderShutdown MeterProvider
provider Maybe Int
timeout
        }