Skip to content

Commit

Permalink
Migrate monotonic counter metrics to u64_counter! (#6350)
Browse files Browse the repository at this point in the history
Co-authored-by: Coenen Benjamin <[email protected]>
  • Loading branch information
goto-bus-stop and bnjjj authored Dec 3, 2024
1 parent 59d8cc9 commit 89795a7
Show file tree
Hide file tree
Showing 20 changed files with 285 additions and 147 deletions.
13 changes: 13 additions & 0 deletions .changesets/maint_renee_router_297_monotonic_counters.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
### Docs-deprecate several metrics ([PR #6350](https://github.com/apollographql/router/pull/6350))

These metrics are deprecated in favor of better, OTel-compatible alternatives:

- `apollo_router_deduplicated_subscriptions_total` - use the `apollo.router.operations.subscriptions` metric's `subscriptions.deduplicated` attribute.
- `apollo_authentication_failure_count` - use the `apollo.router.operations.authentication.jwt` metric's `authentication.jwt.failed` attribute.
- `apollo_authentication_success_count` - use the `apollo.router.operations.authentication.jwt` metric instead. If the `authentication.jwt.failed` attribute is *absent* or `false`, the authentication succeeded.
- `apollo_require_authentication_failure_count` - use the `http.server.request.duration` metric's `http.response.status_code` attribute. Requests with authentication failures have HTTP status code 401.
- `apollo_router_timeout` - this metric conflates timed-out requests from client to the router, and requests from the router to subgraphs. Timed-out requests have HTTP status code 504. Use the `http.response.status_code` attribute on the `http.server.request.duration` metric to identify timed-out router requests, and the same attribute on the `http.client.request.duration` metric to identify timed-out subgraph requests.

The deprecated metrics will continue to work in the 1.x release line.

By [@goto-bus-stop](https://github.com/goto-bus-stop) in https://github.com/apollographql/router/pull/6350
40 changes: 24 additions & 16 deletions apollo-router/src/cache/storage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -170,10 +170,12 @@ where

match res {
Some(v) => {
tracing::info!(
monotonic_counter.apollo_router_cache_hit_count = 1u64,
kind = %self.caller,
storage = &tracing::field::display(CacheStorageName::Memory),
u64_counter!(
"apollo_router_cache_hit_count",
"Number of cache hits",
1,
kind = self.caller,
storage = CacheStorageName::Memory.to_string()
);
let duration = instant_memory.elapsed().as_secs_f64();
tracing::info!(
Expand All @@ -190,10 +192,12 @@ where
kind = %self.caller,
storage = &tracing::field::display(CacheStorageName::Memory),
);
tracing::info!(
monotonic_counter.apollo_router_cache_miss_count = 1u64,
kind = %self.caller,
storage = &tracing::field::display(CacheStorageName::Memory),
u64_counter!(
"apollo_router_cache_miss_count",
"Number of cache misses",
1,
kind = self.caller,
storage = CacheStorageName::Memory.to_string()
);

let instant_redis = Instant::now();
Expand All @@ -214,10 +218,12 @@ where
Some(v) => {
self.insert_in_memory(key.clone(), v.0.clone()).await;

tracing::info!(
monotonic_counter.apollo_router_cache_hit_count = 1u64,
kind = %self.caller,
storage = &tracing::field::display(CacheStorageName::Redis),
u64_counter!(
"apollo_router_cache_hit_count",
"Number of cache hits",
1,
kind = self.caller,
storage = CacheStorageName::Redis.to_string()
);
let duration = instant_redis.elapsed().as_secs_f64();
tracing::info!(
Expand All @@ -228,10 +234,12 @@ where
Some(v.0)
}
None => {
tracing::info!(
monotonic_counter.apollo_router_cache_miss_count = 1u64,
kind = %self.caller,
storage = &tracing::field::display(CacheStorageName::Redis),
u64_counter!(
"apollo_router_cache_miss_count",
"Number of cache misses",
1,
kind = self.caller,
storage = CacheStorageName::Redis.to_string()
);
let duration = instant_redis.elapsed().as_secs_f64();
tracing::info!(
Expand Down
16 changes: 10 additions & 6 deletions apollo-router/src/graphql/request.rs
Original file line number Diff line number Diff line change
Expand Up @@ -202,9 +202,11 @@ impl Request {
mode = %BatchingMode::BatchHttpLink // Only supported mode right now
);

tracing::info!(
monotonic_counter.apollo.router.operations.batching = 1u64,
mode = %BatchingMode::BatchHttpLink // Only supported mode right now
u64_counter!(
"apollo.router.operations.batching",
"Total requests with batched operations",
1,
mode = BatchingMode::BatchHttpLink.to_string() // Only supported mode right now
);
for entry in value
.as_array()
Expand All @@ -229,9 +231,11 @@ impl Request {
mode = "batch_http_link" // Only supported mode right now
);

tracing::info!(
monotonic_counter.apollo.router.operations.batching = 1u64,
mode = "batch_http_link" // Only supported mode right now
u64_counter!(
"apollo.router.operations.batching",
"Total requests with batched operations",
1,
mode = BatchingMode::BatchHttpLink.to_string() // Only supported mode right now
);
for entry in value
.as_array()
Expand Down
31 changes: 30 additions & 1 deletion apollo-router/src/metrics/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,9 @@ pub(crate) fn meter_provider() -> AggregateMeterProvider {
}

#[macro_export]
/// Get or create a u64 monotonic counter metric and add a value to it
/// Get or create a `u64` monotonic counter metric and add a value to it.
///
/// Each metric needs a description.
///
/// This macro is a replacement for the telemetry crate's MetricsLayer. We will eventually convert all metrics to use these macros and deprecate the MetricsLayer.
/// The reason for this is that the MetricsLayer has:
Expand All @@ -506,6 +508,33 @@ pub(crate) fn meter_provider() -> AggregateMeterProvider {
/// * Imperfect mapping to metrics API that can only be checked at runtime.
///
/// New metrics should be added using these macros.
///
/// # Examples
/// ```ignore
/// // Count a thing:
/// u64_counter!(
/// "apollo.router.operations.frobbles",
/// "The amount of frobbles we've operated on",
/// 1
/// );
/// // Count a thing with attributes:
/// u64_counter!(
/// "apollo.router.operations.frobbles",
/// "The amount of frobbles we've operated on",
/// 1,
/// frobbles.color = "blue"
/// );
/// // Count a thing with dynamic attributes:
/// let attributes = [
/// opentelemetry::KeyValue::new("frobbles.color".to_string(), "blue".into()),
/// ];
/// u64_counter!(
/// "apollo.router.operations.frobbles",
/// "The amount of frobbles we've operated on",
/// 1,
/// attributes
/// );
/// ```
#[allow(unused_macros)]
macro_rules! u64_counter {
($($name:ident).+, $description:literal, $value: expr, $($attr_key:literal = $attr_value:expr),+) => {
Expand Down
6 changes: 5 additions & 1 deletion apollo-router/src/notification.rs
Original file line number Diff line number Diff line change
Expand Up @@ -510,7 +510,11 @@ where

match Pin::new(&mut this.msg_receiver).poll_next(cx) {
Poll::Ready(Some(Err(BroadcastStreamRecvError::Lagged(_)))) => {
tracing::info!(monotonic_counter.apollo_router_skipped_event_count = 1u64,);
u64_counter!(
"apollo_router_skipped_event_count",
"Amount of events dropped from the internal message queue",
1u64
);
self.poll_next(cx)
}
Poll::Ready(None) => Poll::Ready(None),
Expand Down
35 changes: 19 additions & 16 deletions apollo-router/src/plugins/authentication/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -539,8 +539,6 @@ fn authenticate(
jwks_manager: &JwksManager,
request: router::Request,
) -> ControlFlow<router::Response, router::Request> {
const AUTHENTICATION_KIND: &str = "JWT";

// We are going to do a lot of similar checking so let's define a local function
// to help reduce repetition
fn failure_message(
Expand All @@ -549,17 +547,16 @@ fn authenticate(
status: StatusCode,
) -> ControlFlow<router::Response, router::Request> {
// This is a metric and will not appear in the logs
tracing::info!(
monotonic_counter.apollo_authentication_failure_count = 1u64,
kind = %AUTHENTICATION_KIND
u64_counter!(
"apollo_authentication_failure_count",
"Number of requests with failed JWT authentication (deprecated)",
1,
kind = "JWT"
);
tracing::info!(
monotonic_counter
.apollo
.router
.operations
.authentication
.jwt = 1,
u64_counter!(
"apollo.router.operations.authentication.jwt",
"Number of requests with JWT authentication",
1,
authentication.jwt.failed = true
);
tracing::info!(message = %error, "jwt authentication failure");
Expand Down Expand Up @@ -662,11 +659,17 @@ fn authenticate(
);
}
// This is a metric and will not appear in the logs
tracing::info!(
monotonic_counter.apollo_authentication_success_count = 1u64,
kind = %AUTHENTICATION_KIND
u64_counter!(
"apollo_authentication_success_count",
"Number of requests with successful JWT authentication (deprecated)",
1,
kind = "JWT"
);
u64_counter!(
"apollo.router.operations.jwt",
"Number of requests with JWT authentication",
1
);
tracing::info!(monotonic_counter.apollo.router.operations.jwt = 1u64);
return ControlFlow::Continue(request);
}

Expand Down
16 changes: 10 additions & 6 deletions apollo-router/src/plugins/authentication/subgraph.rs
Original file line number Diff line number Diff line change
Expand Up @@ -409,17 +409,21 @@ impl SigningParamsConfig {
}

fn increment_success_counter(subgraph_name: &str) {
tracing::info!(
monotonic_counter.apollo.router.operations.authentication.aws.sigv4 = 1u64,
u64_counter!(
"apollo.router.operations.authentication.aws.sigv4",
"Number of subgraph requests signed with AWS SigV4",
1,
authentication.aws.sigv4.failed = false,
subgraph.service.name = %subgraph_name,
subgraph.service.name = subgraph_name.to_string()
);
}
fn increment_failure_counter(subgraph_name: &str) {
tracing::info!(
monotonic_counter.apollo.router.operations.authentication.aws.sigv4 = 1u64,
u64_counter!(
"apollo.router.operations.authentication.aws.sigv4",
"Number of subgraph requests signed with AWS SigV4",
1,
authentication.aws.sigv4.failed = true,
subgraph.service.name = %subgraph_name,
subgraph.service.name = subgraph_name.to_string()
);
}

Expand Down
14 changes: 9 additions & 5 deletions apollo-router/src/plugins/authorization/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -556,8 +556,10 @@ impl Plugin for AuthorizationPlugin {
Ok(ControlFlow::Continue(request))
} else {
// This is a metric and will not appear in the logs
tracing::info!(
monotonic_counter.apollo_require_authentication_failure_count = 1u64,
u64_counter!(
"apollo_require_authentication_failure_count",
"Number of unauthenticated requests (deprecated)",
1
);
tracing::error!("rejecting unauthenticated request");
let response = supergraph::Response::error_builder()
Expand Down Expand Up @@ -588,11 +590,13 @@ impl Plugin for AuthorizationPlugin {
let needs_requires_scopes = request.context.contains_key(REQUIRED_SCOPES_KEY);

if needs_authenticated || needs_requires_scopes {
tracing::info!(
monotonic_counter.apollo.router.operations.authorization = 1u64,
u64_counter!(
"apollo.router.operations.authorization",
"Number of subgraph requests requiring authorization",
1,
authorization.filtered = filtered,
authorization.needs_authenticated = needs_authenticated,
authorization.needs_requires_scopes = needs_requires_scopes,
authorization.needs_requires_scopes = needs_requires_scopes
);
}

Expand Down
20 changes: 12 additions & 8 deletions apollo-router/src/plugins/subscription.rs
Original file line number Diff line number Diff line change
Expand Up @@ -503,10 +503,12 @@ impl Service<router::Request> for CallbackService {
};
// Keep the subscription to the client opened
payload.subscribed = Some(true);
tracing::info!(
monotonic_counter.apollo.router.operations.subscriptions.events = 1u64,
subscriptions.mode="callback"
);
u64_counter!(
"apollo.router.operations.subscriptions.events",
"Number of subscription events",
1,
subscriptions.mode = "callback"
);
handle.send_sync(payload)?;

Ok(router::Response {
Expand Down Expand Up @@ -626,10 +628,12 @@ impl Service<router::Request> for CallbackService {
});
}
};
tracing::info!(
monotonic_counter.apollo.router.operations.subscriptions.events = 1u64,
subscriptions.mode="callback",
subscriptions.complete=true
u64_counter!(
"apollo.router.operations.subscriptions.events",
"Number of subscription events",
1,
subscriptions.mode = "callback",
subscriptions.complete = true
);
if let Err(_err) = handle.send_sync(
graphql::Response::builder().errors(errors).build(),
Expand Down
49 changes: 26 additions & 23 deletions apollo-router/src/plugins/telemetry/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,6 @@ use crate::plugins::telemetry::reload::OPENTELEMETRY_TRACER_HANDLE;
use crate::plugins::telemetry::tracing::apollo_telemetry::decode_ftv1_trace;
use crate::plugins::telemetry::tracing::apollo_telemetry::APOLLO_PRIVATE_OPERATION_SIGNATURE;
use crate::plugins::telemetry::tracing::TracingConfigurator;
use crate::plugins::telemetry::utils::TracingUtils;
use crate::query_planner::OperationKind;
use crate::register_private_plugin;
use crate::router_factory::Endpoint;
Expand Down Expand Up @@ -1747,28 +1746,32 @@ impl Telemetry {
}

fn plugin_metrics(config: &Arc<Conf>) {
let metrics_prom_used = config.exporters.metrics.prometheus.enabled;
let metrics_otlp_used = MetricsConfigurator::enabled(&config.exporters.metrics.otlp);
let tracing_otlp_used = TracingConfigurator::enabled(&config.exporters.tracing.otlp);
let tracing_datadog_used = config.exporters.tracing.datadog.enabled();
let tracing_jaeger_used = config.exporters.tracing.jaeger.enabled();
let tracing_zipkin_used = config.exporters.tracing.zipkin.enabled();

if metrics_prom_used
|| metrics_otlp_used
|| tracing_jaeger_used
|| tracing_otlp_used
|| tracing_zipkin_used
|| tracing_datadog_used
{
::tracing::info!(
monotonic_counter.apollo.router.operations.telemetry = 1u64,
telemetry.metrics.otlp = metrics_otlp_used.or_empty(),
telemetry.metrics.prometheus = metrics_prom_used.or_empty(),
telemetry.tracing.otlp = tracing_otlp_used.or_empty(),
telemetry.tracing.datadog = tracing_datadog_used.or_empty(),
telemetry.tracing.jaeger = tracing_jaeger_used.or_empty(),
telemetry.tracing.zipkin = tracing_zipkin_used.or_empty(),
let mut attributes = Vec::new();
if MetricsConfigurator::enabled(&config.exporters.metrics.otlp) {
attributes.push(KeyValue::new("telemetry.metrics.otlp", true));
}
if config.exporters.metrics.prometheus.enabled {
attributes.push(KeyValue::new("telemetry.metrics.prometheus", true));
}
if TracingConfigurator::enabled(&config.exporters.tracing.otlp) {
attributes.push(KeyValue::new("telemetry.tracing.otlp", true));
}
if config.exporters.tracing.datadog.enabled() {
attributes.push(KeyValue::new("telemetry.tracing.datadog", true));
}
if config.exporters.tracing.jaeger.enabled() {
attributes.push(KeyValue::new("telemetry.tracing.jaeger", true));
}
if config.exporters.tracing.zipkin.enabled() {
attributes.push(KeyValue::new("telemetry.tracing.zipkin", true));
}

if !attributes.is_empty() {
u64_counter!(
"apollo.router.operations.telemetry",
"Telemetry exporters enabled",
1,
attributes
);
}
}
Expand Down
6 changes: 5 additions & 1 deletion apollo-router/src/plugins/traffic_shaping/timeout/future.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,11 @@ where
match Pin::new(&mut this.sleep).poll(cx) {
Poll::Pending => Poll::Pending,
Poll::Ready(_) => {
tracing::info!(monotonic_counter.apollo_router_timeout = 1u64,);
u64_counter!(
"apollo_router_timeout",
"Number of timed out client requests",
1
);
Poll::Ready(Err(Elapsed::new().into()))
}
}
Expand Down
Loading

0 comments on commit 89795a7

Please sign in to comment.