Skip to content

Commit

Permalink
Enable force datadog agent sampling disabled (#6057)
Browse files Browse the repository at this point in the history
  • Loading branch information
BrynCooke authored Sep 25, 2024
2 parents 116427d + 49d7e4c commit 6d8fb94
Show file tree
Hide file tree
Showing 14 changed files with 127 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7267,8 +7267,9 @@ expression: "&schema"
"additionalProperties": false,
"properties": {
"datadog_agent_sampling": {
"default": false,
"description": "Use datadog agent sampling. This means that all spans will be sent to the Datadog agent and the `sampling.priority` attribute will be used to control if the span will then be sent to Datadog.",
"default": null,
"description": "Use datadog agent sampling. This means that all spans will be sent to the Datadog agent and the `sampling.priority` attribute will be used to control if the span will then be sent to Datadog. This option is enabled by default if you are using the Datadog native exporter but can be explicitly disabled.",
"nullable": true,
"type": "boolean"
},
"max_attributes_per_event": {
Expand Down
15 changes: 11 additions & 4 deletions apollo-router/src/plugins/telemetry/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,8 @@ pub(crate) struct TracingCommon {
pub(crate) sampler: SamplerOption,
/// Use datadog agent sampling. This means that all spans will be sent to the Datadog agent
/// and the `sampling.priority` attribute will be used to control if the span will then be sent to Datadog.
pub(crate) datadog_agent_sampling: bool,
/// This option is enabled by default if you are using the Datadog native exporter but can be explicitly disabled.
pub(crate) datadog_agent_sampling: Option<bool>,
/// Whether to use parent based sampling
pub(crate) parent_based_sampler: bool,
/// The maximum events per span before discarding
Expand Down Expand Up @@ -405,7 +406,7 @@ impl Default for TracingCommon {
service_name: Default::default(),
service_namespace: Default::default(),
sampler: default_sampler(),
datadog_agent_sampling: false,
datadog_agent_sampling: None,
parent_based_sampler: default_parent_based_sampler(),
max_events_per_span: default_max_events_per_span(),
max_attributes_per_span: default_max_attributes_per_span(),
Expand Down Expand Up @@ -673,7 +674,7 @@ impl From<&TracingCommon> for opentelemetry::sdk::trace::Config {
if config.parent_based_sampler {
sampler = parent_based(sampler);
}
if config.datadog_agent_sampling {
if config.datadog_agent_sampling.unwrap_or_default() {
common = common.with_sampler(DatadogAgentSampling::new(
sampler,
config.parent_based_sampler,
Expand Down Expand Up @@ -701,7 +702,13 @@ fn parent_based(sampler: opentelemetry::sdk::trace::Sampler) -> opentelemetry::s
impl Conf {
pub(crate) fn calculate_field_level_instrumentation_ratio(&self) -> Result<f64, Error> {
// Because when datadog is enabled the global sampling is overriden to always_on
if self.exporters.tracing.common.datadog_agent_sampling {
if self
.exporters
.tracing
.common
.datadog_agent_sampling
.unwrap_or_default()
{
let field_ratio = match &self.apollo.field_level_instrumentation_sampler {
SamplerOption::TraceIdRatioBased(ratio) => *ratio,
SamplerOption::Always(Sampler::AlwaysOn) => 1.0,
Expand Down
22 changes: 18 additions & 4 deletions apollo-router/src/plugins/telemetry/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -288,8 +288,15 @@ impl Plugin for Telemetry {
let mut config = init.config;
// If the datadog exporter is enabled then enable the agent sampler.
// If users are using otlp export then they will need to set this explicitly in their config.
if config.exporters.tracing.datadog.enabled() {
config.exporters.tracing.common.datadog_agent_sampling = true;
if config.exporters.tracing.datadog.enabled()
&& config
.exporters
.tracing
.common
.datadog_agent_sampling
.is_none()
{
config.exporters.tracing.common.datadog_agent_sampling = Some(true);
}
config.instrumentation.spans.update_defaults();
config.instrumentation.instruments.update_defaults();
Expand Down Expand Up @@ -874,7 +881,14 @@ impl Telemetry {
// If the datadog agent sampling is enabled, then we cannot presample the spans
// Therefore we set presampling to always on and let the regular sampler do the work.
// Effectively, we are disabling the presampling.
if self.config.exporters.tracing.common.datadog_agent_sampling {
if self
.config
.exporters
.tracing
.common
.datadog_agent_sampling
.unwrap_or_default()
{
otel::layer::configure(&SamplerOption::Always(Sampler::AlwaysOn));
} else {
otel::layer::configure(&self.sampling_filter_ratio);
Expand Down Expand Up @@ -975,7 +989,7 @@ impl Telemetry {
// This is because the pre-sampler will sample the spans before they sent to the regular sampler
// If the datadog agent sampling is enabled, then we cannot pre-sample the spans because even if the sampling decision is made to drop
// DatadogAgentSampler will modify the decision to RecordAndSample and instead use the sampling.priority attribute to decide if the span should be sampled or not.
if !common.datadog_agent_sampling {
if !common.datadog_agent_sampling.unwrap_or_default() {
common.sampler = SamplerOption::Always(Sampler::AlwaysOn);
}

Expand Down
59 changes: 59 additions & 0 deletions apollo-router/tests/integration/telemetry/datadog.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ struct TraceSpec {
measured_spans: HashSet<&'static str>,
unmeasured_spans: HashSet<&'static str>,
priority_sampled: Option<&'static str>,
// Not the metrics but the otel attribute
no_priority_sampled_attribute: Option<bool>,
}

#[tokio::test(flavor = "multi_thread")]
Expand Down Expand Up @@ -73,6 +75,49 @@ async fn test_no_sample() -> Result<(), BoxError> {
Ok(())
}

// We want to check we're able to override the behavior of datadog_agent_sampling configuration even if we set a datadog exporter
#[tokio::test(flavor = "multi_thread")]
async fn test_sampling_datadog_agent_disabled() -> Result<(), BoxError> {
if !graph_os_enabled() {
return Ok(());
}
let context = std::sync::Arc::new(std::sync::Mutex::new(None));
let context_clone = context.clone();
let mut router = IntegrationTest::builder()
.telemetry(Telemetry::Datadog)
.config(include_str!(
"fixtures/datadog_agent_sampling_disabled.router.yaml"
))
.responder(ResponseTemplate::new(200).set_body_json(
json!({"data":{"topProducts":[{"name":"Table"},{"name":"Couch"},{"name":"Chair"}]}}),
))
.subgraph_callback(Box::new(move || {
let context = Context::current();
let span = context.span();
let span_context = span.span_context();
*context_clone.lock().expect("poisoned") = Some(span_context.clone());
}))
.build()
.await;

router.start().await;
router.assert_started().await;
let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}});
let (id, result) = router.execute_query(&query).await;
assert!(result.status().is_success());

TraceSpec::builder()
.services(["client", "router", "subgraph"].into())
.no_priority_sampled_attribute(true)
.build()
.validate_trace(id)
.await?;

router.graceful_shutdown().await;

Ok(())
}

#[tokio::test(flavor = "multi_thread")]
async fn test_priority_sampling_propagated() -> Result<(), BoxError> {
if !graph_os_enabled() {
Expand Down Expand Up @@ -795,6 +840,7 @@ impl TraceSpec {
self.verify_measured_spans(&trace)?;
self.verify_operation_name(&trace)?;
self.verify_priority_sampled(&trace)?;
self.verify_priority_sampled_attribute(&trace)?;
self.verify_version(&trace)?;
self.verify_span_kinds(&trace)?;
Ok(())
Expand Down Expand Up @@ -967,4 +1013,17 @@ impl TraceSpec {
}
Ok(())
}

fn verify_priority_sampled_attribute(&self, trace: &Value) -> Result<(), BoxError> {
if self.no_priority_sampled_attribute.unwrap_or_default() {
let binding =
trace.select_path("$..[?(@.service=='router')].meta['sampling.priority']")?;
if binding.is_empty() {
return Ok(());
} else {
return Err(BoxError::from("sampling priority attribute exists"));
}
}
Ok(())
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ telemetry:
resource:
env: local1
service.version: router_version_override
datadog_agent_sampling: true
datadog:
enabled: true
batch_processor:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
telemetry:
apollo:
field_level_instrumentation_sampler: always_off
exporters:
tracing:
experimental_response_trace_id:
enabled: true
header_name: apollo-custom-trace-id
format: datadog
propagation:
trace_context: true
jaeger: true
common:
service_name: router
resource:
env: local1
service.version: router_version_override
datadog_agent_sampling: false
sampler: always_off
datadog:
enabled: true
batch_processor:
scheduled_delay: 100ms
instrumentation:
spans:
mode: spec_compliant
supergraph:
attributes:
graphql.operation.name: true

Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ telemetry:
format: datadog
common:
service_name: router
datadog_agent_sampling: true
datadog:
enabled: true
batch_processor:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ telemetry:
service_name: router
# NOT always_off to allow us to test a sampling probability of zero
sampler: 0.0
datadog_agent_sampling: true
datadog:
enabled: true
batch_processor:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ telemetry:
format: datadog
common:
service_name: router
datadog_agent_sampling: true
datadog:
enabled: true
# Span mapping will always override the span name as far as the test agent is concerned
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ telemetry:
format: datadog
common:
service_name: router
datadog_agent_sampling: true
datadog:
enabled: true
# Span mapping will always override the span name as far as the test agent is concerned
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ telemetry:
format: datadog
common:
service_name: router
datadog_agent_sampling: true
datadog:
enabled: true
enable_span_mapping: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ telemetry:
format: datadog
common:
service_name: router
datadog_agent_sampling: true
datadog:
enabled: true
enable_span_mapping: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,10 @@ telemetry:
common:
# Only 10 percent of spans will be forwarded from the Datadog agent to Datadog. Experiment to find a value that is good for you!
sampler: 0.1
# This is set to true by default if the Datadog exporter is enabled.
datadog_agent_sampling: true
datadog:
enabled: true
# Optional endpoint, either 'default' or a URL (Defaults to http://127.0.0.1:8126)
Expand All @@ -110,7 +114,7 @@ telemetry:
graphql.operation.name: true
```

Note that `datadog_agent_sampling` is automatically enabled when using the native Datadog exporter and cannot be disabled.
Note that `datadog_agent_sampling` is automatically enabled when using the native Datadog exporter but can be overridden if sampled APM metrics are desired.

> [!IMPORTANT]
> Sending all spans to the datadog agent may require that you tweak the `batch_processor` settings in your exporter config. This applies to both OTLP and the Datadog native exporter.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ telemetry:
sampler: 0.1
# Send all spans to the Datadog agent.
# This is enabled by default Datadog exporter is enabled, but may be overridden to false if you want to only have sampled traces.
datadog_agent_sampling: true
```

Expand Down

0 comments on commit 6d8fb94

Please sign in to comment.