Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion crates/api-core/src/logging/service_health_metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ pub fn start_export_service_health_metrics(health_context: ServiceHealthContext)
health_context
.meter
.u64_observable_gauge("carbide_api_ready")
.with_description("Whether the Forge Site Controller API is running")
.with_description("Whether the NICo API is running")
.with_callback(|observer| {
observer.observe(1, &[]);
})
Expand Down
2 changes: 1 addition & 1 deletion crates/api-core/src/logging/setup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ pub fn create_metric_for_spancount_reader(
) {
meter
.u64_observable_gauge("carbide_api_tracing_spans_open")
.with_description("Whether the Forge Site Controller API is running")
.with_description("Number of open logging/tracing spans")
.with_callback(move |observer| {
let open_spans = if let Some(spancount_reader) = &spancount_reader {
spancount_reader.open_spans()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# HELP carbide_api_ready Whether the Forge Site Controller API is running
# HELP carbide_api_ready Whether the NICo API is running
# TYPE carbide_api_ready gauge
carbide_api_ready 1
# HELP carbide_api_version Version (git sha, build date, etc) of this service
Expand Down
6 changes: 3 additions & 3 deletions crates/api-integration-tests/tests/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -218,14 +218,14 @@ fn generate_core_metric_docs(metrics_endpoints: &[SocketAddr]) {
.into_iter()
.filter(|metric| !metric.name.starts_with("alt_metric"))
.collect();
let mut docs = "# NCX Infra Controller (NICo) core metrics\n\n".to_string();
let mut docs = "# NVIDIA Infra Controller (NICo) Core Metrics\n\n".to_string();
use std::fmt::Write;

use askama_escape::Escaper;

writeln!(
&mut docs,
"This file contains a list of metrics exported by NCX Infra Controller (NICo). \
"This file contains a list of metrics exported by NVIDIA Infra Controller (NICo). \
The list is auto-generated from an integration test (`test_integration`). \
Metrics for workflows which are not exercised by the test are missing."
)
Expand All @@ -249,7 +249,7 @@ fn generate_core_metric_docs(metrics_endpoints: &[SocketAddr]) {
write!(&mut docs, "</td>").unwrap();
writeln!(&mut docs, "</tr>").unwrap();
}
writeln!(&mut docs, "<table>").unwrap();
writeln!(&mut docs, "</table>").unwrap();

let path = std::path::Path::new(METRIC_DOC_PATH);
assert!(
Expand Down
3 changes: 2 additions & 1 deletion crates/health-metrics/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ impl<D: HealthMetricDimension> HealthIterationMetrics<D> {
pub fn register_health_gauges<T, D, F>(
metric_prefix: &str,
suppressed_label_key: &'static str,
display_name_plural: &str,
meter: &Meter,
shared: SharedMetricsHolder<T>,
project: F,
Expand All @@ -147,7 +148,7 @@ pub fn register_health_gauges<T, D, F>(
meter
.u64_observable_gauge(format!("{metric_prefix}_health_status_count"))
.with_description(
"The total number of objects in the system that have reported either a healthy or not healthy status - based on the presence of health probe alerts",
format!("The total number of {display_name_plural} in the system that have reported either a healthy or not healthy status - based on the presence of health probe alerts"),
)
.with_callback(move |observer| {
metrics.if_available(|metrics, attrs| {
Expand Down
15 changes: 7 additions & 8 deletions crates/machine-controller/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ impl MetricsEmitter for MachineMetricsEmitter {
let metrics = shared_metrics.clone();
meter
.u64_observable_gauge("carbide_gpus_total_count")
.with_description("The total number of GPUs available in the Forge site")
.with_description("The total number of GPUs available in the NICo deployment")
.with_callback(move |observer| {
metrics.if_available(|metrics, attrs| {
observer.observe(metrics.gpus_total as u64, attrs);
Expand All @@ -136,7 +136,7 @@ impl MetricsEmitter for MachineMetricsEmitter {
let metrics = shared_metrics.clone();
meter
.u64_observable_gauge("carbide_hosts_usable_count")
.with_description("The remaining number of hosts in the Forge site which are available for immediate instance creation")
.with_description("The remaining number of hosts in the NICo deployment which are available for immediate instance creation")
.with_callback(move |observer| {
metrics.if_available(|metrics, attrs| {
observer.observe(
Expand All @@ -151,7 +151,7 @@ impl MetricsEmitter for MachineMetricsEmitter {
let metrics = shared_metrics.clone();
meter
.u64_observable_gauge("carbide_gpus_usable_count")
.with_description("The remaining number of GPUs in the Forge site which are available for immediate instance creation")
.with_description("The remaining number of GPUs in the NICo deployment which are available for immediate instance creation")
.with_callback(move |observer| {
metrics.if_available(|metrics, attrs| {
observer.observe(
Expand All @@ -166,7 +166,7 @@ impl MetricsEmitter for MachineMetricsEmitter {
let metrics = shared_metrics.clone();
meter
.u64_observable_gauge("carbide_gpus_in_use_count")
.with_description("The total number of GPUs that are actively used by tenants in instances in the Forge site")
.with_description("The total number of GPUs that are actively used by tenants in instances in the NICo deployment")
.with_callback(move |observer| {
metrics.if_available(|metrics, attrs| {
let total_in_use_gpus = metrics.gpus_in_use_by_tenant.values().copied().reduce(|a,b| a + b).unwrap_or_default();
Expand All @@ -182,7 +182,7 @@ impl MetricsEmitter for MachineMetricsEmitter {
let metrics = shared_metrics.clone();
meter
.u64_observable_gauge("carbide_hosts_in_use_count")
.with_description("The total number of hosts that are actively used by tenants as instances in the Forge site")
.with_description("The total number of hosts that are actively used by tenants as instances in the NICo deployment")
.with_callback(move |observer| {
metrics.if_available(|metrics, attrs| {
let total_in_use_hosts = metrics.hosts_in_use_by_tenant.values().copied().reduce(|a,b| a + b).unwrap_or_default();
Expand Down Expand Up @@ -266,6 +266,7 @@ impl MetricsEmitter for MachineMetricsEmitter {
register_health_gauges::<_, IsInUseByTenant, _>(
"carbide_hosts",
"machine_id",
"Managed Hosts",
meter,
shared_metrics.clone(),
|m| &m.health,
Expand Down Expand Up @@ -347,9 +348,7 @@ impl MetricsEmitter for MachineMetricsEmitter {
let metrics = shared_metrics.clone();
meter
.u64_observable_gauge("carbide_dpu_agent_version_count")
.with_description(
"The amount of Forge DPU agents which have reported a certain version.",
)
.with_description("The amount of DPU agents which have reported a certain version.")
.with_callback(move |observer| {
metrics.if_available(|metrics, attrs| {
for (version, count) in &metrics.agent_versions {
Expand Down
1 change: 1 addition & 0 deletions crates/power-shelf-controller/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ impl MetricsEmitter for PowerShelfMetricsEmitter {
register_health_gauges::<_, (), _>(
"carbide_power_shelves",
"power_shelf_id",
"Power Shelves",
meter,
shared_metrics,
|m| &m.health,
Expand Down
1 change: 1 addition & 0 deletions crates/rack-controller/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ impl MetricsEmitter for RackMetricsEmitter {
register_health_gauges::<_, (), _>(
"carbide_racks",
"rack_id",
"Racks",
meter,
shared_metrics,
|m| &m.health,
Expand Down
1 change: 1 addition & 0 deletions crates/switch-controller/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ impl MetricsEmitter for SwitchMetricsEmitter {
register_health_gauges::<_, (), _>(
"carbide_switches",
"switch_id",
"Switches",
meter,
shared_metrics,
|m| &m.health,
Expand Down
Loading
Loading