diff --git a/crates/api-core/src/logging/service_health_metrics.rs b/crates/api-core/src/logging/service_health_metrics.rs index 32b2a4cbe3..5c985202a5 100644 --- a/crates/api-core/src/logging/service_health_metrics.rs +++ b/crates/api-core/src/logging/service_health_metrics.rs @@ -33,7 +33,7 @@ pub fn start_export_service_health_metrics(health_context: ServiceHealthContext) health_context .meter .u64_observable_gauge("carbide_api_ready") - .with_description("Whether the Forge Site Controller API is running") + .with_description("Whether the NICo API is running") .with_callback(|observer| { observer.observe(1, &[]); }) diff --git a/crates/api-core/src/logging/setup.rs b/crates/api-core/src/logging/setup.rs index 24a380f260..0e27d387be 100644 --- a/crates/api-core/src/logging/setup.rs +++ b/crates/api-core/src/logging/setup.rs @@ -309,7 +309,7 @@ pub fn create_metric_for_spancount_reader( ) { meter .u64_observable_gauge("carbide_api_tracing_spans_open") - .with_description("Whether the Forge Site Controller API is running") + .with_description("Number of open logging/tracing spans") .with_callback(move |observer| { let open_spans = if let Some(spancount_reader) = &spancount_reader { spancount_reader.open_spans() diff --git a/crates/api-core/src/tests/metrics_fixtures/test_service_health_metrics.txt b/crates/api-core/src/tests/metrics_fixtures/test_service_health_metrics.txt index d519e8f86e..def27a53be 100644 --- a/crates/api-core/src/tests/metrics_fixtures/test_service_health_metrics.txt +++ b/crates/api-core/src/tests/metrics_fixtures/test_service_health_metrics.txt @@ -1,4 +1,4 @@ -# HELP carbide_api_ready Whether the Forge Site Controller API is running +# HELP carbide_api_ready Whether the NICo API is running # TYPE carbide_api_ready gauge carbide_api_ready 1 # HELP carbide_api_version Version (git sha, build date, etc) of this service diff --git a/crates/api-integration-tests/tests/lib.rs b/crates/api-integration-tests/tests/lib.rs index 26b2ed668d..0c6d2f084f 100644 --- a/crates/api-integration-tests/tests/lib.rs +++ b/crates/api-integration-tests/tests/lib.rs @@ -218,14 +218,14 @@ fn generate_core_metric_docs(metrics_endpoints: &[SocketAddr]) { .into_iter() .filter(|metric| !metric.name.starts_with("alt_metric")) .collect(); - let mut docs = "# NCX Infra Controller (NICo) core metrics\n\n".to_string(); + let mut docs = "# NVIDIA Infra Controller (NICo) Core Metrics\n\n".to_string(); use std::fmt::Write; use askama_escape::Escaper; writeln!( &mut docs, - "This file contains a list of metrics exported by NCX Infra Controller (NICo). \ + "This file contains a list of metrics exported by NVIDIA Infra Controller (NICo). \ The list is auto-generated from an integration test (`test_integration`). \ Metrics for workflows which are not exercised by the test are missing." ) @@ -249,7 +249,7 @@ fn generate_core_metric_docs(metrics_endpoints: &[SocketAddr]) { write!(&mut docs, "").unwrap(); writeln!(&mut docs, "").unwrap(); } - writeln!(&mut docs, "").unwrap(); + writeln!(&mut docs, "
").unwrap(); let path = std::path::Path::new(METRIC_DOC_PATH); assert!( diff --git a/crates/health-metrics/src/lib.rs b/crates/health-metrics/src/lib.rs index f8ed29521b..4c77ef2ff1 100644 --- a/crates/health-metrics/src/lib.rs +++ b/crates/health-metrics/src/lib.rs @@ -130,6 +130,7 @@ impl HealthIterationMetrics { pub fn register_health_gauges( metric_prefix: &str, suppressed_label_key: &'static str, + display_name_plural: &str, meter: &Meter, shared: SharedMetricsHolder, project: F, @@ -147,7 +148,7 @@ pub fn register_health_gauges( meter .u64_observable_gauge(format!("{metric_prefix}_health_status_count")) .with_description( - "The total number of objects in the system that have reported either a healthy or not healthy status - based on the presence of health probe alerts", + format!("The total number of {display_name_plural} in the system that have reported either a healthy or not healthy status - based on the presence of health probe alerts"), ) .with_callback(move |observer| { metrics.if_available(|metrics, attrs| { diff --git a/crates/machine-controller/src/metrics.rs b/crates/machine-controller/src/metrics.rs index 79e9e486ec..54f93c60cd 100644 --- a/crates/machine-controller/src/metrics.rs +++ b/crates/machine-controller/src/metrics.rs @@ -124,7 +124,7 @@ impl MetricsEmitter for MachineMetricsEmitter { let metrics = shared_metrics.clone(); meter .u64_observable_gauge("carbide_gpus_total_count") - .with_description("The total number of GPUs available in the Forge site") + .with_description("The total number of GPUs available in the NICo deployment") .with_callback(move |observer| { metrics.if_available(|metrics, attrs| { observer.observe(metrics.gpus_total as u64, attrs); @@ -136,7 +136,7 @@ impl MetricsEmitter for MachineMetricsEmitter { let metrics = shared_metrics.clone(); meter .u64_observable_gauge("carbide_hosts_usable_count") - .with_description("The remaining number of hosts in the Forge site which are available for immediate instance creation") + .with_description("The remaining number of hosts in the NICo deployment which are available for immediate instance creation") .with_callback(move |observer| { metrics.if_available(|metrics, attrs| { observer.observe( @@ -151,7 +151,7 @@ impl MetricsEmitter for MachineMetricsEmitter { let metrics = shared_metrics.clone(); meter .u64_observable_gauge("carbide_gpus_usable_count") - .with_description("The remaining number of GPUs in the Forge site which are available for immediate instance creation") + .with_description("The remaining number of GPUs in the NICo deployment which are available for immediate instance creation") .with_callback(move |observer| { metrics.if_available(|metrics, attrs| { observer.observe( @@ -166,7 +166,7 @@ impl MetricsEmitter for MachineMetricsEmitter { let metrics = shared_metrics.clone(); meter .u64_observable_gauge("carbide_gpus_in_use_count") - .with_description("The total number of GPUs that are actively used by tenants in instances in the Forge site") + .with_description("The total number of GPUs that are actively used by tenants in instances in the NICo deployment") .with_callback(move |observer| { metrics.if_available(|metrics, attrs| { let total_in_use_gpus = metrics.gpus_in_use_by_tenant.values().copied().reduce(|a,b| a + b).unwrap_or_default(); @@ -182,7 +182,7 @@ impl MetricsEmitter for MachineMetricsEmitter { let metrics = shared_metrics.clone(); meter .u64_observable_gauge("carbide_hosts_in_use_count") - .with_description("The total number of hosts that are actively used by tenants as instances in the Forge site") + .with_description("The total number of hosts that are actively used by tenants as instances in the NICo deployment") .with_callback(move |observer| { metrics.if_available(|metrics, attrs| { let total_in_use_hosts = metrics.hosts_in_use_by_tenant.values().copied().reduce(|a,b| a + b).unwrap_or_default(); @@ -266,6 +266,7 @@ impl MetricsEmitter for MachineMetricsEmitter { register_health_gauges::<_, IsInUseByTenant, _>( "carbide_hosts", "machine_id", + "Managed Hosts", meter, shared_metrics.clone(), |m| &m.health, @@ -347,9 +348,7 @@ impl MetricsEmitter for MachineMetricsEmitter { let metrics = shared_metrics.clone(); meter .u64_observable_gauge("carbide_dpu_agent_version_count") - .with_description( - "The amount of Forge DPU agents which have reported a certain version.", - ) + .with_description("The amount of DPU agents which have reported a certain version.") .with_callback(move |observer| { metrics.if_available(|metrics, attrs| { for (version, count) in &metrics.agent_versions { diff --git a/crates/power-shelf-controller/src/metrics.rs b/crates/power-shelf-controller/src/metrics.rs index 1bf982fae7..16e3d5878a 100644 --- a/crates/power-shelf-controller/src/metrics.rs +++ b/crates/power-shelf-controller/src/metrics.rs @@ -45,6 +45,7 @@ impl MetricsEmitter for PowerShelfMetricsEmitter { register_health_gauges::<_, (), _>( "carbide_power_shelves", "power_shelf_id", + "Power Shelves", meter, shared_metrics, |m| &m.health, diff --git a/crates/rack-controller/src/metrics.rs b/crates/rack-controller/src/metrics.rs index ca71b31bcf..215fcd4d43 100644 --- a/crates/rack-controller/src/metrics.rs +++ b/crates/rack-controller/src/metrics.rs @@ -45,6 +45,7 @@ impl MetricsEmitter for RackMetricsEmitter { register_health_gauges::<_, (), _>( "carbide_racks", "rack_id", + "Racks", meter, shared_metrics, |m| &m.health, diff --git a/crates/switch-controller/src/metrics.rs b/crates/switch-controller/src/metrics.rs index 7a8b8c3e4b..3de7dca29d 100644 --- a/crates/switch-controller/src/metrics.rs +++ b/crates/switch-controller/src/metrics.rs @@ -45,6 +45,7 @@ impl MetricsEmitter for SwitchMetricsEmitter { register_health_gauges::<_, (), _>( "carbide_switches", "switch_id", + "Switches", meter, shared_metrics, |m| &m.health, diff --git a/docs/manuals/metrics/core_metrics.md b/docs/manuals/metrics/core_metrics.md index 49da640d8e..ab934b9e46 100644 --- a/docs/manuals/metrics/core_metrics.md +++ b/docs/manuals/metrics/core_metrics.md @@ -4,120 +4,126 @@ This file contains a list of metrics exported by NVIDIA Infra Controller (NICo). - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
nico_active_host_firmware_update_countgaugeThe number of host machines in the system currently working on updating their firmware.
nico_api_db_queries_totalcounterThe amount of database queries that occurred inside a span
nico_api_db_span_query_time_millisecondshistogramTotal time the request spent inside a span on database transactions
nico_api_grpc_server_duration_millisecondshistogramProcessing time for a request on the nico API server
nico_api_readygaugeWhether the NICo API is running
nico_api_tls_connection_attempted_totalcounterThe amount of tls connections that were attempted
nico_api_tls_connection_success_totalcounterThe amount of tls connections that were successful
nico_api_tracing_spans_opengaugeWhether the NICo API is running
nico_api_vault_request_duration_millisecondshistogramthe duration of outbound vault requests, in milliseconds
nico_api_vault_requests_attempted_totalcounterThe amount of tls connections that were attempted
nico_api_vault_requests_failed_totalcounterThe amount of tcp connections that were failures
nico_api_vault_requests_succeeded_totalcounterThe amount of tls connections that were successful
nico_api_vault_token_time_until_refresh_secondsgaugeThe amount of time, in seconds, until the vault token is required to be refreshed
nico_api_versiongaugeVersion (git sha, build date, etc) of this service
nico_available_ips_countgaugeThe total number of available ips in the site
nico_concurrent_machine_updates_availablegaugeThe number of machines in the system that we will update concurrently.
nico_db_pool_idle_connsgaugeThe amount of idle connections in the nico database pool
nico_db_pool_total_connsgaugeThe amount of total (active + idle) connections in the nico database pool
nico_dpu_agent_version_countgaugeThe amount of DPU agents which have reported a certain version.
nico_dpu_firmware_version_countgaugeThe amount of DPUs which have reported a certain firmware version.
nico_dpus_healthy_countgaugeThe total number of DPUs in the system that have reported healthy in the last report. Healthy does not imply up - the report from the DPU might be outdated.
nico_dpus_up_countgaugeThe total number of DPUs in the system that are up. Up means we have received a health report less than 5 minutes ago.
nico_endpoint_exploration_duration_millisecondshistogramThe time it took to explore an endpoint
nico_endpoint_exploration_expected_machines_missing_overall_countgaugeThe total number of machines that were expected but not identified
nico_endpoint_exploration_expected_power_shelves_missing_overall_countgaugeThe total number of power shelves that were expected but not identified
nico_endpoint_exploration_identified_managed_hosts_overall_countgaugeThe total number of managed hosts identified by expectation
nico_endpoint_exploration_machines_explored_overall_countgaugeThe total number of machines explored by machine type
nico_endpoint_exploration_success_countgaugeThe amount of endpoint explorations that have been successful
nico_endpoint_explorations_countgaugeThe amount of endpoint explorations that have been attempted
nico_gpus_in_use_countgaugeThe total number of GPUs that are actively used by tenants in instances in the NICo deployment
nico_gpus_total_countgaugeThe total number of GPUs available in the NICo deployment
nico_gpus_usable_countgaugeThe remaining number of GPUs in the NICo deployment which are available for immediate instance creation
nico_hosts_by_sku_countgaugeThe amount of hosts by SKU and device type ('unknown' for hosts without SKU)
nico_hosts_health_overrides_countgaugeThe amount of health overrides that are configured in the site
nico_hosts_health_status_countgaugeThe total number of Managed Hosts in the system that have reported either a healthy or not healthy status - based on the presence of health probe alerts
nico_hosts_in_use_countgaugeThe total number of hosts that are actively used by tenants as instances in the NICo deployment
nico_hosts_usable_countgaugeThe remaining number of hosts in the NICo deployment which are available for immediate instance creation
nico_hosts_with_bios_password_setgaugeThe total number of Hosts in the system that have their BIOS password set.
nico_ib_partitions_enqueuer_iteration_latency_millisecondshistogramThe overall time it took to enqueue state handling tasks for all nico_ib_partitions in the system
nico_ib_partitions_iteration_latency_millisecondshistogramThe elapsed time in the last state processor iteration to handle objects of type nico_ib_partitions
nico_ib_partitions_object_tasks_enqueued_totalcounterThe amount of types that object handling tasks that have been freshly enqueued for objects of type nico_ib_partitions
nico_ib_partitions_totalgaugeThe total number of nico_ib_partitions in the system
nico_machine_reboot_duration_secondshistogramTime taken for machine/host to reboot in seconds
nico_machine_updates_started_countgaugeThe number of machines in the system that are in the process of updating.
nico_machine_validation_completedgaugeCount of machine validation that have completed successfully
nico_machine_validation_failedgaugeCount of machine validation that have failed
nico_machine_validation_in_progressgaugeCount of machine validation that are in progress
nico_machine_validation_testsgaugeThe details of machine validation tests
nico_machines_enqueuer_iteration_latency_millisecondshistogramThe overall time it took to enqueue state handling tasks for all nico_machines in the system
nico_machines_handler_latency_in_state_millisecondshistogramThe amount of time it took to invoke the state handler for objects of type nico_machines in a certain state
nico_machines_in_maintenance_countgaugeThe total number of machines in the system that are in maintenance.
nico_machines_iteration_latency_millisecondshistogramThe elapsed time in the last state processor iteration to handle objects of type nico_machines
nico_machines_object_tasks_completed_totalcounterThe amount of object handling tasks that have been completed for objects of type nico_machines
nico_machines_object_tasks_dispatched_totalcounterThe amount of types that object handling tasks that have been dequeued and dispatched for processing for objects of type nico_machines
nico_machines_object_tasks_enqueued_totalcounterThe amount of types that object handling tasks that have been freshly enqueued for objects of type nico_machines
nico_machines_object_tasks_requeued_totalcounterThe amount of object handling tasks that have been requeued for objects of type nico_machines
nico_machines_per_stategaugeThe number of nico_machines in the system with a given state
nico_machines_per_state_above_slagaugeThe number of nico_machines in the system which had been longer in a state than allowed per SLA
nico_machines_state_entered_totalcounterThe amount of types that objects of type nico_machines have entered a certain state
nico_machines_state_exited_totalcounterThe amount of types that objects of type nico_machines have exited a certain state
nico_machines_time_in_state_secondshistogramThe amount of time objects of type nico_machines have spent in a certain state
nico_machines_totalgaugeThe total number of nico_machines in the system
nico_machines_with_state_handling_errors_per_stategaugeThe number of nico_machines in the system with a given state that failed state handling
nico_measured_boot_bundles_totalgaugeThe total number of measured boot bundles.
nico_measured_boot_machines_per_bundle_state_totalgaugeThe total number of machines per a given measured boot bundle state.
nico_measured_boot_machines_per_machine_state_totalgaugeThe total number of machines per a given measured boot machine state.
nico_measured_boot_machines_totalgaugeThe total number of machines reporting measurements.
nico_measured_boot_profiles_totalgaugeThe total number of measured boot profiles.
nico_network_segments_enqueuer_iteration_latency_millisecondshistogramThe overall time it took to enqueue state handling tasks for all nico_network_segments in the system
nico_network_segments_handler_latency_in_state_millisecondshistogramThe amount of time it took to invoke the state handler for objects of type nico_network_segments in a certain state
nico_network_segments_iteration_latency_millisecondshistogramThe elapsed time in the last state processor iteration to handle objects of type nico_network_segments
nico_network_segments_object_tasks_completed_totalcounterThe amount of object handling tasks that have been completed for objects of type nico_network_segments
nico_network_segments_object_tasks_dispatched_totalcounterThe amount of types that object handling tasks that have been dequeued and dispatched for processing for objects of type nico_network_segments
nico_network_segments_object_tasks_enqueued_totalcounterThe amount of types that object handling tasks that have been freshly enqueued for objects of type nico_network_segments
nico_network_segments_object_tasks_requeued_totalcounterThe amount of object handling tasks that have been requeued for objects of type nico_network_segments
nico_network_segments_per_stategaugeThe number of nico_network_segments in the system with a given state
nico_network_segments_per_state_above_slagaugeThe number of nico_network_segments in the system which had been longer in a state than allowed per SLA
nico_network_segments_state_entered_totalcounterThe amount of types that objects of type nico_network_segments have entered a certain state
nico_network_segments_state_exited_totalcounterThe amount of types that objects of type nico_network_segments have exited a certain state
nico_network_segments_time_in_state_secondshistogramThe amount of time objects of type nico_network_segments have spent in a certain state
nico_network_segments_totalgaugeThe total number of nico_network_segments in the system
nico_network_segments_with_state_handling_errors_per_stategaugeThe number of nico_network_segments in the system with a given state that failed state handling
nico_nvlink_partition_monitor_nmxc_changes_applied_totalcounterNumber of changes requested to NMX-C
nico_pending_dpu_nic_firmware_update_countgaugeThe number of machines in the system that need a firmware update.
nico_pending_host_firmware_update_countgaugeThe number of host machines in the system that need a firmware update.
nico_power_shelves_enqueuer_iteration_latency_millisecondshistogramThe overall time it took to enqueue state handling tasks for all nico_power_shelves in the system
nico_power_shelves_iteration_latency_millisecondshistogramThe elapsed time in the last state processor iteration to handle objects of type nico_power_shelves
nico_power_shelves_object_tasks_enqueued_totalcounterThe amount of types that object handling tasks that have been freshly enqueued for objects of type nico_power_shelves
nico_power_shelves_totalgaugeThe total number of nico_power_shelves in the system
nico_preingestion_totalgaugeThe amount of known machines currently being evaluated prior to ingestion
nico_preingestion_waiting_downloadgaugeThe amount of machines that are waiting for firmware downloads on other machines to complete before doing their own
nico_preingestion_waiting_installationgaugeThe amount of machines which have had firmware uploaded to them and are currently in the process of installing that firmware
nico_racks_enqueuer_iteration_latency_millisecondshistogramThe overall time it took to enqueue state handling tasks for all nico_racks in the system
nico_racks_iteration_latency_millisecondshistogramThe elapsed time in the last state processor iteration to handle objects of type nico_racks
nico_racks_object_tasks_enqueued_totalcounterThe amount of types that object handling tasks that have been freshly enqueued for objects of type nico_racks
nico_racks_totalgaugeThe total number of nico_racks in the system
nico_reboot_attempts_in_booting_with_discovery_imagehistogramThe amount of machines rebooted again in BootingWithDiscoveryImage since there is no response after a certain time from host.
nico_reserved_ips_countgaugeThe total number of reserved ips in the site
nico_resourcepool_free_countgaugeCount of values in the pool currently available for allocation
nico_resourcepool_used_countgaugeCount of values in the pool currently allocated
nico_running_dpu_updates_countgaugeThe number of machines in the system that are running a firmware update.
nico_site_exploration_expected_machines_sku_countgaugeThe total count of expected machines by SKU ID and device type
nico_site_exploration_identified_managed_hosts_countgaugeThe amount of Host+DPU pairs that has been identified in the last SiteExplorer run
nico_site_explorer_bmc_reset_countgaugeThe amount of BMC resets initiated in the last SiteExplorer run
nico_site_explorer_create_machinesgaugeWhether site-explorer machine creation is enabled (1) or disabled (0)
nico_site_explorer_create_machines_latency_millisecondshistogramThe time it took to perform create_machines inside site-explorer
nico_site_explorer_created_machines_countgaugeThe amount of Machine pairs that had been created by Site Explorer after being identified
nico_site_explorer_created_power_shelves_countgaugeThe amount of Power Shelves that had been created by Site Explorer after being identified
nico_site_explorer_enabledgaugeWhether site-explorer is enabled (1) or paused (0)
nico_site_explorer_iteration_latency_millisecondshistogramThe time it took to perform one site explorer iteration
nico_switches_enqueuer_iteration_latency_millisecondshistogramThe overall time it took to enqueue state handling tasks for all nico_switches in the system
nico_switches_iteration_latency_millisecondshistogramThe elapsed time in the last state processor iteration to handle objects of type nico_switches
nico_switches_object_tasks_enqueued_totalcounterThe amount of types that object handling tasks that have been freshly enqueued for objects of type nico_switches
nico_switches_totalgaugeThe total number of nico_switches in the system
nico_total_ips_countgaugeThe total number of ips in the site
nico_unavailable_dpu_nic_firmware_update_countgaugeThe number of machines in the system that need a firmware update but are unavailable for update.
carbide_active_host_firmware_update_countgaugeThe number of host machines in the system currently working on updating their firmware.
carbide_api_db_queries_totalcounterThe amount of database queries that occurred inside a span
carbide_api_db_span_query_time_millisecondshistogramTotal time the request spent inside a span on database transactions
carbide_api_grpc_server_duration_millisecondshistogramProcessing time for a request on the carbide API server
carbide_api_readygaugeWhether the NICo API is running
carbide_api_tls_connection_attempted_totalcounterThe amount of tls connections that were attempted
carbide_api_tls_connection_success_totalcounterThe amount of tls connections that were successful
carbide_api_tracing_spans_opengaugeNumber of open logging/tracing spans
carbide_api_vault_request_duration_millisecondshistogramthe duration of outbound vault requests, in milliseconds
carbide_api_vault_requests_attempted_totalcounterThe amount of tls connections that were attempted
carbide_api_vault_requests_failed_totalcounterThe amount of tcp connections that were failures
carbide_api_vault_requests_succeeded_totalcounterThe amount of tls connections that were successful
carbide_api_vault_token_time_until_refresh_secondsgaugeThe amount of time, in seconds, until the vault token is required to be refreshed
carbide_api_versiongaugeVersion (git sha, build date, etc) of this service
carbide_available_ips_countgaugeThe total number of available ips in the site
carbide_concurrent_machine_updates_availablegaugeThe number of machines in the system that we will update concurrently.
carbide_db_pool_idle_connsgaugeThe amount of idle connections in the carbide database pool
carbide_db_pool_total_connsgaugeThe amount of total (active + idle) connections in the carbide database pool
carbide_dpu_agent_version_countgaugeThe amount of DPU agents which have reported a certain version.
carbide_dpu_firmware_version_countgaugeThe amount of DPUs which have reported a certain firmware version.
carbide_dpus_healthy_countgaugeThe total number of DPUs in the system that have reported healthy in the last report. Healthy does not imply up - the report from the DPU might be outdated.
carbide_dpus_up_countgaugeThe total number of DPUs in the system that are up. Up means we have received a health report less than 5 minutes ago.
carbide_endpoint_exploration_duration_millisecondshistogramThe time it took to explore an endpoint
carbide_endpoint_exploration_expected_machines_missing_overall_countgaugeThe total number of machines that were expected but not identified
carbide_endpoint_exploration_expected_power_shelves_missing_overall_countgaugeThe total number of power shelves that were expected but not identified
carbide_endpoint_exploration_identified_managed_hosts_overall_countgaugeThe total number of managed hosts identified by expectation
carbide_endpoint_exploration_machines_explored_overall_countgaugeThe total number of machines explored by machine type
carbide_endpoint_exploration_success_countgaugeThe amount of endpoint explorations that have been successful
carbide_endpoint_explorations_countgaugeThe amount of endpoint explorations that have been attempted
carbide_gpus_in_use_countgaugeThe total number of GPUs that are actively used by tenants in instances in the NICo deployment
carbide_gpus_total_countgaugeThe total number of GPUs available in the NICo deployment
carbide_gpus_usable_countgaugeThe remaining number of GPUs in the NICo deployment which are available for immediate instance creation
carbide_hosts_by_sku_countgaugeThe amount of hosts by SKU and device type ('unknown' for hosts without SKU)
carbide_hosts_health_overrides_countgaugeThe amount of health overrides that are configured in the site
carbide_hosts_health_status_countgaugeThe total number of Managed Hosts in the system that have reported either a healthy or not healthy status - based on the presence of health probe alerts
carbide_hosts_in_use_countgaugeThe total number of hosts that are actively used by tenants as instances in the NICo deployment
carbide_hosts_usable_countgaugeThe remaining number of hosts in the NICo deployment which are available for immediate instance creation
carbide_hosts_with_bios_password_setgaugeThe total number of Hosts in the system that have their BIOS password set.
carbide_ib_partitions_enqueuer_iteration_latency_millisecondshistogramThe overall time it took to enqueue state handling tasks for all carbide_ib_partitions in the system
carbide_ib_partitions_iteration_latency_millisecondshistogramThe elapsed time in the last state processor iteration to handle objects of type carbide_ib_partitions
carbide_ib_partitions_object_tasks_enqueued_totalcounterThe amount of types that object handling tasks that have been freshly enqueued for objects of type carbide_ib_partitions
carbide_ib_partitions_totalgaugeThe total number of carbide_ib_partitions in the system
carbide_machine_reboot_duration_secondshistogramTime taken for machine/host to reboot in seconds
carbide_machine_updates_started_countgaugeThe number of machines in the system that are in the process of updating.
carbide_machine_validation_completedgaugeCount of machine validation that have completed successfully
carbide_machine_validation_failedgaugeCount of machine validation that have failed
carbide_machine_validation_in_progressgaugeCount of machine validation that are in progress
carbide_machine_validation_testsgaugeThe details of machine validation tests
carbide_machines_enqueuer_iteration_latency_millisecondshistogramThe overall time it took to enqueue state handling tasks for all carbide_machines in the system
carbide_machines_handler_latency_in_state_millisecondshistogramThe amount of time it took to invoke the state handler for objects of type carbide_machines in a certain state
carbide_machines_in_maintenance_countgaugeThe total number of machines in the system that are in maintenance.
carbide_machines_iteration_latency_millisecondshistogramThe elapsed time in the last state processor iteration to handle objects of type carbide_machines
carbide_machines_object_tasks_completed_totalcounterThe amount of object handling tasks that have been completed for objects of type carbide_machines
carbide_machines_object_tasks_dispatched_totalcounterThe amount of types that object handling tasks that have been dequeued and dispatched for processing for objects of type carbide_machines
carbide_machines_object_tasks_enqueued_totalcounterThe amount of types that object handling tasks that have been freshly enqueued for objects of type carbide_machines
carbide_machines_object_tasks_requeued_totalcounterThe amount of object handling tasks that have been requeued for objects of type carbide_machines
carbide_machines_per_stategaugeThe number of carbide_machines in the system with a given state
carbide_machines_per_state_above_slagaugeThe number of carbide_machines in the system which had been longer in a state than allowed per SLA
carbide_machines_state_entered_totalcounterThe amount of types that objects of type carbide_machines have entered a certain state
carbide_machines_state_exited_totalcounterThe amount of types that objects of type carbide_machines have exited a certain state
carbide_machines_time_in_state_secondshistogramThe amount of time objects of type carbide_machines have spent in a certain state
carbide_machines_totalgaugeThe total number of carbide_machines in the system
carbide_machines_with_state_handling_errors_per_stategaugeThe number of carbide_machines in the system with a given state that failed state handling
carbide_measured_boot_bundles_totalgaugeThe total number of measured boot bundles.
carbide_measured_boot_machines_per_bundle_state_totalgaugeThe total number of machines per a given measured boot bundle state.
carbide_measured_boot_machines_per_machine_state_totalgaugeThe total number of machines per a given measured boot machine state.
carbide_measured_boot_machines_totalgaugeThe total number of machines reporting measurements.
carbide_measured_boot_profiles_totalgaugeThe total number of measured boot profiles.
carbide_network_segments_enqueuer_iteration_latency_millisecondshistogramThe overall time it took to enqueue state handling tasks for all carbide_network_segments in the system
carbide_network_segments_handler_latency_in_state_millisecondshistogramThe amount of time it took to invoke the state handler for objects of type carbide_network_segments in a certain state
carbide_network_segments_iteration_latency_millisecondshistogramThe elapsed time in the last state processor iteration to handle objects of type carbide_network_segments
carbide_network_segments_object_tasks_completed_totalcounterThe amount of object handling tasks that have been completed for objects of type carbide_network_segments
carbide_network_segments_object_tasks_dispatched_totalcounterThe amount of types that object handling tasks that have been dequeued and dispatched for processing for objects of type carbide_network_segments
carbide_network_segments_object_tasks_enqueued_totalcounterThe amount of types that object handling tasks that have been freshly enqueued for objects of type carbide_network_segments
carbide_network_segments_object_tasks_requeued_totalcounterThe amount of object handling tasks that have been requeued for objects of type carbide_network_segments
carbide_network_segments_per_stategaugeThe number of carbide_network_segments in the system with a given state
carbide_network_segments_per_state_above_slagaugeThe number of carbide_network_segments in the system which had been longer in a state than allowed per SLA
carbide_network_segments_state_entered_totalcounterThe amount of types that objects of type carbide_network_segments have entered a certain state
carbide_network_segments_state_exited_totalcounterThe amount of types that objects of type carbide_network_segments have exited a certain state
carbide_network_segments_time_in_state_secondshistogramThe amount of time objects of type carbide_network_segments have spent in a certain state
carbide_network_segments_totalgaugeThe total number of carbide_network_segments in the system
carbide_network_segments_with_state_handling_errors_per_stategaugeThe number of carbide_network_segments in the system with a given state that failed state handling
carbide_nvlink_partition_monitor_nmxc_changes_applied_totalcounterNumber of changes requested to NMX-C
carbide_pending_dpu_nic_firmware_update_countgaugeThe number of machines in the system that need a firmware update.
carbide_pending_host_firmware_update_countgaugeThe number of host machines in the system that need a firmware update.
carbide_power_shelves_enqueuer_iteration_latency_millisecondshistogramThe overall time it took to enqueue state handling tasks for all carbide_power_shelves in the system
carbide_power_shelves_health_overrides_countgaugeThe amount of health overrides that are configured in the site
carbide_power_shelves_health_status_countgaugeThe total number of Power Shelves in the system that have reported either a healthy or not healthy status - based on the presence of health probe alerts
carbide_power_shelves_iteration_latency_millisecondshistogramThe elapsed time in the last state processor iteration to handle objects of type carbide_power_shelves
carbide_power_shelves_object_tasks_enqueued_totalcounterThe amount of types that object handling tasks that have been freshly enqueued for objects of type carbide_power_shelves
carbide_power_shelves_totalgaugeThe total number of carbide_power_shelves in the system
carbide_preingestion_totalgaugeThe amount of known machines currently being evaluated prior to ingestion
carbide_preingestion_waiting_downloadgaugeThe amount of machines that are waiting for firmware downloads on other machines to complete before doing their own
carbide_preingestion_waiting_installationgaugeThe amount of machines which have had firmware uploaded to them and are currently in the process of installing that firmware
carbide_racks_enqueuer_iteration_latency_millisecondshistogramThe overall time it took to enqueue state handling tasks for all carbide_racks in the system
carbide_racks_health_overrides_countgaugeThe amount of health overrides that are configured in the site
carbide_racks_health_status_countgaugeThe total number of Racks in the system that have reported either a healthy or not healthy status - based on the presence of health probe alerts
carbide_racks_iteration_latency_millisecondshistogramThe elapsed time in the last state processor iteration to handle objects of type carbide_racks
carbide_racks_object_tasks_enqueued_totalcounterThe amount of types that object handling tasks that have been freshly enqueued for objects of type carbide_racks
carbide_racks_totalgaugeThe total number of carbide_racks in the system
carbide_reboot_attempts_in_booting_with_discovery_imagehistogramThe amount of machines rebooted again in BootingWithDiscoveryImage since there is no response after a certain time from host.
carbide_reserved_ips_countgaugeThe total number of reserved ips in the site
carbide_resourcepool_free_countgaugeCount of values in the pool currently available for allocation
carbide_resourcepool_used_countgaugeCount of values in the pool currently allocated
carbide_running_dpu_updates_countgaugeThe number of machines in the system that are running a firmware update.
carbide_site_exploration_expected_machines_sku_countgaugeThe total count of expected machines by SKU ID and device type
carbide_site_exploration_identified_managed_hosts_countgaugeThe amount of Host+DPU pairs that has been identified in the last SiteExplorer run
carbide_site_explorer_bmc_reset_countgaugeThe amount of BMC resets initiated in the last SiteExplorer run
carbide_site_explorer_create_machinesgaugeWhether site-explorer machine creation is enabled (1) or disabled (0)
carbide_site_explorer_create_machines_latency_millisecondshistogramThe time it took to perform create_machines inside site-explorer
carbide_site_explorer_created_machines_countgaugeThe amount of Machine pairs that had been created by Site Explorer after being identified
carbide_site_explorer_created_power_shelves_countgaugeThe amount of Power Shelves that had been created by Site Explorer after being identified
carbide_site_explorer_enabledgaugeWhether site-explorer is enabled (1) or paused (0)
carbide_site_explorer_iteration_latency_millisecondshistogramThe time it took to perform one site explorer iteration
carbide_switches_enqueuer_iteration_latency_millisecondshistogramThe overall time it took to enqueue state handling tasks for all carbide_switches in the system
carbide_switches_health_overrides_countgaugeThe amount of health overrides that are configured in the site
carbide_switches_health_status_countgaugeThe total number of Switches in the system that have reported either a healthy or not healthy status - based on the presence of health probe alerts
carbide_switches_iteration_latency_millisecondshistogramThe elapsed time in the last state processor iteration to handle objects of type carbide_switches
carbide_switches_object_tasks_enqueued_totalcounterThe amount of types that object handling tasks that have been freshly enqueued for objects of type carbide_switches
carbide_switches_totalgaugeThe total number of carbide_switches in the system
carbide_total_ips_countgaugeThe total number of ips in the site
carbide_unavailable_dpu_nic_firmware_update_countgaugeThe number of machines in the system that need a firmware update but are unavailable for update.