diff --git a/crates/api-core/src/logging/service_health_metrics.rs b/crates/api-core/src/logging/service_health_metrics.rs index 32b2a4cbe3..5c985202a5 100644 --- a/crates/api-core/src/logging/service_health_metrics.rs +++ b/crates/api-core/src/logging/service_health_metrics.rs @@ -33,7 +33,7 @@ pub fn start_export_service_health_metrics(health_context: ServiceHealthContext) health_context .meter .u64_observable_gauge("carbide_api_ready") - .with_description("Whether the Forge Site Controller API is running") + .with_description("Whether the NICo API is running") .with_callback(|observer| { observer.observe(1, &[]); }) diff --git a/crates/api-core/src/logging/setup.rs b/crates/api-core/src/logging/setup.rs index 24a380f260..0e27d387be 100644 --- a/crates/api-core/src/logging/setup.rs +++ b/crates/api-core/src/logging/setup.rs @@ -309,7 +309,7 @@ pub fn create_metric_for_spancount_reader( ) { meter .u64_observable_gauge("carbide_api_tracing_spans_open") - .with_description("Whether the Forge Site Controller API is running") + .with_description("Number of open logging/tracing spans") .with_callback(move |observer| { let open_spans = if let Some(spancount_reader) = &spancount_reader { spancount_reader.open_spans() diff --git a/crates/api-core/src/tests/metrics_fixtures/test_service_health_metrics.txt b/crates/api-core/src/tests/metrics_fixtures/test_service_health_metrics.txt index d519e8f86e..def27a53be 100644 --- a/crates/api-core/src/tests/metrics_fixtures/test_service_health_metrics.txt +++ b/crates/api-core/src/tests/metrics_fixtures/test_service_health_metrics.txt @@ -1,4 +1,4 @@ -# HELP carbide_api_ready Whether the Forge Site Controller API is running +# HELP carbide_api_ready Whether the NICo API is running # TYPE carbide_api_ready gauge carbide_api_ready 1 # HELP carbide_api_version Version (git sha, build date, etc) of this service diff --git a/crates/api-integration-tests/tests/lib.rs b/crates/api-integration-tests/tests/lib.rs index 26b2ed668d..0c6d2f084f 100644 --- a/crates/api-integration-tests/tests/lib.rs +++ b/crates/api-integration-tests/tests/lib.rs @@ -218,14 +218,14 @@ fn generate_core_metric_docs(metrics_endpoints: &[SocketAddr]) { .into_iter() .filter(|metric| !metric.name.starts_with("alt_metric")) .collect(); - let mut docs = "# NCX Infra Controller (NICo) core metrics\n\n".to_string(); + let mut docs = "# NVIDIA Infra Controller (NICo) Core Metrics\n\n".to_string(); use std::fmt::Write; use askama_escape::Escaper; writeln!( &mut docs, - "This file contains a list of metrics exported by NCX Infra Controller (NICo). \ + "This file contains a list of metrics exported by NVIDIA Infra Controller (NICo). \ The list is auto-generated from an integration test (`test_integration`). \ Metrics for workflows which are not exercised by the test are missing." ) @@ -249,7 +249,7 @@ fn generate_core_metric_docs(metrics_endpoints: &[SocketAddr]) { write!(&mut docs, "").unwrap(); writeln!(&mut docs, "").unwrap(); } - writeln!(&mut docs, "
| Name | Type | Description |
| nico_active_host_firmware_update_count | gauge | The number of host machines in the system currently working on updating their firmware. |
| nico_api_db_queries_total | counter | The amount of database queries that occurred inside a span |
| nico_api_db_span_query_time_milliseconds | histogram | Total time the request spent inside a span on database transactions |
| nico_api_grpc_server_duration_milliseconds | histogram | Processing time for a request on the nico API server |
| nico_api_ready | gauge | Whether the NICo API is running |
| nico_api_tls_connection_attempted_total | counter | The amount of tls connections that were attempted |
| nico_api_tls_connection_success_total | counter | The amount of tls connections that were successful |
| nico_api_tracing_spans_open | gauge | Whether the NICo API is running |
| nico_api_vault_request_duration_milliseconds | histogram | the duration of outbound vault requests, in milliseconds |
| nico_api_vault_requests_attempted_total | counter | The amount of tls connections that were attempted |
| nico_api_vault_requests_failed_total | counter | The amount of tcp connections that were failures |
| nico_api_vault_requests_succeeded_total | counter | The amount of tls connections that were successful |
| nico_api_vault_token_time_until_refresh_seconds | gauge | The amount of time, in seconds, until the vault token is required to be refreshed |
| nico_api_version | gauge | Version (git sha, build date, etc) of this service |
| nico_available_ips_count | gauge | The total number of available ips in the site |
| nico_concurrent_machine_updates_available | gauge | The number of machines in the system that we will update concurrently. |
| nico_db_pool_idle_conns | gauge | The amount of idle connections in the nico database pool |
| nico_db_pool_total_conns | gauge | The amount of total (active + idle) connections in the nico database pool |
| nico_dpu_agent_version_count | gauge | The amount of DPU agents which have reported a certain version. |
| nico_dpu_firmware_version_count | gauge | The amount of DPUs which have reported a certain firmware version. |
| nico_dpus_healthy_count | gauge | The total number of DPUs in the system that have reported healthy in the last report. Healthy does not imply up - the report from the DPU might be outdated. |
| nico_dpus_up_count | gauge | The total number of DPUs in the system that are up. Up means we have received a health report less than 5 minutes ago. |
| nico_endpoint_exploration_duration_milliseconds | histogram | The time it took to explore an endpoint |
| nico_endpoint_exploration_expected_machines_missing_overall_count | gauge | The total number of machines that were expected but not identified |
| nico_endpoint_exploration_expected_power_shelves_missing_overall_count | gauge | The total number of power shelves that were expected but not identified |
| nico_endpoint_exploration_identified_managed_hosts_overall_count | gauge | The total number of managed hosts identified by expectation |
| nico_endpoint_exploration_machines_explored_overall_count | gauge | The total number of machines explored by machine type |
| nico_endpoint_exploration_success_count | gauge | The amount of endpoint explorations that have been successful |
| nico_endpoint_explorations_count | gauge | The amount of endpoint explorations that have been attempted |
| nico_gpus_in_use_count | gauge | The total number of GPUs that are actively used by tenants in instances in the NICo deployment |
| nico_gpus_total_count | gauge | The total number of GPUs available in the NICo deployment |
| nico_gpus_usable_count | gauge | The remaining number of GPUs in the NICo deployment which are available for immediate instance creation |
| nico_hosts_by_sku_count | gauge | The amount of hosts by SKU and device type ('unknown' for hosts without SKU) |
| nico_hosts_health_overrides_count | gauge | The amount of health overrides that are configured in the site |
| nico_hosts_health_status_count | gauge | The total number of Managed Hosts in the system that have reported either a healthy or not healthy status - based on the presence of health probe alerts |
| nico_hosts_in_use_count | gauge | The total number of hosts that are actively used by tenants as instances in the NICo deployment |
| nico_hosts_usable_count | gauge | The remaining number of hosts in the NICo deployment which are available for immediate instance creation |
| nico_hosts_with_bios_password_set | gauge | The total number of Hosts in the system that have their BIOS password set. |
| nico_ib_partitions_enqueuer_iteration_latency_milliseconds | histogram | The overall time it took to enqueue state handling tasks for all nico_ib_partitions in the system |
| nico_ib_partitions_iteration_latency_milliseconds | histogram | The elapsed time in the last state processor iteration to handle objects of type nico_ib_partitions |
| nico_ib_partitions_object_tasks_enqueued_total | counter | The amount of types that object handling tasks that have been freshly enqueued for objects of type nico_ib_partitions |
| nico_ib_partitions_total | gauge | The total number of nico_ib_partitions in the system |
| nico_machine_reboot_duration_seconds | histogram | Time taken for machine/host to reboot in seconds |
| nico_machine_updates_started_count | gauge | The number of machines in the system that are in the process of updating. |
| nico_machine_validation_completed | gauge | Count of machine validation that have completed successfully |
| nico_machine_validation_failed | gauge | Count of machine validation that have failed |
| nico_machine_validation_in_progress | gauge | Count of machine validation that are in progress |
| nico_machine_validation_tests | gauge | The details of machine validation tests |
| nico_machines_enqueuer_iteration_latency_milliseconds | histogram | The overall time it took to enqueue state handling tasks for all nico_machines in the system |
| nico_machines_handler_latency_in_state_milliseconds | histogram | The amount of time it took to invoke the state handler for objects of type nico_machines in a certain state |
| nico_machines_in_maintenance_count | gauge | The total number of machines in the system that are in maintenance. |
| nico_machines_iteration_latency_milliseconds | histogram | The elapsed time in the last state processor iteration to handle objects of type nico_machines |
| nico_machines_object_tasks_completed_total | counter | The amount of object handling tasks that have been completed for objects of type nico_machines |
| nico_machines_object_tasks_dispatched_total | counter | The amount of types that object handling tasks that have been dequeued and dispatched for processing for objects of type nico_machines |
| nico_machines_object_tasks_enqueued_total | counter | The amount of types that object handling tasks that have been freshly enqueued for objects of type nico_machines |
| nico_machines_object_tasks_requeued_total | counter | The amount of object handling tasks that have been requeued for objects of type nico_machines |
| nico_machines_per_state | gauge | The number of nico_machines in the system with a given state |
| nico_machines_per_state_above_sla | gauge | The number of nico_machines in the system which had been longer in a state than allowed per SLA |
| nico_machines_state_entered_total | counter | The amount of types that objects of type nico_machines have entered a certain state |
| nico_machines_state_exited_total | counter | The amount of types that objects of type nico_machines have exited a certain state |
| nico_machines_time_in_state_seconds | histogram | The amount of time objects of type nico_machines have spent in a certain state |
| nico_machines_total | gauge | The total number of nico_machines in the system |
| nico_machines_with_state_handling_errors_per_state | gauge | The number of nico_machines in the system with a given state that failed state handling |
| nico_measured_boot_bundles_total | gauge | The total number of measured boot bundles. |
| nico_measured_boot_machines_per_bundle_state_total | gauge | The total number of machines per a given measured boot bundle state. |
| nico_measured_boot_machines_per_machine_state_total | gauge | The total number of machines per a given measured boot machine state. |
| nico_measured_boot_machines_total | gauge | The total number of machines reporting measurements. |
| nico_measured_boot_profiles_total | gauge | The total number of measured boot profiles. |
| nico_network_segments_enqueuer_iteration_latency_milliseconds | histogram | The overall time it took to enqueue state handling tasks for all nico_network_segments in the system |
| nico_network_segments_handler_latency_in_state_milliseconds | histogram | The amount of time it took to invoke the state handler for objects of type nico_network_segments in a certain state |
| nico_network_segments_iteration_latency_milliseconds | histogram | The elapsed time in the last state processor iteration to handle objects of type nico_network_segments |
| nico_network_segments_object_tasks_completed_total | counter | The amount of object handling tasks that have been completed for objects of type nico_network_segments |
| nico_network_segments_object_tasks_dispatched_total | counter | The amount of types that object handling tasks that have been dequeued and dispatched for processing for objects of type nico_network_segments |
| nico_network_segments_object_tasks_enqueued_total | counter | The amount of types that object handling tasks that have been freshly enqueued for objects of type nico_network_segments |
| nico_network_segments_object_tasks_requeued_total | counter | The amount of object handling tasks that have been requeued for objects of type nico_network_segments |
| nico_network_segments_per_state | gauge | The number of nico_network_segments in the system with a given state |
| nico_network_segments_per_state_above_sla | gauge | The number of nico_network_segments in the system which had been longer in a state than allowed per SLA |
| nico_network_segments_state_entered_total | counter | The amount of types that objects of type nico_network_segments have entered a certain state |
| nico_network_segments_state_exited_total | counter | The amount of types that objects of type nico_network_segments have exited a certain state |
| nico_network_segments_time_in_state_seconds | histogram | The amount of time objects of type nico_network_segments have spent in a certain state |
| nico_network_segments_total | gauge | The total number of nico_network_segments in the system |
| nico_network_segments_with_state_handling_errors_per_state | gauge | The number of nico_network_segments in the system with a given state that failed state handling |
| nico_nvlink_partition_monitor_nmxc_changes_applied_total | counter | Number of changes requested to NMX-C |
| nico_pending_dpu_nic_firmware_update_count | gauge | The number of machines in the system that need a firmware update. |
| nico_pending_host_firmware_update_count | gauge | The number of host machines in the system that need a firmware update. |
| nico_power_shelves_enqueuer_iteration_latency_milliseconds | histogram | The overall time it took to enqueue state handling tasks for all nico_power_shelves in the system |
| nico_power_shelves_iteration_latency_milliseconds | histogram | The elapsed time in the last state processor iteration to handle objects of type nico_power_shelves |
| nico_power_shelves_object_tasks_enqueued_total | counter | The amount of types that object handling tasks that have been freshly enqueued for objects of type nico_power_shelves |
| nico_power_shelves_total | gauge | The total number of nico_power_shelves in the system |
| nico_preingestion_total | gauge | The amount of known machines currently being evaluated prior to ingestion |
| nico_preingestion_waiting_download | gauge | The amount of machines that are waiting for firmware downloads on other machines to complete before doing their own |
| nico_preingestion_waiting_installation | gauge | The amount of machines which have had firmware uploaded to them and are currently in the process of installing that firmware |
| nico_racks_enqueuer_iteration_latency_milliseconds | histogram | The overall time it took to enqueue state handling tasks for all nico_racks in the system |
| nico_racks_iteration_latency_milliseconds | histogram | The elapsed time in the last state processor iteration to handle objects of type nico_racks |
| nico_racks_object_tasks_enqueued_total | counter | The amount of types that object handling tasks that have been freshly enqueued for objects of type nico_racks |
| nico_racks_total | gauge | The total number of nico_racks in the system |
| nico_reboot_attempts_in_booting_with_discovery_image | histogram | The amount of machines rebooted again in BootingWithDiscoveryImage since there is no response after a certain time from host. |
| nico_reserved_ips_count | gauge | The total number of reserved ips in the site |
| nico_resourcepool_free_count | gauge | Count of values in the pool currently available for allocation |
| nico_resourcepool_used_count | gauge | Count of values in the pool currently allocated |
| nico_running_dpu_updates_count | gauge | The number of machines in the system that are running a firmware update. |
| nico_site_exploration_expected_machines_sku_count | gauge | The total count of expected machines by SKU ID and device type |
| nico_site_exploration_identified_managed_hosts_count | gauge | The amount of Host+DPU pairs that has been identified in the last SiteExplorer run |
| nico_site_explorer_bmc_reset_count | gauge | The amount of BMC resets initiated in the last SiteExplorer run |
| nico_site_explorer_create_machines | gauge | Whether site-explorer machine creation is enabled (1) or disabled (0) |
| nico_site_explorer_create_machines_latency_milliseconds | histogram | The time it took to perform create_machines inside site-explorer |
| nico_site_explorer_created_machines_count | gauge | The amount of Machine pairs that had been created by Site Explorer after being identified |
| nico_site_explorer_created_power_shelves_count | gauge | The amount of Power Shelves that had been created by Site Explorer after being identified |
| nico_site_explorer_enabled | gauge | Whether site-explorer is enabled (1) or paused (0) |
| nico_site_explorer_iteration_latency_milliseconds | histogram | The time it took to perform one site explorer iteration |
| nico_switches_enqueuer_iteration_latency_milliseconds | histogram | The overall time it took to enqueue state handling tasks for all nico_switches in the system |
| nico_switches_iteration_latency_milliseconds | histogram | The elapsed time in the last state processor iteration to handle objects of type nico_switches |
| nico_switches_object_tasks_enqueued_total | counter | The amount of types that object handling tasks that have been freshly enqueued for objects of type nico_switches |
| nico_switches_total | gauge | The total number of nico_switches in the system |
| nico_total_ips_count | gauge | The total number of ips in the site |
| nico_unavailable_dpu_nic_firmware_update_count | gauge | The number of machines in the system that need a firmware update but are unavailable for update. |
| carbide_active_host_firmware_update_count | gauge | The number of host machines in the system currently working on updating their firmware. |
| carbide_api_db_queries_total | counter | The amount of database queries that occurred inside a span |
| carbide_api_db_span_query_time_milliseconds | histogram | Total time the request spent inside a span on database transactions |
| carbide_api_grpc_server_duration_milliseconds | histogram | Processing time for a request on the carbide API server |
| carbide_api_ready | gauge | Whether the NICo API is running |
| carbide_api_tls_connection_attempted_total | counter | The amount of tls connections that were attempted |
| carbide_api_tls_connection_success_total | counter | The amount of tls connections that were successful |
| carbide_api_tracing_spans_open | gauge | Number of open logging/tracing spans |
| carbide_api_vault_request_duration_milliseconds | histogram | the duration of outbound vault requests, in milliseconds |
| carbide_api_vault_requests_attempted_total | counter | The amount of tls connections that were attempted |
| carbide_api_vault_requests_failed_total | counter | The amount of tcp connections that were failures |
| carbide_api_vault_requests_succeeded_total | counter | The amount of tls connections that were successful |
| carbide_api_vault_token_time_until_refresh_seconds | gauge | The amount of time, in seconds, until the vault token is required to be refreshed |
| carbide_api_version | gauge | Version (git sha, build date, etc) of this service |
| carbide_available_ips_count | gauge | The total number of available ips in the site |
| carbide_concurrent_machine_updates_available | gauge | The number of machines in the system that we will update concurrently. |
| carbide_db_pool_idle_conns | gauge | The amount of idle connections in the carbide database pool |
| carbide_db_pool_total_conns | gauge | The amount of total (active + idle) connections in the carbide database pool |
| carbide_dpu_agent_version_count | gauge | The amount of DPU agents which have reported a certain version. |
| carbide_dpu_firmware_version_count | gauge | The amount of DPUs which have reported a certain firmware version. |
| carbide_dpus_healthy_count | gauge | The total number of DPUs in the system that have reported healthy in the last report. Healthy does not imply up - the report from the DPU might be outdated. |
| carbide_dpus_up_count | gauge | The total number of DPUs in the system that are up. Up means we have received a health report less than 5 minutes ago. |
| carbide_endpoint_exploration_duration_milliseconds | histogram | The time it took to explore an endpoint |
| carbide_endpoint_exploration_expected_machines_missing_overall_count | gauge | The total number of machines that were expected but not identified |
| carbide_endpoint_exploration_expected_power_shelves_missing_overall_count | gauge | The total number of power shelves that were expected but not identified |
| carbide_endpoint_exploration_identified_managed_hosts_overall_count | gauge | The total number of managed hosts identified by expectation |
| carbide_endpoint_exploration_machines_explored_overall_count | gauge | The total number of machines explored by machine type |
| carbide_endpoint_exploration_success_count | gauge | The amount of endpoint explorations that have been successful |
| carbide_endpoint_explorations_count | gauge | The amount of endpoint explorations that have been attempted |
| carbide_gpus_in_use_count | gauge | The total number of GPUs that are actively used by tenants in instances in the NICo deployment |
| carbide_gpus_total_count | gauge | The total number of GPUs available in the NICo deployment |
| carbide_gpus_usable_count | gauge | The remaining number of GPUs in the NICo deployment which are available for immediate instance creation |
| carbide_hosts_by_sku_count | gauge | The amount of hosts by SKU and device type ('unknown' for hosts without SKU) |
| carbide_hosts_health_overrides_count | gauge | The amount of health overrides that are configured in the site |
| carbide_hosts_health_status_count | gauge | The total number of Managed Hosts in the system that have reported either a healthy or not healthy status - based on the presence of health probe alerts |
| carbide_hosts_in_use_count | gauge | The total number of hosts that are actively used by tenants as instances in the NICo deployment |
| carbide_hosts_usable_count | gauge | The remaining number of hosts in the NICo deployment which are available for immediate instance creation |
| carbide_hosts_with_bios_password_set | gauge | The total number of Hosts in the system that have their BIOS password set. |
| carbide_ib_partitions_enqueuer_iteration_latency_milliseconds | histogram | The overall time it took to enqueue state handling tasks for all carbide_ib_partitions in the system |
| carbide_ib_partitions_iteration_latency_milliseconds | histogram | The elapsed time in the last state processor iteration to handle objects of type carbide_ib_partitions |
| carbide_ib_partitions_object_tasks_enqueued_total | counter | The amount of types that object handling tasks that have been freshly enqueued for objects of type carbide_ib_partitions |
| carbide_ib_partitions_total | gauge | The total number of carbide_ib_partitions in the system |
| carbide_machine_reboot_duration_seconds | histogram | Time taken for machine/host to reboot in seconds |
| carbide_machine_updates_started_count | gauge | The number of machines in the system that are in the process of updating. |
| carbide_machine_validation_completed | gauge | Count of machine validation that have completed successfully |
| carbide_machine_validation_failed | gauge | Count of machine validation that have failed |
| carbide_machine_validation_in_progress | gauge | Count of machine validation that are in progress |
| carbide_machine_validation_tests | gauge | The details of machine validation tests |
| carbide_machines_enqueuer_iteration_latency_milliseconds | histogram | The overall time it took to enqueue state handling tasks for all carbide_machines in the system |
| carbide_machines_handler_latency_in_state_milliseconds | histogram | The amount of time it took to invoke the state handler for objects of type carbide_machines in a certain state |
| carbide_machines_in_maintenance_count | gauge | The total number of machines in the system that are in maintenance. |
| carbide_machines_iteration_latency_milliseconds | histogram | The elapsed time in the last state processor iteration to handle objects of type carbide_machines |
| carbide_machines_object_tasks_completed_total | counter | The amount of object handling tasks that have been completed for objects of type carbide_machines |
| carbide_machines_object_tasks_dispatched_total | counter | The amount of types that object handling tasks that have been dequeued and dispatched for processing for objects of type carbide_machines |
| carbide_machines_object_tasks_enqueued_total | counter | The amount of types that object handling tasks that have been freshly enqueued for objects of type carbide_machines |
| carbide_machines_object_tasks_requeued_total | counter | The amount of object handling tasks that have been requeued for objects of type carbide_machines |
| carbide_machines_per_state | gauge | The number of carbide_machines in the system with a given state |
| carbide_machines_per_state_above_sla | gauge | The number of carbide_machines in the system which had been longer in a state than allowed per SLA |
| carbide_machines_state_entered_total | counter | The amount of types that objects of type carbide_machines have entered a certain state |
| carbide_machines_state_exited_total | counter | The amount of types that objects of type carbide_machines have exited a certain state |
| carbide_machines_time_in_state_seconds | histogram | The amount of time objects of type carbide_machines have spent in a certain state |
| carbide_machines_total | gauge | The total number of carbide_machines in the system |
| carbide_machines_with_state_handling_errors_per_state | gauge | The number of carbide_machines in the system with a given state that failed state handling |
| carbide_measured_boot_bundles_total | gauge | The total number of measured boot bundles. |
| carbide_measured_boot_machines_per_bundle_state_total | gauge | The total number of machines per a given measured boot bundle state. |
| carbide_measured_boot_machines_per_machine_state_total | gauge | The total number of machines per a given measured boot machine state. |
| carbide_measured_boot_machines_total | gauge | The total number of machines reporting measurements. |
| carbide_measured_boot_profiles_total | gauge | The total number of measured boot profiles. |
| carbide_network_segments_enqueuer_iteration_latency_milliseconds | histogram | The overall time it took to enqueue state handling tasks for all carbide_network_segments in the system |
| carbide_network_segments_handler_latency_in_state_milliseconds | histogram | The amount of time it took to invoke the state handler for objects of type carbide_network_segments in a certain state |
| carbide_network_segments_iteration_latency_milliseconds | histogram | The elapsed time in the last state processor iteration to handle objects of type carbide_network_segments |
| carbide_network_segments_object_tasks_completed_total | counter | The amount of object handling tasks that have been completed for objects of type carbide_network_segments |
| carbide_network_segments_object_tasks_dispatched_total | counter | The amount of types that object handling tasks that have been dequeued and dispatched for processing for objects of type carbide_network_segments |
| carbide_network_segments_object_tasks_enqueued_total | counter | The amount of types that object handling tasks that have been freshly enqueued for objects of type carbide_network_segments |
| carbide_network_segments_object_tasks_requeued_total | counter | The amount of object handling tasks that have been requeued for objects of type carbide_network_segments |
| carbide_network_segments_per_state | gauge | The number of carbide_network_segments in the system with a given state |
| carbide_network_segments_per_state_above_sla | gauge | The number of carbide_network_segments in the system which had been longer in a state than allowed per SLA |
| carbide_network_segments_state_entered_total | counter | The amount of types that objects of type carbide_network_segments have entered a certain state |
| carbide_network_segments_state_exited_total | counter | The amount of types that objects of type carbide_network_segments have exited a certain state |
| carbide_network_segments_time_in_state_seconds | histogram | The amount of time objects of type carbide_network_segments have spent in a certain state |
| carbide_network_segments_total | gauge | The total number of carbide_network_segments in the system |
| carbide_network_segments_with_state_handling_errors_per_state | gauge | The number of carbide_network_segments in the system with a given state that failed state handling |
| carbide_nvlink_partition_monitor_nmxc_changes_applied_total | counter | Number of changes requested to NMX-C |
| carbide_pending_dpu_nic_firmware_update_count | gauge | The number of machines in the system that need a firmware update. |
| carbide_pending_host_firmware_update_count | gauge | The number of host machines in the system that need a firmware update. |
| carbide_power_shelves_enqueuer_iteration_latency_milliseconds | histogram | The overall time it took to enqueue state handling tasks for all carbide_power_shelves in the system |
| carbide_power_shelves_health_overrides_count | gauge | The amount of health overrides that are configured in the site |
| carbide_power_shelves_health_status_count | gauge | The total number of Power Shelves in the system that have reported either a healthy or not healthy status - based on the presence of health probe alerts |
| carbide_power_shelves_iteration_latency_milliseconds | histogram | The elapsed time in the last state processor iteration to handle objects of type carbide_power_shelves |
| carbide_power_shelves_object_tasks_enqueued_total | counter | The amount of types that object handling tasks that have been freshly enqueued for objects of type carbide_power_shelves |
| carbide_power_shelves_total | gauge | The total number of carbide_power_shelves in the system |
| carbide_preingestion_total | gauge | The amount of known machines currently being evaluated prior to ingestion |
| carbide_preingestion_waiting_download | gauge | The amount of machines that are waiting for firmware downloads on other machines to complete before doing their own |
| carbide_preingestion_waiting_installation | gauge | The amount of machines which have had firmware uploaded to them and are currently in the process of installing that firmware |
| carbide_racks_enqueuer_iteration_latency_milliseconds | histogram | The overall time it took to enqueue state handling tasks for all carbide_racks in the system |
| carbide_racks_health_overrides_count | gauge | The amount of health overrides that are configured in the site |
| carbide_racks_health_status_count | gauge | The total number of Racks in the system that have reported either a healthy or not healthy status - based on the presence of health probe alerts |
| carbide_racks_iteration_latency_milliseconds | histogram | The elapsed time in the last state processor iteration to handle objects of type carbide_racks |
| carbide_racks_object_tasks_enqueued_total | counter | The amount of types that object handling tasks that have been freshly enqueued for objects of type carbide_racks |
| carbide_racks_total | gauge | The total number of carbide_racks in the system |
| carbide_reboot_attempts_in_booting_with_discovery_image | histogram | The amount of machines rebooted again in BootingWithDiscoveryImage since there is no response after a certain time from host. |
| carbide_reserved_ips_count | gauge | The total number of reserved ips in the site |
| carbide_resourcepool_free_count | gauge | Count of values in the pool currently available for allocation |
| carbide_resourcepool_used_count | gauge | Count of values in the pool currently allocated |
| carbide_running_dpu_updates_count | gauge | The number of machines in the system that are running a firmware update. |
| carbide_site_exploration_expected_machines_sku_count | gauge | The total count of expected machines by SKU ID and device type |
| carbide_site_exploration_identified_managed_hosts_count | gauge | The amount of Host+DPU pairs that has been identified in the last SiteExplorer run |
| carbide_site_explorer_bmc_reset_count | gauge | The amount of BMC resets initiated in the last SiteExplorer run |
| carbide_site_explorer_create_machines | gauge | Whether site-explorer machine creation is enabled (1) or disabled (0) |
| carbide_site_explorer_create_machines_latency_milliseconds | histogram | The time it took to perform create_machines inside site-explorer |
| carbide_site_explorer_created_machines_count | gauge | The amount of Machine pairs that had been created by Site Explorer after being identified |
| carbide_site_explorer_created_power_shelves_count | gauge | The amount of Power Shelves that had been created by Site Explorer after being identified |
| carbide_site_explorer_enabled | gauge | Whether site-explorer is enabled (1) or paused (0) |
| carbide_site_explorer_iteration_latency_milliseconds | histogram | The time it took to perform one site explorer iteration |
| carbide_switches_enqueuer_iteration_latency_milliseconds | histogram | The overall time it took to enqueue state handling tasks for all carbide_switches in the system |
| carbide_switches_health_overrides_count | gauge | The amount of health overrides that are configured in the site |
| carbide_switches_health_status_count | gauge | The total number of Switches in the system that have reported either a healthy or not healthy status - based on the presence of health probe alerts |
| carbide_switches_iteration_latency_milliseconds | histogram | The elapsed time in the last state processor iteration to handle objects of type carbide_switches |
| carbide_switches_object_tasks_enqueued_total | counter | The amount of types that object handling tasks that have been freshly enqueued for objects of type carbide_switches |
| carbide_switches_total | gauge | The total number of carbide_switches in the system |
| carbide_total_ips_count | gauge | The total number of ips in the site |
| carbide_unavailable_dpu_nic_firmware_update_count | gauge | The number of machines in the system that need a firmware update but are unavailable for update. |