add new drive I/O waiting/tokens metric (#18836)

Bonus: add virtual memory used as well part of the system resource metrics.
This commit is contained in:
Harshavardhana 2024-01-19 14:51:36 -08:00 committed by GitHub
parent ac81f0248c
commit e11d851aee
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 77 additions and 16 deletions

View file

@ -203,6 +203,8 @@ func getDisksInfo(disks []StorageAPI, endpoints []Endpoint, metrics bool) (disks
APICalls: make(map[string]uint64, len(info.Metrics.APICalls)),
TotalErrorsAvailability: info.Metrics.TotalErrorsAvailability,
TotalErrorsTimeout: info.Metrics.TotalErrorsTimeout,
TotalTokens: info.Metrics.TotalTokens,
TotalWaiting: info.Metrics.TotalWaiting,
}
for k, v := range info.Metrics.LastMinute {
if v.N > 0 {

View file

@ -1,4 +1,4 @@
// Copyright (c) 2015-2023 MinIO, Inc.
// Copyright (c) 2015-2024 MinIO, Inc.
//
// This file is part of MinIO Object Storage stack
//
@ -256,6 +256,7 @@ const (
startTime = "starttime_seconds"
upTime = "uptime_seconds"
memory = "resident_memory_bytes"
vmemory = "virtual_memory_bytes"
cpu = "cpu_total_seconds"
expiryPendingTasks MetricName = "expiry_pending_tasks"
@ -519,7 +520,7 @@ func getNodeDriveTimeoutErrorsMD() MetricDescription {
Namespace: nodeMetricNamespace,
Subsystem: driveSubsystem,
Name: "errors_timeout",
Help: "Total number of timeout errors since server start",
Help: "Total number of drive timeout errors since server start",
Type: counterMetric,
}
}
@ -529,7 +530,27 @@ func getNodeDriveAvailablityErrorsMD() MetricDescription {
Namespace: nodeMetricNamespace,
Subsystem: driveSubsystem,
Name: "errors_availability",
Help: "Total number of I/O errors, permission denied and timeouts since server start",
Help: "Total number of drive I/O errors, permission denied and timeouts since server start",
Type: counterMetric,
}
}
func getNodeDriveWaitingIOMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: driveSubsystem,
Name: "io_waiting",
Help: "Total number I/O operations waiting on drive",
Type: counterMetric,
}
}
func getNodeDriveTokensIOMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: driveSubsystem,
Name: "io_tokens",
Help: "Total number concurrent I/O operations configured on drive",
Type: counterMetric,
}
}
@ -1532,6 +1553,16 @@ func getMinIOProcessResidentMemory() MetricDescription {
}
}
func getMinIOProcessVirtualMemory() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: processSubsystem,
Name: memory,
Help: "Virtual memory size in bytes",
Type: gaugeMetric,
}
}
func getMinIOProcessCPUTime() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
@ -1654,6 +1685,14 @@ func getMinioProcMetrics() *MetricsGroup {
})
}
if stat.VirtualMemory() > 0 {
metrics = append(metrics,
Metric{
Description: getMinIOProcessVirtualMemory(),
Value: float64(stat.VirtualMemory()),
})
}
if stat.CPUTime() > 0 {
metrics = append(metrics,
Metric{
@ -2900,6 +2939,9 @@ func getClusterUsageMetrics(opts MetricsGroupOpts) *MetricsGroup {
}
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
objLayer := newObjectLayerFn()
if objLayer == nil {
return
}
metrics = make([]Metric, 0, 50)
dataUsageInfo, err := loadDataUsageFromBackend(ctx, objLayer)
@ -3260,6 +3302,18 @@ func getLocalStorageMetrics(opts MetricsGroupOpts) *MetricsGroup {
VariableLabels: map[string]string{"drive": disk.DrivePath},
})
metrics = append(metrics, Metric{
Description: getNodeDriveWaitingIOMD(),
Value: float64(disk.Metrics.TotalWaiting),
VariableLabels: map[string]string{"drive": disk.DrivePath},
})
metrics = append(metrics, Metric{
Description: getNodeDriveTokensIOMD(),
Value: float64(disk.Metrics.TotalTokens),
VariableLabels: map[string]string{"drive": disk.DrivePath},
})
for apiName, latency := range disk.Metrics.LastMinute {
metrics = append(metrics, Metric{
Description: getNodeDriveAPILatencyMD(),

View file

@ -170,16 +170,20 @@ For deployments with [bucket](https://min.io/docs/minio/linux/administration/buc
## Drive Metrics
| Name | Description |
|:---------------------------------|:--------------------------------------------------------------------|
| `minio_node_drive_free_bytes` | Total storage available on a drive. |
| `minio_node_drive_free_inodes` | Total free inodes. |
| `minio_node_drive_latency_us` | Average last minute latency in µs for drive API storage operations. |
| `minio_node_drive_offline_total` | Total drives offline in this node. |
| `minio_node_drive_online_total` | Total drives online in this node. |
| `minio_node_drive_total` | Total drives in this node. |
| `minio_node_drive_total_bytes` | Total storage on a drive. |
| `minio_node_drive_used_bytes` | Total storage used on a drive. |
| Name | Description |
|:---------------------------------------|:------------------------------------------------------------------------------------|
| `minio_node_drive_free_bytes` | Total storage available on a drive. |
| `minio_node_drive_free_inodes` | Total free inodes. |
| `minio_node_drive_latency_us` | Average last minute latency in µs for drive API storage operations. |
| `minio_node_drive_offline_total` | Total drives offline in this node. |
| `minio_node_drive_online_total` | Total drives online in this node. |
| `minio_node_drive_total` | Total drives in this node. |
| `minio_node_drive_total_bytes` | Total storage on a drive. |
| `minio_node_drive_used_bytes` | Total storage used on a drive. |
| `minio_node_drive_errors_timeout` | Total number of drive timeout errors since server start |
| `minio_node_drive_errors_availability` | Total number of drive I/O errors, permission denied and timeouts since server start |
| `minio_node_drive_io_waiting` | Total number I/O operations waiting on drive |
| `minio_node_drive_io_tokens` | Total number concurrent I/O operations configured on drive |
## Identity and Access Management (IAM) Metrics
@ -228,6 +232,7 @@ For deployments with [bucket](https://min.io/docs/minio/linux/administration/buc
| `minio_node_io_write_bytes` | Total bytes written by the process to the underlying storage system, /proc/[pid]/io write_bytes. |
| `minio_node_process_cpu_total_seconds` | Total user and system CPU time spent in seconds. |
| `minio_node_process_resident_memory_bytes` | Resident memory size in bytes. |
| `minio_node_process_virtual_memory_bytes` | Virtual memory size in bytes. |
| `minio_node_process_starttime_seconds` | Start time for MinIO process per node, time in seconds since Unix epoc. |
| `minio_node_process_uptime_seconds` | Uptime for MinIO process per node in seconds. |

2
go.mod
View file

@ -51,7 +51,7 @@ require (
github.com/minio/dperf v0.5.3
github.com/minio/highwayhash v1.0.2
github.com/minio/kes-go v0.2.0
github.com/minio/madmin-go/v3 v3.0.38
github.com/minio/madmin-go/v3 v3.0.40-0.20240119195114-66fab65f959f
github.com/minio/minio-go/v7 v7.0.66
github.com/minio/mux v1.9.0
github.com/minio/pkg/v2 v2.0.8

4
go.sum
View file

@ -443,8 +443,8 @@ github.com/minio/highwayhash v1.0.2 h1:Aak5U0nElisjDCfPSG79Tgzkn2gl66NxOMspRrKnA
github.com/minio/highwayhash v1.0.2/go.mod h1:BQskDq+xkJ12lmlUUi7U0M5Swg3EWR+dLTk+kldvVxY=
github.com/minio/kes-go v0.2.0 h1:HA33arq9s3MErbsj3PAXFVfFo4U4yw7lTKQ5kWFrpCA=
github.com/minio/kes-go v0.2.0/go.mod h1:VorHLaIYis9/MxAHAtXN4d8PUMNKhIxTIlvFt0hBOEo=
github.com/minio/madmin-go/v3 v3.0.38 h1:hgyQg43IkTq40ymFWoJwZyoqjYoT2GkiPlc1e7Bu+dY=
github.com/minio/madmin-go/v3 v3.0.38/go.mod h1:4QN2NftLSV7MdlT50dkrenOMmNVHluxTvlqJou3hte8=
github.com/minio/madmin-go/v3 v3.0.40-0.20240119195114-66fab65f959f h1:clgtVs6KUJTtKb4Xghq35gyJM/m10IwEmgfb4Do6BuY=
github.com/minio/madmin-go/v3 v3.0.40-0.20240119195114-66fab65f959f/go.mod h1:4QN2NftLSV7MdlT50dkrenOMmNVHluxTvlqJou3hte8=
github.com/minio/mc v0.0.0-20240111054932-d4305a5bf95e h1:vKnv5aBTcAAnDGYeJW/SPieXCerp/7MIYxuEUYt7iOE=
github.com/minio/mc v0.0.0-20240111054932-d4305a5bf95e/go.mod h1:wFVJTmLJniMFDkcvPP0h/KvCxK+MiA2rc6q7KUefN28=
github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34=