minio/cmd/metrics-v3.go
Shireesh Anjal 4caa3422bd
Add process metrics in metrics-v3 (#19612)
endpoint: /minio/metrics/v3/system/process
metrics:
- locks_read_total
- locks_write_total
- cpu_total_seconds
- go_routine_total
- io_rchar_bytes
- io_read_bytes
- io_wchar_bytes
- io_write_bytes
- start_time_seconds
- uptime_seconds
- file_descriptor_limit_total
- file_descriptor_open_total
- syscall_read_total
- syscall_write_total
- resident_memory_bytes
- virtual_memory_bytes
- virtual_memory_max_bytes

Since the standard process collector implements only a subset of these
metrics, remove it and implement our own custom process collector that
captures all the process metrics we need.
2024-04-26 09:07:23 -07:00

363 lines
10 KiB
Go

// Copyright (c) 2015-2024 MinIO, Inc.
//
// This file is part of MinIO Object Storage stack
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
package cmd
import (
"slices"
"strings"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/collectors"
)
// Collector paths.
//
// These are paths under the top-level /minio/metrics/v3 metrics endpoint. Each
// of these paths returns a set of V3 metrics.
const (
apiRequestsCollectorPath collectorPath = "/api/requests"
apiBucketCollectorPath collectorPath = "/api/bucket"
systemNetworkInternodeCollectorPath collectorPath = "/system/network/internode"
systemDriveCollectorPath collectorPath = "/system/drive"
systemMemoryCollectorPath collectorPath = "/system/memory"
systemCPUCollectorPath collectorPath = "/system/cpu"
systemProcessCollectorPath collectorPath = "/system/process"
systemGoCollectorPath collectorPath = "/system/go"
clusterHealthCollectorPath collectorPath = "/cluster/health"
clusterUsageObjectsCollectorPath collectorPath = "/cluster/usage/objects"
clusterUsageBucketsCollectorPath collectorPath = "/cluster/usage/buckets"
clusterErasureSetCollectorPath collectorPath = "/cluster/erasure-set"
clusterAuditCollectorPath collectorPath = "/cluster/audit"
clusterNotificationCollectorPath collectorPath = "/cluster/notification"
)
const (
clusterBasePath = "/cluster"
)
type metricsV3Collection struct {
mgMap map[collectorPath]*MetricsGroup
bucketMGMap map[collectorPath]*MetricsGroup
// Gatherers for non-bucket MetricsGroup's
mgGatherers map[collectorPath]prometheus.Gatherer
collectorPaths []collectorPath
}
func newMetricGroups(r *prometheus.Registry) *metricsV3Collection {
// Create all metric groups.
apiRequestsMG := NewMetricsGroup(apiRequestsCollectorPath,
[]MetricDescriptor{
apiRejectedAuthTotalMD,
apiRejectedHeaderTotalMD,
apiRejectedTimestampTotalMD,
apiRejectedInvalidTotalMD,
apiRequestsWaitingTotalMD,
apiRequestsIncomingTotalMD,
apiRequestsInFlightTotalMD,
apiRequestsTotalMD,
apiRequestsErrorsTotalMD,
apiRequests5xxErrorsTotalMD,
apiRequests4xxErrorsTotalMD,
apiRequestsCanceledTotalMD,
apiRequestsTTFBSecondsDistributionMD,
apiTrafficSentBytesMD,
apiTrafficRecvBytesMD,
},
JoinLoaders(loadAPIRequestsHTTPMetrics, loadAPIRequestsTTFBMetrics,
loadAPIRequestsNetworkMetrics),
)
apiBucketMG := NewBucketMetricsGroup(apiBucketCollectorPath,
[]MetricDescriptor{
apiBucketTrafficRecvBytesMD,
apiBucketTrafficSentBytesMD,
apiBucketRequestsInFlightMD,
apiBucketRequestsTotalMD,
apiBucketRequestsCanceledMD,
apiBucketRequests4xxErrorsMD,
apiBucketRequests5xxErrorsMD,
apiBucketRequestsTTFBSecondsDistributionMD,
},
JoinBucketLoaders(loadAPIBucketHTTPMetrics, loadAPIBucketTTFBMetrics),
)
systemNetworkInternodeMG := NewMetricsGroup(systemNetworkInternodeCollectorPath,
[]MetricDescriptor{
internodeErrorsTotalMD,
internodeDialedErrorsTotalMD,
internodeDialAvgTimeNanosMD,
internodeSentBytesTotalMD,
internodeRecvBytesTotalMD,
},
loadNetworkInternodeMetrics,
)
systemMemoryMG := NewMetricsGroup(systemMemoryCollectorPath,
[]MetricDescriptor{
memTotalMD,
memUsedMD,
memFreeMD,
memAvailableMD,
memBuffersMD,
memCacheMD,
memSharedMD,
memUsedPercMD,
},
loadMemoryMetrics,
)
systemCPUMG := NewMetricsGroup(systemCPUCollectorPath,
[]MetricDescriptor{
sysCPUAvgIdleMD,
sysCPUAvgIOWaitMD,
sysCPULoadMD,
sysCPULoadPercMD,
sysCPUNiceMD,
sysCPUStealMD,
sysCPUSystemMD,
sysCPUUserMD,
},
loadCPUMetrics,
)
systemProcessMG := NewMetricsGroup(systemProcessCollectorPath,
[]MetricDescriptor{
processLocksReadTotalMD,
processLocksWriteTotalMD,
processCPUTotalSecondsMD,
processGoRoutineTotalMD,
processIORCharBytesMD,
processIOReadBytesMD,
processIOWCharBytesMD,
processIOWriteBytesMD,
processStarttimeSecondsMD,
processUptimeSecondsMD,
processFileDescriptorLimitTotalMD,
processFileDescriptorOpenTotalMD,
processSyscallReadTotalMD,
processSyscallWriteTotalMD,
processResidentMemoryBytesMD,
processVirtualMemoryBytesMD,
processVirtualMemoryMaxBytesMD,
},
loadProcessMetrics,
)
systemDriveMG := NewMetricsGroup(systemDriveCollectorPath,
[]MetricDescriptor{
driveUsedBytesMD,
driveFreeBytesMD,
driveTotalBytesMD,
driveUsedInodesMD,
driveFreeInodesMD,
driveTotalInodesMD,
driveTimeoutErrorsMD,
driveIOErrorsMD,
driveAvailabilityErrorsMD,
driveWaitingIOMD,
driveAPILatencyMD,
driveHealingMD,
driveOnlineMD,
driveOfflineCountMD,
driveOnlineCountMD,
driveCountMD,
// iostat related
driveReadsPerSecMD,
driveReadsKBPerSecMD,
driveReadsAwaitMD,
driveWritesPerSecMD,
driveWritesKBPerSecMD,
driveWritesAwaitMD,
drivePercUtilMD,
},
loadDriveMetrics,
)
clusterHealthMG := NewMetricsGroup(clusterHealthCollectorPath,
[]MetricDescriptor{
healthDrivesOfflineCountMD,
healthDrivesOnlineCountMD,
healthDrivesCountMD,
healthNodesOfflineCountMD,
healthNodesOnlineCountMD,
healthCapacityRawTotalBytesMD,
healthCapacityRawFreeBytesMD,
healthCapacityUsableTotalBytesMD,
healthCapacityUsableFreeBytesMD,
},
JoinLoaders(loadClusterHealthDriveMetrics,
loadClusterHealthNodeMetrics,
loadClusterHealthCapacityMetrics),
)
clusterUsageObjectsMG := NewMetricsGroup(clusterUsageObjectsCollectorPath,
[]MetricDescriptor{
usageSinceLastUpdateSecondsMD,
usageTotalBytesMD,
usageObjectsCountMD,
usageVersionsCountMD,
usageDeleteMarkersCountMD,
usageBucketsCountMD,
usageObjectsDistributionMD,
usageVersionsDistributionMD,
},
loadClusterUsageObjectMetrics,
)
clusterUsageBucketsMG := NewBucketMetricsGroup(clusterUsageBucketsCollectorPath,
[]MetricDescriptor{
usageSinceLastUpdateSecondsMD,
usageBucketTotalBytesMD,
usageBucketObjectsTotalMD,
usageBucketVersionsCountMD,
usageBucketDeleteMarkersCountMD,
usageBucketQuotaTotalBytesMD,
usageBucketObjectSizeDistributionMD,
usageBucketObjectVersionCountDistributionMD,
},
loadClusterUsageBucketMetrics,
)
clusterErasureSetMG := NewMetricsGroup(clusterErasureSetCollectorPath,
[]MetricDescriptor{
erasureSetOverallWriteQuorumMD,
erasureSetOverallHealthMD,
erasureSetReadQuorumMD,
erasureSetWriteQuorumMD,
erasureSetOnlineDrivesCountMD,
erasureSetHealingDrivesCountMD,
erasureSetHealthMD,
},
loadClusterErasureSetMetrics,
)
clusterAuditMG := NewMetricsGroup(clusterAuditCollectorPath,
[]MetricDescriptor{
auditFailedMessagesMD,
auditTargetQueueLengthMD,
auditTotalMessagesMD,
},
loadClusterAuditMetrics,
)
clusterNotificationMG := NewMetricsGroup(clusterNotificationCollectorPath,
[]MetricDescriptor{
notificationCurrentSendInProgressMD,
notificationEventsErrorsTotalMD,
notificationEventsSentTotalMD,
notificationEventsSkippedTotalMD,
},
loadClusterNotificationMetrics,
)
allMetricGroups := []*MetricsGroup{
apiRequestsMG,
apiBucketMG,
systemNetworkInternodeMG,
systemDriveMG,
systemMemoryMG,
systemCPUMG,
systemProcessMG,
clusterHealthMG,
clusterUsageObjectsMG,
clusterUsageBucketsMG,
clusterErasureSetMG,
clusterAuditMG,
clusterNotificationMG,
}
// Bucket metrics are special, they always include the bucket label. These
// metrics required a list of buckets to be passed to the loader, and the list
// of buckets is not known until the request is made. So we keep a separate
// map for bucket metrics and handle them specially.
// Add the serverName and poolIndex labels to all non-cluster metrics.
//
// Also create metric group maps and set the cache.
metricsCache := newMetricsCache()
mgMap := make(map[collectorPath]*MetricsGroup)
bucketMGMap := make(map[collectorPath]*MetricsGroup)
for _, mg := range allMetricGroups {
if !strings.HasPrefix(string(mg.CollectorPath), clusterBasePath) {
mg.AddExtraLabels(
serverName, globalLocalNodeName,
// poolIndex, strconv.Itoa(globalLocalPoolIdx),
)
}
mg.SetCache(metricsCache)
if mg.IsBucketMetricsGroup() {
bucketMGMap[mg.CollectorPath] = mg
} else {
mgMap[mg.CollectorPath] = mg
}
}
// Prepare to register the collectors. Other than `MetricGroup` collectors,
// we also have standard collectors like `GoCollector`.
// Create all Non-`MetricGroup` collectors here.
collectors := map[collectorPath]prometheus.Collector{
systemGoCollectorPath: collectors.NewGoCollector(),
}
// Add all `MetricGroup` collectors to the map.
for _, mg := range allMetricGroups {
collectors[mg.CollectorPath] = mg
}
// Helper function to register a collector and return a gatherer for it.
mustRegister := func(c ...prometheus.Collector) prometheus.Gatherer {
subRegistry := prometheus.NewRegistry()
for _, col := range c {
subRegistry.MustRegister(col)
}
r.MustRegister(subRegistry)
return subRegistry
}
// Register all collectors and create gatherers for them.
gatherers := make(map[collectorPath]prometheus.Gatherer, len(collectors))
collectorPaths := make([]collectorPath, 0, len(collectors))
for path, collector := range collectors {
gatherers[path] = mustRegister(collector)
collectorPaths = append(collectorPaths, path)
}
slices.Sort(collectorPaths)
return &metricsV3Collection{
mgMap: mgMap,
bucketMGMap: bucketMGMap,
mgGatherers: gatherers,
collectorPaths: collectorPaths,
}
}