From 7981509cc85c35d2df7e7e42fed9a067e429535a Mon Sep 17 00:00:00 2001 From: Shireesh Anjal <355479+anjalshireesh@users.noreply.github.com> Date: Thu, 23 May 2024 13:11:18 +0530 Subject: [PATCH] Add cluster and bucket replication metrics in metrics-v3 (#19546) endpoint: /minio/metrics/v3/cluster/replication metrics: - average_active_workers - average_queued_bytes - average_queued_count - average_transfer_rate - current_active_workers - current_transfer_rate - last_minute_queued_bytes - last_minute_queued_count - max_active_workers - max_queued_bytes - max_queued_count - max_transfer_rate - recent_backlog_count endpoint: /minio/metrics/v3/api/bucket/replication metrics: - last_hour_failed_bytes - last_hour_failed_count - last_minute_failed_bytes - last_minute_failed_count - latency_ms - proxied_delete_tagging_requests_total - proxied_get_requests_failures - proxied_get_requests_total - proxied_get_tagging_requests_failures - proxied_get_tagging_requests_total - proxied_head_requests_failures - proxied_head_requests_total - proxied_put_tagging_requests_failures - proxied_put_tagging_requests_total - sent_bytes - sent_count - total_failed_bytes - total_failed_count - proxied_delete_tagging_requests_failures --- cmd/metrics-v3-api.go | 24 ++--- cmd/metrics-v3-bucket-replication.go | 155 +++++++++++++++++++++++++++ cmd/metrics-v3-replication.go | 96 +++++++++++++++++ cmd/metrics-v3-types.go | 12 ++- cmd/metrics-v3.go | 72 ++++++++++--- docs/metrics/v3.md | 81 ++++++++++---- 6 files changed, 395 insertions(+), 45 deletions(-) create mode 100644 cmd/metrics-v3-bucket-replication.go create mode 100644 cmd/metrics-v3-replication.go diff --git a/cmd/metrics-v3-api.go b/cmd/metrics-v3-api.go index b12fe3a5f..548d72172 100644 --- a/cmd/metrics-v3-api.go +++ b/cmd/metrics-v3-api.go @@ -144,33 +144,33 @@ func loadAPIRequestsNetworkMetrics(ctx context.Context, m MetricValues, _ *metri // Metric Descriptions for bucket level S3 metrics. var ( - apiBucketTrafficSentBytesMD = NewCounterMD(apiTrafficSentBytes, + bucketAPITrafficSentBytesMD = NewCounterMD(apiTrafficSentBytes, "Total number of bytes received for a bucket", "bucket", "type") - apiBucketTrafficRecvBytesMD = NewCounterMD(apiTrafficRecvBytes, + bucketAPITrafficRecvBytesMD = NewCounterMD(apiTrafficRecvBytes, "Total number of bytes sent for a bucket", "bucket", "type") - apiBucketRequestsInFlightMD = NewGaugeMD(apiRequestsInFlightTotal, + bucketAPIRequestsInFlightMD = NewGaugeMD(apiRequestsInFlightTotal, "Total number of requests currently in flight for a bucket", "bucket", "name", "type") - apiBucketRequestsTotalMD = NewCounterMD(apiRequestsTotal, + bucketAPIRequestsTotalMD = NewCounterMD(apiRequestsTotal, "Total number of requests for a bucket", "bucket", "name", "type") - apiBucketRequestsCanceledMD = NewCounterMD(apiRequestsCanceledTotal, + bucketAPIRequestsCanceledMD = NewCounterMD(apiRequestsCanceledTotal, "Total number of requests canceled by the client for a bucket", "bucket", "name", "type") - apiBucketRequests4xxErrorsMD = NewCounterMD(apiRequests4xxErrorsTotal, + bucketAPIRequests4xxErrorsMD = NewCounterMD(apiRequests4xxErrorsTotal, "Total number of requests with 4xx errors for a bucket", "bucket", "name", "type") - apiBucketRequests5xxErrorsMD = NewCounterMD(apiRequests5xxErrorsTotal, + bucketAPIRequests5xxErrorsMD = NewCounterMD(apiRequests5xxErrorsTotal, "Total number of requests with 5xx errors for a bucket", "bucket", "name", "type") - apiBucketRequestsTTFBSecondsDistributionMD = NewCounterMD(apiRequestsTTFBSecondsDistribution, + bucketAPIRequestsTTFBSecondsDistributionMD = NewCounterMD(apiRequestsTTFBSecondsDistribution, "Distribution of time to first byte across API calls for a bucket", "bucket", "name", "le", "type") ) -// loadAPIBucketHTTPMetrics - loads bucket level S3 HTTP metrics. +// loadBucketAPIHTTPMetrics - loads bucket level S3 HTTP metrics. // // This is a `MetricsLoaderFn`. // // This includes bucket level S3 HTTP metrics and S3 network in/out metrics. -func loadAPIBucketHTTPMetrics(ctx context.Context, m MetricValues, _ *metricsCache, buckets []string) error { +func loadBucketAPIHTTPMetrics(ctx context.Context, m MetricValues, _ *metricsCache, buckets []string) error { if len(buckets) == 0 { return nil } @@ -209,10 +209,10 @@ func loadAPIBucketHTTPMetrics(ctx context.Context, m MetricValues, _ *metricsCac return nil } -// loadAPIBucketTTFBMetrics - loads bucket S3 TTFB metrics. +// loadBucketAPITTFBMetrics - loads bucket S3 TTFB metrics. // // This is a `MetricsLoaderFn`. -func loadAPIBucketTTFBMetrics(ctx context.Context, m MetricValues, _ *metricsCache, buckets []string) error { +func loadBucketAPITTFBMetrics(ctx context.Context, m MetricValues, _ *metricsCache, buckets []string) error { renameLabels := map[string]string{"api": "name"} m.SetHistogram(apiRequestsTTFBSecondsDistribution, bucketHTTPRequestsDuration, renameLabels, buckets, "type", "s3") diff --git a/cmd/metrics-v3-bucket-replication.go b/cmd/metrics-v3-bucket-replication.go new file mode 100644 index 000000000..64f65e832 --- /dev/null +++ b/cmd/metrics-v3-bucket-replication.go @@ -0,0 +1,155 @@ +// Copyright (c) 2015-2024 MinIO, Inc. +// +// This file is part of MinIO Object Storage stack +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +package cmd + +import ( + "context" +) + +const ( + bucketReplLastHrFailedBytes = "last_hour_failed_bytes" + bucketReplLastHrFailedCount = "last_hour_failed_count" + bucketReplLastMinFailedBytes = "last_minute_failed_bytes" + bucketReplLastMinFailedCount = "last_minute_failed_count" + bucketReplLatencyMs = "latency_ms" + bucketReplProxiedDeleteTaggingRequestsTotal = "proxied_delete_tagging_requests_total" + bucketReplProxiedGetRequestsFailures = "proxied_get_requests_failures" + bucketReplProxiedGetRequestsTotal = "proxied_get_requests_total" + bucketReplProxiedGetTaggingRequestsFailures = "proxied_get_tagging_requests_failures" + bucketReplProxiedGetTaggingRequestsTotal = "proxied_get_tagging_requests_total" + bucketReplProxiedHeadRequestsFailures = "proxied_head_requests_failures" + bucketReplProxiedHeadRequestsTotal = "proxied_head_requests_total" + bucketReplProxiedPutTaggingRequestsFailures = "proxied_put_tagging_requests_failures" + bucketReplProxiedPutTaggingRequestsTotal = "proxied_put_tagging_requests_total" + bucketReplSentBytes = "sent_bytes" + bucketReplSentCount = "sent_count" + bucketReplTotalFailedBytes = "total_failed_bytes" + bucketReplTotalFailedCount = "total_failed_count" + bucketReplProxiedDeleteTaggingRequestsFailures = "proxied_delete_tagging_requests_failures" + bucketL = "bucket" + operationL = "operation" + targetArnL = "targetArn" +) + +var ( + bucketReplLastHrFailedBytesMD = NewGaugeMD(bucketReplLastHrFailedBytes, + "Total number of bytes failed at least once to replicate in the last hour on a bucket", + bucketL) + bucketReplLastHrFailedCountMD = NewGaugeMD(bucketReplLastHrFailedCount, + "Total number of objects which failed replication in the last hour on a bucket", + bucketL) + bucketReplLastMinFailedBytesMD = NewGaugeMD(bucketReplLastMinFailedBytes, + "Total number of bytes failed at least once to replicate in the last full minute on a bucket", + bucketL) + bucketReplLastMinFailedCountMD = NewGaugeMD(bucketReplLastMinFailedCount, + "Total number of objects which failed replication in the last full minute on a bucket", + bucketL) + bucketReplLatencyMsMD = NewGaugeMD(bucketReplLatencyMs, + "Replication latency on a bucket in milliseconds", + bucketL, operationL, rangeL, targetArnL) + bucketReplProxiedDeleteTaggingRequestsTotalMD = NewCounterMD(bucketReplProxiedDeleteTaggingRequestsTotal, + "Number of DELETE tagging requests proxied to replication target", + bucketL) + bucketReplProxiedGetRequestsFailuresMD = NewCounterMD(bucketReplProxiedGetRequestsFailures, + "Number of failures in GET requests proxied to replication target", + bucketL) + bucketReplProxiedGetRequestsTotalMD = NewCounterMD(bucketReplProxiedGetRequestsTotal, + "Number of GET requests proxied to replication target", + bucketL) + bucketReplProxiedGetTaggingRequestsFailuresMD = NewCounterMD(bucketReplProxiedGetTaggingRequestsFailures, + "Number of failures in GET tagging requests proxied to replication target", + bucketL) + bucketReplProxiedGetTaggingRequestsTotalMD = NewCounterMD(bucketReplProxiedGetTaggingRequestsTotal, + "Number of GET tagging requests proxied to replication target", + bucketL) + bucketReplProxiedHeadRequestsFailuresMD = NewCounterMD(bucketReplProxiedHeadRequestsFailures, + "Number of failures in HEAD requests proxied to replication target", + bucketL) + bucketReplProxiedHeadRequestsTotalMD = NewCounterMD(bucketReplProxiedHeadRequestsTotal, + "Number of HEAD requests proxied to replication target", + bucketL) + bucketReplProxiedPutTaggingRequestsFailuresMD = NewCounterMD(bucketReplProxiedPutTaggingRequestsFailures, + "Number of failures in PUT tagging requests proxied to replication target", + bucketL) + bucketReplProxiedPutTaggingRequestsTotalMD = NewCounterMD(bucketReplProxiedPutTaggingRequestsTotal, + "Number of PUT tagging requests proxied to replication target", + bucketL) + bucketReplSentBytesMD = NewCounterMD(bucketReplSentBytes, + "Total number of bytes replicated to the target", + bucketL) + bucketReplSentCountMD = NewCounterMD(bucketReplSentCount, + "Total number of objects replicated to the target", + bucketL) + bucketReplTotalFailedBytesMD = NewCounterMD(bucketReplTotalFailedBytes, + "Total number of bytes failed at least once to replicate since server start", + bucketL) + bucketReplTotalFailedCountMD = NewCounterMD(bucketReplTotalFailedCount, + "Total number of objects which failed replication since server start", + bucketL) + bucketReplProxiedDeleteTaggingRequestsFailuresMD = NewCounterMD(bucketReplProxiedDeleteTaggingRequestsFailures, + "Number of failures in DELETE tagging requests proxied to replication target", + bucketL) +) + +// loadBucketReplicationMetrics - `BucketMetricsLoaderFn` for bucket replication metrics +// such as latency and sent bytes. +func loadBucketReplicationMetrics(ctx context.Context, m MetricValues, c *metricsCache, buckets []string) error { + if globalSiteReplicationSys.isEnabled() { + return nil + } + + dataUsageInfo, err := c.dataUsageInfo.Get() + if err != nil { + metricsLogIf(ctx, err) + return nil + } + + bucketReplStats := globalReplicationStats.getAllLatest(dataUsageInfo.BucketsUsage) + for _, bucket := range buckets { + labels := []string{bucketL, bucket} + if s, ok := bucketReplStats[bucket]; ok { + stats := s.ReplicationStats + if stats.hasReplicationUsage() { + for arn, stat := range stats.Stats { + m.Set(bucketReplLastHrFailedBytes, float64(stat.Failed.LastHour.Bytes), labels...) + m.Set(bucketReplLastHrFailedCount, float64(stat.Failed.LastHour.Count), labels...) + m.Set(bucketReplLastMinFailedBytes, float64(stat.Failed.LastMinute.Bytes), labels...) + m.Set(bucketReplLastMinFailedCount, float64(stat.Failed.LastMinute.Count), labels...) + m.Set(bucketReplProxiedDeleteTaggingRequestsTotal, float64(s.ProxyStats.RmvTagTotal), labels...) + m.Set(bucketReplProxiedGetRequestsFailures, float64(s.ProxyStats.GetFailedTotal), labels...) + m.Set(bucketReplProxiedGetRequestsTotal, float64(s.ProxyStats.GetTotal), labels...) + m.Set(bucketReplProxiedGetTaggingRequestsFailures, float64(s.ProxyStats.GetTagFailedTotal), labels...) + m.Set(bucketReplProxiedGetTaggingRequestsTotal, float64(s.ProxyStats.GetTagTotal), labels...) + m.Set(bucketReplProxiedHeadRequestsFailures, float64(s.ProxyStats.HeadFailedTotal), labels...) + m.Set(bucketReplProxiedHeadRequestsTotal, float64(s.ProxyStats.HeadTotal), labels...) + m.Set(bucketReplProxiedPutTaggingRequestsFailures, float64(s.ProxyStats.PutTagFailedTotal), labels...) + m.Set(bucketReplProxiedPutTaggingRequestsTotal, float64(s.ProxyStats.PutTagTotal), labels...) + m.Set(bucketReplSentCount, float64(stat.ReplicatedCount), labels...) + m.Set(bucketReplTotalFailedBytes, float64(stat.Failed.Totals.Bytes), labels...) + m.Set(bucketReplTotalFailedCount, float64(stat.Failed.Totals.Count), labels...) + m.Set(bucketReplProxiedDeleteTaggingRequestsFailures, float64(s.ProxyStats.RmvTagFailedTotal), labels...) + m.Set(bucketReplSentBytes, float64(stat.ReplicatedSize), labels...) + + SetHistogramValues(m, bucketReplLatencyMs, stat.Latency.getUploadLatency(), bucketL, bucket, operationL, "upload", targetArnL, arn) + } + } + } + } + + return nil +} diff --git a/cmd/metrics-v3-replication.go b/cmd/metrics-v3-replication.go new file mode 100644 index 000000000..1961c3304 --- /dev/null +++ b/cmd/metrics-v3-replication.go @@ -0,0 +1,96 @@ +// Copyright (c) 2015-2024 MinIO, Inc. +// +// This file is part of MinIO Object Storage stack +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +package cmd + +import ( + "context" +) + +const ( + replicationAverageActiveWorkers = "average_active_workers" + replicationAverageQueuedBytes = "average_queued_bytes" + replicationAverageQueuedCount = "average_queued_count" + replicationAverageDataTransferRate = "average_data_transfer_rate" + replicationCurrentActiveWorkers = "current_active_workers" + replicationCurrentDataTransferRate = "current_data_transfer_rate" + replicationLastMinuteQueuedBytes = "last_minute_queued_bytes" + replicationLastMinuteQueuedCount = "last_minute_queued_count" + replicationMaxActiveWorkers = "max_active_workers" + replicationMaxQueuedBytes = "max_queued_bytes" + replicationMaxQueuedCount = "max_queued_count" + replicationMaxDataTransferRate = "max_data_transfer_rate" +) + +var ( + replicationAverageActiveWorkersMD = NewGaugeMD(replicationAverageActiveWorkers, + "Average number of active replication workers") + replicationAverageQueuedBytesMD = NewGaugeMD(replicationAverageQueuedBytes, + "Average number of bytes queued for replication since server start") + replicationAverageQueuedCountMD = NewGaugeMD(replicationAverageQueuedCount, + "Average number of objects queued for replication since server start") + replicationAverageDataTransferRateMD = NewGaugeMD(replicationAverageDataTransferRate, + "Average replication data transfer rate in bytes/sec") + replicationCurrentActiveWorkersMD = NewGaugeMD(replicationCurrentActiveWorkers, + "Total number of active replication workers") + replicationCurrentDataTransferRateMD = NewGaugeMD(replicationCurrentDataTransferRate, + "Current replication data transfer rate in bytes/sec") + replicationLastMinuteQueuedBytesMD = NewGaugeMD(replicationLastMinuteQueuedBytes, + "Number of bytes queued for replication in the last full minute") + replicationLastMinuteQueuedCountMD = NewGaugeMD(replicationLastMinuteQueuedCount, + "Number of objects queued for replication in the last full minute") + replicationMaxActiveWorkersMD = NewGaugeMD(replicationMaxActiveWorkers, + "Maximum number of active replication workers seen since server start") + replicationMaxQueuedBytesMD = NewGaugeMD(replicationMaxQueuedBytes, + "Maximum number of bytes queued for replication since server start") + replicationMaxQueuedCountMD = NewGaugeMD(replicationMaxQueuedCount, + "Maximum number of objects queued for replication since server start") + replicationMaxDataTransferRateMD = NewGaugeMD(replicationMaxDataTransferRate, + "Maximum replication data transfer rate in bytes/sec seen since server start") +) + +// loadClusterReplicationMetrics - `MetricsLoaderFn` for cluster replication metrics +// such as transfer rate and objects queued. +func loadClusterReplicationMetrics(ctx context.Context, m MetricValues, c *metricsCache) error { + if globalReplicationStats == nil { + return nil + } + + qs := globalReplicationStats.getNodeQueueStatsSummary() + + qt := qs.QStats + m.Set(replicationAverageQueuedBytes, float64(qt.Avg.Bytes)) + m.Set(replicationAverageQueuedCount, float64(qt.Avg.Count)) + m.Set(replicationMaxQueuedBytes, float64(qt.Max.Bytes)) + m.Set(replicationMaxQueuedCount, float64(qt.Max.Count)) + m.Set(replicationLastMinuteQueuedBytes, float64(qt.Curr.Bytes)) + m.Set(replicationLastMinuteQueuedCount, float64(qt.Curr.Count)) + + qa := qs.ActiveWorkers + m.Set(replicationAverageActiveWorkers, float64(qa.Avg)) + m.Set(replicationCurrentActiveWorkers, float64(qa.Curr)) + m.Set(replicationMaxActiveWorkers, float64(qa.Max)) + + if len(qs.XferStats) > 0 { + tots := qs.XferStats[Total] + m.Set(replicationAverageDataTransferRate, tots.Avg) + m.Set(replicationCurrentDataTransferRate, tots.Curr) + m.Set(replicationMaxDataTransferRate, tots.Peak) + } + + return nil +} diff --git a/cmd/metrics-v3-types.go b/cmd/metrics-v3-types.go index 4fd5e265b..07bc1d616 100644 --- a/cmd/metrics-v3-types.go +++ b/cmd/metrics-v3-types.go @@ -72,6 +72,8 @@ const ( GaugeMT // HistogramMT - represents a histogram metric. HistogramMT + // rangeL - represents a range label. + rangeL = "range" ) func (mt MetricType) String() string { @@ -225,7 +227,7 @@ func (m *MetricValues) Set(name MetricName, value float64, labels ...string) { } if len(labels)/2 != len(validLabels) { - panic(fmt.Sprintf("not all labels were given values")) + panic("not all labels were given values") } v, ok := m.values[name] @@ -284,6 +286,14 @@ func (m *MetricValues) SetHistogram(name MetricName, hist *prometheus.HistogramV } } +// SetHistogramValues - sets values for the given MetricName using the provided map of +// range to value. +func SetHistogramValues[V uint64 | int64 | float64](m MetricValues, name MetricName, values map[string]V, labels ...string) { + for rng, val := range values { + m.Set(name, float64(val), append(labels, rangeL, rng)...) + } +} + // MetricsLoaderFn - represents a function to load metrics from the // metricsCache. // diff --git a/cmd/metrics-v3.go b/cmd/metrics-v3.go index 79b432b9b..9dc36e4fc 100644 --- a/cmd/metrics-v3.go +++ b/cmd/metrics-v3.go @@ -35,7 +35,9 @@ import ( // for the bucket "mybucket" would be /minio/metrics/v3/bucket/api/mybucket const ( apiRequestsCollectorPath collectorPath = "/api/requests" - apiBucketCollectorPath collectorPath = "/bucket/api" + + bucketAPICollectorPath collectorPath = "/bucket/api" + bucketReplicationCollectorPath collectorPath = "/bucket/replication" systemNetworkInternodeCollectorPath collectorPath = "/system/network/internode" systemDriveCollectorPath collectorPath = "/system/drive" @@ -54,6 +56,7 @@ const ( auditCollectorPath collectorPath = "/audit" loggerWebhookCollectorPath collectorPath = "/logger/webhook" + replicationCollectorPath collectorPath = "/replication" ) const ( @@ -97,20 +100,45 @@ func newMetricGroups(r *prometheus.Registry) *metricsV3Collection { loadAPIRequestsNetworkMetrics), ) - apiBucketMG := NewBucketMetricsGroup(apiBucketCollectorPath, + bucketAPIMG := NewBucketMetricsGroup(bucketAPICollectorPath, []MetricDescriptor{ - apiBucketTrafficRecvBytesMD, - apiBucketTrafficSentBytesMD, + bucketAPITrafficRecvBytesMD, + bucketAPITrafficSentBytesMD, - apiBucketRequestsInFlightMD, - apiBucketRequestsTotalMD, - apiBucketRequestsCanceledMD, - apiBucketRequests4xxErrorsMD, - apiBucketRequests5xxErrorsMD, + bucketAPIRequestsInFlightMD, + bucketAPIRequestsTotalMD, + bucketAPIRequestsCanceledMD, + bucketAPIRequests4xxErrorsMD, + bucketAPIRequests5xxErrorsMD, - apiBucketRequestsTTFBSecondsDistributionMD, + bucketAPIRequestsTTFBSecondsDistributionMD, }, - JoinBucketLoaders(loadAPIBucketHTTPMetrics, loadAPIBucketTTFBMetrics), + JoinBucketLoaders(loadBucketAPIHTTPMetrics, loadBucketAPITTFBMetrics), + ) + + bucketReplicationMG := NewBucketMetricsGroup(bucketReplicationCollectorPath, + []MetricDescriptor{ + bucketReplLastHrFailedBytesMD, + bucketReplLastHrFailedCountMD, + bucketReplLastMinFailedBytesMD, + bucketReplLastMinFailedCountMD, + bucketReplLatencyMsMD, + bucketReplProxiedDeleteTaggingRequestsTotalMD, + bucketReplProxiedGetRequestsFailuresMD, + bucketReplProxiedGetRequestsTotalMD, + bucketReplProxiedGetTaggingRequestsFailuresMD, + bucketReplProxiedGetTaggingRequestsTotalMD, + bucketReplProxiedHeadRequestsFailuresMD, + bucketReplProxiedHeadRequestsTotalMD, + bucketReplProxiedPutTaggingRequestsFailuresMD, + bucketReplProxiedPutTaggingRequestsTotalMD, + bucketReplSentBytesMD, + bucketReplSentCountMD, + bucketReplTotalFailedBytesMD, + bucketReplTotalFailedCountMD, + bucketReplProxiedDeleteTaggingRequestsFailuresMD, + }, + loadBucketReplicationMetrics, ) systemNetworkInternodeMG := NewMetricsGroup(systemNetworkInternodeCollectorPath, @@ -296,6 +324,24 @@ func newMetricGroups(r *prometheus.Registry) *metricsV3Collection { loadClusterIAMMetrics, ) + clusterReplicationMG := NewMetricsGroup(replicationCollectorPath, + []MetricDescriptor{ + replicationAverageActiveWorkersMD, + replicationAverageQueuedBytesMD, + replicationAverageQueuedCountMD, + replicationAverageDataTransferRateMD, + replicationCurrentActiveWorkersMD, + replicationCurrentDataTransferRateMD, + replicationLastMinuteQueuedBytesMD, + replicationLastMinuteQueuedCountMD, + replicationMaxActiveWorkersMD, + replicationMaxQueuedBytesMD, + replicationMaxQueuedCountMD, + replicationMaxDataTransferRateMD, + }, + loadClusterReplicationMetrics, + ) + loggerWebhookMG := NewMetricsGroup(loggerWebhookCollectorPath, []MetricDescriptor{ webhookFailedMessagesMD, @@ -316,7 +362,8 @@ func newMetricGroups(r *prometheus.Registry) *metricsV3Collection { allMetricGroups := []*MetricsGroup{ apiRequestsMG, - apiBucketMG, + bucketAPIMG, + bucketReplicationMG, systemNetworkInternodeMG, systemDriveMG, @@ -330,6 +377,7 @@ func newMetricGroups(r *prometheus.Registry) *metricsV3Collection { clusterErasureSetMG, clusterNotificationMG, clusterIAMMG, + clusterReplicationMG, auditMG, loggerWebhookMG, diff --git a/docs/metrics/v3.md b/docs/metrics/v3.md index d09348d0c..ffbeea9d8 100644 --- a/docs/metrics/v3.md +++ b/docs/metrics/v3.md @@ -31,7 +31,7 @@ These are metrics about requests served by the (current) node. | Path | Description | |-----------------|--------------------------------------------------| | `/api/requests` | Metrics over all requests | -| `/api/bucket` | Metrics over all requests split by bucket labels | +| `/bucket/api` | Metrics over all requests for a given bucket | | | | ### Audit metrics @@ -122,6 +122,30 @@ The standard metrics group for GoCollector is not shown below. | `minio_bucket_api_5xx_errors_total` | `counter` | Total number of requests with 5xx errors for a bucket | `bucket,name,type,server,pool_index` | | `minio_bucket_api_ttfb_seconds_distribution` | `counter` | Distribution of time to first byte across API calls for a bucket | `bucket,name,le,type,server,pool_index` | +### `/bucket/replication` + +| Name | Type | Help | Labels | +|---------------------------------------------------------------------|-----------|---------------------------------------------------------------------------------------------|-------------------------------------------| +| `minio_bucket_replication_last_hour_failed_bytes` | `gauge` | Total number of bytes failed at least once to replicate in the last hour on a bucket | `bucket,server` | +| `minio_bucket_replication_last_hour_failed_count` | `gauge` | Total number of objects which failed replication in the last hour on a bucket | `bucket,server` | +| `minio_bucket_replication_last_minute_failed_bytes` | `gauge` | Total number of bytes failed at least once to replicate in the last full minute on a bucket | `bucket,server` | +| `minio_bucket_replication_last_minute_failed_count` | `gauge` | Total number of objects which failed replication in the last full minute on a bucket | `bucket,server` | +| `minio_bucket_replication_latency_ms` | `gauge` | Replication latency on a bucket in milliseconds | `bucket,operation,range,targetArn,server` | +| `minio_bucket_replication_proxied_delete_tagging_requests_total` | `counter` | Number of DELETE tagging requests proxied to replication target | `bucket,server` | +| `minio_bucket_replication_proxied_get_requests_failures` | `counter` | Number of failures in GET requests proxied to replication target | `bucket,server` | +| `minio_bucket_replication_proxied_get_requests_total` | `counter` | Number of GET requests proxied to replication target | `bucket,server` | +| `minio_bucket_replication_proxied_get_tagging_requests_failures` | `counter` | Number of failures in GET tagging requests proxied to replication target | `bucket,server` | +| `minio_bucket_replication_proxied_get_tagging_requests_total` | `counter` | Number of GET tagging requests proxied to replication target | `bucket,server` | +| `minio_bucket_replication_proxied_head_requests_failures` | `counter` | Number of failures in HEAD requests proxied to replication target | `bucket,server` | +| `minio_bucket_replication_proxied_head_requests_total` | `counter` | Number of HEAD requests proxied to replication target | `bucket,server` | +| `minio_bucket_replication_proxied_put_tagging_requests_failures` | `counter` | Number of failures in PUT tagging requests proxied to replication target | `bucket,server` | +| `minio_bucket_replication_proxied_put_tagging_requests_total` | `counter` | Number of PUT tagging requests proxied to replication target | `bucket,server` | +| `minio_bucket_replication_sent_bytes` | `counter` | Total number of bytes replicated to the target | `bucket,server` | +| `minio_bucket_replication_sent_count` | `counter` | Total number of objects replicated to the target | `bucket,server` | +| `minio_bucket_replication_total_failed_bytes` | `counter` | Total number of bytes failed at least once to replicate since server start | `bucket,server` | +| `minio_bucket_replication_total_failed_count` | `counter` | Total number of objects which failed replication since server start | `bucket,server` | +| `minio_bucket_replication_proxied_delete_tagging_requests_failures` | `counter` | Number of failures in DELETE tagging requests proxied to replication target | `bucket,server` | + ### `/audit` | Name | Type | Help | Labels | @@ -195,25 +219,25 @@ The standard metrics group for GoCollector is not shown below. ### `/system/process` -| Name | Type | Help | Labels | -|-------------------------------|-----------|----------------------------------------------------------------------------------------------------------------|----------| -| `locks_read_total` | `gauge` | Number of current READ locks on this peer | `server` | -| `locks_write_total` | `gauge` | Number of current WRITE locks on this peer | `server` | -| `cpu_total_seconds` | `counter` | Total user and system CPU time spent in seconds | `server` | -| `go_routine_total` | `gauge` | Total number of go routines running | `server` | -| `io_rchar_bytes` | `counter` | Total bytes read by the process from the underlying storage system including cache, /proc/[pid]/io rchar | `server` | -| `io_read_bytes` | `counter` | Total bytes read by the process from the underlying storage system, /proc/[pid]/io read_bytes | `server` | -| `io_wchar_bytes` | `counter` | Total bytes written by the process to the underlying storage system including page cache, /proc/[pid]/io wchar | `server` | -| `io_write_bytes` | `counter` | Total bytes written by the process to the underlying storage system, /proc/[pid]/io write_bytes | `server` | -| `start_time_seconds` | `gauge` | Start time for MinIO process in seconds since Unix epoc | `server` | -| `uptime_seconds` | `gauge` | Uptime for MinIO process in seconds | `server` | -| `file_descriptor_limit_total` | `gauge` | Limit on total number of open file descriptors for the MinIO Server process | `server` | -| `file_descriptor_open_total` | `gauge` | Total number of open file descriptors by the MinIO Server process | `server` | -| `syscall_read_total` | `counter` | Total read SysCalls to the kernel. /proc/[pid]/io syscr | `server` | -| `syscall_write_total` | `counter` | Total write SysCalls to the kernel. /proc/[pid]/io syscw | `server` | -| `resident_memory_bytes` | `gauge` | Resident memory size in bytes | `server` | -| `virtual_memory_bytes` | `gauge` | Virtual memory size in bytes | `server` | -| `virtual_memory_max_bytes` | `gauge` | Maximum virtual memory size in bytes | `server` | +| Name | Type | Help | Labels | +|----------------------------------------------------|-----------|----------------------------------------------------------------------------------------------------------------|----------| +| `minio_system_process_locks_read_total` | `gauge` | Number of current READ locks on this peer | `server` | +| `minio_system_process_locks_write_total` | `gauge` | Number of current WRITE locks on this peer | `server` | +| `minio_system_process_cpu_total_seconds` | `counter` | Total user and system CPU time spent in seconds | `server` | +| `minio_system_process_go_routine_total` | `gauge` | Total number of go routines running | `server` | +| `minio_system_process_io_rchar_bytes` | `counter` | Total bytes read by the process from the underlying storage system including cache, /proc/[pid]/io rchar | `server` | +| `minio_system_process_io_read_bytes` | `counter` | Total bytes read by the process from the underlying storage system, /proc/[pid]/io read_bytes | `server` | +| `minio_system_process_io_wchar_bytes` | `counter` | Total bytes written by the process to the underlying storage system including page cache, /proc/[pid]/io wchar | `server` | +| `minio_system_process_io_write_bytes` | `counter` | Total bytes written by the process to the underlying storage system, /proc/[pid]/io write_bytes | `server` | +| `minio_system_process_start_time_seconds` | `gauge` | Start time for MinIO process in seconds since Unix epoc | `server` | +| `minio_system_process_uptime_seconds` | `gauge` | Uptime for MinIO process in seconds | `server` | +| `minio_system_process_file_descriptor_limit_total` | `gauge` | Limit on total number of open file descriptors for the MinIO Server process | `server` | +| `minio_system_process_file_descriptor_open_total` | `gauge` | Total number of open file descriptors by the MinIO Server process | `server` | +| `minio_system_process_syscall_read_total` | `counter` | Total read SysCalls to the kernel. /proc/[pid]/io syscr | `server` | +| `minio_system_process_syscall_write_total` | `counter` | Total write SysCalls to the kernel. /proc/[pid]/io syscw | `server` | +| `minio_system_process_resident_memory_bytes` | `gauge` | Resident memory size in bytes | `server` | +| `minio_system_process_virtual_memory_bytes` | `gauge` | Virtual memory size in bytes | `server` | +| `minio_system_process_virtual_memory_max_bytes` | `gauge` | Maximum virtual memory size in bytes | `server` | ### `/cluster/health` @@ -302,3 +326,20 @@ The standard metrics group for GoCollector is not shown below. | `minio_logger_webhook_failed_messages` | `counter` | Number of messages that failed to send | `server,name,endpoint` | | `minio_logger_webhook_queue_length` | `gauge` | Webhook queue length | `server,name,endpoint` | | `minio_logger_webhook_total_message` | `counter` | Total number of messages sent to this target | `server,name,endpoint` | + +### `/replication` + +| Name | Type | Help | Labels | +|---------------------------------------------------|---------|-----------------------------------------------------------------------------|----------| +| `minio_replication_average_active_workers` | `gauge` | Average number of active replication workers | `server` | +| `minio_replication_average_queued_bytes` | `gauge` | Average number of bytes queued for replication since server start | `server` | +| `minio_replication_average_queued_count` | `gauge` | Average number of objects queued for replication since server start | `server` | +| `minio_replication_average_data_transfer_rate` | `gauge` | Average replication data transfer rate in bytes/sec | `server` | +| `minio_replication_current_active_workers` | `gauge` | Total number of active replication workers | `server` | +| `minio_replication_current_data_transfer_rate` | `gauge` | Current replication data transfer rate in bytes/sec | `server` | +| `minio_replication_last_minute_queued_bytes` | `gauge` | Number of bytes queued for replication in the last full minute | `server` | +| `minio_replication_last_minute_queued_count` | `gauge` | Number of objects queued for replication in the last full minute | `server` | +| `minio_replication_max_active_workers` | `gauge` | Maximum number of active replication workers seen since server start | `server` | +| `minio_replication_max_queued_bytes` | `gauge` | Maximum number of bytes queued for replication since server start | `server` | +| `minio_replication_max_queued_count` | `gauge` | Maximum number of objects queued for replication since server start | `server` | +| `minio_replication_max_data_transfer_rate` | `gauge` | Maximum replication data transfer rate in bytes/sec seen since server start | `server` |