fix replication last hour metric (#20199)

also adding missing recent_backlog_count metric to v3 metrics
This commit is contained in:
Poorna 2024-08-01 17:55:27 -07:00 committed by GitHub
parent 50a5ad48fc
commit 74c047cb03
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 14 additions and 4 deletions

View file

@ -20,6 +20,7 @@ package cmd
import (
"fmt"
"math"
"sync/atomic"
"time"
"github.com/minio/madmin-go/v3"
@ -127,8 +128,7 @@ func (l *ReplicationLastHour) getTotal() AccElem {
// forwardTo time t, clearing any entries in between.
func (l *ReplicationLastHour) forwardTo(t int64) {
tMin := t / 60
if l.LastMin >= tMin {
if l.LastMin >= t {
return
}
if t-l.LastMin >= 60 {
@ -314,6 +314,9 @@ func (r *ReplicationStats) getNodeQueueStats(bucket string) (qs ReplQNodeStats)
qs.XferStats = make(map[RMetricName]XferStats)
qs.QStats = r.qCache.getBucketStats(bucket)
qs.TgtXferStats = make(map[string]map[RMetricName]XferStats)
qs.MRFStats = ReplicationMRFStats{
LastFailedCount: atomic.LoadUint64(&r.mrfStats.LastFailedCount),
}
r.RLock()
defer r.RUnlock()
@ -402,7 +405,9 @@ func (r *ReplicationStats) getNodeQueueStatsSummary() (qs ReplQNodeStats) {
qs.ActiveWorkers = globalReplicationStats.ActiveWorkers()
qs.XferStats = make(map[RMetricName]XferStats)
qs.QStats = r.qCache.getSiteStats()
qs.MRFStats = ReplicationMRFStats{
LastFailedCount: atomic.LoadUint64(&r.mrfStats.LastFailedCount),
}
r.RLock()
defer r.RUnlock()
tx := newXferStats()

View file

@ -34,6 +34,7 @@ const (
replicationMaxQueuedBytes = "max_queued_bytes"
replicationMaxQueuedCount = "max_queued_count"
replicationMaxDataTransferRate = "max_data_transfer_rate"
replicationRecentBacklogCount = "recent_backlog_count"
)
var (
@ -61,6 +62,8 @@ var (
"Maximum number of objects queued for replication since server start")
replicationMaxDataTransferRateMD = NewGaugeMD(replicationMaxDataTransferRate,
"Maximum replication data transfer rate in bytes/sec seen since server start")
replicationRecentBacklogCountMD = NewGaugeMD(replicationRecentBacklogCount,
"Total number of objects seen in replication backlog in the last 5 minutes")
)
// loadClusterReplicationMetrics - `MetricsLoaderFn` for cluster replication metrics
@ -91,6 +94,7 @@ func loadClusterReplicationMetrics(ctx context.Context, m MetricValues, c *metri
m.Set(replicationCurrentDataTransferRate, tots.Curr)
m.Set(replicationMaxDataTransferRate, tots.Peak)
}
m.Set(replicationRecentBacklogCount, float64(qs.MRFStats.LastFailedCount))
return nil
}

View file

@ -341,6 +341,7 @@ func newMetricGroups(r *prometheus.Registry) *metricsV3Collection {
replicationMaxQueuedBytesMD,
replicationMaxQueuedCountMD,
replicationMaxDataTransferRateMD,
replicationRecentBacklogCountMD,
},
loadClusterReplicationMetrics,
)

View file

@ -275,7 +275,7 @@ Metrics about MinIO site and bucket replication.
| `minio_replication_max_queued_bytes` | Maximum number of bytes queued for replication since server start. <br><br>Type: gauge | `server` |
| `minio_replication_max_queued_count` | Maximum number of objects queued for replication since server start. <br><br>Type: gauge | `server` |
| `minio_replication_max_data_transfer_rate` | Maximum replication data transfer rate in bytes/sec since server start. <br><br>Type: gauge | `server` |
| `minio_replication_recent_backlog_count` | Total number of objects seen in replication backlog in the last 5 minutes <br><br>Type: gauge | `server` |
#### `/bucket/replication`
| Name | Description | Labels |