From 135874ebdc07ec84b616c76ec0455e6888b520ee Mon Sep 17 00:00:00 2001 From: Anis Eleuch Date: Fri, 26 Apr 2024 07:32:14 +0100 Subject: [PATCH] heal: Avoid marking a bucket as done when remote drives are offline (#19587) --- cmd/erasure-healing.go | 4 ++-- cmd/erasure-server-pool.go | 2 +- cmd/erasure.go | 12 ++++++------ cmd/global-heal.go | 28 +++++++++++++++------------- 4 files changed, 24 insertions(+), 22 deletions(-) diff --git a/cmd/erasure-healing.go b/cmd/erasure-healing.go index 381b970c3..037245261 100644 --- a/cmd/erasure-healing.go +++ b/cmd/erasure-healing.go @@ -44,8 +44,8 @@ const ( healingMetricCheckAbandonedParts ) -func (er erasureObjects) listAndHeal(bucket, prefix string, scanMode madmin.HealScanMode, healEntry func(string, metaCacheEntry, madmin.HealScanMode) error) error { - ctx, cancel := context.WithCancel(context.Background()) +func (er erasureObjects) listAndHeal(ctx context.Context, bucket, prefix string, scanMode madmin.HealScanMode, healEntry func(string, metaCacheEntry, madmin.HealScanMode) error) error { + ctx, cancel := context.WithCancel(ctx) defer cancel() disks, _ := er.getOnlineDisksWithHealing(false) diff --git a/cmd/erasure-server-pool.go b/cmd/erasure-server-pool.go index 8a306659c..598149bd5 100644 --- a/cmd/erasure-server-pool.go +++ b/cmd/erasure-server-pool.go @@ -2270,7 +2270,7 @@ func (z *erasureServerPools) HealObjects(ctx context.Context, bucket, prefix str go func(idx int, set *erasureObjects) { defer wg.Done() - errs[idx] = set.listAndHeal(bucket, prefix, opts.ScanMode, healEntry) + errs[idx] = set.listAndHeal(ctx, bucket, prefix, opts.ScanMode, healEntry) }(idx, set) } wg.Wait() diff --git a/cmd/erasure.go b/cmd/erasure.go index 34971ab42..72e3a2e0c 100644 --- a/cmd/erasure.go +++ b/cmd/erasure.go @@ -272,11 +272,11 @@ func (er erasureObjects) LocalStorageInfo(ctx context.Context, metrics bool) Sto } // getOnlineDisksWithHealingAndInfo - returns online disks and overall healing status. -// Disks are randomly ordered, but in the following groups: +// Disks are ordered in the following groups: // - Non-scanning disks // - Non-healing disks // - Healing disks (if inclHealing is true) -func (er erasureObjects) getOnlineDisksWithHealingAndInfo(inclHealing bool) (newDisks []StorageAPI, newInfos []DiskInfo, healing bool) { +func (er erasureObjects) getOnlineDisksWithHealingAndInfo(inclHealing bool) (newDisks []StorageAPI, newInfos []DiskInfo, healing int) { var wg sync.WaitGroup disks := er.getDisks() infos := make([]DiskInfo, len(disks)) @@ -315,7 +315,7 @@ func (er erasureObjects) getOnlineDisksWithHealingAndInfo(inclHealing bool) (new continue } if info.Healing { - healing = true + healing++ if inclHealing { healingDisks = append(healingDisks, disks[i]) healingInfos = append(healingInfos, infos[i]) @@ -343,9 +343,9 @@ func (er erasureObjects) getOnlineDisksWithHealingAndInfo(inclHealing bool) (new return newDisks, newInfos, healing } -func (er erasureObjects) getOnlineDisksWithHealing(inclHealing bool) (newDisks []StorageAPI, healing bool) { - newDisks, _, healing = er.getOnlineDisksWithHealingAndInfo(inclHealing) - return +func (er erasureObjects) getOnlineDisksWithHealing(inclHealing bool) ([]StorageAPI, bool) { + newDisks, _, healing := er.getOnlineDisksWithHealingAndInfo(inclHealing) + return newDisks, healing > 0 } // Clean-up previously deleted objects. from .minio.sys/tmp/.trash/ diff --git a/cmd/global-heal.go b/cmd/global-heal.go index f27b0f27a..a95eb0e1f 100644 --- a/cmd/global-heal.go +++ b/cmd/global-heal.go @@ -259,12 +259,17 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, bucket, humanize.Ordinal(er.setIndex+1)) } - disks, _ := er.getOnlineDisksWithHealing(false) - if len(disks) == 0 { - // No object healing necessary - tracker.bucketDone(bucket) - healingLogIf(ctx, tracker.update(ctx)) - continue + disks, _, healing := er.getOnlineDisksWithHealingAndInfo(true) + if len(disks) == healing { + // All drives in this erasure set were reformatted for some reasons, abort healing and mark it as successful + healingLogIf(ctx, errors.New("all drives are in healing state, aborting..")) + return nil + } + + disks = disks[:len(disks)-healing] // healing drives are always at the end of the list + + if len(disks) < er.setDriveCount/2 { + return fmt.Errorf("not enough drives (found=%d, healing=%d, total=%d) are available to heal `%s`", len(disks), healing, er.setDriveCount, tracker.disk.String()) } rand.Shuffle(len(disks), func(i, j int) { @@ -465,27 +470,24 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, waitForLowHTTPReq() } - actualBucket, prefix := path2BucketObject(bucket) - // How to resolve partial results. resolver := metadataResolutionParams{ dirQuorum: 1, objQuorum: 1, - bucket: actualBucket, + bucket: bucket, } err = listPathRaw(ctx, listPathRawOptions{ disks: disks, fallbackDisks: fallbackDisks, - bucket: actualBucket, - path: prefix, + bucket: bucket, recursive: true, forwardTo: forwardTo, minDisks: 1, reportNotFound: false, agreed: func(entry metaCacheEntry) { jt.Take() - go healEntry(actualBucket, entry) + go healEntry(bucket, entry) }, partial: func(entries metaCacheEntries, _ []error) { entry, ok := entries.resolve(&resolver) @@ -495,7 +497,7 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, entry, _ = entries.firstFound() } jt.Take() - go healEntry(actualBucket, *entry) + go healEntry(bucket, *entry) }, finished: nil, })