From 9a012a53efbaca385026179c43cce8db9a5bbff8 Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Tue, 27 Feb 2024 23:02:14 -0800 Subject: [PATCH] initialize the disk healer early on (#19143) This PR fixes a bug that perhaps has been long introduced, with no visible workarounds. In any deployment, if an entire erasure set is deleted, there is no way the cluster recovers. --- buildscripts/verify-healing.sh | 3 ++- cmd/background-heal-ops.go | 1 - cmd/background-newdisks-heal-ops.go | 22 ++----------------- cmd/bucket-lifecycle.go | 5 ++--- cmd/common-main.go | 2 -- cmd/data-scanner.go | 8 +++---- cmd/data-scanner_test.go | 8 +++---- cmd/erasure-server-pool.go | 23 ++++++++++++++++++- cmd/global-heal.go | 8 +++---- cmd/globals.go | 10 +++++---- cmd/mrf.go | 34 +++-------------------------- cmd/server-main.go | 30 ------------------------- cmd/service.go | 11 ++-------- cmd/storage-rest-server.go | 14 +++++++----- cmd/xl-storage.go | 2 +- 15 files changed, 59 insertions(+), 122 deletions(-) diff --git a/buildscripts/verify-healing.sh b/buildscripts/verify-healing.sh index 6ede11d99..4a7be9e48 100755 --- a/buildscripts/verify-healing.sh +++ b/buildscripts/verify-healing.sh @@ -83,7 +83,7 @@ function start_minio_3_node() { } function check_online() { - if grep -q 'Unable to initialize sub-systems' ${WORK_DIR}/dist-minio-*.log; then + if ! grep -q 'Status:' ${WORK_DIR}/dist-minio-*.log; then echo "1" fi } @@ -109,6 +109,7 @@ function perform_test() { rm -rf ${WORK_DIR}/${1}/*/ + set -x start_minio_3_node 120 $2 rv=$(check_online) diff --git a/cmd/background-heal-ops.go b/cmd/background-heal-ops.go index 5568d0b5d..f4affaed9 100644 --- a/cmd/background-heal-ops.go +++ b/cmd/background-heal-ops.go @@ -102,7 +102,6 @@ func waitForLowHTTPReq() { func initBackgroundHealing(ctx context.Context, objAPI ObjectLayer) { // Run the background healer - globalBackgroundHealRoutine = newHealRoutine() for i := 0; i < globalBackgroundHealRoutine.workers; i++ { go globalBackgroundHealRoutine.AddWorker(ctx, objAPI) } diff --git a/cmd/background-newdisks-heal-ops.go b/cmd/background-newdisks-heal-ops.go index fba8c1799..a12641e90 100644 --- a/cmd/background-newdisks-heal-ops.go +++ b/cmd/background-newdisks-heal-ops.go @@ -376,26 +376,8 @@ func getLocalDisksToHeal() (disksToHeal Endpoints) { var newDiskHealingTimeout = newDynamicTimeout(30*time.Second, 10*time.Second) func healFreshDisk(ctx context.Context, z *erasureServerPools, endpoint Endpoint) error { - disk, format, _, err := connectEndpoint(endpoint) - if err != nil { - return fmt.Errorf("Error: %w, %s", err, endpoint) - } - defer disk.Close() - poolIdx := globalEndpoints.GetLocalPoolIdx(disk.Endpoint()) - if poolIdx < 0 { - return fmt.Errorf("unexpected pool index (%d) found for %s", poolIdx, disk.Endpoint()) - } - - // Calculate the set index where the current endpoint belongs - z.serverPools[poolIdx].erasureDisksMu.RLock() - setIdx, _, err := findDiskIndex(z.serverPools[poolIdx].format, format) - z.serverPools[poolIdx].erasureDisksMu.RUnlock() - if err != nil { - return err - } - if setIdx < 0 { - return fmt.Errorf("unexpected set index (%d) found for %s", setIdx, disk.Endpoint()) - } + poolIdx, setIdx := endpoint.PoolIdx, endpoint.SetIdx + disk := getStorageViaEndpoint(endpoint) // Prevent parallel erasure set healing locker := z.NewNSLock(minioMetaBucket, fmt.Sprintf("new-drive-healing/%d/%d", poolIdx, setIdx)) diff --git a/cmd/bucket-lifecycle.go b/cmd/bucket-lifecycle.go index f5270d380..2f4a50258 100644 --- a/cmd/bucket-lifecycle.go +++ b/cmd/bucket-lifecycle.go @@ -145,7 +145,7 @@ func (es *expiryState) enqueueByNewerNoncurrent(bucket string, versions []Object } } -var globalExpiryState *expiryState +var globalExpiryState = newExpiryState() func newExpiryState() *expiryState { return &expiryState{ @@ -155,8 +155,6 @@ func newExpiryState() *expiryState { } func initBackgroundExpiry(ctx context.Context, objectAPI ObjectLayer) { - globalExpiryState = newExpiryState() - workerSize, _ := strconv.Atoi(env.Get("_MINIO_ILM_EXPIRY_WORKERS", strconv.Itoa((runtime.GOMAXPROCS(0)+1)/2))) if workerSize == 0 { workerSize = 4 @@ -185,6 +183,7 @@ func initBackgroundExpiry(ctx context.Context, objectAPI ObjectLayer) { } ewk.Wait() }() + go func() { for t := range globalExpiryState.byNewerNoncurrentCh { nwk.Take() diff --git a/cmd/common-main.go b/cmd/common-main.go index 32abee71b..7cda7b023 100644 --- a/cmd/common-main.go +++ b/cmd/common-main.go @@ -88,8 +88,6 @@ func init() { logger.Init(GOPATH, GOROOT) logger.RegisterError(config.FmtError) - initGlobalContext() - globalBatchJobsMetrics = batchJobMetrics{metrics: make(map[string]*batchJobInfo)} go globalBatchJobsMetrics.purgeJobMetrics() diff --git a/cmd/data-scanner.go b/cmd/data-scanner.go index 86ed7e883..8ea6613e0 100644 --- a/cmd/data-scanner.go +++ b/cmd/data-scanner.go @@ -1020,7 +1020,7 @@ func (i *scannerItem) applyTierObjSweep(ctx context.Context, o ObjectLayer, oi O // applyNewerNoncurrentVersionLimit removes noncurrent versions older than the most recent NewerNoncurrentVersions configured. // Note: This function doesn't update sizeSummary since it always removes versions that it doesn't return. -func (i *scannerItem) applyNewerNoncurrentVersionLimit(ctx context.Context, _ ObjectLayer, fivs []FileInfo) ([]ObjectInfo, error) { +func (i *scannerItem) applyNewerNoncurrentVersionLimit(ctx context.Context, _ ObjectLayer, fivs []FileInfo, expState *expiryState) ([]ObjectInfo, error) { done := globalScannerMetrics.time(scannerMetricApplyNonCurrent) defer done() @@ -1087,14 +1087,14 @@ func (i *scannerItem) applyNewerNoncurrentVersionLimit(ctx context.Context, _ Ob }) } - globalExpiryState.enqueueByNewerNoncurrent(i.bucket, toDel, event) + expState.enqueueByNewerNoncurrent(i.bucket, toDel, event) return objectInfos, nil } // applyVersionActions will apply lifecycle checks on all versions of a scanned item. Returns versions that remain // after applying lifecycle checks configured. -func (i *scannerItem) applyVersionActions(ctx context.Context, o ObjectLayer, fivs []FileInfo) ([]ObjectInfo, error) { - objInfos, err := i.applyNewerNoncurrentVersionLimit(ctx, o, fivs) +func (i *scannerItem) applyVersionActions(ctx context.Context, o ObjectLayer, fivs []FileInfo, expState *expiryState) ([]ObjectInfo, error) { + objInfos, err := i.applyNewerNoncurrentVersionLimit(ctx, o, fivs, expState) if err != nil { return nil, err } diff --git a/cmd/data-scanner_test.go b/cmd/data-scanner_test.go index 92b444313..464feb805 100644 --- a/cmd/data-scanner_test.go +++ b/cmd/data-scanner_test.go @@ -39,13 +39,13 @@ func TestApplyNewerNoncurrentVersionsLimit(t *testing.T) { globalBucketMetadataSys = NewBucketMetadataSys() globalBucketObjectLockSys = &BucketObjectLockSys{} globalBucketVersioningSys = &BucketVersioningSys{} - globalExpiryState = newExpiryState() + expiryState := newExpiryState() var wg sync.WaitGroup wg.Add(1) expired := make([]ObjectToDelete, 0, 5) go func() { defer wg.Done() - for t := range globalExpiryState.byNewerNoncurrentCh { + for t := range expiryState.byNewerNoncurrentCh { expired = append(expired, t.versions...) } }() @@ -116,7 +116,7 @@ func TestApplyNewerNoncurrentVersionsLimit(t *testing.T) { for i, fi := range fivs[:2] { wants[i] = fi.ToObjectInfo(bucket, obj, versioned) } - gots, err := item.applyNewerNoncurrentVersionLimit(context.TODO(), objAPI, fivs) + gots, err := item.applyNewerNoncurrentVersionLimit(context.TODO(), objAPI, fivs, expiryState) if err != nil { t.Fatalf("Failed with err: %v", err) } @@ -125,7 +125,7 @@ func TestApplyNewerNoncurrentVersionsLimit(t *testing.T) { } // Close expiry state's channel to inspect object versions enqueued for expiration - close(globalExpiryState.byNewerNoncurrentCh) + close(expiryState.byNewerNoncurrentCh) wg.Wait() for _, obj := range expired { switch obj.ObjectV.VersionID { diff --git a/cmd/erasure-server-pool.go b/cmd/erasure-server-pool.go index 0e39b7854..2884a9e45 100644 --- a/cmd/erasure-server-pool.go +++ b/cmd/erasure-server-pool.go @@ -175,8 +175,29 @@ func newErasureServerPools(ctx context.Context, endpointServerPools EndpointServ z.poolMeta = newPoolMeta(z, poolMeta{}) z.poolMeta.dontSave = true + bootstrapTrace("newSharedLock", func() { + globalLeaderLock = newSharedLock(GlobalContext, z, "leader.lock") + }) + + // Enable background operations on + // + // - Disk auto healing + // - MRF (most recently failed) healing + // - Background expiration routine for lifecycle policies + bootstrapTrace("initAutoHeal", func() { + initAutoHeal(GlobalContext, z) + }) + + bootstrapTrace("initHealMRF", func() { + go globalMRFState.healRoutine(z) + }) + + bootstrapTrace("initBackgroundExpiry", func() { + initBackgroundExpiry(GlobalContext, z) + }) + // initialize the object layer. - setObjectLayer(z) + defer setObjectLayer(z) r := rand.New(rand.NewSource(time.Now().UnixNano())) attempt := 1 diff --git a/cmd/global-heal.go b/cmd/global-heal.go index 47e7ab2e5..7d311244f 100644 --- a/cmd/global-heal.go +++ b/cmd/global-heal.go @@ -224,7 +224,9 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, disks, _ := er.getOnlineDisksWithHealing(false) if len(disks) == 0 { - logger.LogIf(ctx, fmt.Errorf("no online disks found to heal the bucket `%s`", bucket)) + // No object healing necessary + tracker.bucketDone(bucket) + logger.LogIf(ctx, tracker.update(ctx)) continue } @@ -472,9 +474,7 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, func healBucket(bucket string, scan madmin.HealScanMode) error { // Get background heal sequence to send elements to heal - globalHealStateLK.Lock() bgSeq, ok := globalBackgroundHealState.getHealSequenceByToken(bgHealingUUID) - globalHealStateLK.Unlock() if ok { return bgSeq.queueHealTask(healSource{bucket: bucket}, madmin.HealItemBucket) } @@ -484,9 +484,7 @@ func healBucket(bucket string, scan madmin.HealScanMode) error { // healObject sends the given object/version to the background healing workers func healObject(bucket, object, versionID string, scan madmin.HealScanMode) error { // Get background heal sequence to send elements to heal - globalHealStateLK.Lock() bgSeq, ok := globalBackgroundHealState.getHealSequenceByToken(bgHealingUUID) - globalHealStateLK.Unlock() if ok { return bgSeq.queueHealTask(healSource{ bucket: bucket, diff --git a/cmd/globals.go b/cmd/globals.go index 24734e7e6..bcd1e6b57 100644 --- a/cmd/globals.go +++ b/cmd/globals.go @@ -379,13 +379,15 @@ var ( return *ptr } - globalAllHealState *allHealState + globalAllHealState = newHealState(GlobalContext, true) // The always present healing routine ready to heal objects - globalBackgroundHealRoutine *healRoutine - globalBackgroundHealState *allHealState + globalBackgroundHealRoutine = newHealRoutine() + globalBackgroundHealState = newHealState(GlobalContext, false) - globalMRFState mrfState + globalMRFState = mrfState{ + opCh: make(chan partialOperation, mrfOpsQueueSize), + } // If writes to FS backend should be O_SYNC. globalFSOSync bool diff --git a/cmd/mrf.go b/cmd/mrf.go index 7eccda884..bd82863e5 100644 --- a/cmd/mrf.go +++ b/cmd/mrf.go @@ -19,7 +19,6 @@ package cmd import ( "context" - "sync" "time" "github.com/minio/madmin-go/v3" @@ -44,37 +43,15 @@ type partialOperation struct { // mrfState sncapsulates all the information // related to the global background MRF. type mrfState struct { - ctx context.Context - pools *erasureServerPools - - mu sync.RWMutex opCh chan partialOperation } -// Initialize healing MRF subsystem -func (m *mrfState) init(ctx context.Context, objAPI ObjectLayer) { - m.mu.Lock() - defer m.mu.Unlock() - - m.ctx = ctx - m.opCh = make(chan partialOperation, mrfOpsQueueSize) - - var ok bool - m.pools, ok = objAPI.(*erasureServerPools) - if ok { - go m.healRoutine() - } -} - // Add a partial S3 operation (put/delete) when one or more disks are offline. func (m *mrfState) addPartialOp(op partialOperation) { if m == nil { return } - m.mu.RLock() - defer m.mu.RUnlock() - select { case m.opCh <- op: default: @@ -86,10 +63,10 @@ var healSleeper = newDynamicSleeper(5, time.Second, false) // healRoutine listens to new disks reconnection events and // issues healing requests for queued objects belonging to the // corresponding erasure set -func (m *mrfState) healRoutine() { +func (m *mrfState) healRoutine(z *erasureServerPools) { for { select { - case <-m.ctx.Done(): + case <-GlobalContext.Done(): return case u, ok := <-m.opCh: if !ok { @@ -115,7 +92,7 @@ func (m *mrfState) healRoutine() { healBucket(u.bucket, scan) } else { if u.allVersions { - m.pools.serverPools[u.poolIndex].sets[u.setIndex].listAndHeal(u.bucket, u.object, u.scanMode, healObjectVersionsDisparity) + z.serverPools[u.poolIndex].sets[u.setIndex].listAndHeal(u.bucket, u.object, u.scanMode, healObjectVersionsDisparity) } else { healObject(u.bucket, u.object, u.versionID, scan) } @@ -125,8 +102,3 @@ func (m *mrfState) healRoutine() { } } } - -// Initialize healing MRF -func initHealMRF(ctx context.Context, obj ObjectLayer) { - globalMRFState.init(ctx, obj) -} diff --git a/cmd/server-main.go b/cmd/server-main.go index 9001dc02b..01e21cb0d 100644 --- a/cmd/server-main.go +++ b/cmd/server-main.go @@ -30,7 +30,6 @@ import ( "os/signal" "runtime" "strings" - "sync" "syscall" "time" @@ -372,15 +371,7 @@ func serverHandleCmdArgs(ctxt serverCtxt) { globalConnWriteDeadline = ctxt.ConnWriteDeadline } -var globalHealStateLK sync.RWMutex - func initAllSubsystems(ctx context.Context) { - globalHealStateLK.Lock() - // New global heal state - globalAllHealState = newHealState(ctx, true) - globalBackgroundHealState = newHealState(ctx, false) - globalHealStateLK.Unlock() - // Initialize notification peer targets globalNotificationSys = NewNotificationSys(globalEndpoints) @@ -814,27 +805,6 @@ func serverMain(ctx *cli.Context) { globalNodeNamesHex[hex.EncodeToString(nodeNameSum[:])] = struct{}{} } - bootstrapTrace("newSharedLock", func() { - globalLeaderLock = newSharedLock(GlobalContext, newObject, "leader.lock") - }) - - // Enable background operations on - // - // - Disk auto healing - // - MRF (most recently failed) healing - // - Background expiration routine for lifecycle policies - bootstrapTrace("initAutoHeal", func() { - initAutoHeal(GlobalContext, newObject) - }) - - bootstrapTrace("initHealMRF", func() { - initHealMRF(GlobalContext, newObject) - }) - - bootstrapTrace("initBackgroundExpiry", func() { - initBackgroundExpiry(GlobalContext, newObject) - }) - var err error bootstrapTrace("initServerConfig", func() { if err = initServerConfig(GlobalContext, newObject); err != nil { diff --git a/cmd/service.go b/cmd/service.go index ebe7394b5..5561cfc9f 100644 --- a/cmd/service.go +++ b/cmd/service.go @@ -40,18 +40,11 @@ const ( ) // Global service signal channel. -var globalServiceSignalCh chan serviceSignal +var globalServiceSignalCh = make(chan serviceSignal) // GlobalContext context that is canceled when server is requested to shut down. -var GlobalContext context.Context - // cancelGlobalContext can be used to indicate server shutdown. -var cancelGlobalContext context.CancelFunc - -func initGlobalContext() { - GlobalContext, cancelGlobalContext = context.WithCancel(context.Background()) - globalServiceSignalCh = make(chan serviceSignal) -} +var GlobalContext, cancelGlobalContext = context.WithCancel(context.Background()) // restartProcess starts a new process passing it the active fd's. It // doesn't fork, but starts a new process using the same environment and diff --git a/cmd/storage-rest-server.go b/cmd/storage-rest-server.go index 7f57074fe..cd4d3da5d 100644 --- a/cmd/storage-rest-server.go +++ b/cmd/storage-rest-server.go @@ -54,7 +54,7 @@ var errDiskStale = errors.New("drive stale") // To abstract a disk over network. type storageRESTServer struct { - poolIndex, setIndex, diskIndex int + endpoint Endpoint } var ( @@ -74,10 +74,14 @@ var ( storageListDirRPC = grid.NewStream[*grid.MSS, grid.NoPayload, *ListDirResult](grid.HandlerListDir, grid.NewMSS, nil, func() *ListDirResult { return &ListDirResult{} }).WithOutCapacity(1) ) -func (s *storageRESTServer) getStorage() StorageAPI { +func getStorageViaEndpoint(endpoint Endpoint) StorageAPI { globalLocalDrivesMu.RLock() defer globalLocalDrivesMu.RUnlock() - return globalLocalSetDrives[s.poolIndex][s.setIndex][s.diskIndex] + return globalLocalSetDrives[endpoint.PoolIdx][endpoint.SetIdx][endpoint.DiskIdx] +} + +func (s *storageRESTServer) getStorage() StorageAPI { + return getStorageViaEndpoint(s.endpoint) } func (s *storageRESTServer) writeErrorResponse(w http.ResponseWriter, err error) { @@ -1287,9 +1291,7 @@ func registerStorageRESTHandlers(router *mux.Router, endpointServerPools Endpoin } server := &storageRESTServer{ - poolIndex: endpoint.PoolIdx, - setIndex: endpoint.SetIdx, - diskIndex: endpoint.DiskIdx, + endpoint: endpoint, } subrouter := router.PathPrefix(path.Join(storageRESTPrefix, endpoint.Path)).Subrouter() diff --git a/cmd/xl-storage.go b/cmd/xl-storage.go index b74de4328..c84437e38 100644 --- a/cmd/xl-storage.go +++ b/cmd/xl-storage.go @@ -581,7 +581,7 @@ func (s *xlStorage) NSScanner(ctx context.Context, cache dataUsageCache, updates } done := globalScannerMetrics.time(scannerMetricApplyAll) - objInfos, err := item.applyVersionActions(ctx, objAPI, fivs.Versions) + objInfos, err := item.applyVersionActions(ctx, objAPI, fivs.Versions, globalExpiryState) done() if err != nil {