From 4790868878911aa8a0aed35887a9110ecf1f3822 Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Tue, 9 Jun 2020 19:19:03 -0700 Subject: [PATCH] allow background IAM load to speed up startup (#9796) Also fix healthcheck handler to run success only if object layer has initialized fully for S3 API access call. --- cmd/admin-handlers_test.go | 2 +- cmd/gateway-main.go | 2 +- cmd/healthcheck-handler.go | 2 +- cmd/iam-object-store.go | 8 ---- cmd/iam.go | 82 ++++++++++++++++++++++++++++++++++---- cmd/peer-rest-client.go | 5 ++- cmd/server-main.go | 58 +++++++++++---------------- cmd/test-utils_test.go | 10 ++--- cmd/xl-v1-multipart.go | 5 ++- cmd/xl-v1-object.go | 5 ++- go.mod | 2 +- go.sum | 7 +++- 12 files changed, 124 insertions(+), 64 deletions(-) diff --git a/cmd/admin-handlers_test.go b/cmd/admin-handlers_test.go index 615b3ddd5..5955a50e1 100644 --- a/cmd/admin-handlers_test.go +++ b/cmd/admin-handlers_test.go @@ -70,7 +70,7 @@ func prepareAdminXLTestBed(ctx context.Context) (*adminXLTestBed, error) { newAllSubsystems() - initAllSubsystems(objLayer) + initAllSubsystems(ctx, objLayer) // Setup admin mgmt REST API handlers. adminRouter := mux.NewRouter() diff --git a/cmd/gateway-main.go b/cmd/gateway-main.go index a8f5d075b..067b6269a 100644 --- a/cmd/gateway-main.go +++ b/cmd/gateway-main.go @@ -274,7 +274,7 @@ func StartGateway(ctx *cli.Context, gw Gateway) { if enableIAMOps { // Initialize IAM sys. - logger.FatalIf(globalIAMSys.Init(GlobalContext, newObject), "Unable to initialize IAM system") + startBackgroundIAMLoad(GlobalContext) } if globalCacheConfig.Enabled { diff --git a/cmd/healthcheck-handler.go b/cmd/healthcheck-handler.go index 098ef802a..dd224885c 100644 --- a/cmd/healthcheck-handler.go +++ b/cmd/healthcheck-handler.go @@ -37,7 +37,7 @@ func ReadinessCheckHandler(w http.ResponseWriter, r *http.Request) { ctx, cancel := context.WithTimeout(ctx, globalAPIConfig.getReadyDeadline()) defer cancel() - if !objLayer.IsReady(ctx) { + if !objLayer.IsReady(ctx) && newObjectLayerFn() == nil { writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone) return } diff --git a/cmd/iam-object-store.go b/cmd/iam-object-store.go index 85a4b619d..4ec3e8d32 100644 --- a/cmd/iam-object-store.go +++ b/cmd/iam-object-store.go @@ -588,14 +588,6 @@ func listIAMConfigItems(ctx context.Context, objAPI ObjectLayer, pathPrefix stri return } - // Attempt a slow down load only when server is - // active and initialized. - if !globalSafeMode { - // Slow down listing and loading for config items to - // reduce load on the server - waitForLowHTTPReq(int32(globalEndpoints.NEndpoints())) - } - marker = lo.NextMarker lister := dirList(lo) if !dirs { diff --git a/cmd/iam.go b/cmd/iam.go index 1f28845eb..e52d3bc30 100644 --- a/cmd/iam.go +++ b/cmd/iam.go @@ -21,8 +21,10 @@ import ( "context" "encoding/base64" "encoding/json" + "errors" "fmt" "strings" + "time" "github.com/minio/minio-go/v6/pkg/set" "github.com/minio/minio/cmd/config" @@ -30,6 +32,7 @@ import ( "github.com/minio/minio/pkg/auth" iampolicy "github.com/minio/minio/pkg/iam/policy" "github.com/minio/minio/pkg/madmin" + "github.com/minio/minio/pkg/retry" ) // UsersSysType - defines the type of users and groups system that is @@ -403,10 +406,18 @@ func (sys *IAMSys) doIAMConfigMigration(ctx context.Context) error { return sys.store.migrateBackendFormat(ctx) } +// Loads IAM users and policies in background, any un-handled +// error means this code can potentially crash the server +// in such a situation manual intervention is necessary. +func startBackgroundIAMLoad(ctx context.Context) { + go globalIAMSys.Init(ctx, newObjectLayerWithoutSafeModeFn()) +} + // Init - initializes config system from iam.json -func (sys *IAMSys) Init(ctx context.Context, objAPI ObjectLayer) error { +func (sys *IAMSys) Init(ctx context.Context, objAPI ObjectLayer) { if objAPI == nil { - return errServerNotInitialized + logger.LogIf(ctx, errServerNotInitialized) + return } if globalEtcdClient == nil { @@ -419,18 +430,75 @@ func (sys *IAMSys) Init(ctx context.Context, objAPI ObjectLayer) error { sys.EnableLDAPSys() } - // Migrate IAM configuration - if err := sys.doIAMConfigMigration(ctx); err != nil { - return err + retryCtx, cancel := context.WithCancel(ctx) + + // Indicate to our routine to exit cleanly upon return. + defer cancel() + + // Hold the lock for migration only. + txnLk := objAPI.NewNSLock(retryCtx, minioMetaBucket, minioConfigPrefix+"/iam.lock") + + // Initializing IAM sub-system needs a retry mechanism for + // the following reasons: + // - Read quorum is lost just after the initialization + // of the object layer. + // - Write quorum not met when upgrading configuration + // version is needed, migration is needed etc. + rquorum := InsufficientReadQuorum{} + wquorum := InsufficientWriteQuorum{} + + for range retry.NewTimerWithJitter(retryCtx, time.Second, 5*time.Second, retry.MaxJitter) { + // let one of the server acquire the lock, if not let them timeout. + // which shall be retried again by this loop. + if err := txnLk.GetLock(newDynamicTimeout(1*time.Second, 5*time.Second)); err != nil { + logger.Info("Waiting for all MinIO IAM sub-system to be initialized.. trying to acquire lock") + continue + } + + if globalEtcdClient != nil { + // **** WARNING **** + // Migrating to encrypted backend on etcd should happen before initialization of + // IAM sub-system, make sure that we do not move the above codeblock elsewhere. + if err := migrateIAMConfigsEtcdToEncrypted(ctx, globalEtcdClient); err != nil { + txnLk.Unlock() + logger.LogIf(ctx, fmt.Errorf("Unable to handle encrypted backend for iam and policies: %w", err)) + return + } + } + + // These messages only meant primarily for distributed setup, so only log during distributed setup. + if globalIsDistXL { + logger.Info("Waiting for all MinIO IAM sub-system to be initialized.. lock acquired") + } + + // Migrate IAM configuration + if err := sys.doIAMConfigMigration(ctx); err != nil { + txnLk.Unlock() + if errors.Is(err, errDiskNotFound) || + errors.Is(err, errConfigNotFound) || + errors.Is(err, context.Canceled) || + errors.Is(err, context.DeadlineExceeded) || + errors.As(err, &rquorum) || + errors.As(err, &wquorum) || + isErrBucketNotFound(err) { + logger.Info("Waiting for all MinIO IAM sub-system to be initialized.. possible cause (%v)", err) + continue + } + logger.LogIf(ctx, fmt.Errorf("Unable to migration IAM users and policies: %w", err)) + return + } + + // Successfully migrated + txnLk.Unlock() + break } - err := sys.store.loadAll(ctx, sys) + logger.LogIf(ctx, sys.store.loadAll(ctx, sys)) // Invalidate the old cred after finishing IAM initialization globalOldCred = auth.Credentials{} go sys.store.watch(ctx, sys) - return err } // DeletePolicy - deletes a canned policy from backend or etcd. diff --git a/cmd/peer-rest-client.go b/cmd/peer-rest-client.go index cd9074356..2e556cae9 100644 --- a/cmd/peer-rest-client.go +++ b/cmd/peer-rest-client.go @@ -677,14 +677,15 @@ func (client *peerRESTClient) BackgroundHealStatus() (madmin.BgHealState, error) } // GetLocalDiskIDs - get a peer's local disks' IDs. -func (client *peerRESTClient) GetLocalDiskIDs(ctx context.Context) []string { +func (client *peerRESTClient) GetLocalDiskIDs(ctx context.Context) (diskIDs []string) { respBody, err := client.callWithContext(ctx, peerRESTMethodGetLocalDiskIDs, nil, nil, -1) if err != nil { + logger.LogIf(ctx, err) return nil } defer http.DrainBody(respBody) - var diskIDs []string if err = gob.NewDecoder(respBody).Decode(&diskIDs); err != nil { + logger.LogIf(ctx, err) return nil } return diskIDs diff --git a/cmd/server-main.go b/cmd/server-main.go index 549881758..3c984b009 100644 --- a/cmd/server-main.go +++ b/cmd/server-main.go @@ -169,15 +169,19 @@ func newAllSubsystems() { globalBucketQuotaSys = NewBucketQuotaSys() } -func initSafeMode() (err error) { - newObject := newObjectLayerWithoutSafeModeFn() +func initSafeMode(ctx context.Context, newObject ObjectLayer) (err error) { + // Create cancel context to control 'newRetryTimer' go routine. + retryCtx, cancel := context.WithCancel(ctx) + + // Indicate to our routine to exit cleanly upon return. + defer cancel() // Make sure to hold lock for entire migration to avoid // such that only one server should migrate the entire config // at a given time, this big transaction lock ensures this // appropriately. This is also true for rotation of encrypted // content. - txnLk := newObject.NewNSLock(GlobalContext, minioMetaBucket, minioConfigPrefix+"/transaction.lock") + txnLk := newObject.NewNSLock(retryCtx, minioMetaBucket, minioConfigPrefix+"/transaction.lock") defer func(txnLk RWLocker) { txnLk.Unlock() @@ -203,11 +207,6 @@ func initSafeMode() (err error) { // **** WARNING **** // Migrating to encrypted backend should happen before initialization of any // sub-systems, make sure that we do not move the above codeblock elsewhere. - // Create cancel context to control 'newRetryTimer' go routine. - retryCtx, cancel := context.WithCancel(GlobalContext) - - // Indicate to our routine to exit cleanly upon return. - defer cancel() // Initializing sub-systems needs a retry mechanism for // the following reasons: @@ -220,7 +219,7 @@ func initSafeMode() (err error) { for range retry.NewTimer(retryCtx) { // let one of the server acquire the lock, if not let them timeout. // which shall be retried again by this loop. - if err = txnLk.GetLock(newDynamicTimeout(5*time.Second, 30*time.Second)); err != nil { + if err = txnLk.GetLock(newDynamicTimeout(1*time.Second, 10*time.Second)); err != nil { logger.Info("Waiting for all MinIO sub-systems to be initialized.. trying to acquire lock") continue } @@ -236,7 +235,7 @@ func initSafeMode() (err error) { if err = handleEncryptedConfigBackend(newObject, true); err == nil { // Upon success migrating the config, initialize all sub-systems // if all sub-systems initialized successfully return right away - if err = initAllSubsystems(newObject); err == nil { + if err = initAllSubsystems(retryCtx, newObject); err == nil { // All successful return. if globalIsDistXL { // These messages only meant primarily for distributed setup, so only log during distributed setup. @@ -272,7 +271,7 @@ func initSafeMode() (err error) { return errors.New("Initializing sub-systems stopped gracefully") } -func initAllSubsystems(newObject ObjectLayer) (err error) { +func initAllSubsystems(ctx context.Context, newObject ObjectLayer) (err error) { // %w is used by all error returns here to make sure // we wrap the underlying error, make sure when you // are modifying this code that you do so, if and when @@ -281,7 +280,7 @@ func initAllSubsystems(newObject ObjectLayer) (err error) { var buckets []BucketInfo if globalIsDistXL || globalIsXL { // List buckets to heal, and be re-used for loading configs. - buckets, err = newObject.ListBucketsHeal(GlobalContext) + buckets, err = newObject.ListBucketsHeal(ctx) if err != nil { return fmt.Errorf("Unable to list buckets to heal: %w", err) } @@ -290,21 +289,21 @@ func initAllSubsystems(newObject ObjectLayer) (err error) { wquorum := &InsufficientWriteQuorum{} rquorum := &InsufficientReadQuorum{} for _, bucket := range buckets { - if err = newObject.MakeBucketWithLocation(GlobalContext, bucket.Name, "", false); err != nil { + if err = newObject.MakeBucketWithLocation(ctx, bucket.Name, "", false); err != nil { if errors.As(err, &wquorum) || errors.As(err, &rquorum) { - // Retrun the error upwards for the caller to retry. + // Return the error upwards for the caller to retry. return fmt.Errorf("Unable to heal bucket: %w", err) } if _, ok := err.(BucketExists); !ok { // ignore any other error and log for investigation. - logger.LogIf(GlobalContext, err) + logger.LogIf(ctx, err) continue } // Bucket already exists, nothing that needs to be done. } } } else { - buckets, err = newObject.ListBuckets(GlobalContext) + buckets, err = newObject.ListBuckets(ctx) if err != nil { return fmt.Errorf("Unable to list buckets: %w", err) } @@ -315,21 +314,14 @@ func initAllSubsystems(newObject ObjectLayer) (err error) { return fmt.Errorf("Unable to initialize config system: %w", err) } - if globalEtcdClient != nil { - // **** WARNING **** - // Migrating to encrypted backend on etcd should happen before initialization of - // IAM sub-systems, make sure that we do not move the above codeblock elsewhere. - if err = migrateIAMConfigsEtcdToEncrypted(GlobalContext, globalEtcdClient); err != nil { - return fmt.Errorf("Unable to handle encrypted backend for iam and policies: %w", err) - } - } - - if err = globalIAMSys.Init(GlobalContext, newObject); err != nil { - return fmt.Errorf("Unable to initialize IAM system: %w", err) + // Populate existing buckets to the etcd backend + if globalDNSConfig != nil { + // Background this operation. + go initFederatorBackend(buckets, newObject) } // Initialize bucket metadata sub-system. - if err = globalBucketMetadataSys.Init(GlobalContext, buckets, newObject); err != nil { + if err := globalBucketMetadataSys.Init(ctx, buckets, newObject); err != nil { return fmt.Errorf("Unable to initialize bucket metadata sub-system: %w", err) } @@ -338,11 +330,6 @@ func initAllSubsystems(newObject ObjectLayer) (err error) { return fmt.Errorf("Unable to initialize notification system: %w", err) } - // Populate existing buckets to the etcd backend - if globalDNSConfig != nil { - initFederatorBackend(buckets, newObject) - } - return nil } @@ -523,7 +510,10 @@ func serverMain(ctx *cli.Context) { go startBackgroundOps(GlobalContext, newObject) - logger.FatalIf(initSafeMode(), "Unable to initialize server switching into safe-mode") + logger.FatalIf(initSafeMode(GlobalContext, newObject), "Unable to initialize server switching into safe-mode") + + // Initialize users credentials and policies in background. + go startBackgroundIAMLoad(GlobalContext) if globalCacheConfig.Enabled { // initialize the new disk cache objects. diff --git a/cmd/test-utils_test.go b/cmd/test-utils_test.go index e44bb0900..55213ab2e 100644 --- a/cmd/test-utils_test.go +++ b/cmd/test-utils_test.go @@ -331,7 +331,7 @@ func UnstartedTestServer(t TestErrHandler, instanceType string) TestServer { newAllSubsystems() - initAllSubsystems(objLayer) + initAllSubsystems(ctx, objLayer) return testServer } @@ -1556,7 +1556,7 @@ func newTestObjectLayer(ctx context.Context, endpointZones EndpointZones) (newOb newAllSubsystems() - initAllSubsystems(z) + initAllSubsystems(ctx, z) return z, nil } @@ -1602,7 +1602,7 @@ func removeDiskN(disks []string, n int) { func initAPIHandlerTest(obj ObjectLayer, endpoints []string) (string, http.Handler, error) { newAllSubsystems() - initAllSubsystems(obj) + initAllSubsystems(context.Background(), obj) // get random bucket name. bucketName := getRandomBucketName() @@ -1878,7 +1878,7 @@ func ExecObjectLayerTest(t TestErrHandler, objTest objTestType) { t.Fatal("Unexpected error", err) } - initAllSubsystems(objLayer) + initAllSubsystems(ctx, objLayer) // Executing the object layer tests for single node setup. objTest(objLayer, FSTestStr, t) @@ -1890,7 +1890,7 @@ func ExecObjectLayerTest(t TestErrHandler, objTest objTestType) { t.Fatalf("Initialization of object layer failed for XL setup: %s", err) } - initAllSubsystems(objLayer) + initAllSubsystems(ctx, objLayer) defer removeRoots(append(fsDirs, fsDir)) // Executing the object layer tests for XL. diff --git a/cmd/xl-v1-multipart.go b/cmd/xl-v1-multipart.go index e60f6d2ce..81e953852 100644 --- a/cmd/xl-v1-multipart.go +++ b/cmd/xl-v1-multipart.go @@ -160,7 +160,10 @@ func (xl xlObjects) newMultipartUpload(ctx context.Context, bucket string, objec // we now know the number of blocks this object needs for data and parity. // establish the writeQuorum using this data - writeQuorum := dataBlocks + 1 + writeQuorum := dataBlocks + if dataBlocks == parityBlocks { + writeQuorum = dataBlocks + 1 + } if meta["content-type"] == "" { contentType := mimedb.TypeByExtension(path.Ext(object)) diff --git a/cmd/xl-v1-object.go b/cmd/xl-v1-object.go index c18292e6a..e89413a5e 100644 --- a/cmd/xl-v1-object.go +++ b/cmd/xl-v1-object.go @@ -500,7 +500,10 @@ func (xl xlObjects) putObject(ctx context.Context, bucket string, object string, // we now know the number of blocks this object needs for data and parity. // writeQuorum is dataBlocks + 1 - writeQuorum := dataDrives + 1 + writeQuorum := dataDrives + if dataDrives == parityDrives { + writeQuorum = dataDrives + 1 + } // Delete temporary object in the event of failure. // If PutObject succeeded there would be no temporary diff --git a/go.mod b/go.mod index 53797a3e1..20e46d694 100644 --- a/go.mod +++ b/go.mod @@ -67,7 +67,7 @@ require ( github.com/miekg/dns v1.1.8 github.com/minio/cli v1.22.0 github.com/minio/highwayhash v1.0.0 - github.com/minio/minio-go/v6 v6.0.56 + github.com/minio/minio-go/v6 v6.0.57 github.com/minio/parquet-go v0.0.0-20200414234858-838cfa8aae61 github.com/minio/sha256-simd v0.1.1 github.com/minio/simdjson-go v0.1.5-0.20200303142138-b17fe061ea37 diff --git a/go.sum b/go.sum index 476104ff9..c3580830f 100644 --- a/go.sum +++ b/go.sum @@ -232,6 +232,7 @@ github.com/klauspost/compress v1.10.3 h1:OP96hzwJVBIHYU52pVTI6CczrxPvrGfgqF9N5eT github.com/klauspost/compress v1.10.3/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs= github.com/klauspost/cpuid v1.2.2 h1:1xAgYebNnsb9LKCdLOvFWtAxGU/33mjJtyOVbmUa0Us= github.com/klauspost/cpuid v1.2.2/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek= +github.com/klauspost/cpuid v1.2.3/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek= github.com/klauspost/cpuid v1.2.4 h1:EBfaK0SWSwk+fgk6efYFWdzl8MwRWoOO1gkmiaTXPW4= github.com/klauspost/cpuid v1.2.4/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek= github.com/klauspost/pgzip v1.2.1 h1:oIPZROsWuPHpOdMVWLuJZXwgjhrW8r1yEX8UqMyeNHM= @@ -273,9 +274,11 @@ github.com/minio/cli v1.22.0 h1:VTQm7lmXm3quxO917X3p+el1l0Ca5X3S4PM2ruUYO68= github.com/minio/cli v1.22.0/go.mod h1:bYxnK0uS629N3Bq+AOZZ+6lwF77Sodk4+UL9vNuXhOY= github.com/minio/highwayhash v1.0.0 h1:iMSDhgUILCr0TNm8LWlSjF8N0ZIj2qbO8WHp6Q/J2BA= github.com/minio/highwayhash v1.0.0/go.mod h1:xQboMTeM9nY9v/LlAOxFctujiv5+Aq2hR5dxBpaMbdc= +github.com/minio/md5-simd v1.1.0 h1:QPfiOqlZH+Cj9teu0t9b1nTBfPbyTl16Of5MeuShdK4= +github.com/minio/md5-simd v1.1.0/go.mod h1:XpBqgZULrMYD3R+M28PcmP0CkI7PEMzB3U77ZrKZ0Gw= github.com/minio/minio-go/v6 v6.0.53/go.mod h1:DIvC/IApeHX8q1BAMVCXSXwpmrmM+I+iBvhvztQorfI= -github.com/minio/minio-go/v6 v6.0.56 h1:H4+v6UFV1V7VkEf1HjL15W9OvTL1Gy8EbMmjQZHqEbg= -github.com/minio/minio-go/v6 v6.0.56/go.mod h1:KQMM+/44DSlSGSQWSfRrAZ12FVMmpWNuX37i2AX0jfI= +github.com/minio/minio-go/v6 v6.0.57 h1:ixPkbKkyD7IhnluRgQpGSpHdpvNVaW6OD5R9IAO/9Tw= +github.com/minio/minio-go/v6 v6.0.57/go.mod h1:5+R/nM9Pwrh0vqF+HbYYDQ84wdUFPyXHkrdT4AIkifM= github.com/minio/parquet-go v0.0.0-20200414234858-838cfa8aae61 h1:pUSI/WKPdd77gcuoJkSzhJ4wdS8OMDOsOu99MtpXEQA= github.com/minio/parquet-go v0.0.0-20200414234858-838cfa8aae61/go.mod h1:4trzEJ7N1nBTd5Tt7OCZT5SEin+WiAXpdJ/WgPkESA8= github.com/minio/sha256-simd v0.1.1 h1:5QHSlgo3nt5yKOJrC7W8w7X+NFl8cMPZm96iu8kKUJU=