From 347b29d0590cbdf41688aecf6d0312786d2908d0 Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Tue, 19 Nov 2019 17:42:27 -0800 Subject: [PATCH] Implement bucket expansion (#8509) --- cmd/admin-handlers_test.go | 4 +- cmd/admin-server-info.go | 157 ++-- cmd/background-heal-ops.go | 2 +- cmd/background-newdisks-heal-ops.go | 76 +- cmd/bucket-handlers.go | 22 +- cmd/bucket-handlers_test.go | 6 +- cmd/config-encrypted.go | 14 - cmd/config.go | 13 - cmd/consolelogger.go | 4 +- cmd/endpoint-ellipses.go | 102 ++- cmd/endpoint-ellipses_test.go | 44 +- cmd/endpoint.go | 242 +++-- cmd/endpoint_test.go | 170 ++-- cmd/format-xl.go | 8 +- cmd/format-xl_test.go | 4 +- cmd/global-heal.go | 42 +- cmd/globals.go | 8 +- cmd/healthcheck-handler.go | 20 +- cmd/local-locker.go | 9 + cmd/lock-rest-server.go | 34 +- cmd/metrics.go | 12 +- cmd/namespace-lock.go | 10 +- cmd/net.go | 7 +- cmd/notification.go | 34 +- cmd/object-api-datatypes.go | 10 + cmd/object-api-errors.go | 6 + cmd/object-api-input-checks.go | 9 - cmd/object-api-multipart_test.go | 16 +- cmd/peer-rest-client.go | 6 +- cmd/prepare-storage.go | 10 +- cmd/routers.go | 10 +- cmd/server-main.go | 48 +- cmd/server-main_test.go | 11 +- cmd/storage-rest-server.go | 98 +- cmd/storage-rest_test.go | 4 +- cmd/test-utils_test.go | 117 +-- cmd/tree-walk_test.go | 14 +- cmd/web-handlers_test.go | 11 +- cmd/xl-sets.go | 96 +- cmd/xl-sets_test.go | 5 +- cmd/xl-v1-common_test.go | 5 +- cmd/xl-v1-healing-common.go | 2 +- cmd/xl-v1-healing-common_test.go | 13 +- cmd/xl-v1-healing.go | 10 +- cmd/xl-v1-healing_test.go | 45 +- cmd/xl-v1-metadata_test.go | 20 +- cmd/xl-v1-multipart.go | 71 +- cmd/xl-v1-multipart_test.go | 3 +- cmd/xl-v1-object.go | 67 +- cmd/xl-v1-object_test.go | 82 +- cmd/xl-v1-utils_test.go | 16 +- cmd/xl-v1.go | 5 +- cmd/xl-zones.go | 1299 +++++++++++++++++++++++++++ docs/distributed/DESIGN.md | 11 +- docs/distributed/README.md | 21 +- docs/minio-limits.md | 6 +- go.sum | 6 + pkg/dsync/drwmutex.go | 59 +- pkg/dsync/dsync.go | 38 +- pkg/dsync/dsync_private_test.go | 58 -- pkg/dsync/dsync_test.go | 11 +- pkg/dsync/rpc-client-impl_test.go | 8 + pkg/dsync/rpc-client-interface.go | 3 + 63 files changed, 2208 insertions(+), 1166 deletions(-) create mode 100644 cmd/xl-zones.go delete mode 100644 pkg/dsync/dsync_private_test.go diff --git a/cmd/admin-handlers_test.go b/cmd/admin-handlers_test.go index 5ca55a62d..126c35244 100644 --- a/cmd/admin-handlers_test.go +++ b/cmd/admin-handlers_test.go @@ -61,7 +61,7 @@ func prepareAdminXLTestBed() (*adminXLTestBed, error) { // Initialize boot time globalBootTime = UTCNow() - globalEndpoints = mustGetNewEndpointList(xlDirs...) + globalEndpoints = mustGetZoneEndpoints(xlDirs...) // Set globalIsXL to indicate that the setup uses an erasure // code backend. @@ -113,7 +113,7 @@ func initTestXLObjLayer() (ObjectLayer, []string, error) { if err != nil { return nil, nil, err } - endpoints := mustGetNewEndpointList(xlDirs...) + endpoints := mustGetNewEndpoints(xlDirs...) format, err := waitForFormatXL(true, endpoints, 1, 16) if err != nil { removeRoots(xlDirs) diff --git a/cmd/admin-server-info.go b/cmd/admin-server-info.go index 3c0f2d3c4..59d041f6f 100644 --- a/cmd/admin-server-info.go +++ b/cmd/admin-server-info.go @@ -30,27 +30,28 @@ import ( cpuhw "github.com/shirou/gopsutil/cpu" ) -// getLocalMemUsage - returns ServerMemUsageInfo for only the -// local endpoints from given list of endpoints -func getLocalMemUsage(endpoints EndpointList, r *http.Request) ServerMemUsageInfo { +// getLocalMemUsage - returns ServerMemUsageInfo for all zones, endpoints. +func getLocalMemUsage(endpointZones EndpointZones, r *http.Request) ServerMemUsageInfo { var memUsages []mem.Usage var historicUsages []mem.Usage seenHosts := set.NewStringSet() - for _, endpoint := range endpoints { - if seenHosts.Contains(endpoint.Host) { - continue - } - seenHosts.Add(endpoint.Host) + for _, ep := range endpointZones { + for _, endpoint := range ep.Endpoints { + if seenHosts.Contains(endpoint.Host) { + continue + } + seenHosts.Add(endpoint.Host) - // Only proceed for local endpoints - if endpoint.IsLocal { - memUsages = append(memUsages, mem.GetUsage()) - historicUsages = append(historicUsages, mem.GetHistoricUsage()) + // Only proceed for local endpoints + if endpoint.IsLocal { + memUsages = append(memUsages, mem.GetUsage()) + historicUsages = append(historicUsages, mem.GetHistoricUsage()) + } } } addr := r.Host if globalIsDistXL { - addr = GetLocalPeer(endpoints) + addr = GetLocalPeer(endpointZones) } return ServerMemUsageInfo{ Addr: addr, @@ -59,27 +60,28 @@ func getLocalMemUsage(endpoints EndpointList, r *http.Request) ServerMemUsageInf } } -// getLocalCPULoad - returns ServerCPULoadInfo for only the -// local endpoints from given list of endpoints -func getLocalCPULoad(endpoints EndpointList, r *http.Request) ServerCPULoadInfo { +// getLocalCPULoad - returns ServerCPULoadInfo for all zones, endpoints. +func getLocalCPULoad(endpointZones EndpointZones, r *http.Request) ServerCPULoadInfo { var cpuLoads []cpu.Load var historicLoads []cpu.Load seenHosts := set.NewStringSet() - for _, endpoint := range endpoints { - if seenHosts.Contains(endpoint.Host) { - continue - } - seenHosts.Add(endpoint.Host) + for _, ep := range endpointZones { + for _, endpoint := range ep.Endpoints { + if seenHosts.Contains(endpoint.Host) { + continue + } + seenHosts.Add(endpoint.Host) - // Only proceed for local endpoints - if endpoint.IsLocal { - cpuLoads = append(cpuLoads, cpu.GetLoad()) - historicLoads = append(historicLoads, cpu.GetHistoricLoad()) + // Only proceed for local endpoints + if endpoint.IsLocal { + cpuLoads = append(cpuLoads, cpu.GetLoad()) + historicLoads = append(historicLoads, cpu.GetHistoricLoad()) + } } } addr := r.Host if globalIsDistXL { - addr = GetLocalPeer(endpoints) + addr = GetLocalPeer(endpointZones) } return ServerCPULoadInfo{ Addr: addr, @@ -88,26 +90,27 @@ func getLocalCPULoad(endpoints EndpointList, r *http.Request) ServerCPULoadInfo } } -// getLocalDrivesPerf - returns ServerDrivesPerfInfo for only the -// local endpoints from given list of endpoints -func getLocalDrivesPerf(endpoints EndpointList, size int64, r *http.Request) madmin.ServerDrivesPerfInfo { +// getLocalDrivesPerf - returns ServerDrivesPerfInfo for all zones, endpoints. +func getLocalDrivesPerf(endpointZones EndpointZones, size int64, r *http.Request) madmin.ServerDrivesPerfInfo { var dps []disk.Performance - for _, endpoint := range endpoints { - // Only proceed for local endpoints - if endpoint.IsLocal { - if _, err := os.Stat(endpoint.Path); err != nil { - // Since this drive is not available, add relevant details and proceed - dps = append(dps, disk.Performance{Path: endpoint.Path, Error: err.Error()}) - continue + for _, ep := range endpointZones { + for _, endpoint := range ep.Endpoints { + // Only proceed for local endpoints + if endpoint.IsLocal { + if _, err := os.Stat(endpoint.Path); err != nil { + // Since this drive is not available, add relevant details and proceed + dps = append(dps, disk.Performance{Path: endpoint.Path, Error: err.Error()}) + continue + } + dp := disk.GetPerformance(pathJoin(endpoint.Path, minioMetaTmpBucket, mustGetUUID()), size) + dp.Path = endpoint.Path + dps = append(dps, dp) } - dp := disk.GetPerformance(pathJoin(endpoint.Path, minioMetaTmpBucket, mustGetUUID()), size) - dp.Path = endpoint.Path - dps = append(dps, dp) } } addr := r.Host if globalIsDistXL { - addr = GetLocalPeer(endpoints) + addr = GetLocalPeer(endpointZones) } return madmin.ServerDrivesPerfInfo{ Addr: addr, @@ -116,31 +119,32 @@ func getLocalDrivesPerf(endpoints EndpointList, size int64, r *http.Request) mad } } -// getLocalCPUInfo - returns ServerCPUHardwareInfo only for the -// local endpoints from given list of endpoints -func getLocalCPUInfo(endpoints EndpointList, r *http.Request) madmin.ServerCPUHardwareInfo { +// getLocalCPUInfo - returns ServerCPUHardwareInfo for all zones, endpoints. +func getLocalCPUInfo(endpointZones EndpointZones, r *http.Request) madmin.ServerCPUHardwareInfo { var cpuHardwares []cpuhw.InfoStat seenHosts := set.NewStringSet() - for _, endpoint := range endpoints { - if seenHosts.Contains(endpoint.Host) { - continue - } - // Add to the list of visited hosts - seenHosts.Add(endpoint.Host) - // Only proceed for local endpoints - if endpoint.IsLocal { - cpuHardware, err := cpuhw.Info() - if err != nil { - return madmin.ServerCPUHardwareInfo{ - Error: err.Error(), - } + for _, ep := range endpointZones { + for _, endpoint := range ep.Endpoints { + if seenHosts.Contains(endpoint.Host) { + continue + } + // Add to the list of visited hosts + seenHosts.Add(endpoint.Host) + // Only proceed for local endpoints + if endpoint.IsLocal { + cpuHardware, err := cpuhw.Info() + if err != nil { + return madmin.ServerCPUHardwareInfo{ + Error: err.Error(), + } + } + cpuHardwares = append(cpuHardwares, cpuHardware...) } - cpuHardwares = append(cpuHardwares, cpuHardware...) } } addr := r.Host if globalIsDistXL { - addr = GetLocalPeer(endpoints) + addr = GetLocalPeer(endpointZones) } return madmin.ServerCPUHardwareInfo{ @@ -149,31 +153,32 @@ func getLocalCPUInfo(endpoints EndpointList, r *http.Request) madmin.ServerCPUHa } } -// getLocalNetworkInfo - returns ServerNetworkHardwareInfo only for the -// local endpoints from given list of endpoints -func getLocalNetworkInfo(endpoints EndpointList, r *http.Request) madmin.ServerNetworkHardwareInfo { +// getLocalNetworkInfo - returns ServerNetworkHardwareInfo for all zones, endpoints. +func getLocalNetworkInfo(endpointZones EndpointZones, r *http.Request) madmin.ServerNetworkHardwareInfo { var networkHardwares []net.Interface seenHosts := set.NewStringSet() - for _, endpoint := range endpoints { - if seenHosts.Contains(endpoint.Host) { - continue - } - // Add to the list of visited hosts - seenHosts.Add(endpoint.Host) - // Only proceed for local endpoints - if endpoint.IsLocal { - networkHardware, err := net.Interfaces() - if err != nil { - return madmin.ServerNetworkHardwareInfo{ - Error: err.Error(), - } + for _, ep := range endpointZones { + for _, endpoint := range ep.Endpoints { + if seenHosts.Contains(endpoint.Host) { + continue + } + // Add to the list of visited hosts + seenHosts.Add(endpoint.Host) + // Only proceed for local endpoints + if endpoint.IsLocal { + networkHardware, err := net.Interfaces() + if err != nil { + return madmin.ServerNetworkHardwareInfo{ + Error: err.Error(), + } + } + networkHardwares = append(networkHardwares, networkHardware...) } - networkHardwares = append(networkHardwares, networkHardware...) } } addr := r.Host if globalIsDistXL { - addr = GetLocalPeer(endpoints) + addr = GetLocalPeer(endpointZones) } return madmin.ServerNetworkHardwareInfo{ diff --git a/cmd/background-heal-ops.go b/cmd/background-heal-ops.go index d1f1d862a..c450beeb3 100644 --- a/cmd/background-heal-ops.go +++ b/cmd/background-heal-ops.go @@ -65,7 +65,7 @@ func (h *healRoutine) run() { // Wait at max 10 minute for an inprogress request before proceeding to heal waitCount := 600 // Any requests in progress, delay the heal. - for (globalHTTPServer.GetRequestCount() >= int32(globalXLSetCount*globalXLSetDriveCount)) && + for (globalHTTPServer.GetRequestCount() >= int32(globalEndpoints.Nodes())) && waitCount > 0 { waitCount-- time.Sleep(1 * time.Second) diff --git a/cmd/background-newdisks-heal-ops.go b/cmd/background-newdisks-heal-ops.go index a856a2624..b81fd8d12 100644 --- a/cmd/background-newdisks-heal-ops.go +++ b/cmd/background-newdisks-heal-ops.go @@ -44,7 +44,7 @@ func monitorLocalDisksAndHeal() { break } - sets, ok := objAPI.(*xlSets) + z, ok := objAPI.(*xlZones) if !ok { return } @@ -66,21 +66,24 @@ func monitorLocalDisksAndHeal() { for { time.Sleep(defaultMonitorNewDiskInterval) - localDisksToHeal := []Endpoint{} - for _, endpoint := range globalEndpoints { - if !endpoint.IsLocal { + localDisksInZoneHeal := make([]Endpoints, len(z.zones)) + for i, ep := range globalEndpoints { + localDisksToHeal := Endpoints{} + for _, endpoint := range ep.Endpoints { + if !endpoint.IsLocal { + continue + } + // Try to connect to the current endpoint + // and reformat if the current disk is not formatted + _, _, err := connectEndpoint(endpoint) + if err == errUnformattedDisk { + localDisksToHeal = append(localDisksToHeal, endpoint) + } + } + if len(localDisksToHeal) == 0 { continue } - // Try to connect to the current endpoint - // and reformat if the current disk is not formatted - _, _, err := connectEndpoint(endpoint) - if err == errUnformattedDisk { - localDisksToHeal = append(localDisksToHeal, endpoint) - } - } - - if len(localDisksToHeal) == 0 { - continue + localDisksInZoneHeal[i] = localDisksToHeal } // Reformat disks @@ -88,31 +91,36 @@ func monitorLocalDisksAndHeal() { // Ensure that reformatting disks is finished bgSeq.sourceCh <- nopHeal + var erasureSetInZoneToHeal = make([][]int, len(localDisksInZoneHeal)) // Compute the list of erasure set to heal - var erasureSetToHeal []int - for _, endpoint := range localDisksToHeal { - // Load the new format of this passed endpoint - _, format, err := connectEndpoint(endpoint) - if err != nil { - logger.LogIf(ctx, err) - continue - } - // Calculate the set index where the current endpoint belongs - setIndex, _, err := findDiskIndex(sets.format, format) - if err != nil { - logger.LogIf(ctx, err) - continue - } + for i, localDisksToHeal := range localDisksInZoneHeal { + var erasureSetToHeal []int + for _, endpoint := range localDisksToHeal { + // Load the new format of this passed endpoint + _, format, err := connectEndpoint(endpoint) + if err != nil { + logger.LogIf(ctx, err) + continue + } + // Calculate the set index where the current endpoint belongs + setIndex, _, err := findDiskIndex(z.zones[i].format, format) + if err != nil { + logger.LogIf(ctx, err) + continue + } - erasureSetToHeal = append(erasureSetToHeal, setIndex) + erasureSetToHeal = append(erasureSetToHeal, setIndex) + } + erasureSetInZoneToHeal[i] = erasureSetToHeal } // Heal all erasure sets that need - for _, setIndex := range erasureSetToHeal { - xlObj := sets.sets[setIndex] - err := healErasureSet(ctx, setIndex, xlObj) - if err != nil { - logger.LogIf(ctx, err) + for i, erasureSetToHeal := range erasureSetInZoneToHeal { + for _, setIndex := range erasureSetToHeal { + err := healErasureSet(ctx, setIndex, z.zones[i].sets[setIndex]) + if err != nil { + logger.LogIf(ctx, err) + } } } } diff --git a/cmd/bucket-handlers.go b/cmd/bucket-handlers.go index 1bffe0fc6..c566e97a8 100644 --- a/cmd/bucket-handlers.go +++ b/cmd/bucket-handlers.go @@ -383,12 +383,7 @@ func (api objectAPIHandlers) DeleteMultipleObjectsHandler(w http.ResponseWriter, deleteObjectsFn = api.CacheAPI().DeleteObjects } - type delObj struct { - origIndex int - name string - } - - var objectsToDelete []delObj + var objectsToDelete = map[string]int{} var dErrs = make([]APIErrorCode, len(deleteObjects.Objects)) for index, object := range deleteObjects.Objects { @@ -400,13 +395,16 @@ func (api objectAPIHandlers) DeleteMultipleObjectsHandler(w http.ResponseWriter, continue } - objectsToDelete = append(objectsToDelete, delObj{index, object.ObjectName}) + // Avoid duplicate objects, we use map to filter them out. + if _, ok := objectsToDelete[object.ObjectName]; !ok { + objectsToDelete[object.ObjectName] = index + } } - toNames := func(input []delObj) (output []string) { + toNames := func(input map[string]int) (output []string) { output = make([]string, len(input)) - for i := range input { - output[i] = input[i].name + for name, index := range input { + output[index] = name } return } @@ -417,8 +415,8 @@ func (api objectAPIHandlers) DeleteMultipleObjectsHandler(w http.ResponseWriter, return } - for i, obj := range objectsToDelete { - dErrs[obj.origIndex] = toAPIErrorCode(ctx, errs[i]) + for _, index := range objectsToDelete { + dErrs[index] = toAPIErrorCode(ctx, errs[index]) } // Collect deleted objects and errors if any. diff --git a/cmd/bucket-handlers_test.go b/cmd/bucket-handlers_test.go index 6971d9f17..cce4ab122 100644 --- a/cmd/bucket-handlers_test.go +++ b/cmd/bucket-handlers_test.go @@ -331,7 +331,7 @@ func testListMultipartUploadsHandler(obj ObjectLayer, instanceType, bucketName s shouldPass: false, }, // Test case -3. - // Setting invalid delimiter, expecting the HTTP response status to be http.StatusNotImplemented. + // Delimiter unsupported, but response is empty. { bucket: bucketName, prefix: "", @@ -341,8 +341,8 @@ func testListMultipartUploadsHandler(obj ObjectLayer, instanceType, bucketName s maxUploads: "0", accessKey: credentials.AccessKey, secretKey: credentials.SecretKey, - expectedRespStatus: http.StatusNotImplemented, - shouldPass: false, + expectedRespStatus: http.StatusOK, + shouldPass: true, }, // Test case - 4. // Setting Invalid prefix and marker combination. diff --git a/cmd/config-encrypted.go b/cmd/config-encrypted.go index aefc62f0f..c0ac1376a 100644 --- a/cmd/config-encrypted.go +++ b/cmd/config-encrypted.go @@ -44,20 +44,6 @@ func handleEncryptedConfigBackend(objAPI ObjectLayer, server bool) error { var encrypted bool var err error - // Construct path to config/transaction.lock for locking - transactionConfigPrefix := minioConfigPrefix + "/transaction.lock" - - // Make sure to hold lock for entire migration to avoid - // such that only one server should migrate the entire config - // at a given time, this big transaction lock ensures this - // appropriately. This is also true for rotation of encrypted - // content. - objLock := objAPI.NewNSLock(context.Background(), minioMetaBucket, transactionConfigPrefix) - if err := objLock.GetLock(globalOperationTimeout); err != nil { - return err - } - defer objLock.Unlock() - // Migrating Config backend needs a retry mechanism for // the following reasons: // - Read quorum is lost just after the initialization diff --git a/cmd/config.go b/cmd/config.go index d8e40b023..199289c12 100644 --- a/cmd/config.go +++ b/cmd/config.go @@ -287,19 +287,6 @@ func initConfig(objAPI ObjectLayer) error { } } - // Construct path to config/transaction.lock for locking - transactionConfigPrefix := minioConfigPrefix + "/transaction.lock" - - // Hold lock only by one server and let that server alone migrate - // all the config as necessary, this is to ensure that - // redundant locks are not held for each migration - this allows - // for a more predictable behavior while debugging. - objLock := objAPI.NewNSLock(context.Background(), minioMetaBucket, transactionConfigPrefix) - if err := objLock.GetLock(globalOperationTimeout); err != nil { - return err - } - defer objLock.Unlock() - // Migrates ${HOME}/.minio/config.json or config.json.deprecated // to '/.minio.sys/config/config.json' // ignore if the file doesn't exist. diff --git a/cmd/consolelogger.go b/cmd/consolelogger.go index bf627e50a..8f65e70cc 100644 --- a/cmd/consolelogger.go +++ b/cmd/consolelogger.go @@ -44,8 +44,8 @@ type HTTPConsoleLoggerSys struct { // NewConsoleLogger - creates new HTTPConsoleLoggerSys with all nodes subscribed to // the console logging pub sub system -func NewConsoleLogger(ctx context.Context, endpoints EndpointList) *HTTPConsoleLoggerSys { - host, err := xnet.ParseHost(GetLocalPeer(globalEndpoints)) +func NewConsoleLogger(ctx context.Context, endpointZones EndpointZones) *HTTPConsoleLoggerSys { + host, err := xnet.ParseHost(GetLocalPeer(endpointZones)) if err != nil { logger.FatalIf(err, "Unable to start console logging subsystem") } diff --git a/cmd/endpoint-ellipses.go b/cmd/endpoint-ellipses.go index d5de3680a..03263a285 100644 --- a/cmd/endpoint-ellipses.go +++ b/cmd/endpoint-ellipses.go @@ -58,31 +58,20 @@ func getDivisibleSize(totalSizes []uint64) (result uint64) { return result } +// isValidSetSize - checks whether given count is a valid set size for erasure coding. +var isValidSetSize = func(count uint64) bool { + return (count >= setSizes[0] && count <= setSizes[len(setSizes)-1] && count%2 == 0) +} + // getSetIndexes returns list of indexes which provides the set size // on each index, this function also determines the final set size // The final set size has the affinity towards choosing smaller // indexes (total sets) -func getSetIndexes(args []string, totalSizes []uint64) (setIndexes [][]uint64, err error) { +func getSetIndexes(args []string, totalSizes []uint64, customSetDriveCount uint64) (setIndexes [][]uint64, err error) { if len(totalSizes) == 0 || len(args) == 0 { return nil, errInvalidArgument } - // isValidSetSize - checks whether given count is a valid set size for erasure coding. - isValidSetSize := func(count uint64) bool { - return (count >= setSizes[0] && count <= setSizes[len(setSizes)-1] && count%2 == 0) - } - - var customSetDriveCount uint64 - if v := env.Get("MINIO_ERASURE_SET_DRIVE_COUNT", ""); v != "" { - customSetDriveCount, err = strconv.ParseUint(v, 10, 64) - if err != nil { - return nil, config.ErrInvalidErasureSetSize(err) - } - if !isValidSetSize(customSetDriveCount) { - return nil, config.ErrInvalidErasureSetSize(nil) - } - } - setIndexes = make([][]uint64, len(totalSizes)) for _, totalSize := range totalSizes { // Check if totalSize has minimum range upto setSize @@ -189,7 +178,7 @@ func getTotalSizes(argPatterns []ellipses.ArgPattern) []uint64 { // Parses all arguments and returns an endpointSet which is a collection // of endpoints following the ellipses pattern, this is what is used // by the object layer for initializing itself. -func parseEndpointSet(args ...string) (ep endpointSet, err error) { +func parseEndpointSet(customSetDriveCount uint64, args ...string) (ep endpointSet, err error) { var argPatterns = make([]ellipses.ArgPattern, len(args)) for i, arg := range args { patterns, perr := ellipses.FindEllipsesPatterns(arg) @@ -199,7 +188,7 @@ func parseEndpointSet(args ...string) (ep endpointSet, err error) { argPatterns[i] = patterns } - ep.setIndexes, err = getSetIndexes(args, getTotalSizes(argPatterns)) + ep.setIndexes, err = getSetIndexes(args, getTotalSizes(argPatterns), customSetDriveCount) if err != nil { return endpointSet{}, config.ErrInvalidErasureEndpoints(nil).Msg(err.Error()) } @@ -215,8 +204,15 @@ func parseEndpointSet(args ...string) (ep endpointSet, err error) { // For example: {1...64} is divided into 4 sets each of size 16. // This applies to even distributed setup syntax as well. func GetAllSets(args ...string) ([][]string, error) { - if len(args) == 0 { - return nil, errInvalidArgument + var customSetDriveCount uint64 + if v := env.Get("MINIO_ERASURE_SET_DRIVE_COUNT", ""); v != "" { + customSetDriveCount, err := strconv.ParseUint(v, 10, 64) + if err != nil { + return nil, config.ErrInvalidErasureSetSize(err) + } + if !isValidSetSize(customSetDriveCount) { + return nil, config.ErrInvalidErasureSetSize(nil) + } } var setArgs [][]string @@ -225,7 +221,7 @@ func GetAllSets(args ...string) ([][]string, error) { // Check if we have more one args. if len(args) > 1 { var err error - setIndexes, err = getSetIndexes(args, []uint64{uint64(len(args))}) + setIndexes, err = getSetIndexes(args, []uint64{uint64(len(args))}, customSetDriveCount) if err != nil { return nil, err } @@ -239,7 +235,7 @@ func GetAllSets(args ...string) ([][]string, error) { } setArgs = s.Get() } else { - s, err := parseEndpointSet(args...) + s, err := parseEndpointSet(customSetDriveCount, args...) if err != nil { return nil, err } @@ -261,18 +257,60 @@ func GetAllSets(args ...string) ([][]string, error) { // CreateServerEndpoints - validates and creates new endpoints from input args, supports // both ellipses and without ellipses transparently. -func createServerEndpoints(serverAddr string, args ...string) (string, EndpointList, SetupType, int, int, error) { - setArgs, err := GetAllSets(args...) - if err != nil { - return serverAddr, nil, -1, 0, 0, err +func createServerEndpoints(serverAddr string, args ...string) (EndpointZones, SetupType, error) { + if len(args) == 0 { + return nil, -1, errInvalidArgument } - var endpoints EndpointList + var endpointZones EndpointZones var setupType SetupType - serverAddr, endpoints, setupType, err = CreateEndpoints(serverAddr, setArgs...) - if err != nil { - return serverAddr, nil, -1, 0, 0, err + if !ellipses.HasEllipses(args...) { + setArgs, err := GetAllSets(args...) + if err != nil { + return nil, -1, err + } + endpointList, newSetupType, err := CreateEndpoints(serverAddr, setArgs...) + if err != nil { + return nil, -1, err + } + endpointZones = append(endpointZones, ZoneEndpoints{ + SetCount: len(setArgs), + DrivesPerSet: len(setArgs[0]), + Endpoints: endpointList, + }) + globalXLSetDriveCount = len(setArgs[0]) + setupType = newSetupType + return endpointZones, setupType, nil } - return serverAddr, endpoints, setupType, len(setArgs), len(setArgs[0]), nil + // Look for duplicate args. + if _, err := GetAllSets(args...); err != nil { + return nil, -1, err + } + for _, arg := range args { + setArgs, err := GetAllSets(arg) + if err != nil { + return nil, -1, err + } + endpointList, newSetupType, err := CreateEndpoints(serverAddr, setArgs...) + if err != nil { + return nil, -1, err + } + if setupType != 0 && setupType != newSetupType { + return nil, -1, fmt.Errorf("Mixed modes of operation %s and %s are not allowed", + setupType, newSetupType) + } + if globalXLSetDriveCount != 0 && globalXLSetDriveCount != len(setArgs[0]) { + return nil, -1, fmt.Errorf("All zones should have same drive per set ratio - expected %d, got %d", + globalXLSetDriveCount, len(setArgs[0])) + } + endpointZones = append(endpointZones, ZoneEndpoints{ + SetCount: len(setArgs), + DrivesPerSet: len(setArgs[0]), + Endpoints: endpointList, + }) + globalXLSetDriveCount = len(setArgs[0]) + setupType = newSetupType + } + return endpointZones, setupType, nil } diff --git a/cmd/endpoint-ellipses_test.go b/cmd/endpoint-ellipses_test.go index 5c258d5be..edd8256a8 100644 --- a/cmd/endpoint-ellipses_test.go +++ b/cmd/endpoint-ellipses_test.go @@ -18,7 +18,6 @@ package cmd import ( "fmt" - "os" "reflect" "testing" @@ -55,7 +54,7 @@ func TestCreateServerEndpoints(t *testing.T) { } for i, testCase := range testCases { - _, _, _, _, _, err := createServerEndpoints(testCase.serverAddr, testCase.args...) + _, _, err := createServerEndpoints(testCase.serverAddr, testCase.args...) if err != nil && testCase.success { t.Errorf("Test %d: Expected success but failed instead %s", i+1, err) } @@ -74,8 +73,10 @@ func TestGetDivisibleSize(t *testing.T) { {[]uint64{8, 8, 8}, 8}, {[]uint64{24}, 24}, } - for i, testCase := range testCases { - t.Run(fmt.Sprintf("Test%d", i+1), func(t *testing.T) { + + for _, testCase := range testCases { + testCase := testCase + t.Run("", func(t *testing.T) { gotGCD := getDivisibleSize(testCase.totalSizes) if testCase.result != gotGCD { t.Errorf("Expected %v, got %v", testCase.result, gotGCD) @@ -90,45 +91,43 @@ func TestGetSetIndexesEnvOverride(t *testing.T) { args []string totalSizes []uint64 indexes [][]uint64 - envOverride string + envOverride uint64 success bool }{ { []string{"data{1...64}"}, []uint64{64}, [][]uint64{{8, 8, 8, 8, 8, 8, 8, 8}}, - "8", + 8, true, }, { []string{"data{1...60}"}, nil, nil, - "8", + 8, false, }, { []string{"data{1...64}"}, nil, nil, - "-1", + 64, false, }, { []string{"data{1...64}"}, nil, nil, - "2", + 2, false, }, } - for i, testCase := range testCases { - t.Run(fmt.Sprintf("Test%d", i+1), func(t *testing.T) { - if err := os.Setenv("MINIO_ERASURE_SET_DRIVE_COUNT", testCase.envOverride); err != nil { - t.Fatal(err) - } - gotIndexes, err := getSetIndexes(testCase.args, testCase.totalSizes) + for _, testCase := range testCases { + testCase := testCase + t.Run("", func(t *testing.T) { + gotIndexes, err := getSetIndexes(testCase.args, testCase.totalSizes, testCase.envOverride) if err != nil && testCase.success { t.Errorf("Expected success but failed instead %s", err) } @@ -138,7 +137,6 @@ func TestGetSetIndexesEnvOverride(t *testing.T) { if !reflect.DeepEqual(testCase.indexes, gotIndexes) { t.Errorf("Expected %v, got %v", testCase.indexes, gotIndexes) } - os.Unsetenv("MINIO_ERASURE_SET_DRIVE_COUNT") }) } } @@ -209,9 +207,10 @@ func TestGetSetIndexes(t *testing.T) { }, } - for i, testCase := range testCases { - t.Run(fmt.Sprintf("Test%d", i+1), func(t *testing.T) { - gotIndexes, err := getSetIndexes(testCase.args, testCase.totalSizes) + for _, testCase := range testCases { + testCase := testCase + t.Run("", func(t *testing.T) { + gotIndexes, err := getSetIndexes(testCase.args, testCase.totalSizes, 0) if err != nil && testCase.success { t.Errorf("Expected success but failed instead %s", err) } @@ -530,9 +529,10 @@ func TestParseEndpointSet(t *testing.T) { }, } - for i, testCase := range testCases { - t.Run(fmt.Sprintf("Test%d", i+1), func(t *testing.T) { - gotEs, err := parseEndpointSet(testCase.arg) + for _, testCase := range testCases { + testCase := testCase + t.Run("", func(t *testing.T) { + gotEs, err := parseEndpointSet(0, testCase.arg) if err != nil && testCase.success { t.Errorf("Expected success but failed instead %s", err) } diff --git a/cmd/endpoint.go b/cmd/endpoint.go index bee311789..38391076c 100644 --- a/cmd/endpoint.go +++ b/cmd/endpoint.go @@ -55,7 +55,6 @@ type Endpoint struct { *url.URL IsLocal bool SetIndex int - HostName string } func (endpoint Endpoint) String() string { @@ -75,15 +74,15 @@ func (endpoint Endpoint) Type() EndpointType { return URLEndpointType } -// IsHTTPS - returns true if secure for URLEndpointType. -func (endpoint Endpoint) IsHTTPS() bool { +// HTTPS - returns true if secure for URLEndpointType. +func (endpoint Endpoint) HTTPS() bool { return endpoint.Scheme == "https" } // UpdateIsLocal - resolves the host and updates if it is local or not. func (endpoint *Endpoint) UpdateIsLocal() error { if !endpoint.IsLocal { - isLocal, err := isLocalHost(endpoint.HostName) + isLocal, err := isLocalHost(endpoint.Hostname()) if err != nil { return err } @@ -181,35 +180,46 @@ func NewEndpoint(arg string) (ep Endpoint, e error) { } return Endpoint{ - URL: u, - IsLocal: isLocal, - HostName: host, + URL: u, + IsLocal: isLocal, }, nil } -// EndpointList - list of same type of endpoint. -type EndpointList []Endpoint - -// Nodes - returns number of unique servers. -func (endpoints EndpointList) Nodes() int { - uniqueNodes := set.NewStringSet() - for _, endpoint := range endpoints { - if uniqueNodes.Contains(endpoint.Host) { - continue - } - uniqueNodes.Add(endpoint.Host) - } - return len(uniqueNodes) +// ZoneEndpoints represent endpoints in a given zone +// along with its setCount and drivesPerSet. +type ZoneEndpoints struct { + SetCount int + DrivesPerSet int + Endpoints Endpoints } -// IsHTTPS - returns true if secure for URLEndpointType. -func (endpoints EndpointList) IsHTTPS() bool { - return endpoints[0].IsHTTPS() +// EndpointZones - list of list of endpoints +type EndpointZones []ZoneEndpoints + +// HTTPS - returns true if secure for URLEndpointType. +func (l EndpointZones) HTTPS() bool { + return l[0].Endpoints.HTTPS() +} + +// Nodes - returns all nodes count +func (l EndpointZones) Nodes() (count int) { + for _, ep := range l { + count += len(ep.Endpoints) + } + return count +} + +// Endpoints - list of same type of endpoint. +type Endpoints []Endpoint + +// HTTPS - returns true if secure for URLEndpointType. +func (endpoints Endpoints) HTTPS() bool { + return endpoints[0].HTTPS() } // GetString - returns endpoint string of i-th endpoint (0-based), // and empty string for invalid indexes. -func (endpoints EndpointList) GetString(i int) string { +func (endpoints Endpoints) GetString(i int) string { if i < 0 || i >= len(endpoints) { return "" } @@ -217,7 +227,7 @@ func (endpoints EndpointList) GetString(i int) string { } // UpdateIsLocal - resolves the host and discovers the local host. -func (endpoints EndpointList) UpdateIsLocal() error { +func (endpoints Endpoints) UpdateIsLocal() error { var epsResolved int var foundLocal bool resolvedList := make([]bool, len(endpoints)) @@ -246,7 +256,7 @@ func (endpoints EndpointList) UpdateIsLocal() error { // return err if not Docker or Kubernetes // We use IsDocker() to check for Docker environment // We use IsKubernetes() to check for Kubernetes environment - isLocal, err := isLocalHost(endpoints[i].HostName) + isLocal, err := isLocalHost(endpoints[i].Hostname()) if err != nil { if !IsDocker() && !IsKubernetes() { return err @@ -256,8 +266,10 @@ func (endpoints EndpointList) UpdateIsLocal() error { // log error only if more than 1s elapsed if timeElapsed > time.Second { // Log the message to console about the host not being resolveable. - reqInfo := (&logger.ReqInfo{}).AppendTags("host", endpoints[i].HostName) - reqInfo.AppendTags("elapsedTime", humanize.RelTime(startTime, startTime.Add(timeElapsed), "elapsed", "")) + reqInfo := (&logger.ReqInfo{}).AppendTags("host", endpoints[i].Hostname()) + reqInfo.AppendTags("elapsedTime", + humanize.RelTime(startTime, startTime.Add(timeElapsed), + "elapsed", "")) ctx := logger.SetReqInfo(context.Background(), reqInfo) logger.LogIf(ctx, err, logger.Application) } @@ -301,8 +313,8 @@ func (endpoints EndpointList) UpdateIsLocal() error { return nil } -// NewEndpointList - returns new endpoint list based on input args. -func NewEndpointList(args ...string) (endpoints EndpointList, err error) { +// NewEndpoints - returns new endpoint list based on input args. +func NewEndpoints(args ...string) (endpoints Endpoints, err error) { var endpointType EndpointType var scheme string @@ -335,28 +347,30 @@ func NewEndpointList(args ...string) (endpoints EndpointList, err error) { return endpoints, nil } -func checkEndpointsSubOptimal(ctx *cli.Context, setupType SetupType, endpoints EndpointList) (err error) { +func checkEndpointsSubOptimal(ctx *cli.Context, setupType SetupType, endpointZones EndpointZones) (err error) { // Validate sub optimal ordering only for distributed setup. if setupType != DistXLSetupType { return nil } var endpointOrder int err = fmt.Errorf("Too many disk args are local, input is in sub-optimal order. Please review input args: %s", ctx.Args()) - for _, endpoint := range endpoints { - if endpoint.IsLocal { - endpointOrder++ - } else { - endpointOrder-- - } - if endpointOrder >= 2 { - return err + for _, endpoints := range endpointZones { + for _, endpoint := range endpoints.Endpoints { + if endpoint.IsLocal { + endpointOrder++ + } else { + endpointOrder-- + } + if endpointOrder >= 2 { + return err + } } } return nil } // Checks if there are any cross device mounts. -func checkCrossDeviceMounts(endpoints EndpointList) (err error) { +func checkCrossDeviceMounts(endpoints Endpoints) (err error) { var absPaths []string for _, endpoint := range endpoints { if endpoint.IsLocal { @@ -372,14 +386,14 @@ func checkCrossDeviceMounts(endpoints EndpointList) (err error) { } // CreateEndpoints - validates and creates new endpoints for given args. -func CreateEndpoints(serverAddr string, args ...[]string) (string, EndpointList, SetupType, error) { - var endpoints EndpointList +func CreateEndpoints(serverAddr string, args ...[]string) (Endpoints, SetupType, error) { + var endpoints Endpoints var setupType SetupType var err error // Check whether serverAddr is valid for this host. if err = CheckLocalServerAddr(serverAddr); err != nil { - return serverAddr, endpoints, setupType, err + return endpoints, setupType, err } _, serverAddrPort := mustSplitHostPort(serverAddr) @@ -389,36 +403,36 @@ func CreateEndpoints(serverAddr string, args ...[]string) (string, EndpointList, var endpoint Endpoint endpoint, err = NewEndpoint(args[0][0]) if err != nil { - return serverAddr, endpoints, setupType, err + return endpoints, setupType, err } if err := endpoint.UpdateIsLocal(); err != nil { - return serverAddr, endpoints, setupType, err + return endpoints, setupType, err } if endpoint.Type() != PathEndpointType { - return serverAddr, endpoints, setupType, config.ErrInvalidFSEndpoint(nil).Msg("use path style endpoint for FS setup") + return endpoints, setupType, config.ErrInvalidFSEndpoint(nil).Msg("use path style endpoint for FS setup") } endpoints = append(endpoints, endpoint) setupType = FSSetupType // Check for cross device mounts if any. if err = checkCrossDeviceMounts(endpoints); err != nil { - return serverAddr, endpoints, setupType, config.ErrInvalidFSEndpoint(nil).Msg(err.Error()) + return endpoints, setupType, config.ErrInvalidFSEndpoint(nil).Msg(err.Error()) } - return serverAddr, endpoints, setupType, nil + + return endpoints, setupType, nil } for i, iargs := range args { - var newEndpoints EndpointList // Convert args to endpoints - var eps EndpointList - eps, err = NewEndpointList(iargs...) + var newEndpoints Endpoints + eps, err := NewEndpoints(iargs...) if err != nil { - return serverAddr, endpoints, setupType, config.ErrInvalidErasureEndpoints(nil).Msg(err.Error()) + return endpoints, setupType, config.ErrInvalidErasureEndpoints(nil).Msg(err.Error()) } // Check for cross device mounts if any. if err = checkCrossDeviceMounts(eps); err != nil { - return serverAddr, endpoints, setupType, config.ErrInvalidErasureEndpoints(nil).Msg(err.Error()) + return endpoints, setupType, config.ErrInvalidErasureEndpoints(nil).Msg(err.Error()) } for _, ep := range eps { @@ -431,54 +445,44 @@ func CreateEndpoints(serverAddr string, args ...[]string) (string, EndpointList, // Return XL setup when all endpoints are path style. if endpoints[0].Type() == PathEndpointType { setupType = XLSetupType - return serverAddr, endpoints, setupType, nil + return endpoints, setupType, nil } - if err := endpoints.UpdateIsLocal(); err != nil { - return serverAddr, endpoints, setupType, config.ErrInvalidErasureEndpoints(nil).Msg(err.Error()) + if err = endpoints.UpdateIsLocal(); err != nil { + return endpoints, setupType, config.ErrInvalidErasureEndpoints(nil).Msg(err.Error()) } // Here all endpoints are URL style. endpointPathSet := set.NewStringSet() localEndpointCount := 0 - localServerAddrSet := set.NewStringSet() + localServerHostSet := set.NewStringSet() localPortSet := set.NewStringSet() for _, endpoint := range endpoints { endpointPathSet.Add(endpoint.Path) if endpoint.IsLocal { - localServerAddrSet.Add(endpoint.Host) + localServerHostSet.Add(endpoint.Hostname()) var port string _, port, err = net.SplitHostPort(endpoint.Host) if err != nil { port = serverAddrPort } - localPortSet.Add(port) localEndpointCount++ } } - // No local endpoint found. - if localEndpointCount == 0 { - return serverAddr, endpoints, setupType, config.ErrInvalidErasureEndpoints(nil).Msg("no endpoint pointing to the local machine is found") - } - // Check whether same path is not used in endpoints of a host on different port. { pathIPMap := make(map[string]set.StringSet) for _, endpoint := range endpoints { - var host string - host, _, err = net.SplitHostPort(endpoint.Host) - if err != nil { - host = endpoint.Host - } + host := endpoint.Hostname() hostIPSet, _ := getHostIP(host) if IPSet, ok := pathIPMap[endpoint.Path]; ok { if !IPSet.Intersection(hostIPSet).IsEmpty() { - return serverAddr, endpoints, setupType, + return endpoints, setupType, config.ErrInvalidErasureEndpoints(nil).Msg(fmt.Sprintf("path '%s' can not be served by different port on same address", endpoint.Path)) } pathIPMap[endpoint.Path] = IPSet.Union(hostIPSet) @@ -496,42 +500,25 @@ func CreateEndpoints(serverAddr string, args ...[]string) (string, EndpointList, continue } if localPathSet.Contains(endpoint.Path) { - return serverAddr, endpoints, setupType, + return endpoints, setupType, config.ErrInvalidErasureEndpoints(nil).Msg(fmt.Sprintf("path '%s' cannot be served by different address on same server", endpoint.Path)) } localPathSet.Add(endpoint.Path) } } - // Check whether serverAddrPort matches at least in one of port used in local endpoints. - { - if !localPortSet.Contains(serverAddrPort) { - if len(localPortSet) > 1 { - return serverAddr, endpoints, setupType, - config.ErrInvalidErasureEndpoints(nil).Msg("port number in server address must match with one of the port in local endpoints") - } - return serverAddr, endpoints, setupType, - config.ErrInvalidErasureEndpoints(nil).Msg("server address and local endpoint have different ports") - } - } - // All endpoints are pointing to local host if len(endpoints) == localEndpointCount { // If all endpoints have same port number, then this is XL setup using URL style endpoints. if len(localPortSet) == 1 { - if len(localServerAddrSet) > 1 { - // TODO: Even though all endpoints are local, the local host is referred by different IP/name. - // eg '172.0.0.1', 'localhost' and 'mylocalhostname' point to same local host. - // - // In this case, we bind to 0.0.0.0 ie to all interfaces. - // The actual way to do is bind to only IPs in uniqueLocalHosts. - serverAddr = net.JoinHostPort("", serverAddrPort) + if len(localServerHostSet) > 1 { + return endpoints, setupType, + config.ErrInvalidErasureEndpoints(nil).Msg("all local endpoints should not have different hostnames/ips") } - endpointPaths := endpointPathSet.ToSlice() - endpoints, _ = NewEndpointList(endpointPaths...) + endpoints, _ := NewEndpoints(endpointPaths...) setupType = XLSetupType - return serverAddr, endpoints, setupType, nil + return endpoints, setupType, nil } // Even though all endpoints are local, but those endpoints use different ports. @@ -539,24 +526,20 @@ func CreateEndpoints(serverAddr string, args ...[]string) (string, EndpointList, } else { // This is DistXL setup. // Check whether local server address are not 127.x.x.x - for _, localServerAddr := range localServerAddrSet.ToSlice() { - host, _, err := net.SplitHostPort(localServerAddr) - if err != nil { - host = localServerAddr - } - - ipList, err := getHostIP(host) - logger.FatalIf(err, "unexpected error when resolving host '%s'", host) + for _, localHost := range localServerHostSet.ToSlice() { + ipList, err := getHostIP(localHost) + logger.FatalIf(err, "unexpected error when resolving host '%s'", localHost) // Filter ipList by IPs those start with '127.' or '::1' loopBackIPs := ipList.FuncMatch(func(ip string, matchString string) bool { - return strings.HasPrefix(ip, "127.") || strings.HasPrefix(ip, "::1") + return net.ParseIP(ip).IsLoopback() }, "") // If loop back IP is found and ipList contains only loop back IPs, then error out. if len(loopBackIPs) > 0 && len(loopBackIPs) == len(ipList) { - err = fmt.Errorf("'%s' resolves to loopback address is not allowed for distributed XL", localServerAddr) - return serverAddr, endpoints, setupType, err + err = fmt.Errorf("'%s' resolves to loopback address is not allowed for distributed XL", + localHost) + return endpoints, setupType, err } } } @@ -580,7 +563,7 @@ func CreateEndpoints(serverAddr string, args ...[]string) (string, EndpointList, // Error out if we have less than 2 unique servers. if len(uniqueArgs.ToSlice()) < 2 && setupType == DistXLSetupType { err := fmt.Errorf("Unsupported number of endpoints (%s), minimum number of servers cannot be less than 2 in distributed setup", endpoints) - return serverAddr, endpoints, setupType, err + return endpoints, setupType, err } publicIPs := env.Get(config.EnvPublicIPs, "") @@ -589,7 +572,7 @@ func CreateEndpoints(serverAddr string, args ...[]string) (string, EndpointList, } setupType = DistXLSetupType - return serverAddr, endpoints, setupType, nil + return endpoints, setupType, nil } // GetLocalPeer - returns local peer value, returns globalMinioAddr @@ -597,14 +580,16 @@ func CreateEndpoints(serverAddr string, args ...[]string) (string, EndpointList, // the first element from the set of peers which indicate that // they are local. There is always one entry that is local // even with repeated server endpoints. -func GetLocalPeer(endpoints EndpointList) (localPeer string) { +func GetLocalPeer(endpointZones EndpointZones) (localPeer string) { peerSet := set.NewStringSet() - for _, endpoint := range endpoints { - if endpoint.Type() != URLEndpointType { - continue - } - if endpoint.IsLocal && endpoint.Host != "" { - peerSet.Add(endpoint.Host) + for _, ep := range endpointZones { + for _, endpoint := range ep.Endpoints { + if endpoint.Type() != URLEndpointType { + continue + } + if endpoint.IsLocal && endpoint.Host != "" { + peerSet.Add(endpoint.Host) + } } } if peerSet.IsEmpty() { @@ -620,23 +605,24 @@ func GetLocalPeer(endpoints EndpointList) (localPeer string) { } // GetRemotePeers - get hosts information other than this minio service. -func GetRemotePeers(endpoints EndpointList) []string { +func GetRemotePeers(endpointZones EndpointZones) []string { peerSet := set.NewStringSet() - for _, endpoint := range endpoints { - if endpoint.Type() != URLEndpointType { - continue - } - - peer := endpoint.Host - if endpoint.IsLocal { - if _, port := mustSplitHostPort(peer); port == globalMinioPort { + for _, ep := range endpointZones { + for _, endpoint := range ep.Endpoints { + if endpoint.Type() != URLEndpointType { continue } + + peer := endpoint.Host + if endpoint.IsLocal { + if _, port := mustSplitHostPort(peer); port == globalMinioPort { + continue + } + } + + peerSet.Add(peer) } - - peerSet.Add(peer) } - return peerSet.ToSlice() } @@ -664,6 +650,10 @@ func updateDomainIPs(endPoints set.StringSet) { ipList = ipList.Union(IPsWithPort) } globalDomainIPs = ipList.FuncMatch(func(ip string, matchString string) bool { - return !(strings.HasPrefix(ip, "127.") || strings.HasPrefix(ip, "::1") || strings.HasPrefix(ip, "[::1]")) + host, _, err := net.SplitHostPort(ip) + if err != nil { + host = ip + } + return !net.ParseIP(host).IsLoopback() }, "") } diff --git a/cmd/endpoint_test.go b/cmd/endpoint_test.go index 4098bfca6..72746d3cd 100644 --- a/cmd/endpoint_test.go +++ b/cmd/endpoint_test.go @@ -1,5 +1,5 @@ /* - * MinIO Cloud Storage, (C) 2017 MinIO, Inc. + * MinIO Cloud Storage, (C) 2017,2018,2019 MinIO, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ package cmd import ( "flag" "fmt" + "net" "net/url" "reflect" "strings" @@ -35,19 +36,19 @@ func TestSubOptimalEndpointInput(t *testing.T) { tests := []struct { setupType SetupType ctx *cli.Context - endpoints EndpointList + endpoints EndpointZones isErr bool }{ { setupType: DistXLSetupType, ctx: cli.NewContext(cli.NewApp(), flag.NewFlagSet("", flag.ContinueOnError), nil), - endpoints: mustGetNewEndpointList(args1...), + endpoints: mustGetZoneEndpoints(args1...), isErr: false, }, { setupType: DistXLSetupType, ctx: cli.NewContext(cli.NewApp(), flag.NewFlagSet("", flag.ContinueOnError), nil), - endpoints: mustGetNewEndpointList(args2...), + endpoints: mustGetZoneEndpoints(args2...), isErr: false, }, } @@ -90,11 +91,11 @@ func TestNewEndpoint(t *testing.T) { {"http:path", Endpoint{URL: &url.URL{Path: "http:path"}, IsLocal: true}, PathEndpointType, nil}, {"http:/path", Endpoint{URL: &url.URL{Path: "http:/path"}, IsLocal: true}, PathEndpointType, nil}, {"http:///path", Endpoint{URL: &url.URL{Path: "http:/path"}, IsLocal: true}, PathEndpointType, nil}, - {"http://localhost/path", Endpoint{URL: u1, IsLocal: true, HostName: "localhost"}, URLEndpointType, nil}, - {"http://localhost/path//", Endpoint{URL: u1, IsLocal: true, HostName: "localhost"}, URLEndpointType, nil}, - {"https://example.org/path", Endpoint{URL: u2, IsLocal: false, HostName: "example.org"}, URLEndpointType, nil}, - {"http://127.0.0.1:8080/path", Endpoint{URL: u3, IsLocal: true, HostName: "127.0.0.1"}, URLEndpointType, nil}, - {"http://192.168.253.200/path", Endpoint{URL: u4, IsLocal: false, HostName: "192.168.253.200"}, URLEndpointType, nil}, + {"http://localhost/path", Endpoint{URL: u1, IsLocal: true}, URLEndpointType, nil}, + {"http://localhost/path//", Endpoint{URL: u1, IsLocal: true}, URLEndpointType, nil}, + {"https://example.org/path", Endpoint{URL: u2, IsLocal: false}, URLEndpointType, nil}, + {"http://127.0.0.1:8080/path", Endpoint{URL: u3, IsLocal: true}, URLEndpointType, nil}, + {"http://192.168.253.200/path", Endpoint{URL: u4, IsLocal: false}, URLEndpointType, nil}, {"", Endpoint{}, -1, fmt.Errorf("empty or root endpoint is not supported")}, {SlashSeparator, Endpoint{}, -1, fmt.Errorf("empty or root endpoint is not supported")}, {`\`, Endpoint{}, -1, fmt.Errorf("empty or root endpoint is not supported")}, @@ -136,7 +137,7 @@ func TestNewEndpoint(t *testing.T) { } } -func TestNewEndpointList(t *testing.T) { +func TestNewEndpoints(t *testing.T) { testCases := []struct { args []string expectedErr error @@ -159,7 +160,7 @@ func TestNewEndpointList(t *testing.T) { } for _, testCase := range testCases { - _, err := NewEndpointList(testCase.args...) + _, err := NewEndpoints(testCase.args...) if testCase.expectedErr == nil { if err != nil { t.Fatalf("error: expected = , got = %v", err) @@ -175,7 +176,7 @@ func TestNewEndpointList(t *testing.T) { func TestCreateEndpoints(t *testing.T) { // Filter ipList by IPs those do not start with '127.'. nonLoopBackIPs := localIP4.FuncMatch(func(ip string, matchString string) bool { - return !strings.HasPrefix(ip, "127.") + return !net.ParseIP(ip).IsLoopback() }, "") if len(nonLoopBackIPs) == 0 { t.Fatalf("No non-loop back IP address found for this host") @@ -257,120 +258,111 @@ func TestCreateEndpoints(t *testing.T) { serverAddr string args [][]string expectedServerAddr string - expectedEndpoints EndpointList + expectedEndpoints Endpoints expectedSetupType SetupType expectedErr error }{ - {"localhost", [][]string{}, "", EndpointList{}, -1, fmt.Errorf("address localhost: missing port in address")}, + {"localhost", [][]string{}, "", Endpoints{}, -1, fmt.Errorf("address localhost: missing port in address")}, // FS Setup - {"localhost:9000", [][]string{{"http://localhost/d1"}}, "", EndpointList{}, -1, fmt.Errorf("use path style endpoint for FS setup")}, - {":443", [][]string{{"d1"}}, ":443", EndpointList{Endpoint{URL: &url.URL{Path: "d1"}, IsLocal: true}}, FSSetupType, nil}, - {"localhost:10000", [][]string{{"/d1"}}, "localhost:10000", EndpointList{Endpoint{URL: &url.URL{Path: "/d1"}, IsLocal: true}}, FSSetupType, nil}, - {"localhost:10000", [][]string{{"./d1"}}, "localhost:10000", EndpointList{Endpoint{URL: &url.URL{Path: "d1"}, IsLocal: true}}, FSSetupType, nil}, - {"localhost:10000", [][]string{{`\d1`}}, "localhost:10000", EndpointList{Endpoint{URL: &url.URL{Path: `\d1`}, IsLocal: true}}, FSSetupType, nil}, - {"localhost:10000", [][]string{{`.\d1`}}, "localhost:10000", EndpointList{Endpoint{URL: &url.URL{Path: `.\d1`}, IsLocal: true}}, FSSetupType, nil}, - {":8080", [][]string{{"https://example.org/d1", "https://example.org/d2", "https://example.org/d3", "https://example.org/d4"}}, "", EndpointList{}, -1, fmt.Errorf("no endpoint pointing to the local machine is found")}, - {":8080", [][]string{{"https://example.org/d1", "https://example.com/d2", "https://example.net:8000/d3", "https://example.edu/d1"}}, "", EndpointList{}, -1, fmt.Errorf("no endpoint pointing to the local machine is found")}, - {"localhost:9000", [][]string{{"https://127.0.0.1:9000/d1", "https://localhost:9001/d1", "https://example.com/d1", "https://example.com/d2"}}, "", EndpointList{}, -1, fmt.Errorf("path '/d1' can not be served by different port on same address")}, - {"localhost:9000", [][]string{{"https://127.0.0.1:8000/d1", "https://localhost:9001/d2", "https://example.com/d1", "https://example.com/d2"}}, "", EndpointList{}, -1, fmt.Errorf("port number in server address must match with one of the port in local endpoints")}, - {"localhost:10000", [][]string{{"https://127.0.0.1:8000/d1", "https://localhost:8000/d2", "https://example.com/d1", "https://example.com/d2"}}, "", EndpointList{}, -1, fmt.Errorf("server address and local endpoint have different ports")}, + {"localhost:9000", [][]string{{"http://localhost/d1"}}, "", Endpoints{}, -1, fmt.Errorf("use path style endpoint for FS setup")}, + {":443", [][]string{{"d1"}}, ":443", Endpoints{Endpoint{URL: &url.URL{Path: "d1"}, IsLocal: true}}, FSSetupType, nil}, + {"localhost:10000", [][]string{{"/d1"}}, "localhost:10000", Endpoints{Endpoint{URL: &url.URL{Path: "/d1"}, IsLocal: true}}, FSSetupType, nil}, + {"localhost:10000", [][]string{{"./d1"}}, "localhost:10000", Endpoints{Endpoint{URL: &url.URL{Path: "d1"}, IsLocal: true}}, FSSetupType, nil}, + {"localhost:10000", [][]string{{`\d1`}}, "localhost:10000", Endpoints{Endpoint{URL: &url.URL{Path: `\d1`}, IsLocal: true}}, FSSetupType, nil}, + {"localhost:10000", [][]string{{`.\d1`}}, "localhost:10000", Endpoints{Endpoint{URL: &url.URL{Path: `.\d1`}, IsLocal: true}}, FSSetupType, nil}, + {"localhost:9000", [][]string{{"https://127.0.0.1:9000/d1", "https://localhost:9001/d1", "https://example.com/d1", "https://example.com/d2"}}, "", Endpoints{}, -1, fmt.Errorf("path '/d1' can not be served by different port on same address")}, // XL Setup with PathEndpointType {":1234", [][]string{{"/d1", "/d2", "d3", "d4"}}, ":1234", - EndpointList{ + Endpoints{ Endpoint{URL: &url.URL{Path: "/d1"}, IsLocal: true}, Endpoint{URL: &url.URL{Path: "/d2"}, IsLocal: true}, Endpoint{URL: &url.URL{Path: "d3"}, IsLocal: true}, Endpoint{URL: &url.URL{Path: "d4"}, IsLocal: true}, }, XLSetupType, nil}, // XL Setup with URLEndpointType - {":9000", [][]string{{"http://localhost/d1", "http://localhost/d2", "http://localhost/d3", "http://localhost/d4"}}, ":9000", EndpointList{ + {":9000", [][]string{{"http://localhost/d1", "http://localhost/d2", "http://localhost/d3", "http://localhost/d4"}}, ":9000", Endpoints{ Endpoint{URL: &url.URL{Path: "/d1"}, IsLocal: true}, Endpoint{URL: &url.URL{Path: "/d2"}, IsLocal: true}, Endpoint{URL: &url.URL{Path: "/d3"}, IsLocal: true}, Endpoint{URL: &url.URL{Path: "/d4"}, IsLocal: true}, }, XLSetupType, nil}, // XL Setup with URLEndpointType having mixed naming to local host. - {"127.0.0.1:10000", [][]string{{"http://localhost/d1", "http://localhost/d2", "http://127.0.0.1/d3", "http://127.0.0.1/d4"}}, ":10000", EndpointList{ + {"127.0.0.1:10000", [][]string{{"http://localhost/d1", "http://localhost/d2", "http://127.0.0.1/d3", "http://127.0.0.1/d4"}}, ":10000", Endpoints{ Endpoint{URL: &url.URL{Path: "/d1"}, IsLocal: true}, Endpoint{URL: &url.URL{Path: "/d2"}, IsLocal: true}, Endpoint{URL: &url.URL{Path: "/d3"}, IsLocal: true}, Endpoint{URL: &url.URL{Path: "/d4"}, IsLocal: true}, - }, XLSetupType, nil}, - {":9001", [][]string{{"http://10.0.0.1:9000/export", "http://10.0.0.2:9000/export", "http://" + nonLoopBackIP + ":9001/export", "http://10.0.0.2:9001/export"}}, "", EndpointList{}, -1, fmt.Errorf("path '/export' can not be served by different port on same address")}, + }, XLSetupType, fmt.Errorf("all local endpoints should not have different hostname/ips")}, + {":9001", [][]string{{"http://10.0.0.1:9000/export", "http://10.0.0.2:9000/export", "http://" + nonLoopBackIP + ":9001/export", "http://10.0.0.2:9001/export"}}, "", Endpoints{}, -1, fmt.Errorf("path '/export' can not be served by different port on same address")}, - {":9000", [][]string{{"http://127.0.0.1:9000/export", "http://" + nonLoopBackIP + ":9000/export", "http://10.0.0.1:9000/export", "http://10.0.0.2:9000/export"}}, "", EndpointList{}, -1, fmt.Errorf("path '/export' cannot be served by different address on same server")}, + {":9000", [][]string{{"http://127.0.0.1:9000/export", "http://" + nonLoopBackIP + ":9000/export", "http://10.0.0.1:9000/export", "http://10.0.0.2:9000/export"}}, "", Endpoints{}, -1, fmt.Errorf("path '/export' cannot be served by different address on same server")}, - {":9000", [][]string{{"http://localhost/d1", "http://localhost/d2", "http://example.org/d3", "http://example.com/d4"}}, "", EndpointList{}, -1, fmt.Errorf("'localhost' resolves to loopback address is not allowed for distributed XL")}, + {":9000", [][]string{{"http://localhost/d1", "http://localhost/d2", "http://example.org/d3", "http://example.com/d4"}}, "", Endpoints{}, -1, fmt.Errorf("'localhost' resolves to loopback address is not allowed for distributed XL")}, // DistXL type - {"127.0.0.1:10000", [][]string{{case1Endpoint1, case1Endpoint2, "http://example.org/d3", "http://example.com/d4"}}, "127.0.0.1:10000", EndpointList{ - Endpoint{URL: case1URLs[0], IsLocal: case1LocalFlags[0], HostName: nonLoopBackIP}, - Endpoint{URL: case1URLs[1], IsLocal: case1LocalFlags[1], HostName: nonLoopBackIP}, - Endpoint{URL: case1URLs[2], IsLocal: case1LocalFlags[2], HostName: "example.org"}, - Endpoint{URL: case1URLs[3], IsLocal: case1LocalFlags[3], HostName: "example.com"}, + {"127.0.0.1:10000", [][]string{{case1Endpoint1, case1Endpoint2, "http://example.org/d3", "http://example.com/d4"}}, "127.0.0.1:10000", Endpoints{ + Endpoint{URL: case1URLs[0], IsLocal: case1LocalFlags[0]}, + Endpoint{URL: case1URLs[1], IsLocal: case1LocalFlags[1]}, + Endpoint{URL: case1URLs[2], IsLocal: case1LocalFlags[2]}, + Endpoint{URL: case1URLs[3], IsLocal: case1LocalFlags[3]}, }, DistXLSetupType, nil}, - {"127.0.0.1:10000", [][]string{{case2Endpoint1, case2Endpoint2, "http://example.org/d3", "http://example.com/d4"}}, "127.0.0.1:10000", EndpointList{ - Endpoint{URL: case2URLs[0], IsLocal: case2LocalFlags[0], HostName: nonLoopBackIP}, - Endpoint{URL: case2URLs[1], IsLocal: case2LocalFlags[1], HostName: nonLoopBackIP}, - Endpoint{URL: case2URLs[2], IsLocal: case2LocalFlags[2], HostName: "example.org"}, - Endpoint{URL: case2URLs[3], IsLocal: case2LocalFlags[3], HostName: "example.com"}, + {"127.0.0.1:10000", [][]string{{case2Endpoint1, case2Endpoint2, "http://example.org/d3", "http://example.com/d4"}}, "127.0.0.1:10000", Endpoints{ + Endpoint{URL: case2URLs[0], IsLocal: case2LocalFlags[0]}, + Endpoint{URL: case2URLs[1], IsLocal: case2LocalFlags[1]}, + Endpoint{URL: case2URLs[2], IsLocal: case2LocalFlags[2]}, + Endpoint{URL: case2URLs[3], IsLocal: case2LocalFlags[3]}, }, DistXLSetupType, nil}, - {":80", [][]string{{case3Endpoint1, "http://example.org:9000/d2", "http://example.com/d3", "http://example.net/d4"}}, ":80", EndpointList{ - Endpoint{URL: case3URLs[0], IsLocal: case3LocalFlags[0], HostName: nonLoopBackIP}, - Endpoint{URL: case3URLs[1], IsLocal: case3LocalFlags[1], HostName: "example.org"}, - Endpoint{URL: case3URLs[2], IsLocal: case3LocalFlags[2], HostName: "example.com"}, - Endpoint{URL: case3URLs[3], IsLocal: case3LocalFlags[3], HostName: "example.net"}, + {":80", [][]string{{case3Endpoint1, "http://example.org:9000/d2", "http://example.com/d3", "http://example.net/d4"}}, ":80", Endpoints{ + Endpoint{URL: case3URLs[0], IsLocal: case3LocalFlags[0]}, + Endpoint{URL: case3URLs[1], IsLocal: case3LocalFlags[1]}, + Endpoint{URL: case3URLs[2], IsLocal: case3LocalFlags[2]}, + Endpoint{URL: case3URLs[3], IsLocal: case3LocalFlags[3]}, }, DistXLSetupType, nil}, - {":9000", [][]string{{case4Endpoint1, "http://example.org/d2", "http://example.com/d3", "http://example.net/d4"}}, ":9000", EndpointList{ - Endpoint{URL: case4URLs[0], IsLocal: case4LocalFlags[0], HostName: nonLoopBackIP}, - Endpoint{URL: case4URLs[1], IsLocal: case4LocalFlags[1], HostName: "example.org"}, - Endpoint{URL: case4URLs[2], IsLocal: case4LocalFlags[2], HostName: "example.com"}, - Endpoint{URL: case4URLs[3], IsLocal: case4LocalFlags[3], HostName: "example.net"}, + {":9000", [][]string{{case4Endpoint1, "http://example.org/d2", "http://example.com/d3", "http://example.net/d4"}}, ":9000", Endpoints{ + Endpoint{URL: case4URLs[0], IsLocal: case4LocalFlags[0]}, + Endpoint{URL: case4URLs[1], IsLocal: case4LocalFlags[1]}, + Endpoint{URL: case4URLs[2], IsLocal: case4LocalFlags[2]}, + Endpoint{URL: case4URLs[3], IsLocal: case4LocalFlags[3]}, }, DistXLSetupType, nil}, - {":9000", [][]string{{case5Endpoint1, case5Endpoint2, case5Endpoint3, case5Endpoint4}}, ":9000", EndpointList{ - Endpoint{URL: case5URLs[0], IsLocal: case5LocalFlags[0], HostName: nonLoopBackIP}, - Endpoint{URL: case5URLs[1], IsLocal: case5LocalFlags[1], HostName: nonLoopBackIP}, - Endpoint{URL: case5URLs[2], IsLocal: case5LocalFlags[2], HostName: nonLoopBackIP}, - Endpoint{URL: case5URLs[3], IsLocal: case5LocalFlags[3], HostName: nonLoopBackIP}, + {":9000", [][]string{{case5Endpoint1, case5Endpoint2, case5Endpoint3, case5Endpoint4}}, ":9000", Endpoints{ + Endpoint{URL: case5URLs[0], IsLocal: case5LocalFlags[0]}, + Endpoint{URL: case5URLs[1], IsLocal: case5LocalFlags[1]}, + Endpoint{URL: case5URLs[2], IsLocal: case5LocalFlags[2]}, + Endpoint{URL: case5URLs[3], IsLocal: case5LocalFlags[3]}, }, DistXLSetupType, nil}, // DistXL Setup using only local host. - {":9003", [][]string{{"http://localhost:9000/d1", "http://localhost:9001/d2", "http://127.0.0.1:9002/d3", case6Endpoint}}, ":9003", EndpointList{ - Endpoint{URL: case6URLs[0], IsLocal: case6LocalFlags[0], HostName: "localhost"}, - Endpoint{URL: case6URLs[1], IsLocal: case6LocalFlags[1], HostName: "localhost"}, - Endpoint{URL: case6URLs[2], IsLocal: case6LocalFlags[2], HostName: "127.0.0.1"}, - Endpoint{URL: case6URLs[3], IsLocal: case6LocalFlags[3], HostName: nonLoopBackIP}, + {":9003", [][]string{{"http://localhost:9000/d1", "http://localhost:9001/d2", "http://127.0.0.1:9002/d3", case6Endpoint}}, ":9003", Endpoints{ + Endpoint{URL: case6URLs[0], IsLocal: case6LocalFlags[0]}, + Endpoint{URL: case6URLs[1], IsLocal: case6LocalFlags[1]}, + Endpoint{URL: case6URLs[2], IsLocal: case6LocalFlags[2]}, + Endpoint{URL: case6URLs[3], IsLocal: case6LocalFlags[3]}, }, DistXLSetupType, nil}, } - for i, testCase := range testCases { + for _, testCase := range testCases { testCase := testCase - t.Run(fmt.Sprintf("Test%d", i+1), func(t *testing.T) { - serverAddr, endpoints, setupType, err := CreateEndpoints(testCase.serverAddr, testCase.args...) + t.Run("", func(t *testing.T) { + endpoints, setupType, err := CreateEndpoints(testCase.serverAddr, testCase.args...) + if err == nil && testCase.expectedErr != nil { + t.Errorf("error: expected = %v, got = ", testCase.expectedErr) + } if err == nil { - if testCase.expectedErr != nil { - t.Fatalf("error: expected = %v, got = ", testCase.expectedErr) - } else { - if serverAddr != testCase.expectedServerAddr { - t.Fatalf("serverAddr: expected = %v, got = %v", testCase.expectedServerAddr, serverAddr) - } - if !reflect.DeepEqual(endpoints, testCase.expectedEndpoints) { - t.Fatalf("endpoints: expected = %v, got = %v", testCase.expectedEndpoints, endpoints) - } - if setupType != testCase.expectedSetupType { - t.Fatalf("setupType: expected = %v, got = %v", testCase.expectedSetupType, setupType) - } + if !reflect.DeepEqual(endpoints, testCase.expectedEndpoints) { + t.Errorf("endpoints: expected = %v, got = %v", testCase.expectedEndpoints, endpoints) } - } else if testCase.expectedErr == nil { - t.Fatalf("error: expected = , got = %v", err) - } else if err.Error() != testCase.expectedErr.Error() { - t.Fatalf("error: expected = %v, got = %v", testCase.expectedErr, err) + if setupType != testCase.expectedSetupType { + t.Errorf("setupType: expected = %v, got = %v", testCase.expectedSetupType, setupType) + } + } + if err != nil && testCase.expectedErr == nil { + t.Errorf("error: expected = , got = %v, testCase: %v", err, testCase) } }) } @@ -403,13 +395,13 @@ func TestGetLocalPeer(t *testing.T) { } for i, testCase := range testCases { - endpoints, _ := NewEndpointList(testCase.endpointArgs...) - if !endpoints[0].IsLocal { - if err := endpoints.UpdateIsLocal(); err != nil { + zendpoints := mustGetZoneEndpoints(testCase.endpointArgs...) + if !zendpoints[0].Endpoints[0].IsLocal { + if err := zendpoints[0].Endpoints.UpdateIsLocal(); err != nil { t.Fatalf("error: expected = , got = %v", err) } } - remotePeer := GetLocalPeer(endpoints) + remotePeer := GetLocalPeer(zendpoints) if remotePeer != testCase.expectedResult { t.Fatalf("Test %d: expected: %v, got: %v", i+1, testCase.expectedResult, remotePeer) } @@ -435,13 +427,13 @@ func TestGetRemotePeers(t *testing.T) { } for _, testCase := range testCases { - endpoints, _ := NewEndpointList(testCase.endpointArgs...) - if !endpoints[0].IsLocal { - if err := endpoints.UpdateIsLocal(); err != nil { + zendpoints := mustGetZoneEndpoints(testCase.endpointArgs...) + if !zendpoints[0].Endpoints[0].IsLocal { + if err := zendpoints[0].Endpoints.UpdateIsLocal(); err != nil { t.Fatalf("error: expected = , got = %v", err) } } - remotePeers := GetRemotePeers(endpoints) + remotePeers := GetRemotePeers(zendpoints) if !reflect.DeepEqual(remotePeers, testCase.expectedResult) { t.Fatalf("expected: %v, got: %v", testCase.expectedResult, remotePeers) } diff --git a/cmd/format-xl.go b/cmd/format-xl.go index 0089de4cb..2aa2f9d3f 100644 --- a/cmd/format-xl.go +++ b/cmd/format-xl.go @@ -471,7 +471,7 @@ func formatXLGetDeploymentID(refFormat *formatXLV3, formats []*formatXLV3) (stri } // formatXLFixDeploymentID - Add deployment id if it is not present. -func formatXLFixDeploymentID(endpoints EndpointList, storageDisks []StorageAPI, refFormat *formatXLV3) (err error) { +func formatXLFixDeploymentID(endpoints Endpoints, storageDisks []StorageAPI, refFormat *formatXLV3) (err error) { // Attempt to load all `format.json` from all disks. var sErrs []error formats, sErrs := loadFormatXLAll(storageDisks) @@ -515,7 +515,7 @@ func formatXLFixDeploymentID(endpoints EndpointList, storageDisks []StorageAPI, } // Update only the valid local disks which have not been updated before. -func formatXLFixLocalDeploymentID(endpoints EndpointList, storageDisks []StorageAPI, refFormat *formatXLV3) error { +func formatXLFixLocalDeploymentID(endpoints Endpoints, storageDisks []StorageAPI, refFormat *formatXLV3) error { // If this server was down when the deploymentID was updated // then we make sure that we update the local disks with the deploymentID. for index, storageDisk := range storageDisks { @@ -655,7 +655,7 @@ func closeStorageDisks(storageDisks []StorageAPI) { // Initialize storage disks for each endpoint. // Errors are returned for each endpoint with matching index. -func initStorageDisksWithErrors(endpoints EndpointList) ([]StorageAPI, []error) { +func initStorageDisksWithErrors(endpoints Endpoints) ([]StorageAPI, []error) { // Bootstrap disks. storageDisks := make([]StorageAPI, len(endpoints)) g := errgroup.WithNErrs(len(endpoints)) @@ -695,7 +695,7 @@ func formatXLV3ThisEmpty(formats []*formatXLV3) bool { } // fixFormatXLV3 - fix format XL configuration on all disks. -func fixFormatXLV3(storageDisks []StorageAPI, endpoints EndpointList, formats []*formatXLV3) error { +func fixFormatXLV3(storageDisks []StorageAPI, endpoints Endpoints, formats []*formatXLV3) error { for i, format := range formats { if format == nil || !endpoints[i].IsLocal { continue diff --git a/cmd/format-xl_test.go b/cmd/format-xl_test.go index efa1e27fc..f88f1577f 100644 --- a/cmd/format-xl_test.go +++ b/cmd/format-xl_test.go @@ -83,7 +83,7 @@ func TestFixFormatV3(t *testing.T) { for _, xlDir := range xlDirs { defer os.RemoveAll(xlDir) } - endpoints := mustGetNewEndpointList(xlDirs...) + endpoints := mustGetNewEndpoints(xlDirs...) storageDisks, errs := initStorageDisksWithErrors(endpoints) for _, err := range errs { @@ -593,7 +593,7 @@ func benchmarkInitStorageDisksN(b *testing.B, nDisks int) { if err != nil { b.Fatal(err) } - endpoints := mustGetNewEndpointList(fsDirs...) + endpoints := mustGetNewEndpoints(fsDirs...) b.RunParallel(func(pb *testing.PB) { endpoints := endpoints for pb.Next() { diff --git a/cmd/global-heal.go b/cmd/global-heal.go index 397007805..52290abd1 100644 --- a/cmd/global-heal.go +++ b/cmd/global-heal.go @@ -80,16 +80,6 @@ func getLocalBackgroundHealStatus() madmin.BgHealState { // healErasureSet lists and heals all objects in a specific erasure set func healErasureSet(ctx context.Context, setIndex int, xlObj *xlObjects) error { - // Hold a lock for healing the erasure set - zeroDuration := time.Millisecond - zeroDynamicTimeout := newDynamicTimeout(zeroDuration, zeroDuration) - erasureSetHealLock := xlObj.nsMutex.NewNSLock(ctx, xlObj.getLockers(), - "system", fmt.Sprintf("erasure-set-heal-%d", setIndex)) - if err := erasureSetHealLock.GetLock(zeroDynamicTimeout); err != nil { - return err - } - defer erasureSetHealLock.Unlock() - buckets, err := xlObj.ListBuckets(ctx) if err != nil { return err @@ -123,11 +113,11 @@ func healErasureSet(ctx context.Context, setIndex int, xlObj *xlObjects) error { } // Healing leader will take the charge of healing all erasure sets -func execLeaderTasks(sets *xlSets) { +func execLeaderTasks(z *xlZones) { ctx := context.Background() // Hold a lock so only one server performs auto-healing - leaderLock := sets.NewNSLock(ctx, minioMetaBucket, "leader") + leaderLock := z.NewNSLock(ctx, minioMetaBucket, "leader") for { err := leaderLock.GetLock(leaderLockTimeout) if err == nil { @@ -136,18 +126,30 @@ func execLeaderTasks(sets *xlSets) { time.Sleep(leaderTick) } + // Hold a lock for healing the erasure set + zeroDuration := time.Millisecond + zeroDynamicTimeout := newDynamicTimeout(zeroDuration, zeroDuration) + lastScanTime := time.Now() // So that we don't heal immediately, but after one month. for { if time.Since(lastScanTime) < healInterval { time.Sleep(healTick) continue } - // Heal set by set - for i, set := range sets.sets { - err := healErasureSet(ctx, i, set) - if err != nil { - logger.LogIf(ctx, err) - continue + for _, zone := range z.zones { + // Heal set by set + for i, set := range zone.sets { + setLock := z.zones[0].NewNSLock(ctx, "system", fmt.Sprintf("erasure-set-heal-%d", i)) + if err := setLock.GetLock(zeroDynamicTimeout); err != nil { + logger.LogIf(ctx, err) + continue + } + if err := healErasureSet(ctx, i, set); err != nil { + setLock.Unlock() + logger.LogIf(ctx, err) + continue + } + setLock.Unlock() } } lastScanTime = time.Now() @@ -165,12 +167,12 @@ func startGlobalHeal() { break } - sets, ok := objAPI.(*xlSets) + zones, ok := objAPI.(*xlZones) if !ok { return } - execLeaderTasks(sets) + execLeaderTasks(zones) } func initGlobalHeal() { diff --git a/cmd/globals.go b/cmd/globals.go index 59f5612aa..ea7d57700 100644 --- a/cmd/globals.go +++ b/cmd/globals.go @@ -103,9 +103,6 @@ var globalCLIContext = struct { }{} var ( - // Indicates the total number of erasure coded sets configured. - globalXLSetCount int - // Indicates set drive count. globalXLSetDriveCount int @@ -130,9 +127,6 @@ var ( // This flag is set to 'us-east-1' by default globalServerRegion = globalMinioDefaultRegion - // Maximum size of internal objects parts - globalPutPartSize = int64(64 * 1024 * 1024) - // MinIO local server address (in `host:port` format) globalMinioAddr = "" // MinIO default port, can be changed through command line. @@ -173,7 +167,7 @@ var ( // registered listeners globalConsoleSys *HTTPConsoleLoggerSys - globalEndpoints EndpointList + globalEndpoints EndpointZones // Global server's network statistics globalConnStats = newConnStats() diff --git a/cmd/healthcheck-handler.go b/cmd/healthcheck-handler.go index 842d25959..2f16fb200 100644 --- a/cmd/healthcheck-handler.go +++ b/cmd/healthcheck-handler.go @@ -77,26 +77,26 @@ func LivenessCheckHandler(w http.ResponseWriter, r *http.Request) { } // For FS and Erasure backend, check if local disks are up. - var totalLocalDisks int var erroredDisks int - for _, endpoint := range globalEndpoints { - // Check only if local disks are accessible, we do not have - // to reach to rest of the other servers in a distributed setup. - if endpoint.IsLocal { - totalLocalDisks++ + for _, ep := range globalEndpoints { + for _, endpoint := range ep.Endpoints { + // Check only if local disks are accessible, we do not have + // to reach to rest of the other servers in a distributed setup. + if !endpoint.IsLocal { + continue + } // Attempt a stat to backend, any error resulting // from this Stat() operation is considered as backend // is not available, count them as errors. - if _, err := os.Stat(endpoint.Path); err != nil { + if _, err := os.Stat(endpoint.Path); err != nil && os.IsNotExist(err) { logger.LogIf(ctx, err) erroredDisks++ } } } - // If all exported local disks have errored, we simply let kubernetes - // take us down. - if totalLocalDisks == erroredDisks { + // Any errored disks, we let orchestrators take us down. + if erroredDisks > 0 { writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone) return } diff --git a/cmd/local-locker.go b/cmd/local-locker.go index 738699e94..acc37c2ff 100644 --- a/cmd/local-locker.go +++ b/cmd/local-locker.go @@ -65,6 +65,10 @@ func (d *errorLocker) Close() error { return nil } +func (d *errorLocker) IsOnline() bool { + return false +} + // localLocker implements Dsync.NetLocker type localLocker struct { mutex sync.Mutex @@ -193,6 +197,11 @@ func (l *localLocker) Close() error { return nil } +// Local locker is always online. +func (l *localLocker) IsOnline() bool { + return true +} + func newLocker(endpoint Endpoint) *localLocker { return &localLocker{ endpoint: endpoint, diff --git a/cmd/lock-rest-server.go b/cmd/lock-rest-server.go index 24df5aa14..50a61c5e5 100644 --- a/cmd/lock-rest-server.go +++ b/cmd/lock-rest-server.go @@ -128,24 +128,26 @@ func (l *lockRESTServer) RUnlockHandler(w http.ResponseWriter, r *http.Request) } // registerLockRESTHandlers - register lock rest router. -func registerLockRESTHandlers(router *mux.Router, endpoints EndpointList) { +func registerLockRESTHandlers(router *mux.Router, endpointZones EndpointZones) { queries := restQueries(lockRESTUID, lockRESTSource, lockRESTResource) - for _, endpoint := range endpoints { - if !endpoint.IsLocal { - continue + for _, ep := range endpointZones { + for _, endpoint := range ep.Endpoints { + if !endpoint.IsLocal { + continue + } + + lockServer := &lockRESTServer{ + ll: newLocker(endpoint), + } + + subrouter := router.PathPrefix(path.Join(lockRESTPrefix, endpoint.Path)).Subrouter() + subrouter.Methods(http.MethodPost).Path(lockRESTVersionPrefix + lockRESTMethodLock).HandlerFunc(httpTraceHdrs(lockServer.LockHandler)).Queries(queries...) + subrouter.Methods(http.MethodPost).Path(lockRESTVersionPrefix + lockRESTMethodRLock).HandlerFunc(httpTraceHdrs(lockServer.RLockHandler)).Queries(queries...) + subrouter.Methods(http.MethodPost).Path(lockRESTVersionPrefix + lockRESTMethodUnlock).HandlerFunc(httpTraceHdrs(lockServer.UnlockHandler)).Queries(queries...) + subrouter.Methods(http.MethodPost).Path(lockRESTVersionPrefix + lockRESTMethodRUnlock).HandlerFunc(httpTraceHdrs(lockServer.RUnlockHandler)).Queries(queries...) + + globalLockServers[endpoint] = lockServer.ll } - - lockServer := &lockRESTServer{ - ll: newLocker(endpoint), - } - - subrouter := router.PathPrefix(path.Join(lockRESTPrefix, endpoint.Path)).Subrouter() - subrouter.Methods(http.MethodPost).Path(lockRESTVersionPrefix + lockRESTMethodLock).HandlerFunc(httpTraceHdrs(lockServer.LockHandler)).Queries(queries...) - subrouter.Methods(http.MethodPost).Path(lockRESTVersionPrefix + lockRESTMethodRLock).HandlerFunc(httpTraceHdrs(lockServer.RLockHandler)).Queries(queries...) - subrouter.Methods(http.MethodPost).Path(lockRESTVersionPrefix + lockRESTMethodUnlock).HandlerFunc(httpTraceHdrs(lockServer.UnlockHandler)).Queries(queries...) - subrouter.Methods(http.MethodPost).Path(lockRESTVersionPrefix + lockRESTMethodRUnlock).HandlerFunc(httpTraceHdrs(lockServer.RUnlockHandler)).Queries(queries...) - - globalLockServers[endpoint] = lockServer.ll } // If none of the routes match add default error handler routes diff --git a/cmd/metrics.go b/cmd/metrics.go index 4be5b05cb..b3cc8960c 100644 --- a/cmd/metrics.go +++ b/cmd/metrics.go @@ -90,11 +90,13 @@ func (c *minioCollector) Collect(ch chan<- prometheus.Metric) { } storageAPIs := []StorageAPI{} - for _, endpoint := range globalEndpoints { - if endpoint.IsLocal { - // Construct storageAPIs. - sAPI, _ := newStorageAPI(endpoint) - storageAPIs = append(storageAPIs, sAPI) + for _, ep := range globalEndpoints { + for _, endpoint := range ep.Endpoints { + if endpoint.IsLocal { + // Construct storageAPIs. + sAPI, _ := newStorageAPI(endpoint) + storageAPIs = append(storageAPIs, sAPI) + } } } diff --git a/cmd/namespace-lock.go b/cmd/namespace-lock.go index e62d18423..501ecbfbe 100644 --- a/cmd/namespace-lock.go +++ b/cmd/namespace-lock.go @@ -223,14 +223,12 @@ type localLockInstance struct { // NewNSLock - returns a lock instance for a given volume and // path. The returned lockInstance object encapsulates the nsLockMap, // volume, path and operation ID. -func (n *nsLockMap) NewNSLock(ctx context.Context, lockers []dsync.NetLocker, volume, path string) RWLocker { +func (n *nsLockMap) NewNSLock(ctx context.Context, lockersFn func() []dsync.NetLocker, volume, path string) RWLocker { opsID := mustGetUUID() if n.isDistXL { - sync, err := dsync.New(lockers) - if err != nil { - logger.CriticalIf(ctx, err) - } - return &distLockInstance{dsync.NewDRWMutex(ctx, pathJoin(volume, path), sync), volume, path, opsID} + return &distLockInstance{dsync.NewDRWMutex(ctx, pathJoin(volume, path), &dsync.Dsync{ + GetLockersFn: lockersFn, + }), volume, path, opsID} } return &localLockInstance{ctx, n, volume, path, opsID} } diff --git a/cmd/net.go b/cmd/net.go index 52154622a..b2b9ed8fb 100644 --- a/cmd/net.go +++ b/cmd/net.go @@ -120,10 +120,10 @@ func (n byLastOctetValue) Less(i, j int) bool { // This case is needed when all ips in the list // have same last octets, Following just ensures that // 127.0.0.1 is moved to the end of the list. - if n[i].String() == "127.0.0.1" { + if n[i].IsLoopback() { return false } - if n[j].String() == "127.0.0.1" { + if n[j].IsLoopback() { return true } return []byte(n[i].To4())[3] > []byte(n[j].To4())[3] @@ -171,7 +171,8 @@ func getAPIEndpoints() (apiEndpoints []string) { } for _, ip := range ipList { - apiEndpoints = append(apiEndpoints, fmt.Sprintf("%s://%s", getURLScheme(globalIsSSL), net.JoinHostPort(ip, globalMinioPort))) + endpoint := fmt.Sprintf("%s://%s", getURLScheme(globalIsSSL), net.JoinHostPort(ip, globalMinioPort)) + apiEndpoints = append(apiEndpoints, endpoint) } return apiEndpoints diff --git a/cmd/notification.go b/cmd/notification.go index 3d5631bf4..2e404a3be 100644 --- a/cmd/notification.go +++ b/cmd/notification.go @@ -697,17 +697,6 @@ func (sys *NotificationSys) initListeners(ctx context.Context, objAPI ObjectLaye // Construct path to listener.json for the given bucket. configFile := path.Join(bucketConfigPrefix, bucketName, bucketListenerConfig) - transactionConfigFile := configFile + "/transaction.lock" - - // As object layer's GetObject() and PutObject() take respective lock on minioMetaBucket - // and configFile, take a transaction lock to avoid data race between readConfig() - // and saveConfig(). - objLock := objAPI.NewNSLock(ctx, minioMetaBucket, transactionConfigFile) - if err := objLock.GetRLock(globalOperationTimeout); err != nil { - return err - } - defer objLock.RUnlock() - configData, e := readConfig(ctx, objAPI, configFile) if e != nil && !IsErrIgnored(e, errDiskNotFound, errConfigNotFound) { return e @@ -1180,7 +1169,7 @@ func (sys *NotificationSys) NetworkInfo() []madmin.ServerNetworkHardwareInfo { } // NewNotificationSys - creates new notification system object. -func NewNotificationSys(endpoints EndpointList) *NotificationSys { +func NewNotificationSys(endpoints EndpointZones) *NotificationSys { // bucketRulesMap/bucketRemoteTargetRulesMap are initialized by NotificationSys.Init() return &NotificationSys{ targetList: event.NewTargetList(), @@ -1338,16 +1327,6 @@ func SaveListener(objAPI ObjectLayer, bucketName string, eventNames []event.Name // Construct path to listener.json for the given bucket. configFile := path.Join(bucketConfigPrefix, bucketName, bucketListenerConfig) - transactionConfigFile := configFile + "/transaction.lock" - - // As object layer's GetObject() and PutObject() take respective lock on minioMetaBucket - // and configFile, take a transaction lock to avoid data race between readConfig() - // and saveConfig(). - objLock := objAPI.NewNSLock(ctx, minioMetaBucket, transactionConfigFile) - if err := objLock.GetLock(globalOperationTimeout); err != nil { - return err - } - defer objLock.Unlock() configData, err := readConfig(ctx, objAPI, configFile) if err != nil && !IsErrIgnored(err, errDiskNotFound, errConfigNotFound) { @@ -1389,17 +1368,6 @@ func RemoveListener(objAPI ObjectLayer, bucketName string, targetID event.Target // Construct path to listener.json for the given bucket. configFile := path.Join(bucketConfigPrefix, bucketName, bucketListenerConfig) - transactionConfigFile := configFile + "/transaction.lock" - - // As object layer's GetObject() and PutObject() take respective lock on minioMetaBucket - // and configFile, take a transaction lock to avoid data race between readConfig() - // and saveConfig(). - objLock := objAPI.NewNSLock(ctx, minioMetaBucket, transactionConfigFile) - if err := objLock.GetLock(globalOperationTimeout); err != nil { - return err - } - defer objLock.Unlock() - configData, err := readConfig(ctx, objAPI, configFile) if err != nil && !IsErrIgnored(err, errDiskNotFound, errConfigNotFound) { return err diff --git a/cmd/object-api-datatypes.go b/cmd/object-api-datatypes.go index 91830160b..bc09b244c 100644 --- a/cmd/object-api-datatypes.go +++ b/cmd/object-api-datatypes.go @@ -170,6 +170,16 @@ type ListPartsInfo struct { EncodingType string // Not supported yet. } +// Lookup - returns if uploadID is valid +func (lm ListMultipartsInfo) Lookup(uploadID string) bool { + for _, upload := range lm.Uploads { + if upload.UploadID == uploadID { + return true + } + } + return false +} + // ListMultipartsInfo - represnets bucket resources for incomplete multipart uploads. type ListMultipartsInfo struct { // Together with upload-id-marker, this parameter specifies the multipart upload diff --git a/cmd/object-api-errors.go b/cmd/object-api-errors.go index 35695a0db..09ae594bf 100644 --- a/cmd/object-api-errors.go +++ b/cmd/object-api-errors.go @@ -427,6 +427,12 @@ func (e BackendDown) Error() string { return "Backend down" } +// isErrBucketNotFound - Check if error type is BucketNotFound. +func isErrBucketNotFound(err error) bool { + _, ok := err.(BucketNotFound) + return ok +} + // isErrObjectNotFound - Check if error type is ObjectNotFound. func isErrObjectNotFound(err error) bool { _, ok := err.(ObjectNotFound) diff --git a/cmd/object-api-input-checks.go b/cmd/object-api-input-checks.go index 3e02fc7f3..5457d0964 100644 --- a/cmd/object-api-input-checks.go +++ b/cmd/object-api-input-checks.go @@ -74,15 +74,6 @@ func checkListObjsArgs(ctx context.Context, bucket, prefix, marker, delimiter st Object: prefix, } } - // Verify if delimiter is anything other than '/', which we do not support. - if delimiter != "" && delimiter != SlashSeparator { - logger.LogIf(ctx, UnsupportedDelimiter{ - Delimiter: delimiter, - }) - return UnsupportedDelimiter{ - Delimiter: delimiter, - } - } // Verify if marker has prefix. if marker != "" && !hasPrefix(marker, prefix) { logger.LogIf(ctx, InvalidMarkerPrefixCombination{ diff --git a/cmd/object-api-multipart_test.go b/cmd/object-api-multipart_test.go index 1386da39d..644ecc0f7 100644 --- a/cmd/object-api-multipart_test.go +++ b/cmd/object-api-multipart_test.go @@ -1128,10 +1128,9 @@ func testListMultipartUploads(obj ObjectLayer, instanceType string, t TestErrHan {"volatile-bucket-1", "", "", "", "", 0, ListMultipartsInfo{}, BucketNotFound{Bucket: "volatile-bucket-1"}, false}, {"volatile-bucket-2", "", "", "", "", 0, ListMultipartsInfo{}, BucketNotFound{Bucket: "volatile-bucket-2"}, false}, {"volatile-bucket-3", "", "", "", "", 0, ListMultipartsInfo{}, BucketNotFound{Bucket: "volatile-bucket-3"}, false}, - // Valid, existing bucket, but sending invalid delimeter values (Test number 8-9). - // Empty string < "" > and forward slash < / > are the ony two valid arguments for delimeter. - {bucketNames[0], "", "", "", "*", 0, ListMultipartsInfo{}, fmt.Errorf("delimiter '%s' is not supported", "*"), false}, - {bucketNames[0], "", "", "", "-", 0, ListMultipartsInfo{}, fmt.Errorf("delimiter '%s' is not supported", "-"), false}, + // Valid, existing bucket, delimiter not supported, returns empty values (Test number 8-9). + {bucketNames[0], "", "", "", "*", 0, ListMultipartsInfo{Delimiter: "*"}, nil, true}, + {bucketNames[0], "", "", "", "-", 0, ListMultipartsInfo{Delimiter: "-"}, nil, true}, // Testing for failure cases with both perfix and marker (Test number 10). // The prefix and marker combination to be valid it should satisfy strings.HasPrefix(marker, prefix). {bucketNames[0], "asia", "europe-object", "", "", 0, ListMultipartsInfo{}, @@ -1193,9 +1192,6 @@ func testListMultipartUploads(obj ObjectLayer, instanceType string, t TestErrHan {bucketNames[1], "Asia", "", "", "", 10, listMultipartResults[23], nil, true}, // Test case with `Prefix` and `UploadIDMarker` (Test number 37). {bucketNames[1], "min", "minio-object-1.txt", uploadIDs[1], "", 10, listMultipartResults[24], nil, true}, - // Test case with `KeyMarker` and `UploadIDMarker` (Test number 38). - // {bucketNames[1], "", "minio-object-1.txt", uploadIDs[1], "", 10, listMultipartResults[24], nil, true}, - // Test case for bucket with multiple objects in it. // Bucket used : `bucketNames[2]`. // Objects used: `objectNames[1-5]`. @@ -1217,16 +1213,10 @@ func testListMultipartUploads(obj ObjectLayer, instanceType string, t TestErrHan // Since all available entries are listed, IsTruncated is expected to be false // and NextMarkers are expected to empty. {bucketNames[2], "", "", "", "", 6, listMultipartResults[31], nil, true}, - // Test case with `uploadIDMarker` (Test number 46). - // {bucketNames[2], "", "", uploadIDs[6], "", 10, listMultipartResults[32], nil, true}, // Test case with `KeyMarker` (Test number 47). {bucketNames[2], "", objectNames[3], "", "", 10, listMultipartResults[33], nil, true}, // Test case with `prefix` and `KeyMarker` (Test number 48). {bucketNames[2], "minio-object", objectNames[1], "", "", 10, listMultipartResults[34], nil, true}, - // Test case with `prefix` and `uploadIDMarker` (Test number 49). - // {bucketNames[2], globalMinioDefaultOwnerID, "", uploadIDs[4], "", 10, listMultipartResults[35], nil, true}, - // Test case with `KeyMarker` and `uploadIDMarker` (Test number 50). - // {bucketNames[2], "minio-object.txt", "", uploadIDs[5], "", 10, listMultipartResults[36], nil, true}, } for i, testCase := range testCases { diff --git a/cmd/peer-rest-client.go b/cmd/peer-rest-client.go index 32d73c093..851a5f3e8 100644 --- a/cmd/peer-rest-client.go +++ b/cmd/peer-rest-client.go @@ -708,9 +708,9 @@ func (client *peerRESTClient) ConsoleLog(logCh chan interface{}, doneCh chan str }() } -func getRemoteHosts(endpoints EndpointList) []*xnet.Host { +func getRemoteHosts(endpointZones EndpointZones) []*xnet.Host { var remoteHosts []*xnet.Host - for _, hostStr := range GetRemotePeers(endpoints) { + for _, hostStr := range GetRemotePeers(endpointZones) { host, err := xnet.ParseHost(hostStr) if err != nil { logger.LogIf(context.Background(), err) @@ -722,7 +722,7 @@ func getRemoteHosts(endpoints EndpointList) []*xnet.Host { return remoteHosts } -func getRestClients(endpoints EndpointList) []*peerRESTClient { +func getRestClients(endpoints EndpointZones) []*peerRESTClient { peerHosts := getRemoteHosts(endpoints) restClients := make([]*peerRESTClient, len(peerHosts)) for i, host := range peerHosts { diff --git a/cmd/prepare-storage.go b/cmd/prepare-storage.go index d1a2d6eaf..5f6bc1eb9 100644 --- a/cmd/prepare-storage.go +++ b/cmd/prepare-storage.go @@ -53,7 +53,7 @@ var printEndpointError = func() func(Endpoint, error) { }() // Migrates backend format of local disks. -func formatXLMigrateLocalEndpoints(endpoints EndpointList) error { +func formatXLMigrateLocalEndpoints(endpoints Endpoints) error { g := errgroup.WithNErrs(len(endpoints)) for index, endpoint := range endpoints { if !endpoint.IsLocal { @@ -81,7 +81,7 @@ func formatXLMigrateLocalEndpoints(endpoints EndpointList) error { } // Cleans up tmp directory of local disks. -func formatXLCleanupTmpLocalEndpoints(endpoints EndpointList) error { +func formatXLCleanupTmpLocalEndpoints(endpoints Endpoints) error { g := errgroup.WithNErrs(len(endpoints)) for index, endpoint := range endpoints { if !endpoint.IsLocal { @@ -145,7 +145,7 @@ func formatXLCleanupTmpLocalEndpoints(endpoints EndpointList) error { } // validate reference format against list of XL formats. -func validateXLFormats(format *formatXLV3, formats []*formatXLV3, endpoints EndpointList, setCount, drivesPerSet int) error { +func validateXLFormats(format *formatXLV3, formats []*formatXLV3, endpoints Endpoints, setCount, drivesPerSet int) error { for i := range formats { if formats[i] == nil { continue @@ -174,7 +174,7 @@ var errXLV3ThisEmpty = fmt.Errorf("XL format version 3 has This field empty") // connect to list of endpoints and load all XL disk formats, validate the formats are correct // and are in quorum, if no formats are found attempt to initialize all of them for the first // time. additionally make sure to close all the disks used in this attempt. -func connectLoadInitFormats(retryCount int, firstDisk bool, endpoints EndpointList, setCount, drivesPerSet int) (*formatXLV3, error) { +func connectLoadInitFormats(retryCount int, firstDisk bool, endpoints Endpoints, setCount, drivesPerSet int) (*formatXLV3, error) { // Initialize all storage disks storageDisks, errs := initStorageDisksWithErrors(endpoints) defer closeStorageDisks(storageDisks) @@ -286,7 +286,7 @@ func connectLoadInitFormats(retryCount int, firstDisk bool, endpoints EndpointLi } // Format disks before initialization of object layer. -func waitForFormatXL(firstDisk bool, endpoints EndpointList, setCount, disksPerSet int) (format *formatXLV3, err error) { +func waitForFormatXL(firstDisk bool, endpoints Endpoints, setCount, disksPerSet int) (format *formatXLV3, err error) { if len(endpoints) == 0 || setCount == 0 || disksPerSet == 0 { return nil, errInvalidArgument } diff --git a/cmd/routers.go b/cmd/routers.go index 1f8398e5d..491912ee3 100644 --- a/cmd/routers.go +++ b/cmd/routers.go @@ -23,15 +23,15 @@ import ( ) // Composed function registering routers for only distributed XL setup. -func registerDistXLRouters(router *mux.Router, endpoints EndpointList) { +func registerDistXLRouters(router *mux.Router, endpointZones EndpointZones) { // Register storage rpc router only if its a distributed setup. - registerStorageRESTHandlers(router, endpoints) + registerStorageRESTHandlers(router, endpointZones) // Register peer REST router only if its a distributed setup. registerPeerRESTHandlers(router) // Register distributed namespace lock. - registerLockRESTHandlers(router, endpoints) + registerLockRESTHandlers(router, endpointZones) } @@ -79,14 +79,14 @@ var globalHandlers = []HandlerFunc{ } // configureServer handler returns final handler for the http server. -func configureServerHandler(endpoints EndpointList) (http.Handler, error) { +func configureServerHandler(endpointZones EndpointZones) (http.Handler, error) { // Initialize router. `SkipClean(true)` stops gorilla/mux from // normalizing URL path minio/minio#3256 router := mux.NewRouter().SkipClean(true) // Initialize distributed NS lock. if globalIsDistXL { - registerDistXLRouters(router, endpoints) + registerDistXLRouters(router, endpointZones) } // Add STS router always. diff --git a/cmd/server-main.go b/cmd/server-main.go index 63ff890e7..4720608b1 100644 --- a/cmd/server-main.go +++ b/cmd/server-main.go @@ -146,12 +146,13 @@ func serverHandleCmdArgs(ctx *cli.Context) { endpoints := strings.Fields(env.Get(config.EnvEndpoints, "")) if len(endpoints) > 0 { - globalMinioAddr, globalEndpoints, setupType, globalXLSetCount, globalXLSetDriveCount, err = createServerEndpoints(globalCLIContext.Addr, endpoints...) + globalEndpoints, setupType, err = createServerEndpoints(globalCLIContext.Addr, endpoints...) } else { - globalMinioAddr, globalEndpoints, setupType, globalXLSetCount, globalXLSetDriveCount, err = createServerEndpoints(globalCLIContext.Addr, ctx.Args()...) + globalEndpoints, setupType, err = createServerEndpoints(globalCLIContext.Addr, ctx.Args()...) } logger.FatalIf(err, "Invalid command line arguments") + globalMinioAddr = globalCLIContext.Addr logger.LogIf(context.Background(), checkEndpointsSubOptimal(ctx, setupType, globalEndpoints)) globalMinioHost, globalMinioPort = mustSplitHostPort(globalMinioAddr) @@ -192,7 +193,24 @@ func newAllSubsystems() { } func initSafeModeInit(buckets []BucketInfo) (err error) { + newObject := newObjectLayerWithoutSafeModeFn() + + // Construct path to config/transaction.lock for locking + transactionConfigPrefix := minioConfigPrefix + "/transaction.lock" + + // Make sure to hold lock for entire migration to avoid + // such that only one server should migrate the entire config + // at a given time, this big transaction lock ensures this + // appropriately. This is also true for rotation of encrypted + // content. + objLock := newObject.NewNSLock(context.Background(), minioMetaBucket, transactionConfigPrefix) + if err = objLock.GetLock(globalOperationTimeout); err != nil { + return err + } + defer func() { + objLock.Unlock() + if err != nil { var cerr config.Err if errors.As(err, &cerr) { @@ -210,8 +228,6 @@ func initSafeModeInit(buckets []BucketInfo) (err error) { } }() - newObject := newObjectLayerWithoutSafeModeFn() - // Calls New() for all sub-systems. newAllSubsystems() @@ -302,10 +318,10 @@ func serverMain(ctx *cli.Context) { // Is distributed setup, error out if no certificates are found for HTTPS endpoints. if globalIsDistXL { - if globalEndpoints.IsHTTPS() && !globalIsSSL { + if globalEndpoints.HTTPS() && !globalIsSSL { logger.Fatal(config.ErrNoCertsAndHTTPSEndpoints(nil), "Unable to start the server") } - if !globalEndpoints.IsHTTPS() && globalIsSSL { + if !globalEndpoints.HTTPS() && globalIsSSL { logger.Fatal(config.ErrCertsAndHTTPEndpoints(nil), "Unable to start the server") } } @@ -413,19 +429,21 @@ func serverMain(ctx *cli.Context) { } // Initialize object layer with the supplied disks, objectLayer is nil upon any error. -func newObjectLayer(endpoints EndpointList) (newObject ObjectLayer, err error) { +func newObjectLayer(endpointZones EndpointZones) (newObject ObjectLayer, err error) { // For FS only, directly use the disk. - isFS := len(endpoints) == 1 - if isFS { + if endpointZones.Nodes() == 1 { // Initialize new FS object layer. - return NewFSObjectLayer(endpoints[0].Path) + return NewFSObjectLayer(endpointZones[0].Endpoints[0].Path) } - format, err := waitForFormatXL(endpoints[0].IsLocal, endpoints, globalXLSetCount, globalXLSetDriveCount) - if err != nil { - return nil, err + var formats []*formatXLV3 + for _, ep := range endpointZones { + format, err := waitForFormatXL(ep.Endpoints[0].IsLocal, ep.Endpoints, ep.SetCount, ep.DrivesPerSet) + if err != nil { + return nil, err + } + formats = append(formats, format) } - - return newXLSets(endpoints, format, len(format.XL.Sets), len(format.XL.Sets[0])) + return newXLZones(endpointZones, formats) } diff --git a/cmd/server-main_test.go b/cmd/server-main_test.go index 3d0d5990e..16fdc253d 100644 --- a/cmd/server-main_test.go +++ b/cmd/server-main_test.go @@ -31,8 +31,7 @@ func TestNewObjectLayer(t *testing.T) { } defer removeRoots(disks) - endpoints := mustGetNewEndpointList(disks...) - obj, err := newObjectLayer(endpoints) + obj, err := newObjectLayer(mustGetZoneEndpoints(disks...)) if err != nil { t.Fatal("Unexpected object layer initialization error", err) } @@ -51,16 +50,12 @@ func TestNewObjectLayer(t *testing.T) { } defer removeRoots(disks) - globalXLSetCount = 1 - globalXLSetDriveCount = 16 - - endpoints = mustGetNewEndpointList(disks...) - obj, err = newObjectLayer(endpoints) + obj, err = newObjectLayer(mustGetZoneEndpoints(disks...)) if err != nil { t.Fatal("Unexpected object layer initialization error", err) } - _, ok = obj.(*xlSets) + _, ok = obj.(*xlZones) if !ok { t.Fatal("Unexpected object layer detected", reflect.TypeOf(obj)) } diff --git a/cmd/storage-rest-server.go b/cmd/storage-rest-server.go index 2b63acc19..96eca378c 100644 --- a/cmd/storage-rest-server.go +++ b/cmd/storage-rest-server.go @@ -560,55 +560,57 @@ func (s *storageRESTServer) VerifyFile(w http.ResponseWriter, r *http.Request) { } // registerStorageRPCRouter - register storage rpc router. -func registerStorageRESTHandlers(router *mux.Router, endpoints EndpointList) { - for _, endpoint := range endpoints { - if !endpoint.IsLocal { - continue +func registerStorageRESTHandlers(router *mux.Router, endpointZones EndpointZones) { + for _, ep := range endpointZones { + for _, endpoint := range ep.Endpoints { + if !endpoint.IsLocal { + continue + } + storage, err := newPosix(endpoint.Path) + if err != nil { + logger.Fatal(config.ErrUnableToWriteInBackend(err), + "Unable to initialize posix backend") + } + + server := &storageRESTServer{storage: storage} + + subrouter := router.PathPrefix(path.Join(storageRESTPrefix, endpoint.Path)).Subrouter() + + subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodDiskInfo).HandlerFunc(httpTraceHdrs(server.DiskInfoHandler)) + subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodMakeVol).HandlerFunc(httpTraceHdrs(server.MakeVolHandler)).Queries(restQueries(storageRESTVolume)...) + subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodStatVol).HandlerFunc(httpTraceHdrs(server.StatVolHandler)).Queries(restQueries(storageRESTVolume)...) + subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodDeleteVol).HandlerFunc(httpTraceHdrs(server.DeleteVolHandler)).Queries(restQueries(storageRESTVolume)...) + subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodListVols).HandlerFunc(httpTraceHdrs(server.ListVolsHandler)) + + subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodAppendFile).HandlerFunc(httpTraceHdrs(server.AppendFileHandler)). + Queries(restQueries(storageRESTVolume, storageRESTFilePath)...) + subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodWriteAll).HandlerFunc(httpTraceHdrs(server.WriteAllHandler)). + Queries(restQueries(storageRESTVolume, storageRESTFilePath)...) + subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodCreateFile).HandlerFunc(httpTraceHdrs(server.CreateFileHandler)). + Queries(restQueries(storageRESTVolume, storageRESTFilePath, storageRESTLength)...) + + subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodStatFile).HandlerFunc(httpTraceHdrs(server.StatFileHandler)). + Queries(restQueries(storageRESTVolume, storageRESTFilePath)...) + subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodReadAll).HandlerFunc(httpTraceHdrs(server.ReadAllHandler)). + Queries(restQueries(storageRESTVolume, storageRESTFilePath)...) + subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodReadFile).HandlerFunc(httpTraceHdrs(server.ReadFileHandler)). + Queries(restQueries(storageRESTVolume, storageRESTFilePath, storageRESTOffset, storageRESTLength, storageRESTBitrotAlgo, storageRESTBitrotHash)...) + subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodReadFileStream).HandlerFunc(httpTraceHdrs(server.ReadFileStreamHandler)). + Queries(restQueries(storageRESTVolume, storageRESTFilePath, storageRESTOffset, storageRESTLength)...) + subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodListDir).HandlerFunc(httpTraceHdrs(server.ListDirHandler)). + Queries(restQueries(storageRESTVolume, storageRESTDirPath, storageRESTCount, storageRESTLeafFile)...) + subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodWalk).HandlerFunc(httpTraceHdrs(server.WalkHandler)). + Queries(restQueries(storageRESTVolume, storageRESTDirPath, storageRESTMarkerPath, storageRESTRecursive, storageRESTLeafFile)...) + subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodDeleteFile).HandlerFunc(httpTraceHdrs(server.DeleteFileHandler)). + Queries(restQueries(storageRESTVolume, storageRESTFilePath)...) + subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodDeleteFileBulk).HandlerFunc(httpTraceHdrs(server.DeleteFileBulkHandler)). + Queries(restQueries(storageRESTVolume, storageRESTFilePath)...) + + subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodRenameFile).HandlerFunc(httpTraceHdrs(server.RenameFileHandler)). + Queries(restQueries(storageRESTSrcVolume, storageRESTSrcPath, storageRESTDstVolume, storageRESTDstPath)...) + subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodVerifyFile).HandlerFunc(httpTraceHdrs(server.VerifyFile)). + Queries(restQueries(storageRESTVolume, storageRESTFilePath, storageRESTBitrotAlgo, storageRESTBitrotHash, storageRESTLength, storageRESTShardSize)...) } - storage, err := newPosix(endpoint.Path) - if err != nil { - logger.Fatal(config.ErrUnableToWriteInBackend(err), - "Unable to initialize posix backend") - } - - server := &storageRESTServer{storage: storage} - - subrouter := router.PathPrefix(path.Join(storageRESTPrefix, endpoint.Path)).Subrouter() - - subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodDiskInfo).HandlerFunc(httpTraceHdrs(server.DiskInfoHandler)) - subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodMakeVol).HandlerFunc(httpTraceHdrs(server.MakeVolHandler)).Queries(restQueries(storageRESTVolume)...) - subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodStatVol).HandlerFunc(httpTraceHdrs(server.StatVolHandler)).Queries(restQueries(storageRESTVolume)...) - subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodDeleteVol).HandlerFunc(httpTraceHdrs(server.DeleteVolHandler)).Queries(restQueries(storageRESTVolume)...) - subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodListVols).HandlerFunc(httpTraceHdrs(server.ListVolsHandler)) - - subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodAppendFile).HandlerFunc(httpTraceHdrs(server.AppendFileHandler)). - Queries(restQueries(storageRESTVolume, storageRESTFilePath)...) - subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodWriteAll).HandlerFunc(httpTraceHdrs(server.WriteAllHandler)). - Queries(restQueries(storageRESTVolume, storageRESTFilePath)...) - subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodCreateFile).HandlerFunc(httpTraceHdrs(server.CreateFileHandler)). - Queries(restQueries(storageRESTVolume, storageRESTFilePath, storageRESTLength)...) - - subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodStatFile).HandlerFunc(httpTraceHdrs(server.StatFileHandler)). - Queries(restQueries(storageRESTVolume, storageRESTFilePath)...) - subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodReadAll).HandlerFunc(httpTraceHdrs(server.ReadAllHandler)). - Queries(restQueries(storageRESTVolume, storageRESTFilePath)...) - subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodReadFile).HandlerFunc(httpTraceHdrs(server.ReadFileHandler)). - Queries(restQueries(storageRESTVolume, storageRESTFilePath, storageRESTOffset, storageRESTLength, storageRESTBitrotAlgo, storageRESTBitrotHash)...) - subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodReadFileStream).HandlerFunc(httpTraceHdrs(server.ReadFileStreamHandler)). - Queries(restQueries(storageRESTVolume, storageRESTFilePath, storageRESTOffset, storageRESTLength)...) - subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodListDir).HandlerFunc(httpTraceHdrs(server.ListDirHandler)). - Queries(restQueries(storageRESTVolume, storageRESTDirPath, storageRESTCount, storageRESTLeafFile)...) - subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodWalk).HandlerFunc(httpTraceHdrs(server.WalkHandler)). - Queries(restQueries(storageRESTVolume, storageRESTDirPath, storageRESTMarkerPath, storageRESTRecursive, storageRESTLeafFile)...) - subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodDeleteFile).HandlerFunc(httpTraceHdrs(server.DeleteFileHandler)). - Queries(restQueries(storageRESTVolume, storageRESTFilePath)...) - subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodDeleteFileBulk).HandlerFunc(httpTraceHdrs(server.DeleteFileBulkHandler)). - Queries(restQueries(storageRESTVolume, storageRESTFilePath)...) - - subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodRenameFile).HandlerFunc(httpTraceHdrs(server.RenameFileHandler)). - Queries(restQueries(storageRESTSrcVolume, storageRESTSrcPath, storageRESTDstVolume, storageRESTDstPath)...) - subrouter.Methods(http.MethodPost).Path(storageRESTVersionPrefix + storageRESTMethodVerifyFile).HandlerFunc(httpTraceHdrs(server.VerifyFile)). - Queries(restQueries(storageRESTVolume, storageRESTFilePath, storageRESTBitrotAlgo, storageRESTBitrotHash, storageRESTLength, storageRESTShardSize)...) } // If none of the routes match add default error handler routes diff --git a/cmd/storage-rest_test.go b/cmd/storage-rest_test.go index cd4c2994e..dcf0eb4dd 100644 --- a/cmd/storage-rest_test.go +++ b/cmd/storage-rest_test.go @@ -513,7 +513,9 @@ func newStorageRESTHTTPServerClient(t *testing.T) (*httptest.Server, *storageRES t.Fatalf("UpdateIsLocal failed %v", err) } - registerStorageRESTHandlers(router, EndpointList{endpoint}) + registerStorageRESTHandlers(router, []ZoneEndpoints{{ + Endpoints: Endpoints{endpoint}, + }}) restClient := newStorageRESTClient(endpoint) prevGlobalServerConfig := globalServerConfig globalServerConfig = newServerConfig() diff --git a/cmd/test-utils_test.go b/cmd/test-utils_test.go index 64baef3d0..326efa01d 100644 --- a/cmd/test-utils_test.go +++ b/cmd/test-utils_test.go @@ -59,8 +59,6 @@ import ( "github.com/minio/minio/cmd/config" "github.com/minio/minio/cmd/logger" "github.com/minio/minio/pkg/auth" - "github.com/minio/minio/pkg/bpool" - "github.com/minio/minio/pkg/dsync" "github.com/minio/minio/pkg/hash" "github.com/minio/minio/pkg/policy" ) @@ -181,13 +179,13 @@ func prepareXLSets32() (ObjectLayer, []string, error) { return nil, nil, err } - endpoints1 := mustGetNewEndpointList(fsDirs1...) + endpoints1 := mustGetNewEndpoints(fsDirs1...) fsDirs2, err := getRandomDisks(16) if err != nil { removeRoots(fsDirs1) return nil, nil, err } - endpoints2 := mustGetNewEndpointList(fsDirs2...) + endpoints2 := mustGetNewEndpoints(fsDirs2...) endpoints := append(endpoints1, endpoints2...) fsDirs := append(fsDirs1, fsDirs2...) @@ -210,7 +208,7 @@ func prepareXL(nDisks int) (ObjectLayer, []string, error) { if err != nil { return nil, nil, err } - obj, _, err := initObjectLayer(mustGetNewEndpointList(fsDirs...)) + obj, _, err := initObjectLayer(mustGetZoneEndpoints(fsDirs...)) if err != nil { removeRoots(fsDirs) return nil, nil, err @@ -302,7 +300,7 @@ func isSameType(obj1, obj2 interface{}) bool { // defer s.Stop() type TestServer struct { Root string - Disks EndpointList + Disks EndpointZones AccessKey string SecretKey string Server *httptest.Server @@ -331,9 +329,7 @@ func UnstartedTestServer(t TestErrHandler, instanceType string) TestServer { credentials := globalActiveCred testServer.Obj = objLayer - for _, disk := range disks { - testServer.Disks = append(testServer.Disks, mustGetNewEndpointList(disk)...) - } + testServer.Disks = mustGetZoneEndpoints(disks...) testServer.AccessKey = credentials.AccessKey testServer.SecretKey = credentials.SecretKey @@ -450,7 +446,7 @@ func resetGlobalConfig() { } func resetGlobalEndpoints() { - globalEndpoints = EndpointList{} + globalEndpoints = EndpointZones{} } func resetGlobalIsXL() { @@ -525,8 +521,10 @@ func newTestConfig(bucketLocation string, obj ObjectLayer) (err error) { // Deleting the temporary backend and stopping the server. func (testServer TestServer) Stop() { os.RemoveAll(testServer.Root) - for _, disk := range testServer.Disks { - os.RemoveAll(disk.Path) + for _, ep := range testServer.Disks { + for _, disk := range ep.Endpoints { + os.RemoveAll(disk.Path) + } } testServer.Server.Close() } @@ -1580,73 +1578,52 @@ func getRandomDisks(N int) ([]string, error) { } // Initialize object layer with the supplied disks, objectLayer is nil upon any error. -func newTestObjectLayer(endpoints EndpointList) (newObject ObjectLayer, err error) { +func newTestObjectLayer(endpointZones EndpointZones) (newObject ObjectLayer, err error) { // For FS only, directly use the disk. - isFS := len(endpoints) == 1 - if isFS { + if endpointZones.Nodes() == 1 { // Initialize new FS object layer. - return NewFSObjectLayer(endpoints[0].Path) + return NewFSObjectLayer(endpointZones[0].Endpoints[0].Path) } - format, err := waitForFormatXL(endpoints[0].IsLocal, endpoints, 1, 16) - if err != nil { - return nil, err - } - - storageDisks, errs := initStorageDisksWithErrors(endpoints) - for _, err = range errs { - if err != nil && err != errDiskNotFound { + var formats []*formatXLV3 + for _, ep := range endpointZones { + format, err := waitForFormatXL(ep.Endpoints[0].IsLocal, ep.Endpoints, ep.SetCount, ep.DrivesPerSet) + if err != nil { return nil, err } + formats = append(formats, format) } - for i, disk := range storageDisks { - disk.SetDiskID(format.XL.Sets[0][i]) - } - - // Initialize list pool. - listPool := NewTreeWalkPool(globalLookupTimeout) - - // Initialize xl objects. - xl := &xlObjects{ - listPool: listPool, - storageDisks: storageDisks, - nsMutex: newNSLock(false), - bp: bpool.NewBytePoolCap(4, blockSizeV1, blockSizeV1*2), - } - - xl.getDisks = func() []StorageAPI { - return xl.storageDisks - } - xl.getLockers = func() []dsync.NetLocker { - return nil + zones, err := newXLZones(endpointZones, formats) + if err != nil { + return nil, err } globalConfigSys = NewConfigSys() globalIAMSys = NewIAMSys() - globalIAMSys.Init(xl) + globalIAMSys.Init(zones) globalPolicySys = NewPolicySys() - globalPolicySys.Init(nil, xl) + globalPolicySys.Init(nil, zones) - globalNotificationSys = NewNotificationSys(endpoints) - globalNotificationSys.Init(nil, xl) + globalNotificationSys = NewNotificationSys(endpointZones) + globalNotificationSys.Init(nil, zones) - return xl, nil + return zones, nil } // initObjectLayer - Instantiates object layer and returns it. -func initObjectLayer(endpoints EndpointList) (ObjectLayer, []StorageAPI, error) { - objLayer, err := newTestObjectLayer(endpoints) +func initObjectLayer(endpointZones EndpointZones) (ObjectLayer, []StorageAPI, error) { + objLayer, err := newTestObjectLayer(endpointZones) if err != nil { return nil, nil, err } var formattedDisks []StorageAPI // Should use the object layer tests for validating cache. - if xl, ok := objLayer.(*xlObjects); ok { - formattedDisks = xl.storageDisks + if z, ok := objLayer.(*xlZones); ok { + formattedDisks = z.zones[0].GetDisks(0)() } // Success. @@ -2052,7 +2029,7 @@ func ExecObjectLayerStaleFilesTest(t *testing.T, objTest objTestStaleFilesType) if err != nil { t.Fatalf("Initialization of disks for XL setup: %s", err) } - objLayer, _, err := initObjectLayer(mustGetNewEndpointList(erasureDisks...)) + objLayer, _, err := initObjectLayer(mustGetZoneEndpoints(erasureDisks...)) if err != nil { t.Fatalf("Initialization of object layer failed for XL setup: %s", err) } @@ -2303,23 +2280,27 @@ func generateTLSCertKey(host string) ([]byte, []byte, error) { return certOut.Bytes(), keyOut.Bytes(), nil } -func mustGetNewEndpointList(args ...string) (endpoints EndpointList) { - if len(args) == 1 { - endpoint, err := NewEndpoint(args[0]) - logger.FatalIf(err, "unable to create new endpoint") - endpoints = append(endpoints, endpoint) - } else { - var err error - endpoints, err = NewEndpointList(args...) - logger.FatalIf(err, "unable to create new endpoint list") - } +func mustGetZoneEndpoints(args ...string) EndpointZones { + endpoints := mustGetNewEndpoints(args...) + return []ZoneEndpoints{{ + SetCount: 1, + DrivesPerSet: len(args), + Endpoints: endpoints, + }} +} + +func mustGetNewEndpoints(args ...string) (endpoints Endpoints) { + endpoints, err := NewEndpoints(args...) + logger.FatalIf(err, "unable to create new endpoint list") return endpoints } -func getEndpointsLocalAddr(endpoints EndpointList) string { - for _, endpoint := range endpoints { - if endpoint.IsLocal && endpoint.Type() == URLEndpointType { - return endpoint.Host +func getEndpointsLocalAddr(endpointZones EndpointZones) string { + for _, endpoints := range endpointZones { + for _, endpoint := range endpoints.Endpoints { + if endpoint.IsLocal && endpoint.Type() == URLEndpointType { + return endpoint.Host + } } } diff --git a/cmd/tree-walk_test.go b/cmd/tree-walk_test.go index 6262ff02b..cdcac547e 100644 --- a/cmd/tree-walk_test.go +++ b/cmd/tree-walk_test.go @@ -123,7 +123,7 @@ func TestTreeWalk(t *testing.T) { if err != nil { t.Fatalf("Unable to create tmp directory: %s", err) } - endpoints := mustGetNewEndpointList(fsDir) + endpoints := mustGetNewEndpoints(fsDir) disk, err := newStorageAPI(endpoints[0]) if err != nil { t.Fatalf("Unable to create StorageAPI: %s", err) @@ -160,7 +160,7 @@ func TestTreeWalkTimeout(t *testing.T) { if err != nil { t.Fatalf("Unable to create tmp directory: %s", err) } - endpoints := mustGetNewEndpointList(fsDir) + endpoints := mustGetNewEndpoints(fsDir) disk, err := newStorageAPI(endpoints[0]) if err != nil { t.Fatalf("Unable to create StorageAPI: %s", err) @@ -235,13 +235,13 @@ func TestListDir(t *testing.T) { } // Create two StorageAPIs disk1 and disk2. - endpoints := mustGetNewEndpointList(fsDir1) + endpoints := mustGetNewEndpoints(fsDir1) disk1, err := newStorageAPI(endpoints[0]) if err != nil { t.Errorf("Unable to create StorageAPI: %s", err) } - endpoints = mustGetNewEndpointList(fsDir2) + endpoints = mustGetNewEndpoints(fsDir2) disk2, err := newStorageAPI(endpoints[0]) if err != nil { t.Errorf("Unable to create StorageAPI: %s", err) @@ -300,7 +300,7 @@ func TestRecursiveTreeWalk(t *testing.T) { t.Fatalf("Unable to create tmp directory: %s", err) } - endpoints := mustGetNewEndpointList(fsDir1) + endpoints := mustGetNewEndpoints(fsDir1) disk1, err := newStorageAPI(endpoints[0]) if err != nil { t.Fatalf("Unable to create StorageAPI: %s", err) @@ -405,7 +405,7 @@ func TestSortedness(t *testing.T) { t.Errorf("Unable to create tmp directory: %s", err) } - endpoints := mustGetNewEndpointList(fsDir1) + endpoints := mustGetNewEndpoints(fsDir1) disk1, err := newStorageAPI(endpoints[0]) if err != nil { t.Fatalf("Unable to create StorageAPI: %s", err) @@ -476,7 +476,7 @@ func TestTreeWalkIsEnd(t *testing.T) { t.Errorf("Unable to create tmp directory: %s", err) } - endpoints := mustGetNewEndpointList(fsDir1) + endpoints := mustGetNewEndpoints(fsDir1) disk1, err := newStorageAPI(endpoints[0]) if err != nil { t.Fatalf("Unable to create StorageAPI: %s", err) diff --git a/cmd/web-handlers_test.go b/cmd/web-handlers_test.go index 34562ecc6..842c8e2f8 100644 --- a/cmd/web-handlers_test.go +++ b/cmd/web-handlers_test.go @@ -1587,9 +1587,14 @@ func TestWebObjectLayerFaultyDisks(t *testing.T) { } // Set faulty disks to XL backend - xl := obj.(*xlObjects) - for i, d := range xl.storageDisks { - xl.storageDisks[i] = newNaughtyDisk(d, nil, errFaultyDisk) + z := obj.(*xlZones) + xl := z.zones[0].sets[0] + xlDisks := xl.getDisks() + xl.getDisks = func() []StorageAPI { + for i, d := range xlDisks { + xlDisks[i] = newNaughtyDisk(d, nil, errFaultyDisk) + } + return xlDisks } // Initialize web rpc endpoint. diff --git a/cmd/xl-sets.go b/cmd/xl-sets.go index 8a28e763d..c4a704585 100644 --- a/cmd/xl-sets.go +++ b/cmd/xl-sets.go @@ -55,18 +55,6 @@ func (s setsStorageAPI) Close() error { return nil } -func (s setsDsyncLockers) Close() error { - for i := 0; i < len(s); i++ { - for _, locker := range s[i] { - if locker == nil { - continue - } - locker.Close() - } - } - return nil -} - // xlSets implements ObjectLayer combining a static list of erasure coded // object sets. NOTE: There is no dynamic scaling allowed or intended in // current design. @@ -89,7 +77,7 @@ type xlSets struct { lockersMap map[Endpoint]dsync.NetLocker // List of endpoints provided on the command line. - endpoints EndpointList + endpoints Endpoints // Total number of sets and the number of disks per set. setCount, drivesPerSet int @@ -123,11 +111,10 @@ func (s *xlSets) isConnected(endpoint Endpoint) bool { if s.xlDisks[i][j].String() != endpointStr { continue } - if s.xlDisks[i][j].IsOnline() { - return true + if !s.xlLockers[i][j].IsOnline() { + continue } - s.xlLockers[i][j].Close() - return false + return s.xlDisks[i][j].IsOnline() } } return false @@ -282,8 +269,7 @@ func (s *xlSets) GetDisks(setIndex int) func() []StorageAPI { const defaultMonitorConnectEndpointInterval = time.Second * 10 // Set to 10 secs. // Initialize new set of erasure coded sets. -func newXLSets(endpoints EndpointList, format *formatXLV3, setCount int, drivesPerSet int) (ObjectLayer, error) { - +func newXLSets(endpoints Endpoints, format *formatXLV3, setCount int, drivesPerSet int) (*xlSets, error) { lockersMap := make(map[Endpoint]dsync.NetLocker) for _, endpoint := range endpoints { lockersMap[endpoint] = newLockAPI(endpoint) @@ -464,13 +450,6 @@ func (s *xlSets) Shutdown(ctx context.Context) error { // even if one of the sets fail to create buckets, we proceed to undo a // successful operation. func (s *xlSets) MakeBucketWithLocation(ctx context.Context, bucket, location string) error { - set := s.getHashedSet(bucket) - bucketLock := set.nsMutex.NewNSLock(ctx, set.getLockers(), bucket, "") - if err := bucketLock.GetLock(globalOperationTimeout); err != nil { - return err - } - defer bucketLock.Unlock() - g := errgroup.WithNErrs(len(s.sets)) // Create buckets in parallel across all sets. @@ -549,14 +528,7 @@ func (s *xlSets) getHashedSet(input string) (set *xlObjects) { // GetBucketInfo - returns bucket info from one of the erasure coded set. func (s *xlSets) GetBucketInfo(ctx context.Context, bucket string) (bucketInfo BucketInfo, err error) { - set := s.getHashedSet(bucket) - bucketLock := set.nsMutex.NewNSLock(ctx, set.getLockers(), bucket, "") - if err = bucketLock.GetRLock(globalOperationTimeout); err != nil { - return bucketInfo, err - } - defer bucketLock.RUnlock() - - return s.getHashedSet(bucket).GetBucketInfo(ctx, bucket) + return s.getHashedSet("").GetBucketInfo(ctx, bucket) } // ListObjectsV2 lists all objects in bucket filtered by prefix @@ -635,13 +607,6 @@ func (s *xlSets) IsCompressionSupported() bool { // even if one of the sets fail to delete buckets, we proceed to // undo a successful operation. func (s *xlSets) DeleteBucket(ctx context.Context, bucket string) error { - set := s.getHashedSet(bucket) - bucketLock := set.nsMutex.NewNSLock(ctx, set.getLockers(), bucket, "") - if err := bucketLock.GetLock(globalOperationTimeout); err != nil { - return err - } - defer bucketLock.Unlock() - g := errgroup.WithNErrs(len(s.sets)) // Delete buckets in parallel across all sets. @@ -729,7 +694,6 @@ func (s *xlSets) DeleteObject(ctx context.Context, bucket string, object string) // objects are group by set first, and then bulk delete is invoked // for each set, the error response of each delete will be returned func (s *xlSets) DeleteObjects(ctx context.Context, bucket string, objects []string) ([]error, error) { - type delObj struct { // Set index associated to this object setIndex int @@ -787,13 +751,6 @@ func (s *xlSets) CopyObject(ctx context.Context, srcBucket, srcObject, destBucke return srcSet.CopyObject(ctx, srcBucket, srcObject, destBucket, destObject, srcInfo, srcOpts, dstOpts) } - if !cpSrcDstSame { - objectDWLock := destSet.nsMutex.NewNSLock(ctx, destSet.getLockers(), destBucket, destObject) - if err := objectDWLock.GetLock(globalObjectTimeout); err != nil { - return objInfo, err - } - defer objectDWLock.Unlock() - } putOpts := ObjectOptions{ServerSideEncryption: dstOpts.ServerSideEncryption, UserDefined: srcInfo.UserDefined} return destSet.putObject(ctx, destBucket, destObject, srcInfo.PutObjReader, putOpts) } @@ -1078,11 +1035,6 @@ func (s *xlSets) listObjectsNonSlash(ctx context.Context, bucket, prefix, marker // walked and merged at this layer. Resulting value through the merge process sends // the data in lexically sorted order. func (s *xlSets) listObjects(ctx context.Context, bucket, prefix, marker, delimiter string, maxKeys int, heal bool) (loi ListObjectsInfo, err error) { - if delimiter != SlashSeparator && delimiter != "" { - // "heal" option passed can be ignored as the heal-listing does not send non-standard delimiter. - return s.listObjectsNonSlash(ctx, bucket, prefix, marker, delimiter, maxKeys) - } - if err = checkListObjsArgs(ctx, bucket, prefix, marker, delimiter, s); err != nil { return loi, err } @@ -1114,6 +1066,11 @@ func (s *xlSets) listObjects(ctx context.Context, bucket, prefix, marker, delimi maxKeys = maxObjectList } + if delimiter != SlashSeparator && delimiter != "" { + // "heal" option passed can be ignored as the heal-listing does not send non-standard delimiter. + return s.listObjectsNonSlash(ctx, bucket, prefix, marker, delimiter, maxKeys) + } + // Default is recursive, if delimiter is set then list non recursive. recursive := true if delimiter == SlashSeparator { @@ -1284,7 +1241,7 @@ else fi */ -func formatsToDrivesInfo(endpoints EndpointList, formats []*formatXLV3, sErrs []error) (beforeDrives []madmin.DriveInfo) { +func formatsToDrivesInfo(endpoints Endpoints, formats []*formatXLV3, sErrs []error) (beforeDrives []madmin.DriveInfo) { beforeDrives = make([]madmin.DriveInfo, len(endpoints)) // Existing formats are available (i.e. ok), so save it in // result, also populate disks to be healed. @@ -1317,14 +1274,6 @@ func formatsToDrivesInfo(endpoints EndpointList, formats []*formatXLV3, sErrs [] // Reloads the format from the disk, usually called by a remote peer notifier while // healing in a distributed setup. func (s *xlSets) ReloadFormat(ctx context.Context, dryRun bool) (err error) { - // Acquire lock on format.json - set := s.getHashedSet(formatConfigFile) - formatLock := set.nsMutex.NewNSLock(ctx, set.getLockers(), minioMetaBucket, formatConfigFile) - if err = formatLock.GetRLock(globalHealingTimeout); err != nil { - return err - } - defer formatLock.RUnlock() - storageDisks, errs := initStorageDisksWithErrors(s.endpoints) for i, err := range errs { if err != nil && err != errDiskNotFound { @@ -1367,7 +1316,6 @@ func (s *xlSets) ReloadFormat(ctx context.Context, dryRun bool) (err error) { // Close all existing disks, lockers and reconnect all the disks/lockers. s.xlDisks.Close() - s.xlLockers.Close() s.connectDisksAndLockers() // Restart monitoring loop to monitor reformatted disks again. @@ -1433,17 +1381,7 @@ func markRootDisksAsDown(storageDisks []StorageAPI) { } // HealFormat - heals missing `format.json` on fresh unformatted disks. -// TODO: In future support corrupted disks missing format.json but has erasure -// coded data in it. func (s *xlSets) HealFormat(ctx context.Context, dryRun bool) (res madmin.HealResultItem, err error) { - // Acquire lock on format.json - set := s.getHashedSet(formatConfigFile) - formatLock := set.nsMutex.NewNSLock(ctx, set.getLockers(), minioMetaBucket, formatConfigFile) - if err = formatLock.GetLock(globalHealingTimeout); err != nil { - return madmin.HealResultItem{}, err - } - defer formatLock.Unlock() - storageDisks, errs := initStorageDisksWithErrors(s.endpoints) for i, derr := range errs { if derr != nil && derr != errDiskNotFound { @@ -1576,7 +1514,6 @@ func (s *xlSets) HealFormat(ctx context.Context, dryRun bool) (res madmin.HealRe // Disconnect/relinquish all existing disks, lockers and reconnect the disks, lockers. s.xlDisks.Close() - s.xlLockers.Close() s.connectDisksAndLockers() // Restart our monitoring loop to start monitoring newly formatted disks. @@ -1588,13 +1525,6 @@ func (s *xlSets) HealFormat(ctx context.Context, dryRun bool) (res madmin.HealRe // HealBucket - heals inconsistent buckets and bucket metadata on all sets. func (s *xlSets) HealBucket(ctx context.Context, bucket string, dryRun, remove bool) (result madmin.HealResultItem, err error) { - set := s.getHashedSet(bucket) - bucketLock := set.nsMutex.NewNSLock(ctx, set.getLockers(), bucket, "") - if err = bucketLock.GetLock(globalOperationTimeout); err != nil { - return result, err - } - defer bucketLock.Unlock() - // Initialize heal result info result = madmin.HealResultItem{ Type: madmin.HealItemBucket, @@ -1697,7 +1627,7 @@ func (s *xlSets) HealObjects(ctx context.Context, bucket, prefix string, healObj // Wait at max 10 minute for an inprogress request before proceeding to heal waitCount := 600 // Any requests in progress, delay the heal. - for (globalHTTPServer.GetRequestCount() >= int32(globalXLSetCount*globalXLSetDriveCount)) && + for (globalHTTPServer.GetRequestCount() >= int32(s.setCount*s.drivesPerSet)) && waitCount > 0 { waitCount-- time.Sleep(1 * time.Second) diff --git a/cmd/xl-sets_test.go b/cmd/xl-sets_test.go index 50e56e5b5..4dd688922 100644 --- a/cmd/xl-sets_test.go +++ b/cmd/xl-sets_test.go @@ -75,7 +75,7 @@ func TestNewXLSets(t *testing.T) { defer os.RemoveAll(disk) } - endpoints := mustGetNewEndpointList(erasureDisks...) + endpoints := mustGetNewEndpoints(erasureDisks...) _, err := waitForFormatXL(true, endpoints, 0, 16) if err != errInvalidArgument { t.Fatalf("Expecting error, got %s", err) @@ -113,7 +113,8 @@ func TestHashedLayer(t *testing.T) { defer os.RemoveAll(dir) } - objs = append(objs, obj.(*xlObjects)) + z := obj.(*xlZones) + objs = append(objs, z.zones[0].sets[0]) } sets := &xlSets{sets: objs, distributionAlgo: "CRCMOD"} diff --git a/cmd/xl-v1-common_test.go b/cmd/xl-v1-common_test.go index 70c64ac4b..057cf1a1d 100644 --- a/cmd/xl-v1-common_test.go +++ b/cmd/xl-v1-common_test.go @@ -51,7 +51,8 @@ func TestXLParentDirIsObject(t *testing.T) { t.Fatalf("Unexpected object name returned got %s, expected %s", objInfo.Name, objectName) } - fs := obj.(*xlObjects) + z := obj.(*xlZones) + xl := z.zones[0].sets[0] testCases := []struct { parentIsObject bool objectName string @@ -86,7 +87,7 @@ func TestXLParentDirIsObject(t *testing.T) { } for i, testCase := range testCases { - gotValue := fs.parentDirIsObject(context.Background(), bucketName, testCase.objectName) + gotValue := xl.parentDirIsObject(context.Background(), bucketName, testCase.objectName) if testCase.parentIsObject != gotValue { t.Errorf("Test %d: Unexpected value returned got %t, expected %t", i+1, gotValue, testCase.parentIsObject) } diff --git a/cmd/xl-v1-healing-common.go b/cmd/xl-v1-healing-common.go index ee42ce5ec..cb4e3ed4a 100644 --- a/cmd/xl-v1-healing-common.go +++ b/cmd/xl-v1-healing-common.go @@ -122,7 +122,7 @@ func listOnlineDisks(disks []StorageAPI, partsMetadata []xlMetaV1, errs []error) func getLatestXLMeta(ctx context.Context, partsMetadata []xlMetaV1, errs []error) (xlMetaV1, error) { // There should be atleast half correct entries, if not return failure - if reducedErr := reduceReadQuorumErrs(ctx, errs, objectOpIgnoredErrs, globalXLSetDriveCount/2); reducedErr != nil { + if reducedErr := reduceReadQuorumErrs(ctx, errs, objectOpIgnoredErrs, len(partsMetadata)/2); reducedErr != nil { return xlMetaV1{}, reducedErr } diff --git a/cmd/xl-v1-healing-common_test.go b/cmd/xl-v1-healing-common_test.go index b55f6a7a8..36311bb03 100644 --- a/cmd/xl-v1-healing-common_test.go +++ b/cmd/xl-v1-healing-common_test.go @@ -168,7 +168,8 @@ func TestListOnlineDisks(t *testing.T) { bucket := "bucket" object := "object" data := bytes.Repeat([]byte("a"), 1024) - xlDisks := obj.(*xlObjects).storageDisks + z := obj.(*xlZones) + xlDisks := z.zones[0].sets[0].getDisks() for i, test := range testCases { // Prepare bucket/object backend for the tests below. @@ -266,10 +267,10 @@ func TestDisksWithAllParts(t *testing.T) { object := "object" // make data with more than one part partCount := 3 - data := bytes.Repeat([]byte("a"), int(globalPutPartSize)*partCount) - xl := obj.(*xlObjects) - xlDisks := xl.storageDisks - + data := bytes.Repeat([]byte("a"), 6*1024*1024*partCount) + z := obj.(*xlZones) + xl := z.zones[0].sets[0] + xlDisks := xl.getDisks() err = obj.MakeBucketWithLocation(ctx, "bucket", "") if err != nil { t.Fatalf("Failed to make a bucket %v", err) @@ -281,7 +282,7 @@ func TestDisksWithAllParts(t *testing.T) { } _, errs := readAllXLMetadata(ctx, xlDisks, bucket, object) - readQuorum := len(xl.storageDisks) / 2 + readQuorum := len(xlDisks) / 2 if reducedErr := reduceReadQuorumErrs(ctx, errs, objectOpIgnoredErrs, readQuorum); reducedErr != nil { t.Fatalf("Failed to read xl meta data %v", reducedErr) } diff --git a/cmd/xl-v1-healing.go b/cmd/xl-v1-healing.go index 38f52c4d0..f49e4c147 100644 --- a/cmd/xl-v1-healing.go +++ b/cmd/xl-v1-healing.go @@ -679,14 +679,6 @@ func (xl xlObjects) HealObject(ctx context.Context, bucket, object string, dryRu } healCtx := logger.SetReqInfo(context.Background(), newReqInfo) - // Lock the object before healing. Use read lock since healing - // will only regenerate parts & xl.json of outdated disks. - objectLock := xl.nsMutex.NewNSLock(ctx, xl.getLockers(), bucket, object) - if lerr := objectLock.GetRLock(globalHealingTimeout); lerr != nil { - return madmin.HealResultItem{}, lerr - } - defer objectLock.RUnlock() - // Healing directories handle it separately. if hasSuffix(object, SlashSeparator) { return xl.healObjectDir(healCtx, bucket, object, dryRun) @@ -733,7 +725,7 @@ func (xl xlObjects) HealObject(ctx context.Context, bucket, object string, dryRu writeQuorum = len(storageDisks)/2 + 1 } if !dryRun && remove { - err = xl.deleteObject(ctx, bucket, object, writeQuorum, false) + xl.deleteObject(ctx, bucket, object, writeQuorum, false) } } return defaultHealResult(latestXLMeta, storageDisks, errs, bucket, object), toObjectErr(reducedErr, bucket, object) diff --git a/cmd/xl-v1-healing_test.go b/cmd/xl-v1-healing_test.go index a2889c6ae..2dea22c35 100644 --- a/cmd/xl-v1-healing_test.go +++ b/cmd/xl-v1-healing_test.go @@ -35,7 +35,7 @@ func TestUndoMakeBucket(t *testing.T) { defer removeRoots(fsDirs) // Remove format.json on 16 disks. - obj, _, err := initObjectLayer(mustGetNewEndpointList(fsDirs...)) + obj, _, err := initObjectLayer(mustGetZoneEndpoints(fsDirs...)) if err != nil { t.Fatal(err) } @@ -44,8 +44,9 @@ func TestUndoMakeBucket(t *testing.T) { if err = obj.MakeBucketWithLocation(context.Background(), bucketName, ""); err != nil { t.Fatal(err) } - xl := obj.(*xlObjects) - undoMakeBucket(xl.storageDisks, bucketName) + z := obj.(*xlZones) + xl := z.zones[0].sets[0] + undoMakeBucket(xl.getDisks(), bucketName) // Validate if bucket was deleted properly. _, err = obj.GetBucketInfo(context.Background(), bucketName) @@ -68,7 +69,7 @@ func TestHealObjectCorrupted(t *testing.T) { defer removeRoots(fsDirs) // Everything is fine, should return nil - objLayer, _, err := initObjectLayer(mustGetNewEndpointList(fsDirs...)) + objLayer, _, err := initObjectLayer(mustGetZoneEndpoints(fsDirs...)) if err != nil { t.Fatal(err) } @@ -108,8 +109,9 @@ func TestHealObjectCorrupted(t *testing.T) { } // Test 1: Remove the object backend files from the first disk. - xl := objLayer.(*xlObjects) - firstDisk := xl.storageDisks[0] + z := objLayer.(*xlZones) + xl := z.zones[0].sets[0] + firstDisk := xl.getDisks()[0] err = firstDisk.DeleteFile(bucket, filepath.Join(object, xlMetaJSONFile)) if err != nil { t.Fatalf("Failed to delete a file - %v", err) @@ -179,8 +181,8 @@ func TestHealObjectCorrupted(t *testing.T) { // Test 4: checks if HealObject returns an error when xl.json is not found // in more than read quorum number of disks, to create a corrupted situation. - for i := 0; i <= len(xl.storageDisks)/2; i++ { - xl.storageDisks[i].DeleteFile(bucket, filepath.Join(object, xlMetaJSONFile)) + for i := 0; i <= len(xl.getDisks())/2; i++ { + xl.getDisks()[i].DeleteFile(bucket, filepath.Join(object, xlMetaJSONFile)) } // Try healing now, expect to receive errDiskNotFound. @@ -207,7 +209,7 @@ func TestHealObjectXL(t *testing.T) { defer removeRoots(fsDirs) // Everything is fine, should return nil - obj, _, err := initObjectLayer(mustGetNewEndpointList(fsDirs...)) + obj, _, err := initObjectLayer(mustGetZoneEndpoints(fsDirs...)) if err != nil { t.Fatal(err) } @@ -247,8 +249,9 @@ func TestHealObjectXL(t *testing.T) { } // Remove the object backend files from the first disk. - xl := obj.(*xlObjects) - firstDisk := xl.storageDisks[0] + z := obj.(*xlZones) + xl := z.zones[0].sets[0] + firstDisk := xl.getDisks()[0] err = firstDisk.DeleteFile(bucket, filepath.Join(object, xlMetaJSONFile)) if err != nil { t.Fatalf("Failed to delete a file - %v", err) @@ -264,9 +267,13 @@ func TestHealObjectXL(t *testing.T) { t.Errorf("Expected xl.json file to be present but stat failed - %v", err) } - // Nil more than half the disks, to remove write quorum. - for i := 0; i <= len(xl.storageDisks)/2; i++ { - xl.storageDisks[i] = nil + xlDisks := xl.getDisks() + xl.getDisks = func() []StorageAPI { + // Nil more than half the disks, to remove write quorum. + for i := 0; i <= len(xlDisks)/2; i++ { + xlDisks[i] = nil + } + return xlDisks } // Try healing now, expect to receive errDiskNotFound. @@ -287,7 +294,7 @@ func TestHealEmptyDirectoryXL(t *testing.T) { defer removeRoots(fsDirs) // Everything is fine, should return nil - obj, _, err := initObjectLayer(mustGetNewEndpointList(fsDirs...)) + obj, _, err := initObjectLayer(mustGetZoneEndpoints(fsDirs...)) if err != nil { t.Fatal(err) } @@ -302,14 +309,16 @@ func TestHealEmptyDirectoryXL(t *testing.T) { } // Upload an empty directory - _, err = obj.PutObject(context.Background(), bucket, object, mustGetPutObjReader(t, bytes.NewReader([]byte{}), 0, "", ""), opts) + _, err = obj.PutObject(context.Background(), bucket, object, mustGetPutObjReader(t, + bytes.NewReader([]byte{}), 0, "", ""), opts) if err != nil { t.Fatal(err) } // Remove the object backend files from the first disk. - xl := obj.(*xlObjects) - firstDisk := xl.storageDisks[0] + z := obj.(*xlZones) + xl := z.zones[0].sets[0] + firstDisk := xl.getDisks()[0] err = firstDisk.DeleteFile(bucket, object) if err != nil { t.Fatalf("Failed to delete a file - %v", err) diff --git a/cmd/xl-v1-metadata_test.go b/cmd/xl-v1-metadata_test.go index 8cf1b319b..5f4e55ea4 100644 --- a/cmd/xl-v1-metadata_test.go +++ b/cmd/xl-v1-metadata_test.go @@ -76,7 +76,9 @@ func testXLReadStat(obj ObjectLayer, instanceType string, disks []string, t *tes } } - _, _, err = obj.(*xlObjects).readXLMetaStat(context.Background(), bucketName, objectName) + z := obj.(*xlZones) + xl := z.zones[0].sets[0] + _, _, err = xl.readXLMetaStat(context.Background(), bucketName, objectName) if err != nil { t.Fatal(err) } @@ -85,7 +87,7 @@ func testXLReadStat(obj ObjectLayer, instanceType string, disks []string, t *tes removeDiskN(disks, 7) // Removing disk shouldn't affect reading object info. - _, _, err = obj.(*xlObjects).readXLMetaStat(context.Background(), bucketName, objectName) + _, _, err = xl.readXLMetaStat(context.Background(), bucketName, objectName) if err != nil { t.Fatal(err) } @@ -94,7 +96,7 @@ func testXLReadStat(obj ObjectLayer, instanceType string, disks []string, t *tes os.RemoveAll(path.Join(disk, bucketName)) } - _, _, err = obj.(*xlObjects).readXLMetaStat(context.Background(), bucketName, objectName) + _, _, err = xl.readXLMetaStat(context.Background(), bucketName, objectName) if err != errVolumeNotFound { t.Fatal(err) } @@ -159,9 +161,11 @@ func testXLReadMetaParts(obj ObjectLayer, instanceType string, disks []string, t } } - uploadIDPath := obj.(*xlObjects).getUploadIDDir(bucketNames[0], objectNames[0], uploadIDs[0]) + z := obj.(*xlZones) + xl := z.zones[0].sets[0] + uploadIDPath := xl.getUploadIDDir(bucketNames[0], objectNames[0], uploadIDs[0]) - _, _, err = obj.(*xlObjects).readXLMetaParts(context.Background(), minioMetaMultipartBucket, uploadIDPath) + _, _, err = xl.readXLMetaParts(context.Background(), minioMetaMultipartBucket, uploadIDPath) if err != nil { t.Fatal(err) } @@ -170,17 +174,17 @@ func testXLReadMetaParts(obj ObjectLayer, instanceType string, disks []string, t removeDiskN(disks, 7) // Removing disk shouldn't affect reading object parts info. - _, _, err = obj.(*xlObjects).readXLMetaParts(context.Background(), minioMetaMultipartBucket, uploadIDPath) + _, _, err = xl.readXLMetaParts(context.Background(), minioMetaMultipartBucket, uploadIDPath) if err != nil { t.Fatal(err) } for _, disk := range disks { os.RemoveAll(path.Join(disk, bucketNames[0])) - os.RemoveAll(path.Join(disk, minioMetaMultipartBucket, obj.(*xlObjects).getMultipartSHADir(bucketNames[0], objectNames[0]))) + os.RemoveAll(path.Join(disk, minioMetaMultipartBucket, xl.getMultipartSHADir(bucketNames[0], objectNames[0]))) } - _, _, err = obj.(*xlObjects).readXLMetaParts(context.Background(), minioMetaMultipartBucket, uploadIDPath) + _, _, err = xl.readXLMetaParts(context.Background(), minioMetaMultipartBucket, uploadIDPath) if err != errFileNotFound { t.Fatal(err) } diff --git a/cmd/xl-v1-multipart.go b/cmd/xl-v1-multipart.go index edd8dbc20..1dbd4ddca 100644 --- a/cmd/xl-v1-multipart.go +++ b/cmd/xl-v1-multipart.go @@ -262,13 +262,6 @@ func (xl xlObjects) NewMultipartUpload(ctx context.Context, bucket, object strin // // Implements S3 compatible Upload Part Copy API. func (xl xlObjects) CopyObjectPart(ctx context.Context, srcBucket, srcObject, dstBucket, dstObject, uploadID string, partID int, startOffset int64, length int64, srcInfo ObjectInfo, srcOpts, dstOpts ObjectOptions) (pi PartInfo, e error) { - // Hold read locks on source object only if we are - // going to read data from source object. - objectSRLock := xl.nsMutex.NewNSLock(ctx, xl.getLockers(), srcBucket, srcObject) - if err := objectSRLock.GetRLock(globalObjectTimeout); err != nil { - return pi, err - } - defer objectSRLock.RUnlock() if err := checkNewMultipartArgs(ctx, srcBucket, srcObject, xl); err != nil { return pi, err @@ -303,17 +296,9 @@ func (xl xlObjects) PutObjectPart(ctx context.Context, bucket, object, uploadID var partsMetadata []xlMetaV1 var errs []error uploadIDPath := xl.getUploadIDDir(bucket, object, uploadID) - uploadIDLockPath := xl.getUploadIDLockPath(bucket, object, uploadID) - - // pre-check upload id lock. - preUploadIDLock := xl.nsMutex.NewNSLock(ctx, xl.getLockers(), minioMetaMultipartBucket, uploadIDLockPath) - if err := preUploadIDLock.GetRLock(globalOperationTimeout); err != nil { - return pi, err - } // Validates if upload ID exists. if err := xl.checkUploadIDExists(ctx, bucket, object, uploadID); err != nil { - preUploadIDLock.RUnlock() return pi, toObjectErr(err, bucket, object, uploadID) } @@ -324,16 +309,13 @@ func (xl xlObjects) PutObjectPart(ctx context.Context, bucket, object, uploadID // get Quorum for this object _, writeQuorum, err := objectQuorumFromMeta(ctx, xl, partsMetadata, errs) if err != nil { - preUploadIDLock.RUnlock() return pi, toObjectErr(err, bucket, object) } reducedErr := reduceWriteQuorumErrs(ctx, errs, objectOpIgnoredErrs, writeQuorum) if reducedErr == errXLWriteQuorum { - preUploadIDLock.RUnlock() return pi, toObjectErr(reducedErr, bucket, object) } - preUploadIDLock.RUnlock() // List all online disks. onlineDisks, modTime := listOnlineDisks(xl.getDisks(), partsMetadata, errs) @@ -403,13 +385,6 @@ func (xl xlObjects) PutObjectPart(ctx context.Context, bucket, object, uploadID } } - // post-upload check (write) lock - postUploadIDLock := xl.nsMutex.NewNSLock(ctx, xl.getLockers(), minioMetaMultipartBucket, uploadIDLockPath) - if err = postUploadIDLock.GetLock(globalOperationTimeout); err != nil { - return pi, err - } - defer postUploadIDLock.Unlock() - // Validates if upload ID exists. if err := xl.checkUploadIDExists(ctx, bucket, object, uploadID); err != nil { return pi, toObjectErr(err, bucket, object, uploadID) @@ -497,16 +472,6 @@ func (xl xlObjects) ListObjectParts(ctx context.Context, bucket, object, uploadI if err := checkListPartsArgs(ctx, bucket, object, xl); err != nil { return result, err } - // Hold lock so that there is no competing - // abort-multipart-upload or complete-multipart-upload. - uploadIDLock := xl.nsMutex.NewNSLock(ctx, xl.getLockers(), - minioMetaMultipartBucket, - xl.getUploadIDLockPath(bucket, object, uploadID)) - if err := uploadIDLock.GetLock(globalListingTimeout); err != nil { - return result, err - } - defer uploadIDLock.Unlock() - if err := xl.checkUploadIDExists(ctx, bucket, object, uploadID); err != nil { return result, toObjectErr(err, bucket, object, uploadID) } @@ -603,27 +568,6 @@ func (xl xlObjects) CompleteMultipartUpload(ctx context.Context, bucket string, if err := checkCompleteMultipartArgs(ctx, bucket, object, xl); err != nil { return oi, err } - // Hold write lock on the object. - destLock := xl.nsMutex.NewNSLock(ctx, xl.getLockers(), bucket, object) - if err := destLock.GetLock(globalObjectTimeout); err != nil { - return oi, err - } - defer destLock.Unlock() - - uploadIDPath := xl.getUploadIDDir(bucket, object, uploadID) - uploadIDLockPath := xl.getUploadIDLockPath(bucket, object, uploadID) - - // Hold lock so that - // - // 1) no one aborts this multipart upload - // - // 2) no one does a parallel complete-multipart-upload on this - // multipart upload - uploadIDLock := xl.nsMutex.NewNSLock(ctx, xl.getLockers(), minioMetaMultipartBucket, uploadIDLockPath) - if err := uploadIDLock.GetLock(globalOperationTimeout); err != nil { - return oi, err - } - defer uploadIDLock.Unlock() if err := xl.checkUploadIDExists(ctx, bucket, object, uploadID); err != nil { return oi, toObjectErr(err, bucket, object, uploadID) @@ -638,6 +582,8 @@ func (xl xlObjects) CompleteMultipartUpload(ctx context.Context, bucket string, // Calculate s3 compatible md5sum for complete multipart. s3MD5 := getCompleteMultipartMD5(parts) + uploadIDPath := xl.getUploadIDDir(bucket, object, uploadID) + // Read metadata associated with the object from all disks. partsMetadata, errs := readAllXLMetadata(ctx, xl.getDisks(), minioMetaMultipartBucket, uploadIDPath) @@ -820,22 +766,13 @@ func (xl xlObjects) AbortMultipartUpload(ctx context.Context, bucket, object, up if err := checkAbortMultipartArgs(ctx, bucket, object, xl); err != nil { return err } - // Construct uploadIDPath. - uploadIDPath := xl.getUploadIDDir(bucket, object, uploadID) - uploadIDLockPath := xl.getUploadIDLockPath(bucket, object, uploadID) - // Hold lock so that there is no competing - // complete-multipart-upload or put-object-part. - uploadIDLock := xl.nsMutex.NewNSLock(ctx, xl.getLockers(), minioMetaMultipartBucket, uploadIDLockPath) - if err := uploadIDLock.GetLock(globalOperationTimeout); err != nil { - return err - } - defer uploadIDLock.Unlock() - // Validates if upload ID exists. if err := xl.checkUploadIDExists(ctx, bucket, object, uploadID); err != nil { return toObjectErr(err, bucket, object, uploadID) } + uploadIDPath := xl.getUploadIDDir(bucket, object, uploadID) + // Read metadata associated with the object from all disks. partsMetadata, errs := readAllXLMetadata(ctx, xl.getDisks(), minioMetaMultipartBucket, uploadIDPath) diff --git a/cmd/xl-v1-multipart_test.go b/cmd/xl-v1-multipart_test.go index 40d7cde38..cef56b1e1 100644 --- a/cmd/xl-v1-multipart_test.go +++ b/cmd/xl-v1-multipart_test.go @@ -32,7 +32,8 @@ func TestXLCleanupStaleMultipartUploads(t *testing.T) { // Defer cleanup of backend directories defer removeRoots(fsDirs) - xl := obj.(*xlObjects) + z := obj.(*xlZones) + xl := z.zones[0].sets[0] // Close the go-routine, we are going to // manually start it and test in this test case. diff --git a/cmd/xl-v1-object.go b/cmd/xl-v1-object.go index 51adada72..fb9f5764f 100644 --- a/cmd/xl-v1-object.go +++ b/cmd/xl-v1-object.go @@ -123,27 +123,7 @@ func (xl xlObjects) CopyObject(ctx context.Context, srcBucket, srcObject, dstBuc // GetObjectNInfo - returns object info and an object // Read(Closer). When err != nil, the returned reader is always nil. func (xl xlObjects) GetObjectNInfo(ctx context.Context, bucket, object string, rs *HTTPRangeSpec, h http.Header, lockType LockType, opts ObjectOptions) (gr *GetObjectReader, err error) { - var nsUnlocker = func() {} - - // Acquire lock - if lockType != noLock { - lock := xl.nsMutex.NewNSLock(ctx, xl.getLockers(), bucket, object) - switch lockType { - case writeLock: - if err = lock.GetLock(globalObjectTimeout); err != nil { - return nil, err - } - nsUnlocker = lock.Unlock - case readLock: - if err = lock.GetRLock(globalObjectTimeout); err != nil { - return nil, err - } - nsUnlocker = lock.RUnlock - } - } - if err = checkGetObjArgs(ctx, bucket, object); err != nil { - nsUnlocker() return nil, err } @@ -152,20 +132,18 @@ func (xl xlObjects) GetObjectNInfo(ctx context.Context, bucket, object string, r if hasSuffix(object, SlashSeparator) { var objInfo ObjectInfo if objInfo, err = xl.getObjectInfoDir(ctx, bucket, object); err != nil { - nsUnlocker() return nil, toObjectErr(err, bucket, object) } - return NewGetObjectReaderFromReader(bytes.NewBuffer(nil), objInfo, opts.CheckCopyPrecondFn, nsUnlocker) + return NewGetObjectReaderFromReader(bytes.NewBuffer(nil), objInfo, opts.CheckCopyPrecondFn) } var objInfo ObjectInfo objInfo, err = xl.getObjectInfo(ctx, bucket, object) if err != nil { - nsUnlocker() return nil, toObjectErr(err, bucket, object) } - fn, off, length, nErr := NewGetObjectReader(rs, objInfo, opts.CheckCopyPrecondFn, nsUnlocker) + fn, off, length, nErr := NewGetObjectReader(rs, objInfo, opts.CheckCopyPrecondFn) if nErr != nil { return nil, nErr } @@ -189,12 +167,6 @@ func (xl xlObjects) GetObjectNInfo(ctx context.Context, bucket, object string, r // startOffset indicates the starting read location of the object. // length indicates the total length of the object. func (xl xlObjects) GetObject(ctx context.Context, bucket, object string, startOffset int64, length int64, writer io.Writer, etag string, opts ObjectOptions) error { - // Lock the object before reading. - objectLock := xl.nsMutex.NewNSLock(ctx, xl.getLockers(), bucket, object) - if err := objectLock.GetRLock(globalObjectTimeout); err != nil { - return err - } - defer objectLock.RUnlock() return xl.getObject(ctx, bucket, object, startOffset, length, writer, etag, opts) } @@ -368,13 +340,6 @@ func (xl xlObjects) getObjectInfoDir(ctx context.Context, bucket, object string) // GetObjectInfo - reads object metadata and replies back ObjectInfo. func (xl xlObjects) GetObjectInfo(ctx context.Context, bucket, object string, opts ObjectOptions) (oi ObjectInfo, e error) { - // Lock the object before reading. - objectLock := xl.nsMutex.NewNSLock(ctx, xl.getLockers(), bucket, object) - if err := objectLock.GetRLock(globalObjectTimeout); err != nil { - return oi, err - } - defer objectLock.RUnlock() - if err := checkGetObjArgs(ctx, bucket, object); err != nil { return oi, err } @@ -497,13 +462,6 @@ func (xl xlObjects) PutObject(ctx context.Context, bucket string, object string, return ObjectInfo{}, err } - // Lock the object. - objectLock := xl.nsMutex.NewNSLock(ctx, xl.getLockers(), bucket, object) - if err := objectLock.GetLock(globalObjectTimeout); err != nil { - return objInfo, err - } - defer objectLock.Unlock() - return xl.putObject(ctx, bucket, object, data, opts) } @@ -844,20 +802,6 @@ func (xl xlObjects) deleteObjects(ctx context.Context, bucket string, objects [] errs[i] = checkDelObjArgs(ctx, bucket, object) } - var objectLocks = make([]RWLocker, len(objects)) - - for i, object := range objects { - if errs[i] != nil { - continue - } - // Acquire a write lock before deleting the object. - objectLocks[i] = xl.nsMutex.NewNSLock(ctx, xl.getLockers(), bucket, object) - if errs[i] = objectLocks[i].GetLock(globalOperationTimeout); errs[i] != nil { - continue - } - defer objectLocks[i].Unlock() - } - for i, object := range objects { isObjectDirs[i] = hasSuffix(object, SlashSeparator) } @@ -953,13 +897,6 @@ func (xl xlObjects) DeleteObjects(ctx context.Context, bucket string, objects [] // any error as it is not necessary for the handler to reply back a // response to the client request. func (xl xlObjects) DeleteObject(ctx context.Context, bucket, object string) (err error) { - // Acquire a write lock before deleting the object. - objectLock := xl.nsMutex.NewNSLock(ctx, xl.getLockers(), bucket, object) - if perr := objectLock.GetLock(globalOperationTimeout); perr != nil { - return perr - } - defer objectLock.Unlock() - if err = checkDelObjArgs(ctx, bucket, object); err != nil { return err } diff --git a/cmd/xl-v1-object_test.go b/cmd/xl-v1-object_test.go index f56b96851..45288f583 100644 --- a/cmd/xl-v1-object_test.go +++ b/cmd/xl-v1-object_test.go @@ -125,7 +125,9 @@ func TestXLDeleteObjectsXLSet(t *testing.T) { for _, dir := range fsDirs { defer os.RemoveAll(dir) } - objs = append(objs, obj.(*xlObjects)) + z := obj.(*xlZones) + xl := z.zones[0].sets[0] + objs = append(objs, xl) } xlSets := &xlSets{sets: objs, distributionAlgo: "CRCMOD"} @@ -192,8 +194,11 @@ func TestXLDeleteObjectDiskNotFound(t *testing.T) { if err != nil { t.Fatal(err) } + // Cleanup backend directories + defer removeRoots(fsDirs) - xl := obj.(*xlObjects) + z := obj.(*xlZones) + xl := z.zones[0].sets[0] // Create "bucket" err = obj.MakeBucketWithLocation(context.Background(), "bucket", "") @@ -211,8 +216,12 @@ func TestXLDeleteObjectDiskNotFound(t *testing.T) { } // for a 16 disk setup, quorum is 9. To simulate disks not found yet // quorum is available, we remove disks leaving quorum disks behind. - for i := range xl.storageDisks[:7] { - xl.storageDisks[i] = newNaughtyDisk(xl.storageDisks[i], nil, errFaultyDisk) + xlDisks := xl.getDisks() + xl.getDisks = func() []StorageAPI { + for i := range xlDisks[:7] { + xlDisks[i] = newNaughtyDisk(xlDisks[i], nil, errFaultyDisk) + } + return xlDisks } err = obj.DeleteObject(context.Background(), bucket, object) if err != nil { @@ -226,15 +235,17 @@ func TestXLDeleteObjectDiskNotFound(t *testing.T) { } // Remove one more disk to 'lose' quorum, by setting it to nil. - xl.storageDisks[7] = nil - xl.storageDisks[8] = nil + xlDisks = xl.getDisks() + xl.getDisks = func() []StorageAPI { + xlDisks[7] = nil + xlDisks[8] = nil + return xlDisks + } err = obj.DeleteObject(context.Background(), bucket, object) // since majority of disks are not available, metaquorum is not achieved and hence errXLReadQuorum error if err != toObjectErr(errXLReadQuorum, bucket, object) { t.Errorf("Expected deleteObject to fail with %v, but failed with %v", toObjectErr(errXLReadQuorum, bucket, object), err) } - // Cleanup backend directories - removeRoots(fsDirs) } func TestGetObjectNoQuorum(t *testing.T) { @@ -243,8 +254,11 @@ func TestGetObjectNoQuorum(t *testing.T) { if err != nil { t.Fatal(err) } + // Cleanup backend directories. + defer removeRoots(fsDirs) - xl := obj.(*xlObjects) + z := obj.(*xlZones) + xl := z.zones[0].sets[0] // Create "bucket" err = obj.MakeBucketWithLocation(context.Background(), "bucket", "") @@ -270,22 +284,24 @@ func TestGetObjectNoQuorum(t *testing.T) { for i := 0; i <= f; i++ { diskErrors[i] = nil } - for i := range xl.storageDisks[:9] { - switch diskType := xl.storageDisks[i].(type) { + xlDisks := xl.getDisks() + for i := range xlDisks[:9] { + switch diskType := xlDisks[i].(type) { case *naughtyDisk: - xl.storageDisks[i] = newNaughtyDisk(diskType.disk, diskErrors, errFaultyDisk) + xlDisks[i] = newNaughtyDisk(diskType.disk, diskErrors, errFaultyDisk) default: - xl.storageDisks[i] = newNaughtyDisk(xl.storageDisks[i], diskErrors, errFaultyDisk) + xlDisks[i] = newNaughtyDisk(xlDisks[i], diskErrors, errFaultyDisk) } } + xl.getDisks = func() []StorageAPI { + return xlDisks + } // Fetch object from store. err = xl.GetObject(context.Background(), bucket, object, 0, int64(len("abcd")), ioutil.Discard, "", opts) if err != toObjectErr(errXLReadQuorum, bucket, object) { t.Errorf("Expected putObject to fail with %v, but failed with %v", toObjectErr(errXLWriteQuorum, bucket, object), err) } } - // Cleanup backend directories. - removeRoots(fsDirs) } func TestPutObjectNoQuorum(t *testing.T) { @@ -295,7 +311,11 @@ func TestPutObjectNoQuorum(t *testing.T) { t.Fatal(err) } - xl := obj.(*xlObjects) + // Cleanup backend directories. + defer removeRoots(fsDirs) + + z := obj.(*xlZones) + xl := z.zones[0].sets[0] // Create "bucket" err = obj.MakeBucketWithLocation(context.Background(), "bucket", "") @@ -321,22 +341,24 @@ func TestPutObjectNoQuorum(t *testing.T) { for i := 0; i <= f; i++ { diskErrors[i] = nil } - for i := range xl.storageDisks[:9] { - switch diskType := xl.storageDisks[i].(type) { + xlDisks := xl.getDisks() + for i := range xlDisks[:9] { + switch diskType := xlDisks[i].(type) { case *naughtyDisk: - xl.storageDisks[i] = newNaughtyDisk(diskType.disk, diskErrors, errFaultyDisk) + xlDisks[i] = newNaughtyDisk(diskType.disk, diskErrors, errFaultyDisk) default: - xl.storageDisks[i] = newNaughtyDisk(xl.storageDisks[i], diskErrors, errFaultyDisk) + xlDisks[i] = newNaughtyDisk(xlDisks[i], diskErrors, errFaultyDisk) } } + xl.getDisks = func() []StorageAPI { + return xlDisks + } // Upload new content to same object "object" _, err = obj.PutObject(context.Background(), bucket, object, mustGetPutObjReader(t, bytes.NewReader([]byte("abcd")), int64(len("abcd")), "", ""), opts) if err != toObjectErr(errXLWriteQuorum, bucket, object) { t.Errorf("Expected putObject to fail with %v, but failed with %v", toObjectErr(errXLWriteQuorum, bucket, object), err) } } - // Cleanup backend directories. - removeRoots(fsDirs) } // Tests both object and bucket healing. @@ -346,7 +368,9 @@ func TestHealing(t *testing.T) { t.Fatal(err) } defer removeRoots(fsDirs) - xl := obj.(*xlObjects) + + z := obj.(*xlZones) + xl := z.zones[0].sets[0] // Create "bucket" err = obj.MakeBucketWithLocation(context.Background(), "bucket", "") @@ -369,7 +393,7 @@ func TestHealing(t *testing.T) { t.Fatal(err) } - disk := xl.storageDisks[0] + disk := xl.getDisks()[0] xlMetaPreHeal, err := readXLMeta(context.Background(), disk, bucket, object) if err != nil { t.Fatal(err) @@ -438,7 +462,7 @@ func TestHealing(t *testing.T) { t.Fatal(err) } // Stat the bucket to make sure that it was created. - _, err = xl.storageDisks[0].StatVol(bucket) + _, err = xl.getDisks()[0].StatVol(bucket) if err != nil { t.Fatal(err) } @@ -454,9 +478,11 @@ func testObjectQuorumFromMeta(obj ObjectLayer, instanceType string, dirs []strin var opts ObjectOptions // make data with more than one part partCount := 3 - data := bytes.Repeat([]byte("a"), int(globalPutPartSize)*partCount) - xl := obj.(*xlObjects) - xlDisks := xl.storageDisks + data := bytes.Repeat([]byte("a"), 6*1024*1024*partCount) + + z := obj.(*xlZones) + xl := z.zones[0].sets[0] + xlDisks := xl.getDisks() err := obj.MakeBucketWithLocation(context.Background(), bucket, globalMinioDefaultRegion) if err != nil { diff --git a/cmd/xl-v1-utils_test.go b/cmd/xl-v1-utils_test.go index 61cd4696d..3e6c60613 100644 --- a/cmd/xl-v1-utils_test.go +++ b/cmd/xl-v1-utils_test.go @@ -399,19 +399,19 @@ func TestShuffleDisks(t *testing.T) { if err != nil { t.Fatal(err) } - objLayer, _, err := initObjectLayer(mustGetNewEndpointList(disks...)) + objLayer, _, err := initObjectLayer(mustGetZoneEndpoints(disks...)) if err != nil { removeRoots(disks) t.Fatal(err) } defer removeRoots(disks) - xl := objLayer.(*xlObjects) - testShuffleDisks(t, xl) + z := objLayer.(*xlZones) + testShuffleDisks(t, z) } // Test shuffleDisks which returns shuffled slice of disks for their actual distribution. -func testShuffleDisks(t *testing.T, xl *xlObjects) { - disks := xl.storageDisks +func testShuffleDisks(t *testing.T, z *xlZones) { + disks := z.zones[0].GetDisks(0)() distribution := []int{16, 14, 12, 10, 8, 6, 4, 2, 1, 3, 5, 7, 9, 11, 13, 15} shuffledDisks := shuffleDisks(disks, distribution) // From the "distribution" above you can notice that: @@ -444,12 +444,12 @@ func TestEvalDisks(t *testing.T) { if err != nil { t.Fatal(err) } - objLayer, _, err := initObjectLayer(mustGetNewEndpointList(disks...)) + objLayer, _, err := initObjectLayer(mustGetZoneEndpoints(disks...)) if err != nil { removeRoots(disks) t.Fatal(err) } defer removeRoots(disks) - xl := objLayer.(*xlObjects) - testShuffleDisks(t, xl) + z := objLayer.(*xlZones) + testShuffleDisks(t, z) } diff --git a/cmd/xl-v1.go b/cmd/xl-v1.go index 159cefd2b..2e84d62d9 100644 --- a/cmd/xl-v1.go +++ b/cmd/xl-v1.go @@ -52,16 +52,13 @@ type xlObjects struct { // Byte pools used for temporary i/o buffers. bp *bpool.BytePoolCap - // TODO: Deprecated only kept here for tests, should be removed in future. - storageDisks []StorageAPI - // TODO: ListObjects pool management, should be removed in future. listPool *TreeWalkPool } // NewNSLock - initialize a new namespace RWLocker instance. func (xl xlObjects) NewNSLock(ctx context.Context, bucket string, object string) RWLocker { - return xl.nsMutex.NewNSLock(ctx, xl.getLockers(), bucket, object) + return xl.nsMutex.NewNSLock(ctx, xl.getLockers, bucket, object) } // Shutdown function for object storage interface. diff --git a/cmd/xl-zones.go b/cmd/xl-zones.go new file mode 100644 index 000000000..e1e3bfcfc --- /dev/null +++ b/cmd/xl-zones.go @@ -0,0 +1,1299 @@ +/* + * MinIO Cloud Storage, (C) 2019 MinIO, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package cmd + +import ( + "context" + "fmt" + "io" + "math/rand" + "net/http" + "strings" + + xhttp "github.com/minio/minio/cmd/http" + "github.com/minio/minio/cmd/logger" + "github.com/minio/minio/pkg/lifecycle" + "github.com/minio/minio/pkg/madmin" + "github.com/minio/minio/pkg/policy" + "github.com/minio/minio/pkg/sync/errgroup" +) + +type xlZones struct { + zones []*xlSets +} + +func (z *xlZones) SingleZone() bool { + return len(z.zones) == 1 +} + +func (z *xlZones) quickHealBuckets(ctx context.Context) { + bucketsInfo, err := z.ListBucketsHeal(ctx) + if err != nil { + return + } + for _, bucket := range bucketsInfo { + z.HealBucket(ctx, bucket.Name, false, false) + } +} + +// Initialize new zone of erasure codes. +func newXLZones(endpointZones EndpointZones, formats []*formatXLV3) (ObjectLayer, error) { + z := &xlZones{} + for i, ep := range endpointZones { + sets, err := newXLSets(ep.Endpoints, formats[i], ep.SetCount, ep.DrivesPerSet) + if err != nil { + return nil, err + } + z.zones = append(z.zones, sets) + } + z.quickHealBuckets(context.Background()) + return z, nil +} + +func (z *xlZones) NewNSLock(ctx context.Context, bucket string, object string) RWLocker { + return z.zones[0].NewNSLock(ctx, bucket, object) +} + +type zonesAvailableSpace []zoneAvailableSpace + +type zoneAvailableSpace struct { + Index int + Available uint64 +} + +// TotalAvailable - total available space +func (p zonesAvailableSpace) TotalAvailable() uint64 { + total := uint64(0) + for _, z := range p { + total += z.Available + } + return total +} + +func (z *xlZones) getAvailableZoneIdx(ctx context.Context) int { + zones := z.getZonesAvailableSpace(ctx) + total := zones.TotalAvailable() + if total == 0 { + // Houston, we have a problem, maybe panic?? + return zones[0].Index + } + // choose when we reach this many + choose := rand.Uint64() % total + atTotal := uint64(0) + for _, zone := range zones { + atTotal += zone.Available + if atTotal > choose && zone.Available > 0 { + return zone.Index + } + } + // Should not happen, but print values just in case. + panic(fmt.Errorf("reached end of zones (total: %v, atTotal: %v, choose: %v)", total, atTotal, choose)) +} + +func (z *xlZones) getZonesAvailableSpace(ctx context.Context) zonesAvailableSpace { + var zones = make(zonesAvailableSpace, len(z.zones)) + + storageInfos := make([]StorageInfo, len(z.zones)) + g := errgroup.WithNErrs(len(z.zones)) + for index := range z.zones { + index := index + g.Go(func() error { + storageInfos[index] = z.zones[index].StorageInfo(ctx) + return nil + }, index) + } + + // Wait for the go routines. + g.Wait() + + for i, zinfo := range storageInfos { + var available uint64 + for _, davailable := range zinfo.Available { + available += davailable + } + zones[i] = zoneAvailableSpace{ + Index: i, + Available: available, + } + } + return zones +} + +func (z *xlZones) Shutdown(ctx context.Context) error { + if z.SingleZone() { + return z.zones[0].Shutdown(ctx) + } + + g := errgroup.WithNErrs(len(z.zones)) + + for index := range z.zones { + index := index + g.Go(func() error { + return z.zones[index].Shutdown(ctx) + }, index) + } + + for _, err := range g.Wait() { + if err != nil { + logger.LogIf(ctx, err) + } + // let's the rest shutdown + } + + return nil +} + +func (z *xlZones) StorageInfo(ctx context.Context) StorageInfo { + if z.SingleZone() { + return z.zones[0].StorageInfo(ctx) + } + + var storageInfo StorageInfo + + storageInfos := make([]StorageInfo, len(z.zones)) + g := errgroup.WithNErrs(len(z.zones)) + for index := range z.zones { + index := index + g.Go(func() error { + storageInfos[index] = z.zones[index].StorageInfo(ctx) + return nil + }, index) + } + + // Wait for the go routines. + g.Wait() + + for _, lstorageInfo := range storageInfos { + storageInfo.Used = append(storageInfo.Used, lstorageInfo.Used...) + storageInfo.Total = append(storageInfo.Total, lstorageInfo.Total...) + storageInfo.Available = append(storageInfo.Available, lstorageInfo.Available...) + storageInfo.MountPaths = append(storageInfo.MountPaths, lstorageInfo.MountPaths...) + storageInfo.Backend.OnlineDisks = storageInfo.Backend.OnlineDisks.Merge(lstorageInfo.Backend.OnlineDisks) + storageInfo.Backend.OfflineDisks = storageInfo.Backend.OfflineDisks.Merge(lstorageInfo.Backend.OfflineDisks) + storageInfo.Backend.Sets = append(storageInfo.Backend.Sets, lstorageInfo.Backend.Sets...) + } + + storageInfo.Backend.Type = storageInfos[0].Backend.Type + storageInfo.Backend.StandardSCData = storageInfos[0].Backend.StandardSCData + storageInfo.Backend.StandardSCParity = storageInfos[0].Backend.StandardSCParity + storageInfo.Backend.RRSCData = storageInfos[0].Backend.RRSCData + storageInfo.Backend.RRSCParity = storageInfos[0].Backend.RRSCParity + + return storageInfo +} + +// This function is used to undo a successful MakeBucket operation. +func undoMakeBucketZones(bucket string, zones []*xlSets, errs []error) { + g := errgroup.WithNErrs(len(zones)) + + // Undo previous make bucket entry on all underlying zones. + for index := range zones { + index := index + g.Go(func() error { + if errs[index] == nil { + return zones[index].DeleteBucket(context.Background(), bucket) + } + return nil + }, index) + } + + // Wait for all delete bucket to finish. + g.Wait() +} + +// MakeBucketWithLocation - creates a new bucket across all zones simultaneously +// even if one of the sets fail to create buckets, we proceed all the successful +// operations. +func (z *xlZones) MakeBucketWithLocation(ctx context.Context, bucket, location string) error { + if z.SingleZone() { + return z.zones[0].MakeBucketWithLocation(ctx, bucket, location) + } + + g := errgroup.WithNErrs(len(z.zones)) + + // Create buckets in parallel across all sets. + for index := range z.zones { + index := index + g.Go(func() error { + return z.zones[index].MakeBucketWithLocation(ctx, bucket, location) + }, index) + } + + errs := g.Wait() + // Upon even a single write quorum error we undo all previously created buckets. + for _, err := range errs { + if err != nil { + if _, ok := err.(InsufficientWriteQuorum); ok { + undoMakeBucketZones(bucket, z.zones, errs) + } + return err + } + } + + // Success. + return nil + +} + +func (z *xlZones) GetObjectNInfo(ctx context.Context, bucket, object string, rs *HTTPRangeSpec, h http.Header, lockType LockType, opts ObjectOptions) (gr *GetObjectReader, err error) { + var nsUnlocker = func() {} + + // Acquire lock + if lockType != noLock { + lock := z.NewNSLock(ctx, bucket, object) + switch lockType { + case writeLock: + if err = lock.GetLock(globalObjectTimeout); err != nil { + return nil, err + } + nsUnlocker = lock.Unlock + case readLock: + if err = lock.GetRLock(globalObjectTimeout); err != nil { + return nil, err + } + nsUnlocker = lock.RUnlock + } + } + + for _, zone := range z.zones { + gr, err := zone.GetObjectNInfo(ctx, bucket, object, rs, h, lockType, opts) + if err != nil { + if isErrObjectNotFound(err) { + continue + } + nsUnlocker() + return nil, err + } + gr.cleanUpFns = append(gr.cleanUpFns, nsUnlocker) + return gr, nil + } + nsUnlocker() + return nil, ObjectNotFound{Bucket: bucket, Object: object} +} + +func (z *xlZones) GetObject(ctx context.Context, bucket, object string, startOffset int64, length int64, writer io.Writer, etag string, opts ObjectOptions) error { + // Lock the object before reading. + objectLock := z.NewNSLock(ctx, bucket, object) + if err := objectLock.GetRLock(globalObjectTimeout); err != nil { + return err + } + defer objectLock.RUnlock() + + if z.SingleZone() { + return z.zones[0].GetObject(ctx, bucket, object, startOffset, length, writer, etag, opts) + } + for _, zone := range z.zones { + if err := zone.GetObject(ctx, bucket, object, startOffset, length, writer, etag, opts); err != nil { + if isErrObjectNotFound(err) { + continue + } + return err + } + return nil + } + return ObjectNotFound{Bucket: bucket, Object: object} +} + +func (z *xlZones) GetObjectInfo(ctx context.Context, bucket, object string, opts ObjectOptions) (ObjectInfo, error) { + // Lock the object before reading. + objectLock := z.NewNSLock(ctx, bucket, object) + if err := objectLock.GetRLock(globalObjectTimeout); err != nil { + return ObjectInfo{}, err + } + defer objectLock.RUnlock() + + if z.SingleZone() { + return z.zones[0].GetObjectInfo(ctx, bucket, object, opts) + } + for _, zone := range z.zones { + objInfo, err := zone.GetObjectInfo(ctx, bucket, object, opts) + if err != nil { + if isErrObjectNotFound(err) { + continue + } + return objInfo, err + } + return objInfo, nil + } + return ObjectInfo{}, ObjectNotFound{Bucket: bucket, Object: object} +} + +// PutObject - writes an object to least used erasure zone. +func (z *xlZones) PutObject(ctx context.Context, bucket string, object string, data *PutObjReader, opts ObjectOptions) (ObjectInfo, error) { + // Lock the object. + objectLock := z.NewNSLock(ctx, bucket, object) + if err := objectLock.GetLock(globalObjectTimeout); err != nil { + return ObjectInfo{}, err + } + defer objectLock.Unlock() + + if z.SingleZone() { + return z.zones[0].PutObject(ctx, bucket, object, data, opts) + } + + for _, zone := range z.zones { + objInfo, err := zone.GetObjectInfo(ctx, bucket, object, opts) + if err != nil { + if isErrObjectNotFound(err) { + continue + } + return objInfo, err + } + // Overwrite request upload to right zone. + return zone.PutObject(ctx, bucket, object, data, opts) + } + // Object not found pick the least used and upload to this zone. + return z.zones[z.getAvailableZoneIdx(ctx)].PutObject(ctx, bucket, object, data, opts) +} + +func (z *xlZones) DeleteObject(ctx context.Context, bucket string, object string) error { + // Acquire a write lock before deleting the object. + objectLock := z.NewNSLock(ctx, bucket, object) + if err := objectLock.GetLock(globalOperationTimeout); err != nil { + return err + } + defer objectLock.Unlock() + + if z.SingleZone() { + return z.zones[0].DeleteObject(ctx, bucket, object) + } + for _, zone := range z.zones { + err := zone.DeleteObject(ctx, bucket, object) + if err != nil && !isErrObjectNotFound(err) { + return err + } + } + return nil +} + +func (z *xlZones) DeleteObjects(ctx context.Context, bucket string, objects []string) ([]error, error) { + derrs := make([]error, len(objects)) + for i := range derrs { + derrs[i] = checkDelObjArgs(ctx, bucket, objects[i]) + } + + var objectLocks = make([]RWLocker, len(objects)) + for i := range objects { + if derrs[i] != nil { + continue + } + + // Acquire a write lock before deleting the object. + objectLocks[i] = z.NewNSLock(ctx, bucket, objects[i]) + if derrs[i] = objectLocks[i].GetLock(globalOperationTimeout); derrs[i] != nil { + continue + } + + defer objectLocks[i].Unlock() + } + + for _, zone := range z.zones { + errs, err := zone.DeleteObjects(ctx, bucket, objects) + if err != nil { + return nil, err + } + for i, derr := range errs { + if derrs[i] == nil { + if derr != nil && !isErrObjectNotFound(derr) { + derrs[i] = derr + } + } + } + } + return derrs, nil +} + +func (z *xlZones) CopyObject(ctx context.Context, srcBucket, srcObject, destBucket, destObject string, srcInfo ObjectInfo, srcOpts, dstOpts ObjectOptions) (objInfo ObjectInfo, err error) { + // Check if this request is only metadata update. + cpSrcDstSame := isStringEqual(pathJoin(srcBucket, srcObject), pathJoin(destBucket, destObject)) + if !cpSrcDstSame { + objectLock := z.NewNSLock(ctx, destBucket, destObject) + if err := objectLock.GetLock(globalObjectTimeout); err != nil { + return objInfo, err + } + defer objectLock.Unlock() + } + + if z.SingleZone() { + return z.zones[0].CopyObject(ctx, srcBucket, srcObject, destBucket, destObject, srcInfo, srcOpts, dstOpts) + } + if cpSrcDstSame && srcInfo.metadataOnly { + for _, zone := range z.zones { + objInfo, err = zone.CopyObject(ctx, srcBucket, srcObject, destBucket, + destObject, srcInfo, srcOpts, dstOpts) + if err != nil { + if isErrObjectNotFound(err) { + continue + } + return objInfo, err + } + return objInfo, nil + } + return objInfo, ObjectNotFound{Bucket: srcBucket, Object: srcObject} + } + return z.zones[z.getAvailableZoneIdx(ctx)].CopyObject(ctx, srcBucket, srcObject, + destBucket, destObject, srcInfo, srcOpts, dstOpts) +} + +func (z *xlZones) ListObjectsV2(ctx context.Context, bucket, prefix, continuationToken, delimiter string, maxKeys int, fetchOwner bool, startAfter string) (ListObjectsV2Info, error) { + if z.SingleZone() { + return z.zones[0].ListObjectsV2(ctx, bucket, prefix, continuationToken, delimiter, maxKeys, fetchOwner, startAfter) + } + marker := continuationToken + if marker == "" { + marker = startAfter + } + + loi, err := z.ListObjects(ctx, bucket, prefix, marker, delimiter, maxKeys) + if err != nil { + return ListObjectsV2Info{}, err + } + + listObjectsV2Info := ListObjectsV2Info{ + IsTruncated: loi.IsTruncated, + ContinuationToken: continuationToken, + NextContinuationToken: loi.NextMarker, + Objects: loi.Objects, + Prefixes: loi.Prefixes, + } + return listObjectsV2Info, err +} + +func (z *xlZones) listObjectsNonSlash(ctx context.Context, bucket, prefix, marker, delimiter string, maxKeys int) (loi ListObjectsInfo, err error) { + + var zonesEntryChs [][]FileInfoCh + + recursive := true + for _, zone := range z.zones { + endWalkCh := make(chan struct{}) + defer close(endWalkCh) + zonesEntryChs = append(zonesEntryChs, + zone.startMergeWalks(ctx, bucket, prefix, "", recursive, endWalkCh)) + } + + var objInfos []ObjectInfo + var eof bool + var prevPrefix string + + var zoneDrivesPerSet []int + for _, zone := range z.zones { + zoneDrivesPerSet = append(zoneDrivesPerSet, zone.drivesPerSet) + } + + var zonesEntriesInfos [][]FileInfo + var zonesEntriesValid [][]bool + for _, entryChs := range zonesEntryChs { + zonesEntriesInfos = append(zonesEntriesInfos, make([]FileInfo, len(entryChs))) + zonesEntriesValid = append(zonesEntriesValid, make([]bool, len(entryChs))) + } + + for { + if len(objInfos) == maxKeys { + break + } + result, quorumCount, zoneIndex, ok := leastEntryZone(zonesEntryChs, zonesEntriesInfos, zonesEntriesValid) + if !ok { + eof = true + break + } + rquorum := result.Quorum + // Quorum is zero for all directories. + if rquorum == 0 { + // Choose N/2 quorum for directory entries. + rquorum = zoneDrivesPerSet[zoneIndex] / 2 + } + if quorumCount < rquorum { + continue + } + + var objInfo ObjectInfo + + index := strings.Index(strings.TrimPrefix(result.Name, prefix), delimiter) + if index == -1 { + objInfo = ObjectInfo{ + IsDir: false, + Bucket: bucket, + Name: result.Name, + ModTime: result.ModTime, + Size: result.Size, + ContentType: result.Metadata["content-type"], + ContentEncoding: result.Metadata["content-encoding"], + } + + // Extract etag from metadata. + objInfo.ETag = extractETag(result.Metadata) + + // All the parts per object. + objInfo.Parts = result.Parts + + // etag/md5Sum has already been extracted. We need to + // remove to avoid it from appearing as part of + // response headers. e.g, X-Minio-* or X-Amz-*. + objInfo.UserDefined = cleanMetadata(result.Metadata) + + // Update storage class + if sc, ok := result.Metadata[xhttp.AmzStorageClass]; ok { + objInfo.StorageClass = sc + } else { + objInfo.StorageClass = globalMinioDefaultStorageClass + } + } else { + index = len(prefix) + index + len(delimiter) + currPrefix := result.Name[:index] + if currPrefix == prevPrefix { + continue + } + prevPrefix = currPrefix + + objInfo = ObjectInfo{ + Bucket: bucket, + Name: currPrefix, + IsDir: true, + } + } + + if objInfo.Name <= marker { + continue + } + + objInfos = append(objInfos, objInfo) + } + + result := ListObjectsInfo{} + for _, objInfo := range objInfos { + if objInfo.IsDir { + result.Prefixes = append(result.Prefixes, objInfo.Name) + continue + } + result.Objects = append(result.Objects, objInfo) + } + + if !eof { + result.IsTruncated = true + if len(objInfos) > 0 { + result.NextMarker = objInfos[len(objInfos)-1].Name + } + } + + return result, nil +} + +func (z *xlZones) listObjects(ctx context.Context, bucket, prefix, marker, delimiter string, maxKeys int, heal bool) (ListObjectsInfo, error) { + loi := ListObjectsInfo{} + + if err := checkListObjsArgs(ctx, bucket, prefix, marker, delimiter, z); err != nil { + return loi, err + } + + // Marker is set validate pre-condition. + if marker != "" { + // Marker not common with prefix is not implemented. Send an empty response + if !hasPrefix(marker, prefix) { + return loi, nil + } + } + + // With max keys of zero we have reached eof, return right here. + if maxKeys == 0 { + return loi, nil + } + + // For delimiter and prefix as '/' we do not list anything at all + // since according to s3 spec we stop at the 'delimiter' + // along // with the prefix. On a flat namespace with 'prefix' + // as '/' we don't have any entries, since all the keys are + // of form 'keyName/...' + if delimiter == SlashSeparator && prefix == SlashSeparator { + return loi, nil + } + + // Over flowing count - reset to maxObjectList. + if maxKeys < 0 || maxKeys > maxObjectList { + maxKeys = maxObjectList + } + + if delimiter != SlashSeparator && delimiter != "" { + // "heal" option passed can be ignored as the heal-listing does not send non-standard delimiter. + return z.listObjectsNonSlash(ctx, bucket, prefix, marker, delimiter, maxKeys) + } + + // Default is recursive, if delimiter is set then list non recursive. + recursive := true + if delimiter == SlashSeparator { + recursive = false + } + + var zonesEntryChs [][]FileInfoCh + var zonesEndWalkCh []chan struct{} + + for _, zone := range z.zones { + entryChs, endWalkCh := zone.pool.Release(listParams{bucket, recursive, marker, prefix, heal}) + if entryChs == nil { + endWalkCh = make(chan struct{}) + entryChs = zone.startMergeWalks(ctx, bucket, prefix, marker, recursive, endWalkCh) + } + zonesEntryChs = append(zonesEntryChs, entryChs) + zonesEndWalkCh = append(zonesEndWalkCh, endWalkCh) + } + + var zoneDrivesPerSet []int + for _, zone := range z.zones { + zoneDrivesPerSet = append(zoneDrivesPerSet, zone.drivesPerSet) + } + + entries := mergeZonesEntriesCh(zonesEntryChs, maxKeys, zoneDrivesPerSet, heal) + if len(entries.Files) == 0 { + return loi, nil + } + + loi.IsTruncated = entries.IsTruncated + if loi.IsTruncated { + loi.NextMarker = entries.Files[len(entries.Files)-1].Name + } + + for _, entry := range entries.Files { + var objInfo ObjectInfo + if hasSuffix(entry.Name, SlashSeparator) { + if !recursive { + loi.Prefixes = append(loi.Prefixes, entry.Name) + continue + } + objInfo = ObjectInfo{ + Bucket: bucket, + Name: entry.Name, + IsDir: true, + } + } else { + objInfo = ObjectInfo{ + IsDir: false, + Bucket: bucket, + Name: entry.Name, + ModTime: entry.ModTime, + Size: entry.Size, + ContentType: entry.Metadata["content-type"], + ContentEncoding: entry.Metadata["content-encoding"], + } + + // Extract etag from metadata. + objInfo.ETag = extractETag(entry.Metadata) + + // All the parts per object. + objInfo.Parts = entry.Parts + + // etag/md5Sum has already been extracted. We need to + // remove to avoid it from appearing as part of + // response headers. e.g, X-Minio-* or X-Amz-*. + objInfo.UserDefined = cleanMetadata(entry.Metadata) + + // Update storage class + if sc, ok := entry.Metadata[xhttp.AmzStorageClass]; ok { + objInfo.StorageClass = sc + } else { + objInfo.StorageClass = globalMinioDefaultStorageClass + } + } + loi.Objects = append(loi.Objects, objInfo) + } + if loi.IsTruncated { + for i, zone := range z.zones { + zone.pool.Set(listParams{bucket, recursive, loi.NextMarker, prefix, heal}, zonesEntryChs[i], + zonesEndWalkCh[i]) + } + } + return loi, nil +} + +// Calculate least entry across zones and across multiple FileInfo +// channels, returns the least common entry and the total number of times +// we found this entry. Additionally also returns a boolean +// to indicate if the caller needs to call this function +// again to list the next entry. It is callers responsibility +// if the caller wishes to list N entries to call leastEntry +// N times until this boolean is 'false'. +func leastEntryZone(zoneEntryChs [][]FileInfoCh, zoneEntries [][]FileInfo, zoneEntriesValid [][]bool) (FileInfo, int, int, bool) { + for i, entryChs := range zoneEntryChs { + for j := range entryChs { + zoneEntries[i][j], zoneEntriesValid[i][j] = entryChs[j].Pop() + } + } + + var isTruncated = false + for _, entriesValid := range zoneEntriesValid { + for _, valid := range entriesValid { + if !valid { + continue + } + isTruncated = true + break + } + if isTruncated { + break + } + } + + var lentry FileInfo + var found bool + var zoneIndex = -1 + for i, entriesValid := range zoneEntriesValid { + for j, valid := range entriesValid { + if !valid { + continue + } + if !found { + lentry = zoneEntries[i][j] + found = true + zoneIndex = i + continue + } + if zoneEntries[i][j].Name < lentry.Name { + lentry = zoneEntries[i][j] + zoneIndex = i + } + } + } + + // We haven't been able to find any least entry, + // this would mean that we don't have valid entry. + if !found { + return lentry, 0, zoneIndex, isTruncated + } + + leastEntryCount := 0 + for i, entriesValid := range zoneEntriesValid { + for j, valid := range entriesValid { + if !valid { + continue + } + + // Entries are duplicated across disks, + // we should simply skip such entries. + if lentry.Name == zoneEntries[i][j].Name && lentry.ModTime.Equal(zoneEntries[i][j].ModTime) { + leastEntryCount++ + continue + } + + // Push all entries which are lexically higher + // and will be returned later in Pop() + zoneEntryChs[i][j].Push(zoneEntries[i][j]) + } + } + + return lentry, leastEntryCount, zoneIndex, isTruncated +} + +// mergeZonesEntriesCh - merges FileInfo channel to entries upto maxKeys. +func mergeZonesEntriesCh(zonesEntryChs [][]FileInfoCh, maxKeys int, zoneDrives []int, heal bool) (entries FilesInfo) { + var i = 0 + var zonesEntriesInfos [][]FileInfo + var zonesEntriesValid [][]bool + for _, entryChs := range zonesEntryChs { + zonesEntriesInfos = append(zonesEntriesInfos, make([]FileInfo, len(entryChs))) + zonesEntriesValid = append(zonesEntriesValid, make([]bool, len(entryChs))) + } + for { + fi, quorumCount, zoneIndex, valid := leastEntryZone(zonesEntryChs, zonesEntriesInfos, zonesEntriesValid) + if !valid { + // We have reached EOF across all entryChs, break the loop. + break + } + rquorum := fi.Quorum + // Quorum is zero for all directories. + if rquorum == 0 { + // Choose N/2 quoroum for directory entries. + rquorum = zoneDrives[zoneIndex] / 2 + } + + if heal { + // When healing is enabled, we should + // list only objects which need healing. + if quorumCount == zoneDrives[zoneIndex] { + // Skip good entries. + continue + } + } else { + // Regular listing, we skip entries not in quorum. + if quorumCount < rquorum { + // Skip entries which do not have quorum. + continue + } + } + entries.Files = append(entries.Files, fi) + i++ + if i == maxKeys { + entries.IsTruncated = isTruncatedZones(zonesEntryChs, zonesEntriesInfos, zonesEntriesValid) + break + } + } + return entries +} + +func isTruncatedZones(zoneEntryChs [][]FileInfoCh, zoneEntries [][]FileInfo, zoneEntriesValid [][]bool) bool { + for i, entryChs := range zoneEntryChs { + for j := range entryChs { + zoneEntries[i][j], zoneEntriesValid[i][j] = entryChs[j].Pop() + } + } + + var isTruncated = false + for _, entriesValid := range zoneEntriesValid { + for _, valid := range entriesValid { + if !valid { + continue + } + isTruncated = true + break + } + if isTruncated { + break + } + } + for i, entryChs := range zoneEntryChs { + for j := range entryChs { + if zoneEntriesValid[i][j] { + zoneEntryChs[i][j].Push(zoneEntries[i][j]) + } + } + } + return isTruncated +} + +func (z *xlZones) ListObjects(ctx context.Context, bucket, prefix, marker, delimiter string, maxKeys int) (ListObjectsInfo, error) { + if z.SingleZone() { + return z.zones[0].ListObjects(ctx, bucket, prefix, marker, delimiter, maxKeys) + } + + return z.listObjects(ctx, bucket, prefix, marker, delimiter, maxKeys, false) +} + +func (z *xlZones) ListMultipartUploads(ctx context.Context, bucket, prefix, keyMarker, uploadIDMarker, delimiter string, maxUploads int) (ListMultipartsInfo, error) { + if z.SingleZone() { + return z.zones[0].ListMultipartUploads(ctx, bucket, prefix, keyMarker, uploadIDMarker, delimiter, maxUploads) + } + var zoneResult = ListMultipartsInfo{} + zoneResult.MaxUploads = maxUploads + zoneResult.KeyMarker = keyMarker + zoneResult.Prefix = prefix + zoneResult.Delimiter = delimiter + for _, zone := range z.zones { + result, err := zone.ListMultipartUploads(ctx, bucket, prefix, keyMarker, uploadIDMarker, + delimiter, maxUploads) + if err != nil { + return result, err + } + zoneResult.Uploads = append(zoneResult.Uploads, result.Uploads...) + } + return zoneResult, nil +} + +// Initiate a new multipart upload on a hashedSet based on object name. +func (z *xlZones) NewMultipartUpload(ctx context.Context, bucket, object string, opts ObjectOptions) (string, error) { + if z.SingleZone() { + return z.zones[0].NewMultipartUpload(ctx, bucket, object, opts) + } + return z.zones[z.getAvailableZoneIdx(ctx)].NewMultipartUpload(ctx, bucket, object, opts) +} + +// Copies a part of an object from source hashedSet to destination hashedSet. +func (z *xlZones) CopyObjectPart(ctx context.Context, srcBucket, srcObject, destBucket, destObject string, uploadID string, partID int, startOffset int64, length int64, srcInfo ObjectInfo, srcOpts, dstOpts ObjectOptions) (PartInfo, error) { + return z.PutObjectPart(ctx, destBucket, destObject, uploadID, partID, + NewPutObjReader(srcInfo.Reader, nil, nil), dstOpts) +} + +// PutObjectPart - writes part of an object to hashedSet based on the object name. +func (z *xlZones) PutObjectPart(ctx context.Context, bucket, object, uploadID string, partID int, data *PutObjReader, opts ObjectOptions) (PartInfo, error) { + uploadIDLock := z.NewNSLock(ctx, bucket, pathJoin(object, uploadID)) + if err := uploadIDLock.GetLock(globalOperationTimeout); err != nil { + return PartInfo{}, err + } + defer uploadIDLock.Unlock() + + if z.SingleZone() { + return z.zones[0].PutObjectPart(ctx, bucket, object, uploadID, partID, data, opts) + } + for _, zone := range z.zones { + result, err := zone.ListMultipartUploads(ctx, bucket, object, "", "", "", maxObjectList) + if err != nil { + return PartInfo{}, err + } + if result.Lookup(uploadID) { + return zone.PutObjectPart(ctx, bucket, object, uploadID, partID, data, opts) + } + } + + return PartInfo{}, InvalidUploadID{ + Bucket: bucket, + Object: object, + UploadID: uploadID, + } +} + +// ListObjectParts - lists all uploaded parts to an object in hashedSet. +func (z *xlZones) ListObjectParts(ctx context.Context, bucket, object, uploadID string, partNumberMarker int, maxParts int, opts ObjectOptions) (ListPartsInfo, error) { + uploadIDLock := z.NewNSLock(ctx, bucket, pathJoin(object, uploadID)) + if err := uploadIDLock.GetRLock(globalOperationTimeout); err != nil { + return ListPartsInfo{}, err + } + defer uploadIDLock.RUnlock() + + if z.SingleZone() { + return z.zones[0].ListObjectParts(ctx, bucket, object, uploadID, partNumberMarker, maxParts, opts) + } + for _, zone := range z.zones { + result, err := zone.ListMultipartUploads(ctx, bucket, object, "", "", "", maxObjectList) + if err != nil { + return ListPartsInfo{}, err + } + if result.Lookup(uploadID) { + return zone.ListObjectParts(ctx, bucket, object, uploadID, partNumberMarker, maxParts, opts) + } + } + return ListPartsInfo{}, InvalidUploadID{ + Bucket: bucket, + Object: object, + UploadID: uploadID, + } +} + +// Aborts an in-progress multipart operation on hashedSet based on the object name. +func (z *xlZones) AbortMultipartUpload(ctx context.Context, bucket, object, uploadID string) error { + uploadIDLock := z.NewNSLock(ctx, bucket, pathJoin(object, uploadID)) + if err := uploadIDLock.GetLock(globalOperationTimeout); err != nil { + return err + } + defer uploadIDLock.Unlock() + + if z.SingleZone() { + return z.zones[0].AbortMultipartUpload(ctx, bucket, object, uploadID) + } + for _, zone := range z.zones { + result, err := zone.ListMultipartUploads(ctx, bucket, object, "", "", "", maxObjectList) + if err != nil { + return err + } + if result.Lookup(uploadID) { + return zone.AbortMultipartUpload(ctx, bucket, object, uploadID) + } + } + return InvalidUploadID{ + Bucket: bucket, + Object: object, + UploadID: uploadID, + } +} + +// CompleteMultipartUpload - completes a pending multipart transaction, on hashedSet based on object name. +func (z *xlZones) CompleteMultipartUpload(ctx context.Context, bucket, object, uploadID string, uploadedParts []CompletePart, opts ObjectOptions) (objInfo ObjectInfo, err error) { + // Hold read-locks to verify uploaded parts, also disallows + // parallel part uploads as well. + uploadIDLock := z.NewNSLock(ctx, bucket, pathJoin(object, uploadID)) + if err = uploadIDLock.GetRLock(globalOperationTimeout); err != nil { + return objInfo, err + } + defer uploadIDLock.RUnlock() + + // Hold namespace to complete the transaction, only hold + // if uploadID can be held exclusively. + objectLock := z.NewNSLock(ctx, bucket, object) + if err = objectLock.GetLock(globalOperationTimeout); err != nil { + return objInfo, err + } + defer objectLock.Unlock() + + if z.SingleZone() { + return z.zones[0].CompleteMultipartUpload(ctx, bucket, object, uploadID, uploadedParts, opts) + } + + // Purge any existing object. + for _, zone := range z.zones { + zone.DeleteObject(ctx, bucket, object) + } + + for _, zone := range z.zones { + result, err := zone.ListMultipartUploads(ctx, bucket, object, "", "", "", maxObjectList) + if err != nil { + return objInfo, err + } + if result.Lookup(uploadID) { + return zone.CompleteMultipartUpload(ctx, bucket, object, uploadID, uploadedParts, opts) + } + } + return objInfo, InvalidUploadID{ + Bucket: bucket, + Object: object, + UploadID: uploadID, + } +} + +// GetBucketInfo - returns bucket info from one of the erasure coded zones. +func (z *xlZones) GetBucketInfo(ctx context.Context, bucket string) (bucketInfo BucketInfo, err error) { + if z.SingleZone() { + return z.zones[0].GetBucketInfo(ctx, bucket) + } + for _, zone := range z.zones { + bucketInfo, err = zone.GetBucketInfo(ctx, bucket) + if err != nil { + if isErrBucketNotFound(err) { + continue + } + return bucketInfo, err + } + return bucketInfo, nil + } + return bucketInfo, BucketNotFound{ + Bucket: bucket, + } +} + +// SetBucketPolicy persist the new policy on the bucket. +func (z *xlZones) SetBucketPolicy(ctx context.Context, bucket string, policy *policy.Policy) error { + return savePolicyConfig(ctx, z, bucket, policy) +} + +// GetBucketPolicy will return a policy on a bucket +func (z *xlZones) GetBucketPolicy(ctx context.Context, bucket string) (*policy.Policy, error) { + return getPolicyConfig(z, bucket) +} + +// DeleteBucketPolicy deletes all policies on bucket +func (z *xlZones) DeleteBucketPolicy(ctx context.Context, bucket string) error { + return removePolicyConfig(ctx, z, bucket) +} + +// SetBucketLifecycle zones lifecycle on bucket +func (z *xlZones) SetBucketLifecycle(ctx context.Context, bucket string, lifecycle *lifecycle.Lifecycle) error { + return saveLifecycleConfig(ctx, z, bucket, lifecycle) +} + +// GetBucketLifecycle will get lifecycle on bucket +func (z *xlZones) GetBucketLifecycle(ctx context.Context, bucket string) (*lifecycle.Lifecycle, error) { + return getLifecycleConfig(z, bucket) +} + +// DeleteBucketLifecycle deletes all lifecycle on bucket +func (z *xlZones) DeleteBucketLifecycle(ctx context.Context, bucket string) error { + return removeLifecycleConfig(ctx, z, bucket) +} + +// IsNotificationSupported returns whether bucket notification is applicable for this layer. +func (z *xlZones) IsNotificationSupported() bool { + return true +} + +// IsListenBucketSupported returns whether listen bucket notification is applicable for this layer. +func (z *xlZones) IsListenBucketSupported() bool { + return true +} + +// IsEncryptionSupported returns whether server side encryption is implemented for this layer. +func (z *xlZones) IsEncryptionSupported() bool { + return true +} + +// IsCompressionSupported returns whether compression is applicable for this layer. +func (z *xlZones) IsCompressionSupported() bool { + return true +} + +// DeleteBucket - deletes a bucket on all zones simultaneously, +// even if one of the zones fail to delete buckets, we proceed to +// undo a successful operation. +func (z *xlZones) DeleteBucket(ctx context.Context, bucket string) error { + if z.SingleZone() { + return z.zones[0].DeleteBucket(ctx, bucket) + } + g := errgroup.WithNErrs(len(z.zones)) + + // Delete buckets in parallel across all zones. + for index := range z.zones { + index := index + g.Go(func() error { + return z.zones[index].DeleteBucket(ctx, bucket) + }, index) + } + + errs := g.Wait() + // For any write quorum failure, we undo all the delete buckets operation + // by creating all the buckets again. + for _, err := range errs { + if err != nil { + if _, ok := err.(InsufficientWriteQuorum); ok { + undoDeleteBucketZones(bucket, z.zones, errs) + } + return err + } + } + + // Success. + return nil +} + +// This function is used to undo a successful DeleteBucket operation. +func undoDeleteBucketZones(bucket string, zones []*xlSets, errs []error) { + g := errgroup.WithNErrs(len(zones)) + + // Undo previous delete bucket on all underlying zones. + for index := range zones { + index := index + g.Go(func() error { + if errs[index] == nil { + return zones[index].MakeBucketWithLocation(context.Background(), bucket, "") + } + return nil + }, index) + } + + g.Wait() +} + +// List all buckets from one of the zones, we are not doing merge +// sort here just for simplification. As per design it is assumed +// that all buckets are present on all zones. +func (z *xlZones) ListBuckets(ctx context.Context) (buckets []BucketInfo, err error) { + if z.SingleZone() { + return z.zones[0].ListBuckets(ctx) + } + for _, zone := range z.zones { + buckets, err := zone.ListBuckets(ctx) + if err != nil { + logger.LogIf(ctx, err) + continue + } + return buckets, nil + } + return buckets, InsufficientReadQuorum{} +} + +func (z *xlZones) ReloadFormat(ctx context.Context, dryRun bool) error { + // Acquire lock on format.json + formatLock := z.NewNSLock(ctx, minioMetaBucket, formatConfigFile) + if err := formatLock.GetRLock(globalHealingTimeout); err != nil { + return err + } + defer formatLock.RUnlock() + + for _, zone := range z.zones { + if err := zone.ReloadFormat(ctx, dryRun); err != nil { + return err + } + } + return nil +} + +func (z *xlZones) HealFormat(ctx context.Context, dryRun bool) (madmin.HealResultItem, error) { + // Acquire lock on format.json + formatLock := z.NewNSLock(ctx, minioMetaBucket, formatConfigFile) + if err := formatLock.GetLock(globalHealingTimeout); err != nil { + return madmin.HealResultItem{}, err + } + defer formatLock.Unlock() + + var r = madmin.HealResultItem{ + Type: madmin.HealItemMetadata, + Detail: "disk-format", + } + for _, zone := range z.zones { + result, err := zone.HealFormat(ctx, dryRun) + if err != nil { + logger.LogIf(ctx, err) + continue + } + r.DiskCount = result.DiskCount + r.SetCount = result.SetCount + r.Before.Drives = append(r.Before.Drives, r.Before.Drives...) + r.After.Drives = append(r.After.Drives, r.After.Drives...) + } + return r, nil +} + +func (z *xlZones) HealBucket(ctx context.Context, bucket string, dryRun, remove bool) (madmin.HealResultItem, error) { + var r = madmin.HealResultItem{ + Type: madmin.HealItemBucket, + Bucket: bucket, + } + + for _, zone := range z.zones { + result, err := zone.HealBucket(ctx, bucket, dryRun, remove) + if err != nil { + switch err.(type) { + case BucketNotFound: + continue + } + return result, err + } + r.DiskCount = result.DiskCount + r.SetCount = result.SetCount + r.Before.Drives = append(r.Before.Drives, r.Before.Drives...) + r.After.Drives = append(r.After.Drives, r.After.Drives...) + } + return r, nil +} + +func (z *xlZones) ListObjectsHeal(ctx context.Context, bucket, prefix, marker, delimiter string, maxKeys int) (ListObjectsInfo, error) { + if z.SingleZone() { + return z.zones[0].ListObjectsHeal(ctx, bucket, prefix, marker, delimiter, maxKeys) + } + return z.listObjects(ctx, bucket, prefix, marker, delimiter, maxKeys, true) +} + +func (z *xlZones) HealObjects(ctx context.Context, bucket, prefix string, healObjectFn func(string, string) error) error { + for _, zone := range z.zones { + if err := zone.HealObjects(ctx, bucket, prefix, healObjectFn); err != nil { + return err + } + } + return nil +} + +func (z *xlZones) HealObject(ctx context.Context, bucket, object string, dryRun, remove bool, scanMode madmin.HealScanMode) (madmin.HealResultItem, error) { + // Lock the object before healing. Use read lock since healing + // will only regenerate parts & xl.json of outdated disks. + objectLock := z.NewNSLock(ctx, bucket, object) + if err := objectLock.GetRLock(globalHealingTimeout); err != nil { + return madmin.HealResultItem{}, err + } + defer objectLock.RUnlock() + + if z.SingleZone() { + return z.zones[0].HealObject(ctx, bucket, object, dryRun, remove, scanMode) + } + for _, zone := range z.zones { + result, err := zone.HealObject(ctx, bucket, object, dryRun, remove, scanMode) + if err != nil { + if isErrObjectNotFound(err) { + continue + } + return result, err + } + return result, nil + } + return madmin.HealResultItem{}, ObjectNotFound{ + Bucket: bucket, + Object: object, + } +} + +func (z *xlZones) ListBucketsHeal(ctx context.Context) ([]BucketInfo, error) { + var healBuckets []BucketInfo + for _, zone := range z.zones { + bucketsInfo, err := zone.ListBucketsHeal(ctx) + if err != nil { + continue + } + healBuckets = append(healBuckets, bucketsInfo...) + } + return healBuckets, nil +} diff --git a/docs/distributed/DESIGN.md b/docs/distributed/DESIGN.md index 8609c64b8..672b33db3 100644 --- a/docs/distributed/DESIGN.md +++ b/docs/distributed/DESIGN.md @@ -1,5 +1,5 @@ # Distributed Server Design Guide [![Slack](https://slack.min.io/slack?type=svg)](https://slack.min.io) -This document explains the design approach, advanced use cases and limits of the MinIO distributed server. +This document explains the design approach and advanced use cases of the MinIO distributed server. ## Command-line ``` @@ -127,10 +127,6 @@ Distributed erasure coded configuration with rack level redundancy 32 sets in to minio server http://rack{1...4}-host{1...8}.example.net/export{1...16} ``` -Distributed erasure coded configuration with no rack level redundancy but redundancy with in the rack we split the arguments, 32 sets in total, 16 disks per set. -``` -minio server http://rack1-host{1...8}.example.net/export{1...16} http://rack2-host{1...8}.example.net/export{1...16} http://rack3-host{1...8}.example.net/export{1...16} http://rack4-host{1...8}.example.net/export{1...16} -``` ## Backend `format.json` changes `format.json` has new fields @@ -208,8 +204,3 @@ type formatXLV2 struct { } `json:"xl"` } ``` - -## Limits - -- Minimum of 4 disks are needed for any erasure coded configuration. -- Maximum of 32 distinct nodes are supported in distributed configuration. diff --git a/docs/distributed/README.md b/docs/distributed/README.md index 4f6fc320e..84a8ca348 100644 --- a/docs/distributed/README.md +++ b/docs/distributed/README.md @@ -12,16 +12,10 @@ Distributed MinIO provides protection against multiple node/drive failures and [ ### High availability -A stand-alone MinIO server would go down if the server hosting the disks goes offline. In contrast, a distributed MinIO setup with _n_ disks will have your data safe as long as _n/2_ or more disks are online. You'll need a minimum of _(n/2 + 1)_ [Quorum](https://github.com/minio/dsync#lock-process) disks to create new objects though. +A stand-alone MinIO server would go down if the server hosting the disks goes offline. In contrast, a distributed MinIO setup with _n_ disks will have your data safe as long as _n/2_ or more disks are online. You'll need a minimum of _(n/2 + 1)_ disks to create new objects. For example, an 16-node distributed MinIO setup with 16 disks per node would continue serving files, even if up to 8 servers are offline. But, you'll need at least 9 servers online to create new objects. -### Limits - -As with MinIO in stand-alone mode, distributed MinIO has a per tenant limit of minimum of 2 and maximum of 32 servers. There are no limits on number of disks across these servers. If you need a multiple tenant setup, you can easily spin up multiple MinIO instances managed by orchestration tools like Kubernetes, Docker Swarm etc. - -Note that with distributed MinIO you can play around with the number of nodes and drives as long as the limits are adhered to. For example, you can have 2 nodes with 4 drives each, 4 nodes with 4 drives each, 8 nodes with 2 drives each, 32 servers with 64 drives each and so on. - You can also use [storage classes](https://github.com/minio/minio/tree/master/docs/erasure/storage-class) to set custom data and parity distribution per object. ### Consistency Guarantees @@ -61,7 +55,18 @@ export MINIO_SECRET_KEY= minio server http://host{1...32}/export{1...32} ``` -__NOTE:__ `{1...n}` shown have 3 dots! Using only 2 dots `{1..32}` will be interpreted by your shell and won't be passed to minio server, affecting the erasure coding order, which may impact performance and high availability. __Always use ellipses syntax `{1...n}` (3 dots!) for optimal erasure-code distribution__ +> __NOTE:__ `{1...n}` shown have 3 dots! Using only 2 dots `{1..32}` will be interpreted by your shell and won't be passed to MinIO server, affecting the erasure coding order, which may impact performance and high availability. __Always use ellipses syntax `{1...n}` (3 dots!) for optimal erasure-code distribution__ + +#### Expanding existing distributed setup +MinIO supports expanding distributed erasure coded clusters by specifying new set of clusters on the command-line as shown below: + +```sh +export MINIO_ACCESS_KEY= +export MINIO_SECRET_KEY= +minio server http://host{1...32}/export{1...32} http://host{33...64}/export{1...32} +``` + +Now the server has expanded storage of *1024* more disks in total of *2048* disks, new object upload requests automatically start using the least used cluster. This expansion strategy works endlessly, so you can perpetually expand your clusters as needed. ## 3. Test your setup To test this setup, access the MinIO server via browser or [`mc`](https://docs.min.io/docs/minio-client-quickstart-guide). diff --git a/docs/minio-limits.md b/docs/minio-limits.md index a3e65dea7..bec203963 100644 --- a/docs/minio-limits.md +++ b/docs/minio-limits.md @@ -4,10 +4,10 @@ |Item|Specification| |:---|:---| -|Maximum number of servers per cluster| Unlimited| -|Maximum number of federated clusters | Unlimited| +|Maximum number of servers per cluster| no-limit| +|Maximum number of federated clusters | no-limit| |Minimum number of servers| 02| -|Maximum number of drives per server| Unlimited| +|Maximum number of drives per server| no-limit| |Read quorum| N/2| |Write quorum| N/2+1| diff --git a/go.sum b/go.sum index 75dce11d2..8f61bb103 100644 --- a/go.sum +++ b/go.sum @@ -499,6 +499,8 @@ github.com/nats-io/go-nats-streaming v0.0.0-20161216191029-077898146bfb/go.mod h github.com/nats-io/go-nats-streaming v0.4.2/go.mod h1:gfq4R3c9sKAINOpelo0gn/b9QDMBZnmrttcsNF+lqyo= github.com/nats-io/go-nats-streaming v0.4.4 h1:1I3lkZDRdQYXb+holjdqZ2J6xyekrD06o9Fd8rWlgP4= github.com/nats-io/go-nats-streaming v0.4.4/go.mod h1:gfq4R3c9sKAINOpelo0gn/b9QDMBZnmrttcsNF+lqyo= +github.com/nats-io/jwt v0.3.0 h1:xdnzwFETV++jNc4W1mw//qFyJGb2ABOombmZJQS4+Qo= +github.com/nats-io/jwt v0.3.0/go.mod h1:fRYCDE99xlTsqUzISS1Bi75UBJ6ljOJQOAAu5VglpSg= github.com/nats-io/jwt v0.3.0/go.mod h1:fRYCDE99xlTsqUzISS1Bi75UBJ6ljOJQOAAu5VglpSg= github.com/nats-io/jwt v0.3.2 h1:+RB5hMpXUUA2dfxuhBTEkMOrYmM+gKIZYS1KjSostMI= github.com/nats-io/jwt v0.3.2/go.mod h1:/euKqTS1ZD+zzjYrY7pseZrTtWQSjujC7xjPc8wL6eU= @@ -517,7 +519,11 @@ github.com/nats-io/nats.go v1.8.0/go.mod h1:BrFz9vVn0fU3AcH9Vn4Kd7W0NpJ651tD5omQ github.com/nats-io/nats.go v1.9.1 h1:ik3HbLhZ0YABLto7iX80pZLPw/6dx3T+++MZJwLnMrQ= github.com/nats-io/nats.go v1.9.1/go.mod h1:ZjDU1L/7fJ09jvUSRVBR2e7+RnLiiIQyqyzEE/Zbp4w= github.com/nats-io/nkeys v0.0.2 h1:+qM7QpgXnvDDixitZtQUBDY9w/s9mu1ghS+JIbsrx6M= +github.com/nats-io/nkeys v0.0.2 h1:+qM7QpgXnvDDixitZtQUBDY9w/s9mu1ghS+JIbsrx6M= github.com/nats-io/nkeys v0.0.2/go.mod h1:dab7URMsZm6Z/jp9Z5UGa87Uutgc2mVpXLC4B7TDb/4= +github.com/nats-io/nkeys v0.0.2/go.mod h1:dab7URMsZm6Z/jp9Z5UGa87Uutgc2mVpXLC4B7TDb/4= +github.com/nats-io/nkeys v0.1.0 h1:qMd4+pRHgdr1nAClu+2h/2a5F2TmKcCzjCDazVgRoX4= +github.com/nats-io/nkeys v0.1.0/go.mod h1:xpnFELMwJABBLVhffcfd1MZx6VsNRFpEugbxziKVo7w= github.com/nats-io/nkeys v0.1.0/go.mod h1:xpnFELMwJABBLVhffcfd1MZx6VsNRFpEugbxziKVo7w= github.com/nats-io/nkeys v0.1.3 h1:6JrEfig+HzTH85yxzhSVbjHRJv9cn0p6n3IngIcM5/k= github.com/nats-io/nkeys v0.1.3/go.mod h1:xpnFELMwJABBLVhffcfd1MZx6VsNRFpEugbxziKVo7w= diff --git a/pkg/dsync/drwmutex.go b/pkg/dsync/drwmutex.go index e289de4aa..553efd138 100644 --- a/pkg/dsync/drwmutex.go +++ b/pkg/dsync/drwmutex.go @@ -20,6 +20,7 @@ import ( "context" "fmt" golog "log" + "math" "math/rand" "os" "path" @@ -75,7 +76,7 @@ func isLocked(uid string) bool { func NewDRWMutex(ctx context.Context, name string, clnt *Dsync) *DRWMutex { return &DRWMutex{ Name: name, - writeLocks: make([]string, clnt.dNodeCount), + writeLocks: make([]string, len(clnt.GetLockersFn())), clnt: clnt, ctx: ctx, } @@ -133,6 +134,8 @@ func (dm *DRWMutex) lockBlocking(timeout time.Duration, id, source string, isRea doneCh, start := make(chan struct{}), time.Now().UTC() defer close(doneCh) + restClnts := dm.clnt.GetLockersFn() + // Use incremental back-off algorithm for repeated attempts to acquire the lock for range newRetryTimerSimple(doneCh) { select { @@ -142,7 +145,7 @@ func (dm *DRWMutex) lockBlocking(timeout time.Duration, id, source string, isRea } // Create temp array on stack. - locks := make([]string, dm.clnt.dNodeCount) + locks := make([]string, len(restClnts)) // Try to acquire the lock. success := lock(dm.clnt, &locks, dm.Name, id, source, isReadLock) @@ -152,7 +155,7 @@ func (dm *DRWMutex) lockBlocking(timeout time.Duration, id, source string, isRea // If success, copy array to object if isReadLock { // Append new array of strings at the end - dm.readersLocks = append(dm.readersLocks, make([]string, dm.clnt.dNodeCount)) + dm.readersLocks = append(dm.readersLocks, make([]string, len(restClnts))) // and copy stack array into last spot copy(dm.readersLocks[len(dm.readersLocks)-1], locks[:]) } else { @@ -174,12 +177,14 @@ func (dm *DRWMutex) lockBlocking(timeout time.Duration, id, source string, isRea // lock tries to acquire the distributed lock, returning true or false. func lock(ds *Dsync, locks *[]string, lockName, id, source string, isReadLock bool) bool { + restClnts := ds.GetLockersFn() + // Create buffered channel of size equal to total number of nodes. - ch := make(chan Granted, ds.dNodeCount) + ch := make(chan Granted, len(restClnts)) defer close(ch) var wg sync.WaitGroup - for index, c := range ds.restClnts { + for index, c := range restClnts { wg.Add(1) // broadcast lock request to all nodes @@ -229,7 +234,10 @@ func lock(ds *Dsync, locks *[]string, lockName, id, source string, isReadLock bo done := false timeout := time.After(DRWMutexAcquireTimeout) - for ; i < ds.dNodeCount; i++ { // Loop until we acquired all locks + dquorum := int(len(restClnts)/2) + 1 + dquorumReads := int(math.Ceil(float64(len(restClnts)) / 2.0)) + + for ; i < len(restClnts); i++ { // Loop until we acquired all locks select { case grant := <-ch: @@ -238,22 +246,22 @@ func lock(ds *Dsync, locks *[]string, lockName, id, source string, isReadLock bo (*locks)[grant.index] = grant.lockUID } else { locksFailed++ - if !isReadLock && locksFailed > ds.dNodeCount-ds.dquorum || - isReadLock && locksFailed > ds.dNodeCount-ds.dquorumReads { + if !isReadLock && locksFailed > len(restClnts)-dquorum || + isReadLock && locksFailed > len(restClnts)-dquorumReads { // We know that we are not going to get the lock anymore, // so exit out and release any locks that did get acquired done = true // Increment the number of grants received from the buffered channel. i++ - releaseAll(ds, locks, lockName, isReadLock) + releaseAll(ds, locks, lockName, isReadLock, restClnts) } } case <-timeout: done = true // timeout happened, maybe one of the nodes is slow, count // number of locks to check whether we have quorum or not - if !quorumMet(locks, isReadLock, ds.dquorum, ds.dquorumReads) { - releaseAll(ds, locks, lockName, isReadLock) + if !quorumMet(locks, isReadLock, dquorum, dquorumReads) { + releaseAll(ds, locks, lockName, isReadLock, restClnts) } } @@ -263,7 +271,7 @@ func lock(ds *Dsync, locks *[]string, lockName, id, source string, isReadLock bo } // Count locks in order to determine whether we have quorum or not - quorum = quorumMet(locks, isReadLock, ds.dquorum, ds.dquorumReads) + quorum = quorumMet(locks, isReadLock, dquorum, dquorumReads) // Signal that we have the quorum wg.Done() @@ -271,11 +279,12 @@ func lock(ds *Dsync, locks *[]string, lockName, id, source string, isReadLock bo // Wait for the other responses and immediately release the locks // (do not add them to the locks array because the DRWMutex could // already has been unlocked again by the original calling thread) - for ; i < ds.dNodeCount; i++ { + for ; i < len(restClnts); i++ { grantToBeReleased := <-ch if grantToBeReleased.isLocked() { // release lock - sendRelease(ds, ds.restClnts[grantToBeReleased.index], lockName, grantToBeReleased.lockUID, isReadLock) + sendRelease(ds, restClnts[grantToBeReleased.index], lockName, + grantToBeReleased.lockUID, isReadLock) } } }(isReadLock) @@ -306,10 +315,10 @@ func quorumMet(locks *[]string, isReadLock bool, quorum, quorumReads int) bool { } // releaseAll releases all locks that are marked as locked -func releaseAll(ds *Dsync, locks *[]string, lockName string, isReadLock bool) { - for lock := 0; lock < ds.dNodeCount; lock++ { +func releaseAll(ds *Dsync, locks *[]string, lockName string, isReadLock bool, restClnts []NetLocker) { + for lock := 0; lock < len(restClnts); lock++ { if isLocked((*locks)[lock]) { - sendRelease(ds, ds.restClnts[lock], lockName, (*locks)[lock], isReadLock) + sendRelease(ds, restClnts[lock], lockName, (*locks)[lock], isReadLock) (*locks)[lock] = "" } } @@ -320,8 +329,9 @@ func releaseAll(ds *Dsync, locks *[]string, lockName string, isReadLock bool) { // It is a run-time error if dm is not locked on entry to Unlock. func (dm *DRWMutex) Unlock() { + restClnts := dm.clnt.GetLockersFn() // create temp array on stack - locks := make([]string, dm.clnt.dNodeCount) + locks := make([]string, len(restClnts)) { dm.m.Lock() @@ -342,11 +352,11 @@ func (dm *DRWMutex) Unlock() { // Copy write locks to stack array copy(locks, dm.writeLocks[:]) // Clear write locks array - dm.writeLocks = make([]string, dm.clnt.dNodeCount) + dm.writeLocks = make([]string, len(restClnts)) } isReadLock := false - unlock(dm.clnt, locks, dm.Name, isReadLock) + unlock(dm.clnt, locks, dm.Name, isReadLock, restClnts) } // RUnlock releases a read lock held on dm. @@ -355,8 +365,9 @@ func (dm *DRWMutex) Unlock() { func (dm *DRWMutex) RUnlock() { // create temp array on stack - locks := make([]string, dm.clnt.dNodeCount) + restClnts := dm.clnt.GetLockersFn() + locks := make([]string, len(restClnts)) { dm.m.Lock() defer dm.m.Unlock() @@ -370,15 +381,15 @@ func (dm *DRWMutex) RUnlock() { } isReadLock := true - unlock(dm.clnt, locks, dm.Name, isReadLock) + unlock(dm.clnt, locks, dm.Name, isReadLock, restClnts) } -func unlock(ds *Dsync, locks []string, name string, isReadLock bool) { +func unlock(ds *Dsync, locks []string, name string, isReadLock bool, restClnts []NetLocker) { // We don't need to synchronously wait until we have released all the locks (or the quorum) // (a subsequent lock will retry automatically in case it would fail to get quorum) - for index, c := range ds.restClnts { + for index, c := range restClnts { if isLocked(locks[index]) { // broadcast lock release to all nodes that granted the lock diff --git a/pkg/dsync/dsync.go b/pkg/dsync/dsync.go index f370f5659..a4022c7fe 100644 --- a/pkg/dsync/dsync.go +++ b/pkg/dsync/dsync.go @@ -16,45 +16,9 @@ package dsync -import ( - "errors" - "math" -) - // Dsync represents dsync client object which is initialized with // authenticated clients, used to initiate lock REST calls. type Dsync struct { - // Number of nodes participating in the distributed locking. - dNodeCount int - // List of rest client objects, one per lock server. - restClnts []NetLocker - - // Simple majority based quorum, set to dNodeCount/2+1 - dquorum int - - // Simple quorum for read operations, set to dNodeCount/2 - dquorumReads int -} - -// New - initializes a new dsync object with input restClnts. -func New(restClnts []NetLocker) (*Dsync, error) { - if len(restClnts) < 2 { - return nil, errors.New("Dsync is not designed for less than 2 nodes") - } else if len(restClnts) > 32 { - return nil, errors.New("Dsync is not designed for more than 32 nodes") - } - - ds := &Dsync{} - ds.dNodeCount = len(restClnts) - - // With odd number of nodes, write and read quorum is basically the same - ds.dquorum = int(ds.dNodeCount/2) + 1 - ds.dquorumReads = int(math.Ceil(float64(ds.dNodeCount) / 2.0)) - - // Initialize node name and rest path for each NetLocker object. - ds.restClnts = make([]NetLocker, ds.dNodeCount) - copy(ds.restClnts, restClnts) - - return ds, nil + GetLockersFn func() []NetLocker } diff --git a/pkg/dsync/dsync_private_test.go b/pkg/dsync/dsync_private_test.go deleted file mode 100644 index c50bfcdde..000000000 --- a/pkg/dsync/dsync_private_test.go +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Minio Cloud Storage, (C) 2018 Minio, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// GOMAXPROCS=10 go test - -package dsync - -import "testing" - -// Tests dsync.New -func TestNew(t *testing.T) { - nclnts := make([]NetLocker, 33) - if _, err := New(nclnts); err == nil { - t.Fatal("Should have failed") - } - - nclnts = make([]NetLocker, 1) - if _, err := New(nclnts); err == nil { - t.Fatal("Should have failed") - } - - nclnts = make([]NetLocker, 2) - nds, err := New(nclnts) - if err != nil { - t.Fatal("Should pass", err) - } - - if nds.dquorumReads != 1 { - t.Fatalf("Unexpected read quorum values expected 1, got %d", nds.dquorumReads) - } - - if nds.dquorum != 2 { - t.Fatalf("Unexpected quorum values expected 2, got %d", nds.dquorum) - } - - nclnts = make([]NetLocker, 3) - nds, err = New(nclnts) - if err != nil { - t.Fatal("Should pass", err) - } - - if nds.dquorumReads != nds.dquorum { - t.Fatalf("Unexpected quorum values for odd nodes we expect read %d and write %d quorum to be same", nds.dquorumReads, nds.dquorum) - } -} diff --git a/pkg/dsync/dsync_test.go b/pkg/dsync/dsync_test.go index 7c73740bc..d3162f61b 100644 --- a/pkg/dsync/dsync_test.go +++ b/pkg/dsync/dsync_test.go @@ -78,10 +78,8 @@ func TestMain(m *testing.M) { clnts = append(clnts, newClient(nodes[i], rpcPaths[i])) } - var err error - ds, err = New(clnts) - if err != nil { - log.Fatalf("set nodes failed with %v", err) + ds = &Dsync{ + GetLockersFn: func() []NetLocker { return clnts }, } startRPCServers(nodes) @@ -256,11 +254,10 @@ func TestMutex(t *testing.T) { func BenchmarkMutexUncontended(b *testing.B) { type PaddedMutex struct { - DRWMutex - pad [128]uint8 + *DRWMutex } b.RunParallel(func(pb *testing.PB) { - var mu PaddedMutex + var mu = PaddedMutex{NewDRWMutex(context.Background(), "", ds)} for pb.Next() { mu.Lock(id, source) mu.Unlock() diff --git a/pkg/dsync/rpc-client-impl_test.go b/pkg/dsync/rpc-client-impl_test.go index bbdabe209..f9f2065d8 100644 --- a/pkg/dsync/rpc-client-impl_test.go +++ b/pkg/dsync/rpc-client-impl_test.go @@ -41,6 +41,14 @@ func newClient(addr, endpoint string) NetLocker { } } +// Close closes the underlying socket file descriptor. +func (rpcClient *ReconnectRPCClient) IsOnline() bool { + rpcClient.mutex.Lock() + defer rpcClient.mutex.Unlock() + // If rpc client has not connected yet there is nothing to close. + return rpcClient.rpc != nil +} + // Close closes the underlying socket file descriptor. func (rpcClient *ReconnectRPCClient) Close() error { rpcClient.mutex.Lock() diff --git a/pkg/dsync/rpc-client-interface.go b/pkg/dsync/rpc-client-interface.go index b613731bb..ffce5f5c7 100644 --- a/pkg/dsync/rpc-client-interface.go +++ b/pkg/dsync/rpc-client-interface.go @@ -56,4 +56,7 @@ type NetLocker interface { // Close closes any underlying connection to the service endpoint Close() error + + // Is the underlying connection online? (is always true for any local lockers) + IsOnline() bool }