minio/cmd/erasure-common.go

// Copyright (c) 2015-2021 MinIO, Inc.
//
// This file is part of MinIO Object Storage stack
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program.  If not, see <http://www.gnu.org/licenses/>.

package cmd

import (
	"context"
	"fmt"
	"sync"

	"github.com/minio/minio/internal/logger"
	"github.com/minio/minio/internal/sync/errgroup"
)

func (er erasureObjects) getOnlineDisks() (newDisks []StorageAPI) {
	disks := er.getDisks()
	var wg sync.WaitGroup
	var mu sync.Mutex
	for _, i := range hashOrder(UTCNow().String(), len(disks)) {
		i := i
		wg.Add(1)
		go func() {
			defer wg.Done()
			if disks[i-1] == nil {
				return
			}
			di, err := disks[i-1].DiskInfo(context.Background())
			if err != nil || di.Healing {
				// - Do not consume disks which are not reachable
				//   unformatted or simply not accessible for some reason.
				//
				// - Do not consume disks which are being healed
				//
				// - Future: skip busy disks
				return
			}

			mu.Lock()
			newDisks = append(newDisks, disks[i-1])
			mu.Unlock()
		}()
	}
	wg.Wait()
	return newDisks
}

func (er erasureObjects) getLoadBalancedLocalDisks() (newDisks []StorageAPI) {
	disks := er.getDisks()
	// Based on the random shuffling return back randomized disks.
	for _, i := range hashOrder(UTCNow().String(), len(disks)) {
		if disks[i-1] != nil && disks[i-1].IsLocal() {
			newDisks = append(newDisks, disks[i-1])
		}
	}
	return newDisks
}

// getLoadBalancedDisks - fetches load balanced (sufficiently randomized) disk slice.
// ensures to skip disks if they are not healing and online.
func (er erasureObjects) getLoadBalancedDisks(optimized bool) []StorageAPI {
	disks := er.getDisks()

	if !optimized {
		var newDisks []StorageAPI
		for _, i := range hashOrder(UTCNow().String(), len(disks)) {
			newDisks = append(newDisks, disks[i-1])
		}
		return newDisks
	}

	var wg sync.WaitGroup
	var mu sync.Mutex
	newDisks := map[uint64][]StorageAPI{}
	// Based on the random shuffling return back randomized disks.
	for _, i := range hashOrder(UTCNow().String(), len(disks)) {
		i := i
		wg.Add(1)
		go func() {
			defer wg.Done()
			if disks[i-1] == nil {
				return
			}
			di, err := disks[i-1].DiskInfo(context.Background())
			if err != nil || di.Healing {
				// - Do not consume disks which are not reachable
				//   unformatted or simply not accessible for some reason.
				//
				// - Do not consume disks which are being healed
				//
				// - Future: skip busy disks
				return
			}

			mu.Lock()
			// Capture disks usage wise upto resolution of MiB
			newDisks[di.Used/1024/1024] = append(newDisks[di.Used/1024/1024], disks[i-1])
			mu.Unlock()
		}()
	}
	wg.Wait()

	var max uint64
	for k := range newDisks {
		if k > max {
			max = k
		}
	}

	// Return disks which have maximum disk usage common.
	return newDisks[max]
}

// readMultipleFiles Reads raw data from all specified files from all disks.
func readMultipleFiles(ctx context.Context, disks []StorageAPI, req ReadMultipleReq, readQuorum int) ([]ReadMultipleResp, error) {
	resps := make([]chan ReadMultipleResp, len(disks))
	for i := range resps {
		resps[i] = make(chan ReadMultipleResp, len(req.Files))
	}
	g := errgroup.WithNErrs(len(disks))
	// Read files in parallel across disks.
	for index := range disks {
		index := index
		g.Go(func() (err error) {
			if disks[index] == nil {
				return errDiskNotFound
			}
			return disks[index].ReadMultiple(ctx, req, resps[index])
		}, index)
	}

	dataArray := make([]ReadMultipleResp, 0, len(req.Files))
	// Merge results. They should come in order from each.
	for _, wantFile := range req.Files {
		quorum := 0
		toAdd := ReadMultipleResp{
			Bucket: req.Bucket,
			Prefix: req.Prefix,
			File:   wantFile,
		}
		for i := range resps {
			if disks[i] == nil {
				continue
			}
			select {
			case <-ctx.Done():
			case gotFile, ok := <-resps[i]:
				if !ok {
					continue
				}
				if gotFile.Error != "" || !gotFile.Exists {
					continue
				}
				if gotFile.File != wantFile || gotFile.Bucket != req.Bucket || gotFile.Prefix != req.Prefix {
					continue
				}
				quorum++
				if toAdd.Modtime.After(gotFile.Modtime) || len(gotFile.Data) < len(toAdd.Data) {
					// Pick latest, or largest to avoid possible truncated entries.
					continue
				}
				toAdd = gotFile
			}
		}
		if quorum < readQuorum {
			toAdd.Exists = false
			toAdd.Error = errErasureReadQuorum.Error()
			toAdd.Data = nil
		}
		dataArray = append(dataArray, toAdd)
	}

	errs := g.Wait()
	for index, err := range errs {
		if err == nil {
			continue
		}
		if !IsErr(err, []error{
			errFileNotFound,
			errVolumeNotFound,
			errFileVersionNotFound,
			errDiskNotFound,
			errUnformattedDisk,
		}...) {
			logger.LogOnceIf(ctx, fmt.Errorf("Drive %s, path (%s/%s) returned an error (%w)",
				disks[index], req.Bucket, req.Prefix, err),
				disks[index].String())
		}
	}

	// Return all the metadata.
	return dataArray, nil
}
update license change for MinIO Signed-off-by: Harshavardhana <harsha@minio.io> 2021-04-18 19:41:13 +00:00			`// Copyright (c) 2015-2021 MinIO, Inc.`
			`//`
			`// This file is part of MinIO Object Storage stack`
			`//`
			`// This program is free software: you can redistribute it and/or modify`
			`// it under the terms of the GNU Affero General Public License as published by`
			`// the Free Software Foundation, either version 3 of the License, or`
			`// (at your option) any later version.`
			`//`
			`// This program is distributed in the hope that it will be useful`
			`// but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`// GNU Affero General Public License for more details.`
			`//`
			`// You should have received a copy of the GNU Affero General Public License`
			`// along with this program. If not, see <http://www.gnu.org/licenses/>.`
XL: Cleanup and add more comments. (#1807) 2016-05-30 23:51:59 +00:00
server: Move all the top level files into cmd folder. (#2490) This change brings a change which was done for the 'mc' package to allow for clean repo and have a cleaner github drop in experience. 2016-08-18 23:23:42 +00:00			`package cmd`
XL: Cleanup and add more comments. (#1807) 2016-05-30 23:51:59 +00:00
Avoid DDOS in PutObject() when objectName is '/' and size '0' (#4962) It can happen that an incoming PutObject() request might have inputs of following form eg:- - bucketName is 'testbucket' - objectName is '/' bucketName exists and was previously created but there are no other objects in this bucket. In a situation like this parentDirIsObject() goes into an infinite loop. Verifying that if '/' is an object fails on both backends but the resulting `path.Dir('/')` returns `'/'` this causes the closure to loop onto itself. Fixes #4940 2017-09-25 21:47:58 +00:00			`import (`
Create logger package and rename errorIf to LogIf (#5678) Removing message from error logging Replace errors.Trace with LogIf 2018-04-05 22:04:40 +00:00			`"context"`
Independent Multipart Uploads (#15346) Do completely independent multipart uploads. In distributed mode, a lock was held to merge each multipart upload as it was added. This lock was highly contested and retries are expensive (timewise) in distributed mode. Instead, each part adds its metadata information uniquely. This eliminates the per object lock required for each to merge. The metadata is read back and merged by "CompleteMultipartUpload" without locks when constructing final object. Co-authored-by: Harshavardhana <harsha@minio.io> 2022-07-19 15:35:29 +00:00			`"fmt"`
cache DiskInfo at storage layer for performance (#10586) `mc admin info` on busy setups will not move HDD heads unnecessarily for repeated calls, provides a better responsiveness for the call overall. Bonus change allow listTolerancePerSet be N-1 for good entries, to avoid skipping entries for some reason one of the disk went offline. 2020-09-29 16:54:41 +00:00			`"sync"`
Independent Multipart Uploads (#15346) Do completely independent multipart uploads. In distributed mode, a lock was held to merge each multipart upload as it was added. This lock was highly contested and retries are expensive (timewise) in distributed mode. Instead, each part adds its metadata information uniquely. This eliminates the per object lock required for each to merge. The metadata is read back and merged by "CompleteMultipartUpload" without locks when constructing final object. Co-authored-by: Harshavardhana <harsha@minio.io> 2022-07-19 15:35:29 +00:00
			`"github.com/minio/minio/internal/logger"`
			`"github.com/minio/minio/internal/sync/errgroup"`
Avoid DDOS in PutObject() when objectName is '/' and size '0' (#4962) It can happen that an incoming PutObject() request might have inputs of following form eg:- - bucketName is 'testbucket' - objectName is '/' bucketName exists and was previously created but there are no other objects in this bucket. In a situation like this parentDirIsObject() goes into an infinite loop. Verifying that if '/' is an object fails on both backends but the resulting `path.Dir('/')` returns `'/'` this causes the closure to loop onto itself. Fixes #4940 2017-09-25 21:47:58 +00:00			`)`
XL: Cleanup and add more comments. (#1807) 2016-05-30 23:51:59 +00:00
feat: decommission feature for pools (#14012) ``` λ mc admin decommission start alias/ http://minio{1...2}/data{1...4} ``` ``` λ mc admin decommission status alias/ ┌─────┬─────────────────────────────────┬──────────────────────────────────┬────────┐ │ ID │ Pools │ Capacity │ Status │ │ 1st │ http://minio{1...2}/data{1...4} │ 439 GiB (used) / 561 GiB (total) │ Active │ │ 2nd │ http://minio{3...4}/data{1...4} │ 329 GiB (used) / 421 GiB (total) │ Active │ └─────┴─────────────────────────────────┴──────────────────────────────────┴────────┘ ``` ``` λ mc admin decommission status alias/ http://minio{1...2}/data{1...4} Progress: ===================> [1GiB/sec] [15%] [4TiB/50TiB] Time Remaining: 4 hours (started 3 hours ago) ``` ``` λ mc admin decommission status alias/ http://minio{1...2}/data{1...4} ERROR: This pool is not scheduled for decommissioning currently. ``` ``` λ mc admin decommission cancel alias/ ┌─────┬─────────────────────────────────┬──────────────────────────────────┬──────────┐ │ ID │ Pools │ Capacity │ Status │ │ 1st │ http://minio{1...2}/data{1...4} │ 439 GiB (used) / 561 GiB (total) │ Draining │ └─────┴─────────────────────────────────┴──────────────────────────────────┴──────────┘ ``` > NOTE: Canceled decommission will not make the pool active again, since we might have > Potentially partial duplicate content on the other pools, to avoid this scenario be > very sure to start decommissioning as a planned activity. ``` λ mc admin decommission cancel alias/ http://minio{1...2}/data{1...4} ┌─────┬─────────────────────────────────┬──────────────────────────────────┬────────────────────┐ │ ID │ Pools │ Capacity │ Status │ │ 1st │ http://minio{1...2}/data{1...4} │ 439 GiB (used) / 561 GiB (total) │ Draining(Canceled) │ └─────┴─────────────────────────────────┴──────────────────────────────────┴────────────────────┘ ``` 2022-01-10 17:07:49 +00:00			`func (er erasureObjects) getOnlineDisks() (newDisks []StorageAPI) {`
			`disks := er.getDisks()`
			`var wg sync.WaitGroup`
			`var mu sync.Mutex`
			`for _, i := range hashOrder(UTCNow().String(), len(disks)) {`
			`i := i`
			`wg.Add(1)`
			`go func() {`
			`defer wg.Done()`
			`if disks[i-1] == nil {`
			`return`
			`}`
			`di, err := disks[i-1].DiskInfo(context.Background())`
			`if err != nil \|\| di.Healing {`
			`// - Do not consume disks which are not reachable`
			`// unformatted or simply not accessible for some reason.`
			`//`
			`// - Do not consume disks which are being healed`
			`//`
			`// - Future: skip busy disks`
			`return`
			`}`

			`mu.Lock()`
			`newDisks = append(newDisks, disks[i-1])`
			`mu.Unlock()`
			`}()`
			`}`
			`wg.Wait()`
			`return newDisks`
			`}`

tolerate listing with only readQuorum disks (#10357) We can reduce this further in the future, but this is a good value to keep around. With the advent of continuous healing, we can be assured that namespace will eventually be consistent so we are okay to avoid the necessity to a list across all drives on all sets. Bonus Pop()'s in parallel seem to have the potential to wait too on large drive setups and cause more slowness instead of gaining any performance remove it for now. Also, implement load balanced reply for local disks, ensuring that local disks have an affinity for - cleanupStaleMultipartUploads() 2020-08-27 02:29:35 +00:00			`func (er erasureObjects) getLoadBalancedLocalDisks() (newDisks []StorageAPI) {`
			`disks := er.getDisks()`
			`// Based on the random shuffling return back randomized disks.`
			`for _, i := range hashOrder(UTCNow().String(), len(disks)) {`
			`if disks[i-1] != nil && disks[i-1].IsLocal() {`
do not skip healing disks during deletes (#14394) healing disks take active I/O it is possible that deleted objects might stay in .trash folder for a really long time until the drive is fully healed. this PR changes it such that we are making sure we purge the active content written to these disks as well. 2022-02-23 22:30:46 +00:00			`newDisks = append(newDisks, disks[i-1])`
tolerate listing with only readQuorum disks (#10357) We can reduce this further in the future, but this is a good value to keep around. With the advent of continuous healing, we can be assured that namespace will eventually be consistent so we are okay to avoid the necessity to a list across all drives on all sets. Bonus Pop()'s in parallel seem to have the potential to wait too on large drive setups and cause more slowness instead of gaining any performance remove it for now. Also, implement load balanced reply for local disks, ensuring that local disks have an affinity for - cleanupStaleMultipartUploads() 2020-08-27 02:29:35 +00:00			`}`
			`}`
			`return newDisks`
			`}`

XL: Do not rely on getLoadBalancedQuorumDisks for NS consistency. (#2243) The reason is any function relying on `getLoadBalancedQuorumDisks` cannot possibly have an idempotent behavior. The problem comes from given a set of N disks returning just a shuffled N/2 disks. In case of a scenario where we have N/2 number of failed disks, the returned value of `getLoadBalancedQuorumDisks` is not equal to the same failed disks so essentially calls using such disks might succeed or fail randomly at different intervals in time. This proposal change is we move to `getLoadBalancedDisks()` and use the shuffled N disks as a whole. Since most of the time we might hit a good disk since we are not reducing our solution space. This also provides consistent behavior for all the functions which rely on shuffled disks. Fixes #2242 2016-07-21 07:27:08 +00:00			`// getLoadBalancedDisks - fetches load balanced (sufficiently randomized) disk slice.`
add '.healing.bin' for tracking currently healing disk (#10573) add a hint on the disk to allow for tracking fresh disk being healed, to allow for restartable heals, and also use this as a way to track and remove disks. There are more pending changes where we should move all the disk formatting logic to backend drives, this PR doesn't deal with this refactor instead makes it easier to track healing in the future. 2020-09-29 02:39:32 +00:00			`// ensures to skip disks if they are not healing and online.`
optionally allow strict quorum listing (#10649) ``` export MINIO_API_LIST_STRICT_QUORUM=on ``` would enable listing in quorum if necessary 2020-10-09 22:40:46 +00:00			`func (er erasureObjects) getLoadBalancedDisks(optimized bool) []StorageAPI {`
Support bucket versioning (#9377) - Implement a new xl.json 2.0.0 format to support, this moves the entire marshaling logic to POSIX layer, top layer always consumes a common FileInfo construct which simplifies the metadata reads. - Implement list object versions - Migrate to siphash from crchash for new deployments for object placements. Fixes #2111 2020-06-13 03:04:01 +00:00			`disks := er.getDisks()`
add '.healing.bin' for tracking currently healing disk (#10573) add a hint on the disk to allow for tracking fresh disk being healed, to allow for restartable heals, and also use this as a way to track and remove disks. There are more pending changes where we should move all the disk formatting logic to backend drives, this PR doesn't deal with this refactor instead makes it easier to track healing in the future. 2020-09-29 02:39:32 +00:00
optionally allow strict quorum listing (#10649) ``` export MINIO_API_LIST_STRICT_QUORUM=on ``` would enable listing in quorum if necessary 2020-10-09 22:40:46 +00:00			`if !optimized {`
			`var newDisks []StorageAPI`
			`for _, i := range hashOrder(UTCNow().String(), len(disks)) {`
			`newDisks = append(newDisks, disks[i-1])`
			`}`
			`return newDisks`
			`}`

cache DiskInfo at storage layer for performance (#10586) `mc admin info` on busy setups will not move HDD heads unnecessarily for repeated calls, provides a better responsiveness for the call overall. Bonus change allow listTolerancePerSet be N-1 for good entries, to avoid skipping entries for some reason one of the disk went offline. 2020-09-29 16:54:41 +00:00			`var wg sync.WaitGroup`
			`var mu sync.Mutex`
run gofumpt cleanup across code-base (#14015) 2022-01-02 17:15:06 +00:00			`newDisks := map[uint64][]StorageAPI{}`
XL: Cleanup, comments and all the updated functions. (#1830) 2016-06-01 23:43:31 +00:00			`// Based on the random shuffling return back randomized disks.`
Improve connectDisks() performance (#9203) 2020-03-25 06:26:13 +00:00			`for _, i := range hashOrder(UTCNow().String(), len(disks)) {`
cache DiskInfo at storage layer for performance (#10586) `mc admin info` on busy setups will not move HDD heads unnecessarily for repeated calls, provides a better responsiveness for the call overall. Bonus change allow listTolerancePerSet be N-1 for good entries, to avoid skipping entries for some reason one of the disk went offline. 2020-09-29 16:54:41 +00:00			`i := i`
			`wg.Add(1)`
			`go func() {`
			`defer wg.Done()`
			`if disks[i-1] == nil {`
			`return`
			`}`
			`di, err := disks[i-1].DiskInfo(context.Background())`
			`if err != nil \|\| di.Healing {`
			`// - Do not consume disks which are not reachable`
			`// unformatted or simply not accessible for some reason.`
			`//`
			`// - Do not consume disks which are being healed`
			`//`
			`// - Future: skip busy disks`
			`return`
			`}`
pick disks which are common maximally used (#10600) further optimization to ensure that good disks are always used for listing, other than healing we only use disks that are maximally used. 2020-09-30 05:54:02 +00:00
cache DiskInfo at storage layer for performance (#10586) `mc admin info` on busy setups will not move HDD heads unnecessarily for repeated calls, provides a better responsiveness for the call overall. Bonus change allow listTolerancePerSet be N-1 for good entries, to avoid skipping entries for some reason one of the disk went offline. 2020-09-29 16:54:41 +00:00			`mu.Lock()`
fix: allow crawler to crawl on disks without usage constraints (#10677) additionally also change the resolution usage wise return of disks, allows to small byte level differences to be masked. 2020-10-14 19:12:10 +00:00			`// Capture disks usage wise upto resolution of MiB`
			`newDisks[di.Used/1024/1024] = append(newDisks[di.Used/1024/1024], disks[i-1])`
cache DiskInfo at storage layer for performance (#10586) `mc admin info` on busy setups will not move HDD heads unnecessarily for repeated calls, provides a better responsiveness for the call overall. Bonus change allow listTolerancePerSet be N-1 for good entries, to avoid skipping entries for some reason one of the disk went offline. 2020-09-29 16:54:41 +00:00			`mu.Unlock()`
			`}()`
XL: Cleanup, comments and all the updated functions. (#1830) 2016-06-01 23:43:31 +00:00			`}`
cache DiskInfo at storage layer for performance (#10586) `mc admin info` on busy setups will not move HDD heads unnecessarily for repeated calls, provides a better responsiveness for the call overall. Bonus change allow listTolerancePerSet be N-1 for good entries, to avoid skipping entries for some reason one of the disk went offline. 2020-09-29 16:54:41 +00:00			`wg.Wait()`
pick disks which are common maximally used (#10600) further optimization to ensure that good disks are always used for listing, other than healing we only use disks that are maximally used. 2020-09-30 05:54:02 +00:00
			`var max uint64`
			`for k := range newDisks {`
			`if k > max {`
			`max = k`
			`}`
			`}`

			`// Return disks which have maximum disk usage common.`
			`return newDisks[max]`
XL: Cleanup and add more comments. (#1807) 2016-05-30 23:51:59 +00:00			`}`
Independent Multipart Uploads (#15346) Do completely independent multipart uploads. In distributed mode, a lock was held to merge each multipart upload as it was added. This lock was highly contested and retries are expensive (timewise) in distributed mode. Instead, each part adds its metadata information uniquely. This eliminates the per object lock required for each to merge. The metadata is read back and merged by "CompleteMultipartUpload" without locks when constructing final object. Co-authored-by: Harshavardhana <harsha@minio.io> 2022-07-19 15:35:29 +00:00
			`// readMultipleFiles Reads raw data from all specified files from all disks.`
			`func readMultipleFiles(ctx context.Context, disks []StorageAPI, req ReadMultipleReq, readQuorum int) ([]ReadMultipleResp, error) {`
			`resps := make([]chan ReadMultipleResp, len(disks))`
			`for i := range resps {`
			`resps[i] = make(chan ReadMultipleResp, len(req.Files))`
			`}`
			`g := errgroup.WithNErrs(len(disks))`
			`// Read files in parallel across disks.`
			`for index := range disks {`
			`index := index`
			`g.Go(func() (err error) {`
			`if disks[index] == nil {`
			`return errDiskNotFound`
			`}`
			`return disks[index].ReadMultiple(ctx, req, resps[index])`
			`}, index)`
			`}`

			`dataArray := make([]ReadMultipleResp, 0, len(req.Files))`
			`// Merge results. They should come in order from each.`
			`for _, wantFile := range req.Files {`
			`quorum := 0`
			`toAdd := ReadMultipleResp{`
			`Bucket: req.Bucket,`
			`Prefix: req.Prefix,`
			`File: wantFile,`
			`}`
			`for i := range resps {`
			`if disks[i] == nil {`
			`continue`
			`}`
			`select {`
			`case <-ctx.Done():`
			`case gotFile, ok := <-resps[i]:`
			`if !ok {`
			`continue`
			`}`
			`if gotFile.Error != "" \|\| !gotFile.Exists {`
			`continue`
			`}`
			`if gotFile.File != wantFile \|\| gotFile.Bucket != req.Bucket \|\| gotFile.Prefix != req.Prefix {`
			`continue`
			`}`
			`quorum++`
			`if toAdd.Modtime.After(gotFile.Modtime) \|\| len(gotFile.Data) < len(toAdd.Data) {`
			`// Pick latest, or largest to avoid possible truncated entries.`
			`continue`
			`}`
			`toAdd = gotFile`
			`}`
			`}`
			`if quorum < readQuorum {`
			`toAdd.Exists = false`
			`toAdd.Error = errErasureReadQuorum.Error()`
			`toAdd.Data = nil`
			`}`
			`dataArray = append(dataArray, toAdd)`
			`}`

			`errs := g.Wait()`
			`for index, err := range errs {`
			`if err == nil {`
			`continue`
			`}`
			`if !IsErr(err, []error{`
			`errFileNotFound,`
			`errVolumeNotFound,`
			`errFileVersionNotFound,`
			`errDiskNotFound,`
			`errUnformattedDisk,`
			`}...) {`
			`logger.LogOnceIf(ctx, fmt.Errorf("Drive %s, path (%s/%s) returned an error (%w)",`
			`disks[index], req.Bucket, req.Prefix, err),`
			`disks[index].String())`
			`}`
			`}`

			`// Return all the metadata.`
			`return dataArray, nil`
			`}`