XL: Make listOnlineDisks and outDatedDisks consistent w/ each other. (#3808)

This commit is contained in:
Krishnan Parthasarathi 2017-03-05 04:23:28 +05:30 committed by Harshavardhana
parent b05c1c11d4
commit e3fd4c0dd6
4 changed files with 393 additions and 47 deletions

View file

@ -16,7 +16,11 @@
package cmd
import "time"
import (
"encoding/hex"
"path/filepath"
"time"
)
// commonTime returns a maximally occurring time from a list of time.
func commonTime(modTimes []time.Time) (modTime time.Time, count int) {
@ -59,29 +63,43 @@ func bootModtimes(diskCount int) []time.Time {
}
// Extracts list of times from xlMetaV1 slice and returns, skips
// slice elements which have errors. As a special error
// errFileNotFound is treated as a initial good condition.
// slice elements which have errors.
func listObjectModtimes(partsMetadata []xlMetaV1, errs []error) (modTimes []time.Time) {
modTimes = bootModtimes(len(partsMetadata))
// Set a new time value, specifically set when
// error == errFileNotFound (this is needed when this is a
// fresh PutObject).
timeNow := time.Now().UTC()
for index, metadata := range partsMetadata {
if errs[index] == nil {
// Once the file is found, save the uuid saved on disk.
modTimes[index] = metadata.Stat.ModTime
} else if errs[index] == errFileNotFound {
// Once the file is not found then the epoch is current time.
modTimes[index] = timeNow
if errs[index] != nil {
continue
}
// Once the file is found, save the uuid saved on disk.
modTimes[index] = metadata.Stat.ModTime
}
return modTimes
}
// Returns slice of online disks needed.
// - slice returing readable disks.
// - modTime of the Object
// Notes:
// There are 5 possible states a disk could be in,
// 1. __online__ - has the latest copy of xl.json - returned by listOnlineDisks
//
// 2. __offline__ - err == errDiskNotFound
//
// 3. __availableWithParts__ - has the latest copy of xl.json and has all
// parts with checksums matching; returned by disksWithAllParts
//
// 4. __outdated__ - returned by outDatedDisk, provided []StorageAPI
// returned by diskWithAllParts is passed for latestDisks.
// - has an old copy of xl.json
// - doesn't have xl.json (errFileNotFound)
// - has the latest xl.json but one or more parts are corrupt
//
// 5. __missingParts__ - has the latest copy of xl.json but has some parts
// missing. This is identified separately since this may need manual
// inspection to understand the root cause. E.g, this could be due to
// backend filesystem corruption.
// listOnlineDisks - returns
// - a slice of disks where disk having 'older' xl.json (or nothing)
// are set to nil.
// - latest (in time) of the maximally occurring modTime(s).
func listOnlineDisks(disks []StorageAPI, partsMetadata []xlMetaV1, errs []error) (onlineDisks []StorageAPI, modTime time.Time) {
onlineDisks = make([]StorageAPI, len(disks))
@ -102,22 +120,23 @@ func listOnlineDisks(disks []StorageAPI, partsMetadata []xlMetaV1, errs []error)
return onlineDisks, modTime
}
// Return disks with the outdated or missing object.
func outDatedDisks(disks []StorageAPI, partsMetadata []xlMetaV1, errs []error) (outDatedDisks []StorageAPI) {
// outDatedDisks - return disks which don't have the latest object (i.e xl.json).
// disks that are offline are not 'marked' outdated.
func outDatedDisks(disks, latestDisks []StorageAPI, errs []error, partsMetadata []xlMetaV1,
bucket, object string) (outDatedDisks []StorageAPI) {
outDatedDisks = make([]StorageAPI, len(disks))
latestDisks, _ := listOnlineDisks(disks, partsMetadata, errs)
for index, disk := range latestDisks {
if errorCause(errs[index]) == errFileNotFound {
outDatedDisks[index] = disks[index]
for index, latestDisk := range latestDisks {
if latestDisk != nil {
continue
}
if errs[index] != nil {
continue
}
if disk == nil {
// disk either has an older xl.json or doesn't have one.
switch errorCause(errs[index]) {
case nil, errFileNotFound:
outDatedDisks[index] = disks[index]
}
}
return outDatedDisks
}
@ -189,3 +208,49 @@ func xlHealStat(xl xlObjects, partsMetadata []xlMetaV1, errs []error) HealObject
MissingPartityCount: missingParityCount,
}
}
// disksWithAllParts - This function needs to be called with
// []StorageAPI returned by listOnlineDisks. Returns,
// - disks which have all parts specified in the latest xl.json.
// - errs updated to have errFileNotFound in place of disks that had
// missing parts.
// - non-nil error if any of the online disks failed during
// calculating blake2b checksum.
func disksWithAllParts(onlineDisks []StorageAPI, partsMetadata []xlMetaV1, errs []error, bucket, object string) ([]StorageAPI, []error, error) {
availableDisks := make([]StorageAPI, len(onlineDisks))
for index, onlineDisk := range onlineDisks {
if onlineDisk == nil {
continue
}
// disk has a valid xl.json but may not have all the
// parts. This is considered an outdated disk, since
// it needs healing too.
for pIndex, part := range partsMetadata[index].Parts {
// compute blake2b sum of part.
partPath := filepath.Join(object, part.Name)
hash := newHash(blake2bAlgo)
blakeBytes, hErr := hashSum(onlineDisk, bucket, partPath, hash)
if hErr == errFileNotFound {
errs[index] = errFileNotFound
continue
}
if hErr != nil && hErr != errFileNotFound {
return nil, nil, traceError(hErr)
}
partChecksum := partsMetadata[index].Erasure.Checksum[pIndex].Hash
blakeSum := hex.EncodeToString(blakeBytes)
// if blake2b sum doesn't match for a part
// then this disk is outdated and needs
// healing.
if blakeSum != partChecksum {
errs[index] = errFileNotFound
break
}
availableDisks[index] = onlineDisk
}
}
return availableDisks, errs, nil
}

View file

@ -17,6 +17,8 @@
package cmd
import (
"bytes"
"path/filepath"
"testing"
"time"
)
@ -81,3 +83,250 @@ func TestCommonTime(t *testing.T) {
}
}
}
// partsMetaFromModTimes - returns slice of modTimes given metadata of
// an object part.
func partsMetaFromModTimes(modTimes []time.Time, checksums []checkSumInfo) []xlMetaV1 {
var partsMetadata []xlMetaV1
for _, modTime := range modTimes {
partsMetadata = append(partsMetadata, xlMetaV1{
Erasure: erasureInfo{
Checksum: checksums,
},
Stat: statInfo{
ModTime: modTime,
},
Parts: []objectPartInfo{
{
Name: "part.1",
},
},
})
}
return partsMetadata
}
// toPosix - fetches *posix object from StorageAPI.
func toPosix(disk StorageAPI) *posix {
retryDisk, ok := disk.(*retryStorage)
if !ok {
return nil
}
pDisk, ok := retryDisk.remoteStorage.(*posix)
if !ok {
return nil
}
return pDisk
}
// TestListOnlineDisks - checks if listOnlineDisks and outDatedDisks
// are consistent with each other.
func TestListOnlineDisks(t *testing.T) {
rootPath, err := newTestConfig(globalMinioDefaultRegion)
if err != nil {
t.Fatalf("Failed to initialize config - %v", err)
}
defer removeAll(rootPath)
obj, disks, err := prepareXL()
if err != nil {
t.Fatalf("Prepare XL backend failed - %v", err)
}
defer removeRoots(disks)
type tamperKind int
const (
noTamper tamperKind = iota
deletePart tamperKind = iota
corruptPart tamperKind = iota
)
threeNanoSecs := time.Unix(0, 3).UTC()
fourNanoSecs := time.Unix(0, 4).UTC()
modTimesThreeNone := []time.Time{
threeNanoSecs, threeNanoSecs, threeNanoSecs, threeNanoSecs,
threeNanoSecs, threeNanoSecs, threeNanoSecs,
timeSentinel, timeSentinel, timeSentinel, timeSentinel,
timeSentinel, timeSentinel, timeSentinel, timeSentinel,
timeSentinel,
}
modTimesThreeFour := []time.Time{
threeNanoSecs, threeNanoSecs, threeNanoSecs, threeNanoSecs,
threeNanoSecs, threeNanoSecs, threeNanoSecs, threeNanoSecs,
fourNanoSecs, fourNanoSecs, fourNanoSecs, fourNanoSecs,
fourNanoSecs, fourNanoSecs, fourNanoSecs, fourNanoSecs,
}
testCases := []struct {
modTimes []time.Time
expectedTime time.Time
errs []error
_tamperBackend tamperKind
}{
{
modTimes: modTimesThreeFour,
expectedTime: fourNanoSecs,
errs: []error{
nil, nil, nil, nil, nil, nil, nil, nil, nil,
nil, nil, nil, nil, nil, nil, nil,
},
_tamperBackend: noTamper,
},
{
modTimes: modTimesThreeNone,
expectedTime: threeNanoSecs,
errs: []error{
// Disks that have a valid xl.json.
nil, nil, nil, nil, nil, nil, nil,
// Majority of disks don't have xl.json.
errFileNotFound, errFileNotFound,
errFileNotFound, errFileNotFound,
errFileNotFound, errDiskAccessDenied,
errDiskNotFound, errFileNotFound,
errFileNotFound,
},
_tamperBackend: deletePart,
},
{
modTimes: modTimesThreeNone,
expectedTime: threeNanoSecs,
errs: []error{
// Disks that have a valid xl.json.
nil, nil, nil, nil, nil, nil, nil,
// Majority of disks don't have xl.json.
errFileNotFound, errFileNotFound,
errFileNotFound, errFileNotFound,
errFileNotFound, errDiskAccessDenied,
errDiskNotFound, errFileNotFound,
errFileNotFound,
},
_tamperBackend: corruptPart,
},
}
bucket := "bucket"
object := "object"
data := bytes.Repeat([]byte("a"), 1024)
xlDisks := obj.(*xlObjects).storageDisks
for i, test := range testCases {
// Prepare bucket/object backend for the tests below.
// Cleanup from previous test.
obj.DeleteObject(bucket, object)
obj.DeleteBucket(bucket)
err = obj.MakeBucket("bucket")
if err != nil {
t.Fatalf("Failed to make a bucket %v", err)
}
_, err = obj.PutObject(bucket, object, int64(len(data)), bytes.NewReader(data), nil, "")
if err != nil {
t.Fatalf("Failed to putObject %v", err)
}
// Fetch xl.json from first disk to construct partsMetadata for the tests.
xlMeta, err := readXLMeta(xlDisks[0], bucket, object)
if err != nil {
t.Fatalf("Test %d: Failed to read xl.json %v", i+1, err)
}
tamperedIndex := -1
switch test._tamperBackend {
case deletePart:
for index, err := range test.errs {
if err != nil {
continue
}
// Remove a part from a disk
// which has a valid xl.json,
// and check if that disk
// appears in outDatedDisks.
tamperedIndex = index
dErr := xlDisks[index].DeleteFile(bucket, filepath.Join(object, "part.1"))
if dErr != nil {
t.Fatalf("Test %d: Failed to delete %s - %v", i+1,
filepath.Join(object, "part.1"), dErr)
}
break
}
case corruptPart:
for index, err := range test.errs {
if err != nil {
continue
}
// Corrupt a part from a disk
// which has a valid xl.json,
// and check if that disk
// appears in outDatedDisks.
tamperedIndex = index
dErr := xlDisks[index].AppendFile(bucket, filepath.Join(object, "part.1"), []byte("corruption"))
if dErr != nil {
t.Fatalf("Test %d: Failed to append corrupting data at the end of file %s - %v",
i+1, filepath.Join(object, "part.1"), dErr)
}
break
}
}
partsMetadata := partsMetaFromModTimes(test.modTimes, xlMeta.Erasure.Checksum)
onlineDisks, modTime := listOnlineDisks(xlDisks, partsMetadata, test.errs)
availableDisks, newErrs, err := disksWithAllParts(onlineDisks, partsMetadata, test.errs, bucket, object)
test.errs = newErrs
outdatedDisks := outDatedDisks(xlDisks, availableDisks, test.errs, partsMetadata, bucket, object)
if modTime.Equal(timeSentinel) {
t.Fatalf("Test %d: modTime should never be equal to timeSentinel, but found equal",
i+1)
}
if test._tamperBackend != noTamper {
if tamperedIndex != -1 && outdatedDisks[tamperedIndex] == nil {
t.Fatalf("Test %d: disk (%v) with part.1 missing is an outdated disk, but wasn't listed by outDatedDisks",
i+1, xlDisks[tamperedIndex])
}
}
if !modTime.Equal(test.expectedTime) {
t.Fatalf("Test %d: Expected modTime to be equal to %v but was found to be %v",
i+1, test.expectedTime, modTime)
}
// Check if a disk is considered both online and outdated,
// which is a contradiction, except if parts are missing.
overlappingDisks := make(map[string]*posix)
for _, availableDisk := range availableDisks {
if availableDisk == nil {
continue
}
pDisk := toPosix(availableDisk)
overlappingDisks[pDisk.diskPath] = pDisk
}
for index, outdatedDisk := range outdatedDisks {
// ignore the intentionally tampered disk,
// this is expected to appear as outdated
// disk, since it doesn't have all the parts.
if index == tamperedIndex {
continue
}
if outdatedDisk == nil {
continue
}
pDisk := toPosix(outdatedDisk)
if _, ok := overlappingDisks[pDisk.diskPath]; ok {
t.Errorf("Test %d: Outdated disk %v was also detected as an online disk - %v %v",
i+1, pDisk, availableDisks, outdatedDisks)
}
// errors other than errFileNotFound doesn't imply that the disk is outdated.
if test.errs[index] != nil && test.errs[index] != errFileNotFound && outdatedDisk != nil {
t.Errorf("Test %d: error (%v) other than errFileNotFound doesn't imply that the disk (%v) could be outdated",
i+1, test.errs[index], pDisk)
}
}
}
}

View file

@ -311,6 +311,9 @@ func quickHeal(storageDisks []StorageAPI, writeQuorum int, readQuorum int) error
// Heals an object only the corrupted/missing erasure blocks.
func healObject(storageDisks []StorageAPI, bucket string, object string, quorum int) error {
partsMetadata, errs := readAllXLMetadata(storageDisks, bucket, object)
// readQuorum suffices for xl.json since we use monotonic
// system time to break the tie when a split-brain situation
// arises.
if reducedErr := reduceReadQuorumErrs(errs, nil, quorum); reducedErr != nil {
return toObjectErr(reducedErr, bucket, object)
}
@ -322,12 +325,35 @@ func healObject(storageDisks []StorageAPI, bucket string, object string, quorum
// List of disks having latest version of the object.
latestDisks, modTime := listOnlineDisks(storageDisks, partsMetadata, errs)
// List of disks having all parts as per latest xl.json.
availableDisks, errs, aErr := disksWithAllParts(latestDisks, partsMetadata, errs, bucket, object)
if aErr != nil {
return toObjectErr(aErr, bucket, object)
}
numAvailableDisks := 0
for _, disk := range availableDisks {
if disk != nil {
numAvailableDisks++
}
}
// If less than read quorum number of disks have all the parts
// of the data, we can't reconstruct the erasure-coded data.
if numAvailableDisks < quorum {
return toObjectErr(errXLReadQuorum, bucket, object)
}
// List of disks having outdated version of the object or missing object.
outDatedDisks := outDatedDisks(storageDisks, partsMetadata, errs)
// Latest xlMetaV1 for reference. If a valid metadata is not present, it is as good as object not found.
outDatedDisks := outDatedDisks(storageDisks, availableDisks, errs, partsMetadata,
bucket, object)
// Latest xlMetaV1 for reference. If a valid metadata is not
// present, it is as good as object not found.
latestMeta, pErr := pickValidXLMeta(partsMetadata, modTime)
if pErr != nil {
return pErr
return toObjectErr(pErr, bucket, object)
}
for index, disk := range outDatedDisks {
@ -357,16 +383,16 @@ func healObject(storageDisks []StorageAPI, bucket string, object string, quorum
// Delete all the parts. Ignore if parts are not found.
for _, part := range outDatedMeta.Parts {
err := disk.DeleteFile(bucket, pathJoin(object, part.Name))
if err != nil && !isErr(err, errFileNotFound) {
return traceError(err)
dErr := disk.DeleteFile(bucket, pathJoin(object, part.Name))
if dErr != nil && !isErr(dErr, errFileNotFound) {
return toObjectErr(traceError(dErr), bucket, object)
}
}
// Delete xl.json file. Ignore if xl.json not found.
err := disk.DeleteFile(bucket, pathJoin(object, xlMetaJSONFile))
if err != nil && !isErr(err, errFileNotFound) {
return traceError(err)
dErr := disk.DeleteFile(bucket, pathJoin(object, xlMetaJSONFile))
if dErr != nil && !isErr(dErr, errFileNotFound) {
return toObjectErr(traceError(dErr), bucket, object)
}
}
@ -390,12 +416,12 @@ func healObject(storageDisks []StorageAPI, bucket string, object string, quorum
erasure := latestMeta.Erasure
sumInfo := latestMeta.Erasure.GetCheckSumInfo(partName)
// Heal the part file.
checkSums, err := erasureHealFile(latestDisks, outDatedDisks,
checkSums, hErr := erasureHealFile(latestDisks, outDatedDisks,
bucket, pathJoin(object, partName),
minioMetaTmpBucket, pathJoin(tmpID, partName),
partSize, erasure.BlockSize, erasure.DataBlocks, erasure.ParityBlocks, sumInfo.Algorithm)
if err != nil {
return err
if hErr != nil {
return toObjectErr(hErr, bucket, object)
}
for index, sum := range checkSums {
if outDatedDisks[index] != nil {
@ -418,9 +444,9 @@ func healObject(storageDisks []StorageAPI, bucket string, object string, quorum
}
// Generate and write `xl.json` generated from other disks.
err := writeUniqueXLMetadata(outDatedDisks, minioMetaTmpBucket, tmpID, partsMetadata, diskCount(outDatedDisks))
if err != nil {
return toObjectErr(err, bucket, object)
aErr = writeUniqueXLMetadata(outDatedDisks, minioMetaTmpBucket, tmpID, partsMetadata, diskCount(outDatedDisks))
if aErr != nil {
return toObjectErr(aErr, bucket, object)
}
// Rename from tmp location to the actual location.
@ -429,14 +455,14 @@ func healObject(storageDisks []StorageAPI, bucket string, object string, quorum
continue
}
// Remove any lingering partial data from current namespace.
err = disk.DeleteFile(bucket, retainSlash(object))
if err != nil && err != errFileNotFound {
return traceError(err)
aErr = disk.DeleteFile(bucket, retainSlash(object))
if aErr != nil && aErr != errFileNotFound {
return toObjectErr(traceError(aErr), bucket, object)
}
// Attempt a rename now from healed data to final location.
err = disk.RenameFile(minioMetaTmpBucket, retainSlash(tmpID), bucket, retainSlash(object))
if err != nil {
return traceError(err)
aErr = disk.RenameFile(minioMetaTmpBucket, retainSlash(tmpID), bucket, retainSlash(object))
if aErr != nil {
return toObjectErr(traceError(aErr), bucket, object)
}
}
return nil

View file

@ -265,6 +265,12 @@ func TestPutObjectNoQuorum(t *testing.T) {
// Tests both object and bucket healing.
func TestHealing(t *testing.T) {
rootPath, err := newTestConfig(globalMinioDefaultRegion)
if err != nil {
t.Fatalf("Failed to initialize test config %v", err)
}
defer removeAll(rootPath)
obj, fsDirs, err := prepareXL()
if err != nil {
t.Fatal(err)