From 9693c382a8b5320252076c95269ebdaf75814c59 Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Tue, 23 Apr 2024 10:15:52 -0700 Subject: [PATCH] make renameData() more defensive during overwrites (#19548) instead upon any error in renameData(), we still preserve the existing dataDir in some form for recoverability in strange situations such as out of disk space type errors. Bonus: avoid running list and heal() instead allow versions disparity to return the actual versions, uuid to heal. Currently limit this to 100 versions and lesser disparate objects. an undo now reverts back the xl.meta from xl.meta.bkp during overwrites on such flaky setups. Bonus: Save N depth syscalls via skipping the parents upon overwrites and versioned updates. Flaky setup examples are stretch clusters with regular packet drops etc, we need to add some defensive code around to avoid dangling objects. --- .github/workflows/multipart/migrate.sh | 5 +- buildscripts/rewrite-old-new.sh | 5 +- cmd/erasure-metadata-utils.go | 42 ++- cmd/erasure-multipart.go | 22 +- cmd/erasure-object.go | 113 +++--- cmd/global-heal.go | 11 +- cmd/mrf.go | 12 +- cmd/naughty-disk_test.go | 4 +- cmd/storage-datatypes.go | 10 +- cmd/storage-datatypes_gen.go | 129 +++++-- cmd/storage-interface.go | 2 +- cmd/storage-rest-client.go | 8 +- cmd/storage-rest-server.go | 8 +- cmd/xl-storage-disk-id-check.go | 7 +- cmd/xl-storage.go | 325 +++++++++++------- .../replication/setup_3site_replication.sh | 5 +- docs/distributed/decom-compressed-sse-s3.sh | 5 +- docs/distributed/decom-encrypted-sse-s3.sh | 5 +- docs/distributed/decom-encrypted.sh | 5 +- docs/distributed/decom.sh | 5 +- internal/grid/handlers.go | 3 + internal/grid/handlers_string.go | 11 +- 22 files changed, 460 insertions(+), 282 deletions(-) diff --git a/.github/workflows/multipart/migrate.sh b/.github/workflows/multipart/migrate.sh index 49e0d5fec..e1bd484ca 100755 --- a/.github/workflows/multipart/migrate.sh +++ b/.github/workflows/multipart/migrate.sh @@ -24,10 +24,7 @@ if [ ! -f ./mc ]; then chmod +x mc fi -( - cd ./docs/debugging/s3-check-md5 - go install -v -) +go install -v github.com/minio/minio/docs/debugging/s3-check-md5@latest export RELEASE=RELEASE.2023-08-29T23-07-35Z diff --git a/buildscripts/rewrite-old-new.sh b/buildscripts/rewrite-old-new.sh index 0dfef4670..18e0cbddf 100755 --- a/buildscripts/rewrite-old-new.sh +++ b/buildscripts/rewrite-old-new.sh @@ -87,10 +87,7 @@ function verify_rewrite() { exit 1 fi - ( - cd ./docs/debugging/s3-check-md5 - go install -v - ) + go install -v github.com/minio/minio/docs/debugging/s3-check-md5@latest if ! s3-check-md5 \ -debug \ diff --git a/cmd/erasure-metadata-utils.go b/cmd/erasure-metadata-utils.go index 17ceb47da..bc1c29723 100644 --- a/cmd/erasure-metadata-utils.go +++ b/cmd/erasure-metadata-utils.go @@ -19,6 +19,7 @@ package cmd import ( "context" + "encoding/binary" "errors" "hash/crc32" @@ -26,14 +27,17 @@ import ( ) // figure out the most commonVersions across disk that satisfies -// the 'writeQuorum' this function returns '0' if quorum cannot +// the 'writeQuorum' this function returns "" if quorum cannot // be achieved and disks have too many inconsistent versions. -func reduceCommonVersions(diskVersions []uint64, writeQuorum int) (commonVersions uint64) { +func reduceCommonVersions(diskVersions [][]byte, writeQuorum int) (versions []byte) { diskVersionsCount := make(map[uint64]int) for _, versions := range diskVersions { - diskVersionsCount[versions]++ + if len(versions) > 0 { + diskVersionsCount[binary.BigEndian.Uint64(versions)]++ + } } + var commonVersions uint64 max := 0 for versions, count := range diskVersionsCount { if max < count { @@ -43,10 +47,38 @@ func reduceCommonVersions(diskVersions []uint64, writeQuorum int) (commonVersion } if max >= writeQuorum { - return commonVersions + for _, versions := range diskVersions { + if binary.BigEndian.Uint64(versions) == commonVersions { + return versions + } + } } - return 0 + return []byte{} +} + +// figure out the most commonVersions across disk that satisfies +// the 'writeQuorum' this function returns '0' if quorum cannot +// be achieved and disks have too many inconsistent versions. +func reduceCommonDataDir(dataDirs []string, writeQuorum int) (dataDir string) { + dataDirsCount := make(map[string]int) + for _, ddir := range dataDirs { + dataDirsCount[ddir]++ + } + + max := 0 + for ddir, count := range dataDirsCount { + if max < count { + max = count + dataDir = ddir + } + } + + if max >= writeQuorum { + return dataDir + } + + return "" } // Returns number of errors that occurred the most (incl. nil) and the diff --git a/cmd/erasure-multipart.go b/cmd/erasure-multipart.go index 1e4bffd8a..d888f8d92 100644 --- a/cmd/erasure-multipart.go +++ b/cmd/erasure-multipart.go @@ -1311,24 +1311,28 @@ func (er erasureObjects) CompleteMultipartUpload(ctx context.Context, bucket str }() // Rename the multipart object to final location. - onlineDisks, versionsDisparity, err := renameData(ctx, onlineDisks, minioMetaMultipartBucket, uploadIDPath, + onlineDisks, versions, oldDataDir, err := renameData(ctx, onlineDisks, minioMetaMultipartBucket, uploadIDPath, partsMetadata, bucket, object, writeQuorum) if err != nil { return oi, toObjectErr(err, bucket, object) } - if !opts.Speedtest && versionsDisparity { + if err = er.commitRenameDataDir(ctx, bucket, object, oldDataDir, onlineDisks); err != nil { + return ObjectInfo{}, toObjectErr(err, bucket, object) + } + + if !opts.Speedtest && len(versions) > 0 { globalMRFState.addPartialOp(partialOperation{ - bucket: bucket, - object: object, - queued: time.Now(), - allVersions: true, - setIndex: er.setIndex, - poolIndex: er.poolIndex, + bucket: bucket, + object: object, + queued: time.Now(), + versions: versions, + setIndex: er.setIndex, + poolIndex: er.poolIndex, }) } - if !opts.Speedtest && !versionsDisparity { + if !opts.Speedtest && len(versions) == 0 { // Check if there is any offline disk and add it to the MRF list for _, disk := range onlineDisks { if disk != nil && disk.IsOnline() { diff --git a/cmd/erasure-object.go b/cmd/erasure-object.go index 7f7eead84..82681dd0d 100644 --- a/cmd/erasure-object.go +++ b/cmd/erasure-object.go @@ -48,7 +48,6 @@ import ( "github.com/minio/minio/internal/logger" "github.com/minio/pkg/v2/mimedb" "github.com/minio/pkg/v2/sync/errgroup" - "github.com/minio/pkg/v2/wildcard" ) // list all errors which can be ignored in object operations. @@ -1006,7 +1005,7 @@ func (er erasureObjects) getObjectInfoAndQuorum(ctx context.Context, bucket, obj } // Similar to rename but renames data from srcEntry to dstEntry at dataDir -func renameData(ctx context.Context, disks []StorageAPI, srcBucket, srcEntry string, metadata []FileInfo, dstBucket, dstEntry string, writeQuorum int) ([]StorageAPI, bool, error) { +func renameData(ctx context.Context, disks []StorageAPI, srcBucket, srcEntry string, metadata []FileInfo, dstBucket, dstEntry string, writeQuorum int) ([]StorageAPI, []byte, string, error) { g := errgroup.WithNErrs(len(disks)) fvID := mustGetUUID() @@ -1014,7 +1013,8 @@ func renameData(ctx context.Context, disks []StorageAPI, srcBucket, srcEntry str metadata[index].SetTierFreeVersionID(fvID) } - diskVersions := make([]uint64, len(disks)) + diskVersions := make([][]byte, len(disks)) + dataDirs := make([]string, len(disks)) // Rename file on all underlying storage disks. for index := range disks { index := index @@ -1033,11 +1033,12 @@ func renameData(ctx context.Context, disks []StorageAPI, srcBucket, srcEntry str if !fi.IsValid() { return errFileCorrupt } - sign, err := disks[index].RenameData(ctx, srcBucket, srcEntry, fi, dstBucket, dstEntry, RenameOptions{}) + resp, err := disks[index].RenameData(ctx, srcBucket, srcEntry, fi, dstBucket, dstEntry, RenameOptions{}) if err != nil { return err } - diskVersions[index] = sign + diskVersions[index] = resp.Sign + dataDirs[index] = resp.OldDataDir return nil }, index) } @@ -1045,8 +1046,6 @@ func renameData(ctx context.Context, disks []StorageAPI, srcBucket, srcEntry str // Wait for all renames to finish. errs := g.Wait() - var versionsDisparity bool - err := reduceWriteQuorumErrs(ctx, errs, objectOpIgnoredErrs, writeQuorum) if err != nil { dg := errgroup.WithNErrs(len(disks)) @@ -1060,27 +1059,35 @@ func renameData(ctx context.Context, disks []StorageAPI, srcBucket, srcEntry str // caller this dangling object will be now scheduled to be removed // via active healing. dg.Go(func() error { - return disks[index].DeleteVersion(context.Background(), dstBucket, dstEntry, metadata[index], false, DeleteOptions{UndoWrite: true}) + return disks[index].DeleteVersion(context.Background(), dstBucket, dstEntry, metadata[index], false, DeleteOptions{ + UndoWrite: true, + OldDataDir: dataDirs[index], + }) }, index) } dg.Wait() } + var dataDir string + var versions []byte if err == nil { - versions := reduceCommonVersions(diskVersions, writeQuorum) + versions = reduceCommonVersions(diskVersions, writeQuorum) for index, dversions := range diskVersions { if errs[index] != nil { continue } - if versions != dversions { - versionsDisparity = true + if !bytes.Equal(dversions, versions) { + if len(dversions) > len(versions) { + versions = dversions + } break } } + dataDir = reduceCommonDataDir(dataDirs, writeQuorum) } // We can safely allow RenameData errors up to len(er.getDisks()) - writeQuorum // otherwise return failure. - return evalDisks(disks, errs), versionsDisparity, err + return evalDisks(disks, errs), versions, dataDir, err } func (er erasureObjects) putMetacacheObject(ctx context.Context, key string, r *PutObjReader, opts ObjectOptions) (objInfo ObjectInfo, err error) { @@ -1229,44 +1236,6 @@ func (er erasureObjects) PutObject(ctx context.Context, bucket string, object st return er.putObject(ctx, bucket, object, data, opts) } -// Heal up to two versions of one object when there is disparity between disks -func healObjectVersionsDisparity(bucket string, entry metaCacheEntry, scanMode madmin.HealScanMode) error { - if entry.isDir() { - return nil - } - // We might land at .metacache, .trash, .multipart - // no need to heal them skip, only when bucket - // is '.minio.sys' - if bucket == minioMetaBucket { - if wildcard.Match("buckets/*/.metacache/*", entry.name) { - return nil - } - if wildcard.Match("tmp/*", entry.name) { - return nil - } - if wildcard.Match("multipart/*", entry.name) { - return nil - } - if wildcard.Match("tmp-old/*", entry.name) { - return nil - } - } - - fivs, err := entry.fileInfoVersions(bucket) - if err != nil { - healObject(bucket, entry.name, "", madmin.HealDeepScan) - return err - } - - if len(fivs.Versions) <= 2 { - for _, version := range fivs.Versions { - healObject(bucket, entry.name, version.VersionID, scanMode) - } - } - - return nil -} - // putObject wrapper for erasureObjects PutObject func (er erasureObjects) putObject(ctx context.Context, bucket string, object string, r *PutObjReader, opts ObjectOptions) (objInfo ObjectInfo, err error) { if !opts.NoAuditLog { @@ -1547,7 +1516,7 @@ func (er erasureObjects) putObject(ctx context.Context, bucket string, object st } // Rename the successfully written temporary object to final location. - onlineDisks, versionsDisparity, err := renameData(ctx, onlineDisks, minioMetaTmpBucket, tempObj, partsMetadata, bucket, object, writeQuorum) + onlineDisks, versions, oldDataDir, err := renameData(ctx, onlineDisks, minioMetaTmpBucket, tempObj, partsMetadata, bucket, object, writeQuorum) if err != nil { if errors.Is(err, errFileNotFound) { return ObjectInfo{}, toObjectErr(errErasureWriteQuorum, bucket, object) @@ -1555,6 +1524,10 @@ func (er erasureObjects) putObject(ctx context.Context, bucket string, object st return ObjectInfo{}, toObjectErr(err, bucket, object) } + if err = er.commitRenameDataDir(ctx, bucket, object, oldDataDir, onlineDisks); err != nil { + return ObjectInfo{}, toObjectErr(err, bucket, object) + } + for i := 0; i < len(onlineDisks); i++ { if onlineDisks[i] != nil && onlineDisks[i].IsOnline() { // Object info is the same in all disks, so we can pick @@ -1569,7 +1542,7 @@ func (er erasureObjects) putObject(ctx context.Context, bucket string, object st // When there is versions disparity we are healing // the content implicitly for all versions, we can // avoid triggering another MRF heal for offline drives. - if !versionsDisparity { + if len(versions) == 0 { // Whether a disk was initially or becomes offline // during this upload, send it to the MRF list. for i := 0; i < len(onlineDisks); i++ { @@ -1582,12 +1555,12 @@ func (er erasureObjects) putObject(ctx context.Context, bucket string, object st } } else { globalMRFState.addPartialOp(partialOperation{ - bucket: bucket, - object: object, - queued: time.Now(), - allVersions: true, - setIndex: er.setIndex, - poolIndex: er.poolIndex, + bucket: bucket, + object: object, + queued: time.Now(), + versions: versions, + setIndex: er.setIndex, + poolIndex: er.poolIndex, }) } } @@ -1797,6 +1770,30 @@ func (er erasureObjects) DeleteObjects(ctx context.Context, bucket string, objec return dobjects, errs } +func (er erasureObjects) commitRenameDataDir(ctx context.Context, bucket, object, dataDir string, onlineDisks []StorageAPI) error { + if dataDir == "" { + return nil + } + g := errgroup.WithNErrs(len(onlineDisks)) + for index := range onlineDisks { + index := index + g.Go(func() error { + if onlineDisks[index] == nil { + return nil + } + return onlineDisks[index].Delete(ctx, bucket, pathJoin(object, dataDir), DeleteOptions{ + Recursive: true, + }) + }, index) + } + for _, err := range g.Wait() { + if err != nil { + return err + } + } + return nil +} + func (er erasureObjects) deletePrefix(ctx context.Context, bucket, prefix string) error { disks := er.getDisks() g := errgroup.WithNErrs(len(disks)) diff --git a/cmd/global-heal.go b/cmd/global-heal.go index ea1d1d97f..74dd6d070 100644 --- a/cmd/global-heal.go +++ b/cmd/global-heal.go @@ -530,16 +530,7 @@ func healObject(bucket, object, versionID string, scan madmin.HealScanMode) erro // Get background heal sequence to send elements to heal bgSeq, ok := globalBackgroundHealState.getHealSequenceByToken(bgHealingUUID) if ok { - return bgSeq.queueHealTask(healSource{ - bucket: bucket, - object: object, - versionID: versionID, - noWait: true, // do not block callers. - opts: &madmin.HealOpts{ - Remove: healDeleteDangling, // if found dangling purge it. - ScanMode: scan, - }, - }, madmin.HealItemObject) + return bgSeq.healObject(bucket, object, versionID, scan) } return nil } diff --git a/cmd/mrf.go b/cmd/mrf.go index 12da25d50..21fdde819 100644 --- a/cmd/mrf.go +++ b/cmd/mrf.go @@ -21,6 +21,7 @@ import ( "context" "time" + "github.com/google/uuid" "github.com/minio/madmin-go/v3" "github.com/minio/pkg/v2/wildcard" ) @@ -35,7 +36,7 @@ type partialOperation struct { bucket string object string versionID string - allVersions bool + versions []byte setIndex, poolIndex int queued time.Time scanMode madmin.HealScanMode @@ -111,8 +112,13 @@ func (m *mrfState) healRoutine(z *erasureServerPools) { if u.object == "" { healBucket(u.bucket, scan) } else { - if u.allVersions { - z.serverPools[u.poolIndex].sets[u.setIndex].listAndHeal(u.bucket, u.object, u.scanMode, healObjectVersionsDisparity) + if len(u.versions) > 0 { + vers := len(u.versions) / 16 + if vers > 0 { + for i := 0; i < vers; i++ { + healObject(u.bucket, u.object, uuid.UUID(u.versions[16*i:]).String(), scan) + } + } } else { healObject(u.bucket, u.object, u.versionID, scan) } diff --git a/cmd/naughty-disk_test.go b/cmd/naughty-disk_test.go index 6cf271671..86c577eca 100644 --- a/cmd/naughty-disk_test.go +++ b/cmd/naughty-disk_test.go @@ -201,9 +201,9 @@ func (d *naughtyDisk) AppendFile(ctx context.Context, volume string, path string return d.disk.AppendFile(ctx, volume, path, buf) } -func (d *naughtyDisk) RenameData(ctx context.Context, srcVolume, srcPath string, fi FileInfo, dstVolume, dstPath string, opts RenameOptions) (uint64, error) { +func (d *naughtyDisk) RenameData(ctx context.Context, srcVolume, srcPath string, fi FileInfo, dstVolume, dstPath string, opts RenameOptions) (RenameDataResp, error) { if err := d.calcError(); err != nil { - return 0, err + return RenameDataResp{}, err } return d.disk.RenameData(ctx, srcVolume, srcPath, fi, dstVolume, dstPath, opts) } diff --git a/cmd/storage-datatypes.go b/cmd/storage-datatypes.go index b2717b734..4f1accffc 100644 --- a/cmd/storage-datatypes.go +++ b/cmd/storage-datatypes.go @@ -33,6 +33,8 @@ type DeleteOptions struct { Recursive bool `msg:"r"` Immediate bool `msg:"i"` UndoWrite bool `msg:"u"` + // OldDataDir of the previous object + OldDataDir string `msg:"o,omitempty"` // old data dir used only when to revert a rename() } // BaseOptions represents common options for all Storage API calls @@ -490,8 +492,14 @@ type WriteAllHandlerParams struct { } // RenameDataResp - RenameData()'s response. +// Provides information about the final state of Rename() +// - on xl.meta (array of versions) on disk to check for version disparity +// - on rewrite dataDir on disk that must be additionally purged +// only after as a 2-phase call, allowing the older dataDir to +// hang-around in-case we need some form of recovery. type RenameDataResp struct { - Signature uint64 `msg:"sig"` + Sign []byte + OldDataDir string // contains '', it is designed to be passed as value to Delete(bucket, pathJoin(object, dataDir)) } // LocalDiskIDs - GetLocalIDs response. diff --git a/cmd/storage-datatypes_gen.go b/cmd/storage-datatypes_gen.go index 7bbeb3ebd..e9d584f2c 100644 --- a/cmd/storage-datatypes_gen.go +++ b/cmd/storage-datatypes_gen.go @@ -514,6 +514,12 @@ func (z *DeleteOptions) DecodeMsg(dc *msgp.Reader) (err error) { err = msgp.WrapError(err, "UndoWrite") return } + case "o": + z.OldDataDir, err = dc.ReadString() + if err != nil { + err = msgp.WrapError(err, "OldDataDir") + return + } default: err = dc.Skip() if err != nil { @@ -527,9 +533,24 @@ func (z *DeleteOptions) DecodeMsg(dc *msgp.Reader) (err error) { // EncodeMsg implements msgp.Encodable func (z *DeleteOptions) EncodeMsg(en *msgp.Writer) (err error) { - // map header, size 4 + // check for omitted fields + zb0001Len := uint32(5) + var zb0001Mask uint8 /* 5 bits */ + _ = zb0001Mask + if z.OldDataDir == "" { + zb0001Len-- + zb0001Mask |= 0x10 + } + // variable map header, size zb0001Len + err = en.Append(0x80 | uint8(zb0001Len)) + if err != nil { + return + } + if zb0001Len == 0 { + return + } // write "BaseOptions" - err = en.Append(0x84, 0xab, 0x42, 0x61, 0x73, 0x65, 0x4f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73) + err = en.Append(0xab, 0x42, 0x61, 0x73, 0x65, 0x4f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73) if err != nil { return } @@ -569,15 +590,39 @@ func (z *DeleteOptions) EncodeMsg(en *msgp.Writer) (err error) { err = msgp.WrapError(err, "UndoWrite") return } + if (zb0001Mask & 0x10) == 0 { // if not omitted + // write "o" + err = en.Append(0xa1, 0x6f) + if err != nil { + return + } + err = en.WriteString(z.OldDataDir) + if err != nil { + err = msgp.WrapError(err, "OldDataDir") + return + } + } return } // MarshalMsg implements msgp.Marshaler func (z *DeleteOptions) MarshalMsg(b []byte) (o []byte, err error) { o = msgp.Require(b, z.Msgsize()) - // map header, size 4 + // check for omitted fields + zb0001Len := uint32(5) + var zb0001Mask uint8 /* 5 bits */ + _ = zb0001Mask + if z.OldDataDir == "" { + zb0001Len-- + zb0001Mask |= 0x10 + } + // variable map header, size zb0001Len + o = append(o, 0x80|uint8(zb0001Len)) + if zb0001Len == 0 { + return + } // string "BaseOptions" - o = append(o, 0x84, 0xab, 0x42, 0x61, 0x73, 0x65, 0x4f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73) + o = append(o, 0xab, 0x42, 0x61, 0x73, 0x65, 0x4f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73) // map header, size 0 _ = z.BaseOptions o = append(o, 0x80) @@ -590,6 +635,11 @@ func (z *DeleteOptions) MarshalMsg(b []byte) (o []byte, err error) { // string "u" o = append(o, 0xa1, 0x75) o = msgp.AppendBool(o, z.UndoWrite) + if (zb0001Mask & 0x10) == 0 { // if not omitted + // string "o" + o = append(o, 0xa1, 0x6f) + o = msgp.AppendString(o, z.OldDataDir) + } return } @@ -652,6 +702,12 @@ func (z *DeleteOptions) UnmarshalMsg(bts []byte) (o []byte, err error) { err = msgp.WrapError(err, "UndoWrite") return } + case "o": + z.OldDataDir, bts, err = msgp.ReadStringBytes(bts) + if err != nil { + err = msgp.WrapError(err, "OldDataDir") + return + } default: bts, err = msgp.Skip(bts) if err != nil { @@ -666,7 +722,7 @@ func (z *DeleteOptions) UnmarshalMsg(bts []byte) (o []byte, err error) { // Msgsize returns an upper bound estimate of the number of bytes occupied by the serialized message func (z *DeleteOptions) Msgsize() (s int) { - s = 1 + 12 + 1 + 2 + msgp.BoolSize + 2 + msgp.BoolSize + 2 + msgp.BoolSize + s = 1 + 12 + 1 + 2 + msgp.BoolSize + 2 + msgp.BoolSize + 2 + msgp.BoolSize + 2 + msgp.StringPrefixSize + len(z.OldDataDir) return } @@ -4751,10 +4807,16 @@ func (z *RenameDataResp) DecodeMsg(dc *msgp.Reader) (err error) { return } switch msgp.UnsafeString(field) { - case "sig": - z.Signature, err = dc.ReadUint64() + case "Sign": + z.Sign, err = dc.ReadBytes(z.Sign) if err != nil { - err = msgp.WrapError(err, "Signature") + err = msgp.WrapError(err, "Sign") + return + } + case "OldDataDir": + z.OldDataDir, err = dc.ReadString() + if err != nil { + err = msgp.WrapError(err, "OldDataDir") return } default: @@ -4769,28 +4831,41 @@ func (z *RenameDataResp) DecodeMsg(dc *msgp.Reader) (err error) { } // EncodeMsg implements msgp.Encodable -func (z RenameDataResp) EncodeMsg(en *msgp.Writer) (err error) { - // map header, size 1 - // write "sig" - err = en.Append(0x81, 0xa3, 0x73, 0x69, 0x67) +func (z *RenameDataResp) EncodeMsg(en *msgp.Writer) (err error) { + // map header, size 2 + // write "Sign" + err = en.Append(0x82, 0xa4, 0x53, 0x69, 0x67, 0x6e) if err != nil { return } - err = en.WriteUint64(z.Signature) + err = en.WriteBytes(z.Sign) if err != nil { - err = msgp.WrapError(err, "Signature") + err = msgp.WrapError(err, "Sign") + return + } + // write "OldDataDir" + err = en.Append(0xaa, 0x4f, 0x6c, 0x64, 0x44, 0x61, 0x74, 0x61, 0x44, 0x69, 0x72) + if err != nil { + return + } + err = en.WriteString(z.OldDataDir) + if err != nil { + err = msgp.WrapError(err, "OldDataDir") return } return } // MarshalMsg implements msgp.Marshaler -func (z RenameDataResp) MarshalMsg(b []byte) (o []byte, err error) { +func (z *RenameDataResp) MarshalMsg(b []byte) (o []byte, err error) { o = msgp.Require(b, z.Msgsize()) - // map header, size 1 - // string "sig" - o = append(o, 0x81, 0xa3, 0x73, 0x69, 0x67) - o = msgp.AppendUint64(o, z.Signature) + // map header, size 2 + // string "Sign" + o = append(o, 0x82, 0xa4, 0x53, 0x69, 0x67, 0x6e) + o = msgp.AppendBytes(o, z.Sign) + // string "OldDataDir" + o = append(o, 0xaa, 0x4f, 0x6c, 0x64, 0x44, 0x61, 0x74, 0x61, 0x44, 0x69, 0x72) + o = msgp.AppendString(o, z.OldDataDir) return } @@ -4812,10 +4887,16 @@ func (z *RenameDataResp) UnmarshalMsg(bts []byte) (o []byte, err error) { return } switch msgp.UnsafeString(field) { - case "sig": - z.Signature, bts, err = msgp.ReadUint64Bytes(bts) + case "Sign": + z.Sign, bts, err = msgp.ReadBytesBytes(bts, z.Sign) if err != nil { - err = msgp.WrapError(err, "Signature") + err = msgp.WrapError(err, "Sign") + return + } + case "OldDataDir": + z.OldDataDir, bts, err = msgp.ReadStringBytes(bts) + if err != nil { + err = msgp.WrapError(err, "OldDataDir") return } default: @@ -4831,8 +4912,8 @@ func (z *RenameDataResp) UnmarshalMsg(bts []byte) (o []byte, err error) { } // Msgsize returns an upper bound estimate of the number of bytes occupied by the serialized message -func (z RenameDataResp) Msgsize() (s int) { - s = 1 + 4 + msgp.Uint64Size +func (z *RenameDataResp) Msgsize() (s int) { + s = 1 + 5 + msgp.BytesPrefixSize + len(z.Sign) + 11 + msgp.StringPrefixSize + len(z.OldDataDir) return } diff --git a/cmd/storage-interface.go b/cmd/storage-interface.go index 5dbfd831f..e8117b99c 100644 --- a/cmd/storage-interface.go +++ b/cmd/storage-interface.go @@ -85,7 +85,7 @@ type StorageAPI interface { UpdateMetadata(ctx context.Context, volume, path string, fi FileInfo, opts UpdateMetadataOpts) error ReadVersion(ctx context.Context, origvolume, volume, path, versionID string, opts ReadOptions) (FileInfo, error) ReadXL(ctx context.Context, volume, path string, readData bool) (RawFileInfo, error) - RenameData(ctx context.Context, srcVolume, srcPath string, fi FileInfo, dstVolume, dstPath string, opts RenameOptions) (uint64, error) + RenameData(ctx context.Context, srcVolume, srcPath string, fi FileInfo, dstVolume, dstPath string, opts RenameOptions) (RenameDataResp, error) // File operations. ListDir(ctx context.Context, origvolume, volume, dirPath string, count int) ([]string, error) diff --git a/cmd/storage-rest-client.go b/cmd/storage-rest-client.go index c6604cb26..3cd4dec9e 100644 --- a/cmd/storage-rest-client.go +++ b/cmd/storage-rest-client.go @@ -440,7 +440,9 @@ func (client *storageRESTClient) CheckParts(ctx context.Context, volume string, } // RenameData - rename source path to destination path atomically, metadata and data file. -func (client *storageRESTClient) RenameData(ctx context.Context, srcVolume, srcPath string, fi FileInfo, dstVolume, dstPath string, opts RenameOptions) (sign uint64, err error) { +func (client *storageRESTClient) RenameData(ctx context.Context, srcVolume, srcPath string, fi FileInfo, + dstVolume, dstPath string, opts RenameOptions, +) (res RenameDataResp, err error) { params := RenameDataHandlerParams{ DiskID: *client.diskID.Load(), SrcVolume: srcVolume, @@ -457,11 +459,11 @@ func (client *storageRESTClient) RenameData(ctx context.Context, srcVolume, srcP resp, err = storageRenameDataInlineRPC.Call(ctx, client.gridConn, &RenameDataInlineHandlerParams{params}) } if err != nil { - return 0, toStorageErr(err) + return res, toStorageErr(err) } defer storageRenameDataRPC.PutResponse(resp) - return resp.Signature, nil + return *resp, nil } // where we keep old *Readers diff --git a/cmd/storage-rest-server.go b/cmd/storage-rest-server.go index 0cebce74d..ba829fc1b 100644 --- a/cmd/storage-rest-server.go +++ b/cmd/storage-rest-server.go @@ -67,7 +67,7 @@ var ( storageWriteAllRPC = grid.NewSingleHandler[*WriteAllHandlerParams, grid.NoPayload](grid.HandlerWriteAll, func() *WriteAllHandlerParams { return &WriteAllHandlerParams{} }, grid.NewNoPayload) storageReadVersionRPC = grid.NewSingleHandler[*grid.MSS, *FileInfo](grid.HandlerReadVersion, grid.NewMSS, func() *FileInfo { return &FileInfo{} }) storageReadXLRPC = grid.NewSingleHandler[*grid.MSS, *RawFileInfo](grid.HandlerReadXL, grid.NewMSS, func() *RawFileInfo { return &RawFileInfo{} }) - storageRenameDataRPC = grid.NewSingleHandler[*RenameDataHandlerParams, *RenameDataResp](grid.HandlerRenameData, func() *RenameDataHandlerParams { return &RenameDataHandlerParams{} }, func() *RenameDataResp { return &RenameDataResp{} }) + storageRenameDataRPC = grid.NewSingleHandler[*RenameDataHandlerParams, *RenameDataResp](grid.HandlerRenameData2, func() *RenameDataHandlerParams { return &RenameDataHandlerParams{} }, func() *RenameDataResp { return &RenameDataResp{} }) storageRenameDataInlineRPC = grid.NewSingleHandler[*RenameDataInlineHandlerParams, *RenameDataResp](grid.HandlerRenameDataInline, newRenameDataInlineHandlerParams, func() *RenameDataResp { return &RenameDataResp{} }).AllowCallRequestPool(false) storageRenameFileRPC = grid.NewSingleHandler[*RenameFileHandlerParams, grid.NoPayload](grid.HandlerRenameFile, func() *RenameFileHandlerParams { return &RenameFileHandlerParams{} }, grid.NewNoPayload).AllowCallRequestPool(true) storageStatVolRPC = grid.NewSingleHandler[*grid.MSS, *VolInfo](grid.HandlerStatVol, grid.NewMSS, func() *VolInfo { return &VolInfo{} }) @@ -695,10 +695,8 @@ func (s *storageRESTServer) RenameDataHandler(p *RenameDataHandlerParams) (*Rena return nil, grid.NewRemoteErr(errDiskNotFound) } - sign, err := s.getStorage().RenameData(context.Background(), p.SrcVolume, p.SrcPath, p.FI, p.DstVolume, p.DstPath, p.Opts) - return &RenameDataResp{ - Signature: sign, - }, grid.NewRemoteErr(err) + resp, err := s.getStorage().RenameData(context.Background(), p.SrcVolume, p.SrcPath, p.FI, p.DstVolume, p.DstPath, p.Opts) + return &resp, grid.NewRemoteErr(err) } // RenameDataInlineHandler - renames a meta object and data dir to destination. diff --git a/cmd/xl-storage-disk-id-check.go b/cmd/xl-storage-disk-id-check.go index 8aa2cb9ec..0e9d7aab9 100644 --- a/cmd/xl-storage-disk-id-check.go +++ b/cmd/xl-storage-disk-id-check.go @@ -461,10 +461,10 @@ func (p *xlStorageDiskIDCheck) RenameFile(ctx context.Context, srcVolume, srcPat return w.Run(func() error { return p.storage.RenameFile(ctx, srcVolume, srcPath, dstVolume, dstPath) }) } -func (p *xlStorageDiskIDCheck) RenameData(ctx context.Context, srcVolume, srcPath string, fi FileInfo, dstVolume, dstPath string, opts RenameOptions) (sign uint64, err error) { +func (p *xlStorageDiskIDCheck) RenameData(ctx context.Context, srcVolume, srcPath string, fi FileInfo, dstVolume, dstPath string, opts RenameOptions) (res RenameDataResp, err error) { ctx, done, err := p.TrackDiskHealth(ctx, storageMetricRenameData, srcPath, fi.DataDir, dstVolume, dstPath) if err != nil { - return 0, err + return res, err } defer func() { if err == nil && !skipAccessChecks(dstVolume) { @@ -472,11 +472,12 @@ func (p *xlStorageDiskIDCheck) RenameData(ctx context.Context, srcVolume, srcPat } done(&err) }() + // Copy inline data to a new buffer to function with deadlines. if len(fi.Data) > 0 { fi.Data = append(grid.GetByteBufferCap(len(fi.Data))[:0], fi.Data...) } - return xioutil.WithDeadline[uint64](ctx, globalDriveConfig.GetMaxTimeout(), func(ctx context.Context) (result uint64, err error) { + return xioutil.WithDeadline[RenameDataResp](ctx, globalDriveConfig.GetMaxTimeout(), func(ctx context.Context) (res RenameDataResp, err error) { if len(fi.Data) > 0 { defer grid.PutByteBuffer(fi.Data) } diff --git a/cmd/xl-storage.go b/cmd/xl-storage.go index 044ea2832..1f03b2adf 100644 --- a/cmd/xl-storage.go +++ b/cmd/xl-storage.go @@ -29,6 +29,7 @@ import ( pathutil "path" "path/filepath" "runtime" + "slices" "strconv" "strings" "sync" @@ -48,7 +49,6 @@ import ( xioutil "github.com/minio/minio/internal/ioutil" "github.com/minio/minio/internal/logger" "github.com/pkg/xattr" - "github.com/zeebo/xxh3" ) const ( @@ -65,6 +65,9 @@ const ( // XL metadata file carries per object metadata. xlStorageFormatFile = "xl.meta" + + // XL metadata file backup file carries previous per object metadata. + xlStorageFormatFileBackup = "xl.meta.bkp" ) var alignedBuf []byte @@ -1366,6 +1369,10 @@ func (s *xlStorage) DeleteVersion(ctx context.Context, volume, path string, fi F return s.WriteAll(ctx, volume, pathJoin(path, xlStorageFormatFile), buf) } + if opts.UndoWrite && opts.OldDataDir != "" { + return renameAll(pathJoin(filePath, opts.OldDataDir, xlStorageFormatFileBackup), pathJoin(filePath, xlStorageFormatFile), filePath) + } + return s.deleteFile(volumeDir, pathJoin(volumeDir, path, xlStorageFormatFile), true, false) } @@ -1375,12 +1382,21 @@ func (s *xlStorage) UpdateMetadata(ctx context.Context, volume, path string, fi return errInvalidArgument } + volumeDir, err := s.getVolDir(volume) + if err != nil { + return err + } + + // Validate file path length, before reading. + filePath := pathJoin(volumeDir, path) + if err = checkPathLength(filePath); err != nil { + return err + } + buf, err := s.ReadAll(ctx, volume, pathJoin(path, xlStorageFormatFile)) if err != nil { - if err == errFileNotFound { - if fi.VersionID != "" { - return errFileVersionNotFound - } + if err == errFileNotFound && fi.VersionID != "" { + return errFileVersionNotFound } return err } @@ -1405,7 +1421,7 @@ func (s *xlStorage) UpdateMetadata(ctx context.Context, volume, path string, fi } defer metaDataPoolPut(wbuf) - return s.writeAll(ctx, volume, pathJoin(path, xlStorageFormatFile), wbuf, !opts.NoPersistence) + return s.writeAll(ctx, volume, pathJoin(path, xlStorageFormatFile), wbuf, !opts.NoPersistence, volumeDir) } // WriteMetadata - writes FileInfo metadata for path at `xl.meta` @@ -1438,7 +1454,7 @@ func (s *xlStorage) WriteMetadata(ctx context.Context, origvolume, volume, path // this is currently used by // - emphemeral objects such as objects created during listObjects() calls // - newMultipartUpload() call.. - return s.writeAll(ctx, volume, pathJoin(path, xlStorageFormatFile), buf, false) + return s.writeAll(ctx, volume, pathJoin(path, xlStorageFormatFile), buf, false, "") } buf, err := s.ReadAll(ctx, volume, pathJoin(path, xlStorageFormatFile)) @@ -1936,14 +1952,17 @@ func (s *xlStorage) openFileDirect(path string, mode int) (f *os.File, err error return w, nil } -func (s *xlStorage) openFileSync(filePath string, mode int) (f *os.File, err error) { - return s.openFile(filePath, mode|writeMode) +func (s *xlStorage) openFileSync(filePath string, mode int, skipParent string) (f *os.File, err error) { + return s.openFile(filePath, mode|writeMode, skipParent) } -func (s *xlStorage) openFile(filePath string, mode int) (f *os.File, err error) { +func (s *xlStorage) openFile(filePath string, mode int, skipParent string) (f *os.File, err error) { + if skipParent == "" { + skipParent = s.drivePath + } // Create top level directories if they don't exist. // with mode 0777 mkdir honors system umask. - if err = mkdirAll(pathutil.Dir(filePath), 0o777, s.drivePath); err != nil { + if err = mkdirAll(pathutil.Dir(filePath), 0o777, skipParent); err != nil { return nil, osErrToFileErr(err) } @@ -2095,18 +2114,22 @@ func (s *xlStorage) CreateFile(ctx context.Context, origvolume, volume, path str } }() - return s.writeAllDirect(ctx, filePath, fileSize, r, os.O_CREATE|os.O_WRONLY|os.O_EXCL) + return s.writeAllDirect(ctx, filePath, fileSize, r, os.O_CREATE|os.O_WRONLY|os.O_EXCL, volumeDir) } -func (s *xlStorage) writeAllDirect(ctx context.Context, filePath string, fileSize int64, r io.Reader, flags int) (err error) { +func (s *xlStorage) writeAllDirect(ctx context.Context, filePath string, fileSize int64, r io.Reader, flags int, skipParent string) (err error) { if contextCanceled(ctx) { return ctx.Err() } + if skipParent == "" { + skipParent = s.drivePath + } + // Create top level directories if they don't exist. // with mode 0777 mkdir honors system umask. parentFilePath := pathutil.Dir(filePath) - if err = mkdirAll(parentFilePath, 0o777, s.drivePath); err != nil { + if err = mkdirAll(parentFilePath, 0o777, skipParent); err != nil { return osErrToFileErr(err) } @@ -2168,7 +2191,7 @@ func (s *xlStorage) writeAllDirect(ctx context.Context, filePath string, fileSiz return w.Close() } -func (s *xlStorage) writeAll(ctx context.Context, volume string, path string, b []byte, sync bool) (err error) { +func (s *xlStorage) writeAll(ctx context.Context, volume string, path string, b []byte, sync bool, skipParent string) (err error) { if contextCanceled(ctx) { return ctx.Err() } @@ -2194,11 +2217,11 @@ func (s *xlStorage) writeAll(ctx context.Context, volume string, path string, b // This is an optimization mainly to ensure faster I/O. if len(b) > xioutil.DirectioAlignSize { r := bytes.NewReader(b) - return s.writeAllDirect(ctx, filePath, r.Size(), r, flags) + return s.writeAllDirect(ctx, filePath, r.Size(), r, flags, skipParent) } - w, err = s.openFileSync(filePath, flags) + w, err = s.openFileSync(filePath, flags, skipParent) } else { - w, err = s.openFile(filePath, flags) + w, err = s.openFile(filePath, flags, skipParent) } if err != nil { return err @@ -2235,7 +2258,12 @@ func (s *xlStorage) WriteAll(ctx context.Context, volume string, path string, b s.Unlock() } - return s.writeAll(ctx, volume, path, b, true) + volumeDir, err := s.getVolDir(volume) + if err != nil { + return err + } + + return s.writeAll(ctx, volume, path, b, true, volumeDir) } // AppendFile - append a byte array at path, if file doesn't exist at @@ -2261,7 +2289,7 @@ func (s *xlStorage) AppendFile(ctx context.Context, volume string, path string, var w *os.File // Create file if not found. Not doing O_DIRECT here to avoid the code that does buffer aligned writes. // AppendFile() is only used by healing code to heal objects written in old format. - w, err = s.openFileSync(filePath, os.O_CREATE|os.O_APPEND|os.O_WRONLY) + w, err = s.openFileSync(filePath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, volumeDir) if err != nil { return err } @@ -2425,7 +2453,7 @@ func skipAccessChecks(volume string) (ok bool) { } // RenameData - rename source path to destination path atomically, metadata and data directory. -func (s *xlStorage) RenameData(ctx context.Context, srcVolume, srcPath string, fi FileInfo, dstVolume, dstPath string, opts RenameOptions) (sign uint64, err error) { +func (s *xlStorage) RenameData(ctx context.Context, srcVolume, srcPath string, fi FileInfo, dstVolume, dstPath string, opts RenameOptions) (res RenameDataResp, err error) { defer func() { ignoredErrs := []error{ errFileNotFound, @@ -2451,24 +2479,24 @@ func (s *xlStorage) RenameData(ctx context.Context, srcVolume, srcPath string, f srcVolumeDir, err := s.getVolDir(srcVolume) if err != nil { - return 0, err + return res, err } dstVolumeDir, err := s.getVolDir(dstVolume) if err != nil { - return 0, err + return res, err } if !skipAccessChecks(srcVolume) { // Stat a volume entry. if err = Access(srcVolumeDir); err != nil { - return 0, convertAccessError(err, errVolumeAccessDenied) + return res, convertAccessError(err, errVolumeAccessDenied) } } if !skipAccessChecks(dstVolume) { if err = Access(dstVolumeDir); err != nil { - return 0, convertAccessError(err, errVolumeAccessDenied) + return res, convertAccessError(err, errVolumeAccessDenied) } } @@ -2490,13 +2518,17 @@ func (s *xlStorage) RenameData(ctx context.Context, srcVolume, srcPath string, f } if err = checkPathLength(srcFilePath); err != nil { - return 0, err + return res, err } if err = checkPathLength(dstFilePath); err != nil { - return 0, err + return res, err } + s.RLock() + formatLegacy := s.formatLegacy + s.RUnlock() + dstBuf, err := xioutil.ReadFile(dstFilePath) if err != nil { // handle situations when dstFilePath is 'file' @@ -2506,20 +2538,22 @@ func (s *xlStorage) RenameData(ctx context.Context, srcVolume, srcPath string, f if isSysErrNotDir(err) && runtime.GOOS != globalWindowsOSName { // NOTE: On windows the error happens at // next line and returns appropriate error. - return 0, errFileAccessDenied + return res, errFileAccessDenied } if !osIsNotExist(err) { - return 0, osErrToFileErr(err) + return res, osErrToFileErr(err) } - // errFileNotFound comes here. - err = s.renameLegacyMetadata(dstVolumeDir, dstPath) - if err != nil && err != errFileNotFound { - return 0, err - } - if err == nil { - dstBuf, err = xioutil.ReadFile(dstFilePath) - if err != nil && !osIsNotExist(err) { - return 0, osErrToFileErr(err) + if formatLegacy { + // errFileNotFound comes here. + err = s.renameLegacyMetadata(dstVolumeDir, dstPath) + if err != nil && err != errFileNotFound { + return res, err + } + if err == nil { + dstBuf, err = xioutil.ReadFile(dstFilePath) + if err != nil && !osIsNotExist(err) { + return res, osErrToFileErr(err) + } } } } @@ -2548,9 +2582,6 @@ func (s *xlStorage) RenameData(ctx context.Context, srcVolume, srcPath string, f } } } else { - s.RLock() - formatLegacy := s.formatLegacy - s.RUnlock() // It is possible that some drives may not have `xl.meta` file // in such scenarios verify if at least `part.1` files exist // to verify for legacy version. @@ -2562,7 +2593,7 @@ func (s *xlStorage) RenameData(ctx context.Context, srcVolume, srcPath string, f currentDataPath := pathJoin(dstVolumeDir, dstPath) entries, err := readDirN(currentDataPath, 1) if err != nil && err != errFileNotFound { - return 0, osErrToFileErr(err) + return res, osErrToFileErr(err) } for _, entry := range entries { if entry == xlStorageFormatFile || strings.HasSuffix(entry, slashSeparator) { @@ -2576,64 +2607,58 @@ func (s *xlStorage) RenameData(ctx context.Context, srcVolume, srcPath string, f } } - legacyDataPath := pathJoin(dstVolumeDir, dstPath, legacyDataDir) - if legacyPreserved { - // Preserve all the legacy data, could be slow, but at max there can be 10,000 parts. - currentDataPath := pathJoin(dstVolumeDir, dstPath) - entries, err := readDir(currentDataPath) - if err != nil { - return 0, osErrToFileErr(err) - } - - // legacy data dir means its old content, honor system umask. - if err = mkdirAll(legacyDataPath, 0o777, dstVolumeDir); err != nil { - // any failed mkdir-calls delete them. - s.deleteFile(dstVolumeDir, legacyDataPath, true, false) - return 0, osErrToFileErr(err) - } - - for _, entry := range entries { - // Skip xl.meta renames further, also ignore any directories such as `legacyDataDir` - if entry == xlStorageFormatFile || strings.HasSuffix(entry, slashSeparator) { - continue + var legacyDataPath string + if formatLegacy { + legacyDataPath = pathJoin(dstVolumeDir, dstPath, legacyDataDir) + if legacyPreserved { + // Preserve all the legacy data, could be slow, but at max there can be 1res,000 parts. + currentDataPath := pathJoin(dstVolumeDir, dstPath) + entries, err := readDir(currentDataPath) + if err != nil { + return res, osErrToFileErr(err) } - if err = Rename(pathJoin(currentDataPath, entry), pathJoin(legacyDataPath, entry)); err != nil { - // Any failed rename calls un-roll previous transaction. + // legacy data dir means its old content, honor system umask. + if err = mkdirAll(legacyDataPath, 0o777, dstVolumeDir); err != nil { + // any failed mkdir-calls delete them. s.deleteFile(dstVolumeDir, legacyDataPath, true, false) + return res, osErrToFileErr(err) + } - return 0, osErrToFileErr(err) + for _, entry := range entries { + // Skip xl.meta renames further, also ignore any directories such as `legacyDataDir` + if entry == xlStorageFormatFile || strings.HasSuffix(entry, slashSeparator) { + continue + } + + if err = Rename(pathJoin(currentDataPath, entry), pathJoin(legacyDataPath, entry)); err != nil { + // Any failed rename calls un-roll previous transaction. + s.deleteFile(dstVolumeDir, legacyDataPath, true, false) + + return res, osErrToFileErr(err) + } } } } - var oldDstDataPath, reqVID string + // Set skipParent to skip mkdirAll() calls for deeply nested objects + // - if its an overwrite + // - if its a versioned object + // + // This can potentiall reduce syscalls by strings.Split(path, "/") + // times relative to the object name. + skipParent := dstVolumeDir + if len(dstBuf) > 0 { + skipParent = pathutil.Dir(dstFilePath) + } + var reqVID string if fi.VersionID == "" { reqVID = nullVersionID } else { reqVID = fi.VersionID } - // Replace the data of null version or any other existing version-id - _, ver, err := xlMeta.findVersionStr(reqVID) - if err == nil { - dataDir := ver.getDataDir() - if dataDir != "" && (xlMeta.SharedDataDirCountStr(reqVID, dataDir) == 0) { - // Purge the destination path as we are not preserving anything - // versioned object was not requested. - oldDstDataPath = pathJoin(dstVolumeDir, dstPath, dataDir) - // if old destination path is same as new destination path - // there is nothing to purge, this is true in case of healing - // avoid setting oldDstDataPath at that point. - if oldDstDataPath == dstDataPath { - oldDstDataPath = "" - } else { - xlMeta.data.remove(reqVID, dataDir) - } - } - } - // Empty fi.VersionID indicates that versioning is either // suspended or disabled on this bucket. RenameData will replace // the 'null' version. We add a free-version to track its tiered @@ -2651,74 +2676,115 @@ func (s *xlStorage) RenameData(ctx context.Context, srcVolume, srcPath string, f } // indicates if RenameData() is called by healing. - // healing doesn't preserve the dataDir as 'legacy' - healing := fi.XLV1 && fi.DataDir != legacyDataDir + healing := fi.Healing() + + // Replace the data of null version or any other existing version-id + _, ver, err := xlMeta.findVersionStr(reqVID) + if err == nil { + dataDir := ver.getDataDir() + if dataDir != "" && (xlMeta.SharedDataDirCountStr(reqVID, dataDir) == 0) { + // Purge the destination path as we are not preserving anything + // versioned object was not requested. + res.OldDataDir = dataDir + if healing { + // if old destination path is same as new destination path + // there is nothing to purge, this is true in case of healing + // avoid setting OldDataDir at that point. + res.OldDataDir = "" + } else { + xlMeta.data.remove(reqVID, dataDir) + } + } + } if err = xlMeta.AddVersion(fi); err != nil { if legacyPreserved { // Any failed rename calls un-roll previous transaction. s.deleteFile(dstVolumeDir, legacyDataPath, true, false) } - return 0, err + return res, err } - var sbuf bytes.Buffer - for _, ver := range xlMeta.versions { - sbuf.Write(ver.header.Signature[:]) + if len(xlMeta.versions) <= 10 { + // any number of versions beyond this is excessive + // avoid healing such objects in this manner, let + // it heal during the regular scanner cycle. + dst := []byte{} + for _, ver := range xlMeta.versions { + dst = slices.Grow(dst, 16) + copy(dst[len(dst):], ver.header.VersionID[:]) + } + res.Sign = dst } - sign = xxh3.Hash(sbuf.Bytes()) - dstBuf, err = xlMeta.AppendTo(metaDataPoolGet()) - defer metaDataPoolPut(dstBuf) + newDstBuf, err := xlMeta.AppendTo(metaDataPoolGet()) + defer metaDataPoolPut(newDstBuf) if err != nil { if legacyPreserved { s.deleteFile(dstVolumeDir, legacyDataPath, true, false) } - return 0, errFileCorrupt + return res, errFileCorrupt } - if err = s.WriteAll(ctx, srcVolume, pathJoin(srcPath, xlStorageFormatFile), dstBuf); err != nil { + if err = s.WriteAll(ctx, srcVolume, pathJoin(srcPath, xlStorageFormatFile), newDstBuf); err != nil { if legacyPreserved { s.deleteFile(dstVolumeDir, legacyDataPath, true, false) } - return 0, osErrToFileErr(err) + return res, osErrToFileErr(err) } diskHealthCheckOK(ctx, err) - if srcDataPath != "" && len(fi.Data) == 0 && fi.Size > 0 { - // renameAll only for objects that have xl.meta not saved inline. - s.moveToTrash(dstDataPath, true, false) + notInline := srcDataPath != "" && len(fi.Data) == 0 && fi.Size > 0 + if notInline { if healing { + // renameAll only for objects that have xl.meta not saved inline. + // this must be done in healing only, otherwise it is expected + // that for fresh PutObject() call dstDataPath can never exist. + // if its an overwrite then the caller deletes the DataDir + // in a separate RPC call. + s.moveToTrash(dstDataPath, true, false) + // If we are healing we should purge any legacyDataPath content, // that was previously preserved during PutObject() call // on a versioned bucket. s.moveToTrash(legacyDataPath, true, false) } - if err = renameAll(srcDataPath, dstDataPath, dstVolumeDir); err != nil { + if err = renameAll(srcDataPath, dstDataPath, skipParent); err != nil { if legacyPreserved { // Any failed rename calls un-roll previous transaction. s.deleteFile(dstVolumeDir, legacyDataPath, true, false) } + // if its a partial rename() do not attempt to delete recursively. s.deleteFile(dstVolumeDir, dstDataPath, false, false) - return 0, osErrToFileErr(err) + return res, osErrToFileErr(err) } } + // When we are not inlined and there is no oldDataDir present + // we backup existing xl.meta -> xl.meta.bkp - this is done to + // ensure for some reason we didn't get enough quorum we can + // revert this back to original xl.meta and preserve the older dataDir. + if notInline && res.OldDataDir != "" { + // preserve current xl.meta inside the oldDataDir. + if err = s.writeAll(ctx, dstVolume, pathJoin(dstPath, res.OldDataDir, xlStorageFormatFileBackup), dstBuf, true, skipParent); err != nil { + if legacyPreserved { + s.deleteFile(dstVolumeDir, legacyDataPath, true, false) + } + return res, osErrToFileErr(err) + } + diskHealthCheckOK(ctx, err) + } + // Commit meta-file - if err = renameAll(srcFilePath, dstFilePath, dstVolumeDir); err != nil { + if err = renameAll(srcFilePath, dstFilePath, skipParent); err != nil { if legacyPreserved { // Any failed rename calls un-roll previous transaction. s.deleteFile(dstVolumeDir, legacyDataPath, true, false) } + // if its a partial rename() do not attempt to delete recursively. + // this can be healed since all parts are available. s.deleteFile(dstVolumeDir, dstDataPath, false, false) - return 0, osErrToFileErr(err) - } - - // additionally only purge older data at the end of the transaction of new data-dir - // movement, this is to ensure that previous data references can co-exist for - // any recoverability. - if oldDstDataPath != "" { - s.moveToTrash(oldDstDataPath, true, false) + return res, osErrToFileErr(err) } if srcVolume != minioMetaMultipartBucket { @@ -2729,7 +2795,7 @@ func (s *xlStorage) RenameData(ctx context.Context, srcVolume, srcPath string, f } else { s.deleteFile(srcVolumeDir, pathutil.Dir(srcFilePath), true, false) } - return sign, nil + return res, nil } // RenameFile - rename source path to destination path atomically. @@ -3047,8 +3113,11 @@ func (s *xlStorage) CleanAbandonedData(ctx context.Context, volume string, path // Do not abort on context errors. for dir := range foundDirs { toRemove := pathJoin(volumeDir, path, dir+SlashSeparator) - err := s.deleteFile(volumeDir, toRemove, true, true) + err = s.deleteFile(volumeDir, toRemove, true, true) diskHealthCheckOK(ctx, err) + if err != nil { + return err + } } // Do the same for inline data @@ -3056,32 +3125,38 @@ func (s *xlStorage) CleanAbandonedData(ctx context.Context, volume string, path if err != nil { return err } + // Clear and repopulate for k := range foundDirs { delete(foundDirs, k) } + // Populate into map for _, k := range dirs { foundDirs[k] = struct{}{} } + // Delete all directories we expect to be there. for _, dir := range wantDirs { delete(foundDirs, dir) } + // Nothing to delete + if len(foundDirs) == 0 { + return nil + } + // Delete excessive inline entries. - if len(foundDirs) > 0 { - // Convert to slice. - dirs = dirs[:0] - for dir := range foundDirs { - dirs = append(dirs, dir) - } - if xl.data.remove(dirs...) { - newBuf, err := xl.AppendTo(metaDataPoolGet()) - if err == nil { - defer metaDataPoolPut(newBuf) - return s.WriteAll(ctx, volume, pathJoin(path, xlStorageFormatFile), buf) - } + // Convert to slice. + dirs = dirs[:0] + for dir := range foundDirs { + dirs = append(dirs, dir) + } + if xl.data.remove(dirs...) { + newBuf, err := xl.AppendTo(metaDataPoolGet()) + if err == nil { + defer metaDataPoolPut(newBuf) + return s.WriteAll(ctx, volume, pathJoin(path, xlStorageFormatFile), buf) } } return nil diff --git a/docs/bucket/replication/setup_3site_replication.sh b/docs/bucket/replication/setup_3site_replication.sh index aead6288e..c348a63ad 100755 --- a/docs/bucket/replication/setup_3site_replication.sh +++ b/docs/bucket/replication/setup_3site_replication.sh @@ -43,10 +43,7 @@ unset MINIO_KMS_KES_KEY_FILE unset MINIO_KMS_KES_ENDPOINT unset MINIO_KMS_KES_KEY_NAME -( - cd ./docs/debugging/s3-check-md5 - go install -v -) +go install -v github.com/minio/minio/docs/debugging/s3-check-md5@latest wget -q -O mc https://dl.minio.io/client/mc/release/linux-amd64/mc && chmod +x mc diff --git a/docs/distributed/decom-compressed-sse-s3.sh b/docs/distributed/decom-compressed-sse-s3.sh index 2977a798a..48191980f 100755 --- a/docs/distributed/decom-compressed-sse-s3.sh +++ b/docs/distributed/decom-compressed-sse-s3.sh @@ -147,10 +147,7 @@ if [ $ret -ne 0 ]; then exit 1 fi -( - cd ./docs/debugging/s3-check-md5 - go install -v -) +go install -v github.com/minio/minio/docs/debugging/s3-check-md5@latest s3-check-md5 -versions -access-key minioadmin -secret-key minioadmin -endpoint http://127.0.0.1:9001/ -bucket versioned diff --git a/docs/distributed/decom-encrypted-sse-s3.sh b/docs/distributed/decom-encrypted-sse-s3.sh index ffc226178..272278255 100755 --- a/docs/distributed/decom-encrypted-sse-s3.sh +++ b/docs/distributed/decom-encrypted-sse-s3.sh @@ -158,10 +158,7 @@ if [ $ret -ne 0 ]; then exit 1 fi -( - cd ./docs/debugging/s3-check-md5 - go install -v -) +go install -v github.com/minio/minio/docs/debugging/s3-check-md5@latest s3-check-md5 -versions -access-key minioadmin -secret-key minioadmin -endpoint http://127.0.0.1:9001/ -bucket versioned diff --git a/docs/distributed/decom-encrypted.sh b/docs/distributed/decom-encrypted.sh index 0256ffd6d..b1169ceea 100755 --- a/docs/distributed/decom-encrypted.sh +++ b/docs/distributed/decom-encrypted.sh @@ -144,10 +144,7 @@ if [ "${expected_checksum}" != "${got_checksum}" ]; then exit 1 fi -( - cd ./docs/debugging/s3-check-md5 - go install -v -) +go install -v github.com/minio/minio/docs/debugging/s3-check-md5@latest s3-check-md5 -versions -access-key minioadmin -secret-key minioadmin -endpoint http://127.0.0.1:9001/ -bucket versioned diff --git a/docs/distributed/decom.sh b/docs/distributed/decom.sh index 80c2b4e41..ee3182ee5 100755 --- a/docs/distributed/decom.sh +++ b/docs/distributed/decom.sh @@ -210,10 +210,7 @@ if [ "${expected_checksum}" != "${got_checksum}" ]; then exit 1 fi -( - cd ./docs/debugging/s3-check-md5 - go install -v -) +go install -v github.com/minio/minio/docs/debugging/s3-check-md5@latest s3-check-md5 -versions -access-key minioadmin -secret-key minioadmin -endpoint http://127.0.0.1:9001/ -bucket bucket2 s3-check-md5 -versions -access-key minioadmin -secret-key minioadmin -endpoint http://127.0.0.1:9001/ -bucket versioned diff --git a/internal/grid/handlers.go b/internal/grid/handlers.go index 358bb8a2a..0f5411ee1 100644 --- a/internal/grid/handlers.go +++ b/internal/grid/handlers.go @@ -111,6 +111,7 @@ const ( HandlerWriteAll HandlerListBuckets HandlerRenameDataInline + HandlerRenameData2 // Add more above here ^^^ // If all handlers are used, the type of Handler can be changed. @@ -189,6 +190,8 @@ var handlerPrefixes = [handlerLast]string{ HandlerConsoleLog: peerPrefix, HandlerListDir: storagePrefix, HandlerListBuckets: peerPrefixS3, + HandlerRenameDataInline: storagePrefix, + HandlerRenameData2: storagePrefix, } const ( diff --git a/internal/grid/handlers_string.go b/internal/grid/handlers_string.go index fa712990b..07c8d810c 100644 --- a/internal/grid/handlers_string.go +++ b/internal/grid/handlers_string.go @@ -81,14 +81,15 @@ func _() { _ = x[HandlerWriteAll-70] _ = x[HandlerListBuckets-71] _ = x[HandlerRenameDataInline-72] - _ = x[handlerTest-73] - _ = x[handlerTest2-74] - _ = x[handlerLast-75] + _ = x[HandlerRenameData2-73] + _ = x[handlerTest-74] + _ = x[handlerTest2-75] + _ = x[handlerLast-76] } -const _HandlerID_name = "handlerInvalidLockLockLockRLockLockUnlockLockRUnlockLockRefreshLockForceUnlockWalkDirStatVolDiskInfoNSScannerReadXLReadVersionDeleteFileDeleteVersionUpdateMetadataWriteMetadataCheckPartsRenameDataRenameFileReadAllServerVerifyTraceListenDeleteBucketMetadataLoadBucketMetadataReloadSiteReplicationConfigReloadPoolMetaStopRebalanceLoadRebalanceMetaLoadTransitionTierConfigDeletePolicyLoadPolicyLoadPolicyMappingDeleteServiceAccountLoadServiceAccountDeleteUserLoadUserLoadGroupHealBucketMakeBucketHeadBucketDeleteBucketGetMetricsGetResourceMetricsGetMemInfoGetProcInfoGetOSInfoGetPartitionsGetNetInfoGetCPUsServerInfoGetSysConfigGetSysServicesGetSysErrorsGetAllBucketStatsGetBucketStatsGetSRMetricsGetPeerMetricsGetMetacacheListingUpdateMetacacheListingGetPeerBucketMetricsStorageInfoConsoleLogListDirGetLocksBackgroundHealStatusGetLastDayTierStatsSignalServiceGetBandwidthWriteAllListBucketsRenameDataInlinehandlerTesthandlerTest2handlerLast" +const _HandlerID_name = "handlerInvalidLockLockLockRLockLockUnlockLockRUnlockLockRefreshLockForceUnlockWalkDirStatVolDiskInfoNSScannerReadXLReadVersionDeleteFileDeleteVersionUpdateMetadataWriteMetadataCheckPartsRenameDataRenameFileReadAllServerVerifyTraceListenDeleteBucketMetadataLoadBucketMetadataReloadSiteReplicationConfigReloadPoolMetaStopRebalanceLoadRebalanceMetaLoadTransitionTierConfigDeletePolicyLoadPolicyLoadPolicyMappingDeleteServiceAccountLoadServiceAccountDeleteUserLoadUserLoadGroupHealBucketMakeBucketHeadBucketDeleteBucketGetMetricsGetResourceMetricsGetMemInfoGetProcInfoGetOSInfoGetPartitionsGetNetInfoGetCPUsServerInfoGetSysConfigGetSysServicesGetSysErrorsGetAllBucketStatsGetBucketStatsGetSRMetricsGetPeerMetricsGetMetacacheListingUpdateMetacacheListingGetPeerBucketMetricsStorageInfoConsoleLogListDirGetLocksBackgroundHealStatusGetLastDayTierStatsSignalServiceGetBandwidthWriteAllListBucketsRenameDataInlineRenameData2handlerTesthandlerTest2handlerLast" -var _HandlerID_index = [...]uint16{0, 14, 22, 31, 41, 52, 63, 78, 85, 92, 100, 109, 115, 126, 136, 149, 163, 176, 186, 196, 206, 213, 225, 230, 236, 256, 274, 301, 315, 328, 345, 369, 381, 391, 408, 428, 446, 456, 464, 473, 483, 493, 503, 515, 525, 543, 553, 564, 573, 586, 596, 603, 613, 625, 639, 651, 668, 682, 694, 708, 727, 749, 769, 780, 790, 797, 805, 825, 844, 857, 869, 877, 888, 904, 915, 927, 938} +var _HandlerID_index = [...]uint16{0, 14, 22, 31, 41, 52, 63, 78, 85, 92, 100, 109, 115, 126, 136, 149, 163, 176, 186, 196, 206, 213, 225, 230, 236, 256, 274, 301, 315, 328, 345, 369, 381, 391, 408, 428, 446, 456, 464, 473, 483, 493, 503, 515, 525, 543, 553, 564, 573, 586, 596, 603, 613, 625, 639, 651, 668, 682, 694, 708, 727, 749, 769, 780, 790, 797, 805, 825, 844, 857, 869, 877, 888, 904, 915, 926, 938, 949} func (i HandlerID) String() string { if i >= HandlerID(len(_HandlerID_index)-1) {