minio/cmd/mrf.go

284 lines
6.5 KiB
Go

// Copyright (c) 2015-2024 MinIO, Inc.
//
// This file is part of MinIO Object Storage stack
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//go:generate msgp -file=$GOFILE
package cmd
import (
"context"
"encoding/binary"
"errors"
"fmt"
"io"
"sync"
"sync/atomic"
"time"
"github.com/google/uuid"
"github.com/minio/madmin-go/v3"
"github.com/minio/pkg/v3/wildcard"
"github.com/tinylib/msgp/msgp"
)
const (
mrfOpsQueueSize = 100000
)
const (
healDir = ".heal"
healMRFDir = bucketMetaPrefix + SlashSeparator + healDir + SlashSeparator + "mrf"
healMRFMetaFormat = 1
healMRFMetaVersionV1 = 1
)
// PartialOperation is a successful upload/delete of an object
// but not written in all disks (having quorum)
type PartialOperation struct {
Bucket string
Object string
VersionID string
Versions []byte
SetIndex, PoolIndex int
Queued time.Time
BitrotScan bool
}
// mrfState sncapsulates all the information
// related to the global background MRF.
type mrfState struct {
opCh chan PartialOperation
closed int32
closing int32
wg sync.WaitGroup
}
func newMRFState() mrfState {
return mrfState{
opCh: make(chan PartialOperation, mrfOpsQueueSize),
}
}
// Add a partial S3 operation (put/delete) when one or more disks are offline.
func (m *mrfState) addPartialOp(op PartialOperation) {
if m == nil {
return
}
if atomic.LoadInt32(&m.closed) == 1 {
return
}
m.wg.Add(1)
defer m.wg.Done()
if atomic.LoadInt32(&m.closing) == 1 {
return
}
select {
case m.opCh <- op:
default:
}
}
// Do not accept new MRF operations anymore and start to save
// the current heal status in one available disk
func (m *mrfState) shutdown() {
atomic.StoreInt32(&m.closing, 1)
m.wg.Wait()
close(m.opCh)
atomic.StoreInt32(&m.closed, 1)
if len(m.opCh) > 0 {
healingLogEvent(context.Background(), "Saving MRF healing data (%d entries)", len(m.opCh))
}
newReader := func() io.ReadCloser {
r, w := io.Pipe()
go func() {
// Initialize MRF meta header.
var data [4]byte
binary.LittleEndian.PutUint16(data[0:2], healMRFMetaFormat)
binary.LittleEndian.PutUint16(data[2:4], healMRFMetaVersionV1)
mw := msgp.NewWriter(w)
n, err := mw.Write(data[:])
if err != nil {
w.CloseWithError(err)
return
}
if n != len(data) {
w.CloseWithError(io.ErrShortWrite)
return
}
for item := range m.opCh {
err = item.EncodeMsg(mw)
if err != nil {
break
}
}
mw.Flush()
w.CloseWithError(err)
}()
return r
}
globalLocalDrivesMu.RLock()
localDrives := cloneDrives(globalLocalDrivesMap)
globalLocalDrivesMu.RUnlock()
for _, localDrive := range localDrives {
r := newReader()
err := localDrive.CreateFile(context.Background(), "", minioMetaBucket, pathJoin(healMRFDir, "list.bin"), -1, r)
r.Close()
if err == nil {
break
}
}
}
func (m *mrfState) startMRFPersistence() {
loadMRF := func(rc io.ReadCloser, opCh chan PartialOperation) error {
defer rc.Close()
var data [4]byte
n, err := rc.Read(data[:])
if err != nil {
return err
}
if n != len(data) {
return errors.New("heal mrf: no data")
}
// Read resync meta header
switch binary.LittleEndian.Uint16(data[0:2]) {
case healMRFMetaFormat:
default:
return fmt.Errorf("heal mrf: unknown format: %d", binary.LittleEndian.Uint16(data[0:2]))
}
switch binary.LittleEndian.Uint16(data[2:4]) {
case healMRFMetaVersionV1:
default:
return fmt.Errorf("heal mrf: unknown version: %d", binary.LittleEndian.Uint16(data[2:4]))
}
mr := msgp.NewReader(rc)
for {
op := PartialOperation{}
err = op.DecodeMsg(mr)
if err != nil {
break
}
opCh <- op
}
return nil
}
globalLocalDrivesMu.RLock()
localDrives := cloneDrives(globalLocalDrivesMap)
globalLocalDrivesMu.RUnlock()
for _, localDrive := range localDrives {
if localDrive == nil {
continue
}
rc, err := localDrive.ReadFileStream(context.Background(), minioMetaBucket, pathJoin(healMRFDir, "list.bin"), 0, -1)
if err != nil {
continue
}
err = loadMRF(rc, m.opCh)
if err != nil {
continue
}
// finally delete the file after processing mrf entries
localDrive.Delete(GlobalContext, minioMetaBucket, pathJoin(healMRFDir, "list.bin"), DeleteOptions{})
break
}
return
}
var healSleeper = newDynamicSleeper(5, time.Second, false)
// healRoutine listens to new disks reconnection events and
// issues healing requests for queued objects belonging to the
// corresponding erasure set
func (m *mrfState) healRoutine(z *erasureServerPools) {
for {
select {
case <-GlobalContext.Done():
return
case u, ok := <-m.opCh:
if !ok {
return
}
// We might land at .metacache, .trash, .multipart
// no need to heal them skip, only when bucket
// is '.minio.sys'
if u.Bucket == minioMetaBucket {
// No MRF needed for temporary objects
if wildcard.Match("buckets/*/.metacache/*", u.Object) {
continue
}
if wildcard.Match("tmp/*", u.Object) {
continue
}
if wildcard.Match("multipart/*", u.Object) {
continue
}
if wildcard.Match("tmp-old/*", u.Object) {
continue
}
}
now := time.Now()
if now.Sub(u.Queued) < time.Second {
// let recently failed networks to reconnect
// making MRF wait for 1s before retrying,
// i.e 4 reconnect attempts.
time.Sleep(time.Second)
}
// wait on timer per heal
wait := healSleeper.Timer(context.Background())
scan := madmin.HealNormalScan
if u.BitrotScan {
scan = madmin.HealDeepScan
}
if u.Object == "" {
healBucket(u.Bucket, scan)
} else {
if len(u.Versions) > 0 {
vers := len(u.Versions) / 16
if vers > 0 {
for i := 0; i < vers; i++ {
healObject(u.Bucket, u.Object, uuid.UUID(u.Versions[16*i:]).String(), scan)
}
}
} else {
healObject(u.Bucket, u.Object, u.VersionID, scan)
}
}
wait()
}
}
}