perform healthchecks before initializing everything fully (#19953)

adds more informative logs that provide details on which
erasure set is losing quorum etc.
This commit is contained in:
Harshavardhana 2024-06-19 07:33:40 -07:00 committed by GitHub
parent 9ba39d7fad
commit ee48f9f206
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 49 additions and 8 deletions

View file

@ -2425,6 +2425,7 @@ const (
type HealthOptions struct {
Maintenance bool
DeploymentType string
Startup bool
}
// HealthResult returns the current state of the system, also
@ -2449,6 +2450,24 @@ type HealthResult struct {
UsingDefaults bool
}
func (hr HealthResult) String() string {
var str strings.Builder
for i, es := range hr.ESHealth {
str.WriteString("(Pool: ")
str.WriteString(strconv.Itoa(es.PoolID))
str.WriteString(" Set: ")
str.WriteString(strconv.Itoa(es.SetID))
str.WriteString(" Healthy: ")
str.WriteString(strconv.FormatBool(es.Healthy))
if i == 0 {
str.WriteString(")")
} else {
str.WriteString("), ")
}
}
return str.String()
}
// Health - returns current status of the object layer health,
// provides if write access exists across sets, additionally
// can be used to query scenarios if health may be lost
@ -2567,17 +2586,29 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
healthy := erasureSetUpCount[poolIdx][setIdx].online >= poolWriteQuorums[poolIdx]
if !healthy {
storageLogIf(logger.SetReqInfo(ctx, reqInfo),
fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
poolIdx, setIdx, poolWriteQuorums[poolIdx]), logger.FatalKind)
if opts.Startup {
storageLogIf(logger.SetReqInfo(ctx, reqInfo),
fmt.Errorf("Write quorum was not established on pool: %d, set: %d, expected write quorum: %d",
poolIdx, setIdx, poolWriteQuorums[poolIdx]), logger.FatalKind)
} else {
storageLogIf(logger.SetReqInfo(ctx, reqInfo),
fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
poolIdx, setIdx, poolWriteQuorums[poolIdx]), logger.FatalKind)
}
}
result.Healthy = result.Healthy && healthy
healthyRead := erasureSetUpCount[poolIdx][setIdx].online >= poolReadQuorums[poolIdx]
if !healthyRead {
storageLogIf(logger.SetReqInfo(ctx, reqInfo),
fmt.Errorf("Read quorum may be lost on pool: %d, set: %d, expected read quorum: %d",
poolIdx, setIdx, poolReadQuorums[poolIdx]))
if opts.Startup {
storageLogIf(logger.SetReqInfo(ctx, reqInfo),
fmt.Errorf("Read quorum was not established on pool: %d, set: %d, expected read quorum: %d",
poolIdx, setIdx, poolReadQuorums[poolIdx]))
} else {
storageLogIf(logger.SetReqInfo(ctx, reqInfo),
fmt.Errorf("Read quorum may be lost on pool: %d, set: %d, expected read quorum: %d",
poolIdx, setIdx, poolReadQuorums[poolIdx]))
}
}
result.HealthyRead = result.HealthyRead && healthyRead
}

View file

@ -740,6 +740,8 @@ func initializeLogRotate(ctx *cli.Context) (io.WriteCloser, error) {
// serverMain handler called for 'minio server' command.
func serverMain(ctx *cli.Context) {
r := rand.New(rand.NewSource(time.Now().UnixNano()))
var warnings []string
signal.Notify(globalOSSignalCh, os.Interrupt, syscall.SIGTERM, syscall.SIGQUIT)
@ -920,6 +922,16 @@ func serverMain(ctx *cli.Context) {
globalNodeNamesHex[hex.EncodeToString(nodeNameSum[:])] = struct{}{}
}
bootstrapTrace("waitForQuorum", func() {
result := newObject.Health(context.Background(), HealthOptions{Startup: true})
for !result.Healthy {
d := time.Duration(r.Float64() * float64(time.Second))
logger.Info("Waiting for quorum healthcheck to succeed.. possible cause unhealthy sets (%s), retrying in %s", result, d)
time.Sleep(d)
result = newObject.Health(context.Background(), HealthOptions{})
}
})
var err error
bootstrapTrace("initServerConfig", func() {
if err = initServerConfig(GlobalContext, newObject); err != nil {
@ -986,8 +998,6 @@ func serverMain(ctx *cli.Context) {
}()
go func() {
r := rand.New(rand.NewSource(time.Now().UnixNano()))
if !globalDisableFreezeOnBoot {
defer bootstrapTrace("unfreezeServices", unfreezeServices)
t := time.AfterFunc(5*time.Minute, func() {