converge clusterRead health into cluster health (#19063)

This commit is contained in:
Harshavardhana 2024-02-15 16:48:36 -08:00 committed by GitHub
parent 68dde2359f
commit 607cafadbc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 69 additions and 149 deletions

View file

@ -2280,55 +2280,23 @@ type HealthOptions struct {
// was queried
type HealthResult struct {
Healthy bool
HealthyRead bool
HealingDrives int
ESHealth []struct {
Maintenance bool
PoolID, SetID int
Healthy bool
HealthyRead bool
HealthyDrives int
HealingDrives int
ReadQuorum int
WriteQuorum int
}
WriteQuorum int
ReadQuorum int
UsingDefaults bool
}
// ReadHealth returns if the cluster can serve read requests
func (z *erasureServerPools) ReadHealth(ctx context.Context) bool {
erasureSetUpCount := make([][]int, len(z.serverPools))
for i := range z.serverPools {
erasureSetUpCount[i] = make([]int, len(z.serverPools[i].sets))
}
diskIDs := globalNotificationSys.GetLocalDiskIDs(ctx)
diskIDs = append(diskIDs, getLocalDiskIDs(z))
for _, localDiskIDs := range diskIDs {
for _, id := range localDiskIDs {
poolIdx, setIdx, _, err := z.getPoolAndSet(id)
if err != nil {
logger.LogIf(ctx, err)
continue
}
erasureSetUpCount[poolIdx][setIdx]++
}
}
b := z.BackendInfo()
poolReadQuorums := make([]int, len(b.StandardSCData))
copy(poolReadQuorums, b.StandardSCData)
for poolIdx := range erasureSetUpCount {
for setIdx := range erasureSetUpCount[poolIdx] {
if erasureSetUpCount[poolIdx][setIdx] < poolReadQuorums[poolIdx] {
return false
}
}
}
return true
}
// Health - returns current status of the object layer health,
// provides if write access exists across sets, additionally
// can be used to query scenarios if health may be lost
@ -2397,9 +2365,20 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
}
}
var maximumReadQuorum int
for _, readQuorum := range poolReadQuorums {
if maximumReadQuorum == 0 {
maximumReadQuorum = readQuorum
}
if readQuorum > maximumReadQuorum {
maximumReadQuorum = readQuorum
}
}
result := HealthResult{
Healthy: true,
WriteQuorum: maximumWriteQuorum,
ReadQuorum: maximumReadQuorum,
UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
}
@ -2409,6 +2388,7 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
Maintenance bool
PoolID, SetID int
Healthy bool
HealthyRead bool
HealthyDrives, HealingDrives int
ReadQuorum, WriteQuorum int
}{
@ -2416,6 +2396,7 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
SetID: setIdx,
PoolID: poolIdx,
Healthy: erasureSetUpCount[poolIdx][setIdx].online >= poolWriteQuorums[poolIdx],
HealthyRead: erasureSetUpCount[poolIdx][setIdx].online >= poolReadQuorums[poolIdx],
HealthyDrives: erasureSetUpCount[poolIdx][setIdx].online,
HealingDrives: erasureSetUpCount[poolIdx][setIdx].healing,
ReadQuorum: poolReadQuorums[poolIdx],
@ -2428,6 +2409,12 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
poolIdx, setIdx, poolWriteQuorums[poolIdx]))
}
result.HealthyRead = erasureSetUpCount[poolIdx][setIdx].online >= poolReadQuorums[poolIdx]
if !result.HealthyRead {
logger.LogIf(logger.SetReqInfo(ctx, reqInfo),
fmt.Errorf("Read quorum may be lost on pool: %d, set: %d, expected read quorum: %d",
poolIdx, setIdx, poolReadQuorums[poolIdx]))
}
}
}

View file

@ -81,12 +81,28 @@ func ClusterReadCheckHandler(w http.ResponseWriter, r *http.Request) {
ctx, cancel := context.WithTimeout(ctx, globalAPIConfig.getClusterDeadline())
defer cancel()
result := objLayer.ReadHealth(ctx)
if !result {
writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone)
opts := HealthOptions{
Maintenance: r.Form.Get("maintenance") == "true",
DeploymentType: r.Form.Get("deployment-type"),
}
result := objLayer.Health(ctx, opts)
w.Header().Set(xhttp.MinIOReadQuorum, strconv.Itoa(result.ReadQuorum))
w.Header().Set(xhttp.MinIOStorageClassDefaults, strconv.FormatBool(result.UsingDefaults))
// return how many drives are being healed if any
if result.HealingDrives > 0 {
w.Header().Set(xhttp.MinIOHealingDrives, strconv.Itoa(result.HealingDrives))
}
if !result.HealthyRead {
// As a maintenance call we are purposefully asked to be taken
// down, this is for orchestrators to know if we can safely
// take this server down, return appropriate error.
if opts.Maintenance {
writeResponse(w, http.StatusPreconditionFailed, nil, mimeNone)
} else {
writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone)
}
return
}
writeResponse(w, http.StatusOK, nil, mimeNone)
}

View file

@ -1104,24 +1104,6 @@ func (sys *NotificationSys) ServerInfo(metrics bool) []madmin.ServerProperties {
return reply
}
// GetLocalDiskIDs - return disk ids of the local disks of the peers.
func (sys *NotificationSys) GetLocalDiskIDs(ctx context.Context) (localDiskIDs [][]string) {
localDiskIDs = make([][]string, len(sys.peerClients))
var wg sync.WaitGroup
for idx, client := range sys.peerClients {
if client == nil {
continue
}
wg.Add(1)
go func(idx int, client *peerRESTClient) {
defer wg.Done()
localDiskIDs[idx] = client.GetLocalDiskIDs(ctx)
}(idx, client)
}
wg.Wait()
return localDiskIDs
}
// returns all the peers that are currently online.
func (sys *NotificationSys) getOnlinePeers() []*peerRESTClient {
var peerClients []*peerRESTClient

View file

@ -287,7 +287,6 @@ type ObjectLayer interface {
// Returns health of the backend
Health(ctx context.Context, opts HealthOptions) HealthResult
ReadHealth(ctx context.Context) bool
// Metadata operations
PutObjectMetadata(context.Context, string, string, ObjectOptions) (ObjectInfo, error)

View file

@ -601,21 +601,6 @@ func (client *peerRESTClient) BackgroundHealStatus() (madmin.BgHealState, error)
return state, err
}
// GetLocalDiskIDs - get a peer's local disks' IDs.
func (client *peerRESTClient) GetLocalDiskIDs(ctx context.Context) (diskIDs []string) {
conn := client.gridConn()
if conn == nil {
return
}
resp, err := getLocalDiskIDsHandler.Call(ctx, conn, grid.NewMSS())
if err != nil {
return
}
return resp.IDs
}
// GetMetacacheListing - get a new or existing metacache.
func (client *peerRESTClient) GetMetacacheListing(ctx context.Context, o listPathOptions) (*metacache, error) {
if client == nil {

View file

@ -715,59 +715,11 @@ func (s *peerRESTServer) PutBucketNotificationHandler(w http.ResponseWriter, r *
globalEventNotifier.AddRulesMap(bucketName, rulesMap)
}
// Return disk IDs of all the local disks.
func getLocalDiskIDs(z *erasureServerPools) []string {
var ids []string
for poolIdx := range z.serverPools {
for _, set := range z.serverPools[poolIdx].sets {
disks := set.getDisks()
for _, disk := range disks {
if disk == nil {
continue
}
if disk.IsLocal() {
id, err := disk.GetDiskID()
if err != nil {
continue
}
if id == "" {
continue
}
ids = append(ids, id)
}
}
}
}
return ids
}
// HealthHandler - returns true of health
func (s *peerRESTServer) HealthHandler(w http.ResponseWriter, r *http.Request) {
s.IsValid(w, r)
}
var getLocalDiskIDsHandler = grid.NewSingleHandler[*grid.MSS, *LocalDiskIDs](grid.HandlerGetLocalDiskIDs, grid.NewMSS, func() *LocalDiskIDs {
return &LocalDiskIDs{}
})
// GetLocalDiskIDs - Return disk IDs of all the local disks.
func (s *peerRESTServer) GetLocalDiskIDs(mss *grid.MSS) (*LocalDiskIDs, *grid.RemoteErr) {
objLayer := newObjectLayerFn()
// Service not initialized yet
if objLayer == nil {
return nil, grid.NewRemoteErr(errServerNotInitialized)
}
z, ok := objLayer.(*erasureServerPools)
if !ok {
return nil, grid.NewRemoteErr(errServerNotInitialized)
}
return &LocalDiskIDs{IDs: getLocalDiskIDs(z)}, nil
}
// VerifyBinary - verifies the downloaded binary is in-tact
func (s *peerRESTServer) VerifyBinaryHandler(w http.ResponseWriter, r *http.Request) {
if !s.IsValid(w, r) {
@ -1591,7 +1543,6 @@ func registerPeerRESTHandlers(router *mux.Router, gm *grid.Manager) {
logger.FatalIf(reloadSiteReplicationConfigHandler.Register(gm, server.ReloadSiteReplicationConfigHandler), "unable to register handler")
logger.FatalIf(loadBucketMetadataHandler.Register(gm, server.LoadBucketMetadataHandler), "unable to register handler")
logger.FatalIf(deleteBucketMetadataHandler.Register(gm, server.DeleteBucketMetadataHandler), "unable to register handler")
logger.FatalIf(getLocalDiskIDsHandler.Register(gm, server.GetLocalDiskIDs), "unable to register handler")
logger.FatalIf(listenHandler.RegisterNoInput(gm, server.ListenHandler), "unable to register handler")
logger.FatalIf(gm.RegisterStreamingHandler(grid.HandlerTrace, grid.StreamHandler{
Handle: server.TraceHandler,

View file

@ -62,7 +62,6 @@ const (
HandlerServerVerify
HandlerTrace
HandlerListen
HandlerGetLocalDiskIDs
HandlerDeleteBucketMetadata
HandlerLoadBucketMetadata
HandlerReloadSiteReplicationConfig
@ -119,7 +118,6 @@ var handlerPrefixes = [handlerLast]string{
HandlerServerVerify: bootstrapPrefix,
HandlerTrace: peerPrefix,
HandlerListen: peerPrefix,
HandlerGetLocalDiskIDs: peerPrefix,
HandlerDeleteBucketMetadata: peerPrefix,
HandlerLoadBucketMetadata: peerPrefix,
HandlerReloadSiteReplicationConfig: peerPrefix,

View file

@ -32,34 +32,33 @@ func _() {
_ = x[HandlerServerVerify-21]
_ = x[HandlerTrace-22]
_ = x[HandlerListen-23]
_ = x[HandlerGetLocalDiskIDs-24]
_ = x[HandlerDeleteBucketMetadata-25]
_ = x[HandlerLoadBucketMetadata-26]
_ = x[HandlerReloadSiteReplicationConfig-27]
_ = x[HandlerReloadPoolMeta-28]
_ = x[HandlerStopRebalance-29]
_ = x[HandlerLoadRebalanceMeta-30]
_ = x[HandlerLoadTransitionTierConfig-31]
_ = x[HandlerDeletePolicy-32]
_ = x[HandlerLoadPolicy-33]
_ = x[HandlerLoadPolicyMapping-34]
_ = x[HandlerDeleteServiceAccount-35]
_ = x[HandlerLoadServiceAccount-36]
_ = x[HandlerDeleteUser-37]
_ = x[HandlerLoadUser-38]
_ = x[HandlerLoadGroup-39]
_ = x[HandlerHealBucket-40]
_ = x[HandlerMakeBucket-41]
_ = x[HandlerHeadBucket-42]
_ = x[HandlerDeleteBucket-43]
_ = x[handlerTest-44]
_ = x[handlerTest2-45]
_ = x[handlerLast-46]
_ = x[HandlerDeleteBucketMetadata-24]
_ = x[HandlerLoadBucketMetadata-25]
_ = x[HandlerReloadSiteReplicationConfig-26]
_ = x[HandlerReloadPoolMeta-27]
_ = x[HandlerStopRebalance-28]
_ = x[HandlerLoadRebalanceMeta-29]
_ = x[HandlerLoadTransitionTierConfig-30]
_ = x[HandlerDeletePolicy-31]
_ = x[HandlerLoadPolicy-32]
_ = x[HandlerLoadPolicyMapping-33]
_ = x[HandlerDeleteServiceAccount-34]
_ = x[HandlerLoadServiceAccount-35]
_ = x[HandlerDeleteUser-36]
_ = x[HandlerLoadUser-37]
_ = x[HandlerLoadGroup-38]
_ = x[HandlerHealBucket-39]
_ = x[HandlerMakeBucket-40]
_ = x[HandlerHeadBucket-41]
_ = x[HandlerDeleteBucket-42]
_ = x[handlerTest-43]
_ = x[handlerTest2-44]
_ = x[handlerLast-45]
}
const _HandlerID_name = "handlerInvalidLockLockLockRLockLockUnlockLockRUnlockLockRefreshLockForceUnlockWalkDirStatVolDiskInfoNSScannerReadXLReadVersionDeleteFileDeleteVersionUpdateMetadataWriteMetadataCheckPartsRenameDataRenameFileReadAllServerVerifyTraceListenGetLocalDiskIDsDeleteBucketMetadataLoadBucketMetadataReloadSiteReplicationConfigReloadPoolMetaStopRebalanceLoadRebalanceMetaLoadTransitionTierConfigDeletePolicyLoadPolicyLoadPolicyMappingDeleteServiceAccountLoadServiceAccountDeleteUserLoadUserLoadGroupHealBucketMakeBucketHeadBucketDeleteBuckethandlerTesthandlerTest2handlerLast"
const _HandlerID_name = "handlerInvalidLockLockLockRLockLockUnlockLockRUnlockLockRefreshLockForceUnlockWalkDirStatVolDiskInfoNSScannerReadXLReadVersionDeleteFileDeleteVersionUpdateMetadataWriteMetadataCheckPartsRenameDataRenameFileReadAllServerVerifyTraceListenDeleteBucketMetadataLoadBucketMetadataReloadSiteReplicationConfigReloadPoolMetaStopRebalanceLoadRebalanceMetaLoadTransitionTierConfigDeletePolicyLoadPolicyLoadPolicyMappingDeleteServiceAccountLoadServiceAccountDeleteUserLoadUserLoadGroupHealBucketMakeBucketHeadBucketDeleteBuckethandlerTesthandlerTest2handlerLast"
var _HandlerID_index = [...]uint16{0, 14, 22, 31, 41, 52, 63, 78, 85, 92, 100, 109, 115, 126, 136, 149, 163, 176, 186, 196, 206, 213, 225, 230, 236, 251, 271, 289, 316, 330, 343, 360, 384, 396, 406, 423, 443, 461, 471, 479, 488, 498, 508, 518, 530, 541, 553, 564}
var _HandlerID_index = [...]uint16{0, 14, 22, 31, 41, 52, 63, 78, 85, 92, 100, 109, 115, 126, 136, 149, 163, 176, 186, 196, 206, 213, 225, 230, 236, 256, 274, 301, 315, 328, 345, 369, 381, 391, 408, 428, 446, 456, 464, 473, 483, 493, 503, 515, 526, 538, 549}
func (i HandlerID) String() string {
if i >= HandlerID(len(_HandlerID_index)-1) {

View file

@ -192,6 +192,9 @@ const (
// Writes expected write quorum
MinIOWriteQuorum = "x-minio-write-quorum"
// Reads expected read quorum
MinIOReadQuorum = "x-minio-read-quorum"
// Indicates if we are using default storage class and there was problem loading config
// if this header is set to "true"
MinIOStorageClassDefaults = "x-minio-storage-class-defaults"