teleport/lib/service/state.go
Zac Bergquist 072956e4a0
docs: clarify /healthz and /readyz (#11085)
- Rename the page, since it's about diagnostics rather than metrics
  alone
- Change major section headings to H2s so they apper in the table of
  contents
- Move information about heartbeats and recovery to an H3 so it's
  more visible

Updates #10799

Co-authored-by: Paul Gottschling <paul.gottschling@goteleport.com>
2022-03-17 16:46:12 +00:00

171 lines
5 KiB
Go

/*
Copyright 2018 Gravitational, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package service
import (
"fmt"
"sync"
"time"
"github.com/gravitational/teleport"
"github.com/gravitational/teleport/lib/defaults"
"github.com/gravitational/teleport/lib/utils"
"github.com/gravitational/trace"
"github.com/prometheus/client_golang/prometheus"
)
type componentStateEnum byte
// Note: these consts are not using iota because they get exposed via a
// Prometheus metric. Using iota makes it possible to accidentally change the
// values.
const (
// stateOK means Teleport is operating normally.
stateOK = componentStateEnum(0)
// stateRecovering means Teleport has begun recovering from a degraded state.
stateRecovering = componentStateEnum(1)
// stateDegraded means some kind of connection error has occurred to put
// Teleport into a degraded state.
stateDegraded = componentStateEnum(2)
// stateStarting means the process is starting but hasn't joined the
// cluster yet.
stateStarting = componentStateEnum(3)
)
var stateGauge = prometheus.NewGauge(prometheus.GaugeOpts{
Name: teleport.MetricState,
Help: fmt.Sprintf("State of the teleport process: %d - ok, %d - recovering, %d - degraded, %d - starting", stateOK, stateRecovering, stateDegraded, stateStarting),
})
func init() {
stateGauge.Set(float64(stateStarting))
}
// processState tracks the state of the Teleport process.
type processState struct {
process *TeleportProcess
mu sync.Mutex
states map[string]*componentState
}
type componentState struct {
recoveryTime time.Time
state componentStateEnum
}
// newProcessState returns a new FSM that tracks the state of the Teleport process.
func newProcessState(process *TeleportProcess) (*processState, error) {
err := utils.RegisterPrometheusCollectors(stateGauge)
if err != nil {
return nil, trace.Wrap(err)
}
return &processState{
process: process,
states: make(map[string]*componentState),
}, nil
}
// update the state of a Teleport component.
func (f *processState) update(event Event) {
f.mu.Lock()
defer f.mu.Unlock()
defer f.updateGauge()
component, ok := event.Payload.(string)
if !ok {
f.process.log.Errorf("%v broadcasted without component name, this is a bug!", event.Name)
return
}
s, ok := f.states[component]
if !ok {
// Register a new component.
s = &componentState{recoveryTime: f.process.Clock.Now(), state: stateStarting}
f.states[component] = s
}
switch event.Name {
// If a degraded event was received, always change the state to degraded.
case TeleportDegradedEvent:
s.state = stateDegraded
f.process.log.Infof("Detected Teleport component %q is running in a degraded state.", component)
// If the current state is degraded, and a OK event has been
// received, change the state to recovering. If the current state is
// recovering and a OK events is received, if it's been longer
// than the recovery time (2 time the server keep alive ttl), change
// state to OK.
case TeleportOKEvent:
switch s.state {
case stateStarting:
s.state = stateOK
f.process.log.Debugf("Teleport component %q has started.", component)
case stateDegraded:
s.state = stateRecovering
s.recoveryTime = f.process.Clock.Now()
f.process.log.Infof("Teleport component %q is recovering from a degraded state.", component)
case stateRecovering:
if f.process.Clock.Since(s.recoveryTime) > defaults.HeartbeatCheckPeriod*2 {
s.state = stateOK
f.process.log.Infof("Teleport component %q has recovered from a degraded state.", component)
}
}
}
}
// getStateLocked returns the overall process state based on the state of
// individual components. If no components sent updates yet, returns
// stateStarting.
//
// Order of importance:
// 1. degraded
// 2. recovering
// 3. starting
// 4. ok
//
// Note: f.mu must be locked by the caller!
func (f *processState) getStateLocked() componentStateEnum {
state := stateStarting
numNotOK := len(f.states)
for _, s := range f.states {
switch s.state {
case stateDegraded:
return stateDegraded
case stateRecovering:
state = stateRecovering
case stateOK:
numNotOK--
}
}
// Only return stateOK if *all* components are in stateOK.
if numNotOK == 0 && len(f.states) > 0 {
state = stateOK
}
return state
}
// Note: f.mu must be locked by the caller!
func (f *processState) updateGauge() {
stateGauge.Set(float64(f.getStateLocked()))
}
// GetState returns the current state of the system.
func (f *processState) getState() componentStateEnum {
f.mu.Lock()
defer f.mu.Unlock()
return f.getStateLocked()
}