teleport/lib/service/state.go

171 lines
5 KiB
Go
Raw Normal View History

2018-10-26 22:20:02 +00:00
/*
Copyright 2018 Gravitational, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package service
import (
"fmt"
"sync"
2018-10-26 22:20:02 +00:00
"time"
"github.com/gravitational/teleport"
2018-10-26 22:20:02 +00:00
"github.com/gravitational/teleport/lib/defaults"
"github.com/gravitational/teleport/lib/utils"
"github.com/gravitational/trace"
"github.com/prometheus/client_golang/prometheus"
2018-10-26 22:20:02 +00:00
)
type componentStateEnum byte
// Note: these consts are not using iota because they get exposed via a
// Prometheus metric. Using iota makes it possible to accidentally change the
// values.
2018-10-26 22:20:02 +00:00
const (
// stateOK means Teleport is operating normally.
stateOK = componentStateEnum(0)
2018-10-26 22:20:02 +00:00
// stateRecovering means Teleport has begun recovering from a degraded state.
stateRecovering = componentStateEnum(1)
2018-11-15 19:14:20 +00:00
// stateDegraded means some kind of connection error has occurred to put
2018-10-26 22:20:02 +00:00
// Teleport into a degraded state.
stateDegraded = componentStateEnum(2)
// stateStarting means the process is starting but hasn't joined the
// cluster yet.
stateStarting = componentStateEnum(3)
2018-10-26 22:20:02 +00:00
)
var stateGauge = prometheus.NewGauge(prometheus.GaugeOpts{
Name: teleport.MetricState,
Help: fmt.Sprintf("State of the teleport process: %d - ok, %d - recovering, %d - degraded, %d - starting", stateOK, stateRecovering, stateDegraded, stateStarting),
})
func init() {
stateGauge.Set(float64(stateStarting))
}
2018-10-26 22:20:02 +00:00
// processState tracks the state of the Teleport process.
type processState struct {
process *TeleportProcess
mu sync.Mutex
states map[string]*componentState
}
type componentState struct {
2018-10-26 22:20:02 +00:00
recoveryTime time.Time
state componentStateEnum
2018-10-26 22:20:02 +00:00
}
// newProcessState returns a new FSM that tracks the state of the Teleport process.
func newProcessState(process *TeleportProcess) (*processState, error) {
err := utils.RegisterPrometheusCollectors(stateGauge)
if err != nil {
return nil, trace.Wrap(err)
}
2018-10-26 22:20:02 +00:00
return &processState{
process: process,
states: make(map[string]*componentState),
}, nil
2018-10-26 22:20:02 +00:00
}
// update the state of a Teleport component.
func (f *processState) update(event Event) {
f.mu.Lock()
defer f.mu.Unlock()
defer f.updateGauge()
component, ok := event.Payload.(string)
if !ok {
f.process.log.Errorf("%v broadcasted without component name, this is a bug!", event.Name)
return
}
s, ok := f.states[component]
if !ok {
// Register a new component.
s = &componentState{recoveryTime: f.process.Clock.Now(), state: stateStarting}
f.states[component] = s
}
2018-10-26 22:20:02 +00:00
switch event.Name {
// If a degraded event was received, always change the state to degraded.
case TeleportDegradedEvent:
s.state = stateDegraded
f.process.log.Infof("Detected Teleport component %q is running in a degraded state.", component)
2018-10-26 22:20:02 +00:00
// If the current state is degraded, and a OK event has been
// received, change the state to recovering. If the current state is
// recovering and a OK events is received, if it's been longer
Events and GRPC API This commit introduces several key changes to Teleport backend and API infrastructure in order to achieve scalability improvements on 10K+ node deployments. Events and plain keyspace -------------------------- New backend interface supports events, pagination and range queries and moves away from buckets to plain keyspace, what better aligns with DynamoDB and Etcd featuring similar interfaces. All backend implementations are exposing Events API, allowing multiple subscribers to consume the same event stream and avoid polling database. Replacing BoltDB, Dir with SQLite ------------------------------- BoltDB backend does not support having two processes access the database at the same time. This prevented Teleport using BoltDB backend to be live reloaded. SQLite supports reads/writes by multiple processes and makes Dir backend obsolete as SQLite is more efficient on larger collections, supports transactions and can detect data corruption. Teleport automatically migrates data from Bolt and Dir backends into SQLite. GRPC API and protobuf resources ------------------------------- GRPC API has been introduced for the auth server. The auth server now serves both GRPC and JSON-HTTP API on the same TLS socket and uses the same client certificate authentication. All future API methods should use GRPC and HTTP-JSON API is considered obsolete. In addition to that some resources like Server and CertificateAuthority are now generated from protobuf service specifications in a way that is fully backward compatible with original JSON spec and schema, so the same resource can be encoded and decoded from JSON, YAML and protobuf. All models should be refactored into new proto specification over time. Streaming presence service -------------------------- In order to cut bandwidth, nodes are sending full updates only when changes to labels or spec have occured, otherwise new light-weight GRPC keep alive updates are sent over to the presence service, reducing bandwidth usage on multi-node deployments. In addition to that nodes are no longer polling auth server for certificate authority rotation updates, instead they subscribe to event updates to detect updates as soon as they happen. This is a new API, so the errors are inevitable, that's why polling is still done, but on a way slower rate.
2018-11-07 23:33:38 +00:00
// than the recovery time (2 time the server keep alive ttl), change
2018-10-26 22:20:02 +00:00
// state to OK.
case TeleportOKEvent:
switch s.state {
case stateStarting:
s.state = stateOK
f.process.log.Debugf("Teleport component %q has started.", component)
2018-10-26 22:20:02 +00:00
case stateDegraded:
s.state = stateRecovering
s.recoveryTime = f.process.Clock.Now()
f.process.log.Infof("Teleport component %q is recovering from a degraded state.", component)
2018-10-26 22:20:02 +00:00
case stateRecovering:
if f.process.Clock.Since(s.recoveryTime) > defaults.HeartbeatCheckPeriod*2 {
s.state = stateOK
f.process.log.Infof("Teleport component %q has recovered from a degraded state.", component)
2018-10-26 22:20:02 +00:00
}
}
}
}
// getStateLocked returns the overall process state based on the state of
// individual components. If no components sent updates yet, returns
// stateStarting.
//
// Order of importance:
// 1. degraded
// 2. recovering
// 3. starting
// 4. ok
//
// Note: f.mu must be locked by the caller!
func (f *processState) getStateLocked() componentStateEnum {
state := stateStarting
numNotOK := len(f.states)
for _, s := range f.states {
switch s.state {
case stateDegraded:
return stateDegraded
case stateRecovering:
state = stateRecovering
case stateOK:
numNotOK--
}
}
// Only return stateOK if *all* components are in stateOK.
if numNotOK == 0 && len(f.states) > 0 {
state = stateOK
}
return state
}
// Note: f.mu must be locked by the caller!
func (f *processState) updateGauge() {
stateGauge.Set(float64(f.getStateLocked()))
}
2018-10-26 22:20:02 +00:00
// GetState returns the current state of the system.
func (f *processState) getState() componentStateEnum {
f.mu.Lock()
defer f.mu.Unlock()
return f.getStateLocked()
2018-10-26 22:20:02 +00:00
}