2018-10-26 22:20:02 +00:00
|
|
|
/*
|
|
|
|
Copyright 2018 Gravitational, Inc.
|
|
|
|
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
you may not use this file except in compliance with the License.
|
|
|
|
You may obtain a copy of the License at
|
|
|
|
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
See the License for the specific language governing permissions and
|
|
|
|
limitations under the License.
|
|
|
|
*/
|
|
|
|
|
|
|
|
package service
|
|
|
|
|
|
|
|
import (
|
2020-05-14 20:53:54 +00:00
|
|
|
"fmt"
|
2020-08-18 18:23:13 +00:00
|
|
|
"sync"
|
2018-10-26 22:20:02 +00:00
|
|
|
"time"
|
|
|
|
|
2020-05-12 22:50:52 +00:00
|
|
|
"github.com/gravitational/teleport"
|
2018-10-26 22:20:02 +00:00
|
|
|
"github.com/gravitational/teleport/lib/defaults"
|
2021-05-19 15:53:36 +00:00
|
|
|
"github.com/gravitational/teleport/lib/utils"
|
|
|
|
"github.com/gravitational/trace"
|
2020-05-12 22:50:52 +00:00
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
2018-10-26 22:20:02 +00:00
|
|
|
)
|
|
|
|
|
2020-08-18 18:23:13 +00:00
|
|
|
type componentStateEnum byte
|
|
|
|
|
2020-05-14 20:53:54 +00:00
|
|
|
// Note: these consts are not using iota because they get exposed via a
|
|
|
|
// Prometheus metric. Using iota makes it possible to accidentally change the
|
|
|
|
// values.
|
2018-10-26 22:20:02 +00:00
|
|
|
const (
|
|
|
|
// stateOK means Teleport is operating normally.
|
2020-08-18 18:23:13 +00:00
|
|
|
stateOK = componentStateEnum(0)
|
2018-10-26 22:20:02 +00:00
|
|
|
// stateRecovering means Teleport has begun recovering from a degraded state.
|
2020-08-18 18:23:13 +00:00
|
|
|
stateRecovering = componentStateEnum(1)
|
2018-11-15 19:14:20 +00:00
|
|
|
// stateDegraded means some kind of connection error has occurred to put
|
2018-10-26 22:20:02 +00:00
|
|
|
// Teleport into a degraded state.
|
2020-08-18 18:23:13 +00:00
|
|
|
stateDegraded = componentStateEnum(2)
|
2020-05-14 20:53:54 +00:00
|
|
|
// stateStarting means the process is starting but hasn't joined the
|
|
|
|
// cluster yet.
|
2020-08-18 18:23:13 +00:00
|
|
|
stateStarting = componentStateEnum(3)
|
2018-10-26 22:20:02 +00:00
|
|
|
)
|
|
|
|
|
2020-05-12 22:50:52 +00:00
|
|
|
var stateGauge = prometheus.NewGauge(prometheus.GaugeOpts{
|
|
|
|
Name: teleport.MetricState,
|
2020-05-14 20:53:54 +00:00
|
|
|
Help: fmt.Sprintf("State of the teleport process: %d - ok, %d - recovering, %d - degraded, %d - starting", stateOK, stateRecovering, stateDegraded, stateStarting),
|
2020-05-12 22:50:52 +00:00
|
|
|
})
|
|
|
|
|
|
|
|
func init() {
|
2020-08-18 18:23:13 +00:00
|
|
|
stateGauge.Set(float64(stateStarting))
|
2020-05-12 22:50:52 +00:00
|
|
|
}
|
|
|
|
|
2018-10-26 22:20:02 +00:00
|
|
|
// processState tracks the state of the Teleport process.
|
|
|
|
type processState struct {
|
2020-08-18 18:23:13 +00:00
|
|
|
process *TeleportProcess
|
|
|
|
mu sync.Mutex
|
|
|
|
states map[string]*componentState
|
|
|
|
}
|
|
|
|
|
|
|
|
type componentState struct {
|
2018-10-26 22:20:02 +00:00
|
|
|
recoveryTime time.Time
|
2020-08-18 18:23:13 +00:00
|
|
|
state componentStateEnum
|
2018-10-26 22:20:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// newProcessState returns a new FSM that tracks the state of the Teleport process.
|
2021-05-19 15:53:36 +00:00
|
|
|
func newProcessState(process *TeleportProcess) (*processState, error) {
|
|
|
|
err := utils.RegisterPrometheusCollectors(stateGauge)
|
|
|
|
if err != nil {
|
|
|
|
return nil, trace.Wrap(err)
|
|
|
|
}
|
|
|
|
|
2018-10-26 22:20:02 +00:00
|
|
|
return &processState{
|
2020-08-18 18:23:13 +00:00
|
|
|
process: process,
|
|
|
|
states: make(map[string]*componentState),
|
2021-05-19 15:53:36 +00:00
|
|
|
}, nil
|
2018-10-26 22:20:02 +00:00
|
|
|
}
|
|
|
|
|
2020-08-18 18:23:13 +00:00
|
|
|
// update the state of a Teleport component.
|
|
|
|
func (f *processState) update(event Event) {
|
|
|
|
f.mu.Lock()
|
|
|
|
defer f.mu.Unlock()
|
|
|
|
defer f.updateGauge()
|
|
|
|
|
|
|
|
component, ok := event.Payload.(string)
|
|
|
|
if !ok {
|
2022-03-17 16:46:12 +00:00
|
|
|
f.process.log.Errorf("%v broadcasted without component name, this is a bug!", event.Name)
|
2020-08-18 18:23:13 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
s, ok := f.states[component]
|
|
|
|
if !ok {
|
|
|
|
// Register a new component.
|
2021-01-12 11:10:00 +00:00
|
|
|
s = &componentState{recoveryTime: f.process.Clock.Now(), state: stateStarting}
|
2020-08-18 18:23:13 +00:00
|
|
|
f.states[component] = s
|
|
|
|
}
|
|
|
|
|
2018-10-26 22:20:02 +00:00
|
|
|
switch event.Name {
|
|
|
|
// If a degraded event was received, always change the state to degraded.
|
|
|
|
case TeleportDegradedEvent:
|
2020-08-18 18:23:13 +00:00
|
|
|
s.state = stateDegraded
|
2020-12-07 14:35:15 +00:00
|
|
|
f.process.log.Infof("Detected Teleport component %q is running in a degraded state.", component)
|
2018-10-26 22:20:02 +00:00
|
|
|
// If the current state is degraded, and a OK event has been
|
|
|
|
// received, change the state to recovering. If the current state is
|
|
|
|
// recovering and a OK events is received, if it's been longer
|
Events and GRPC API
This commit introduces several key changes to
Teleport backend and API infrastructure
in order to achieve scalability improvements
on 10K+ node deployments.
Events and plain keyspace
--------------------------
New backend interface supports events,
pagination and range queries
and moves away from buckets to
plain keyspace, what better aligns
with DynamoDB and Etcd featuring similar
interfaces.
All backend implementations are
exposing Events API, allowing
multiple subscribers to consume the same
event stream and avoid polling database.
Replacing BoltDB, Dir with SQLite
-------------------------------
BoltDB backend does not support
having two processes access the database at the
same time. This prevented Teleport
using BoltDB backend to be live reloaded.
SQLite supports reads/writes by multiple
processes and makes Dir backend obsolete
as SQLite is more efficient on larger collections,
supports transactions and can detect data
corruption.
Teleport automatically migrates data from
Bolt and Dir backends into SQLite.
GRPC API and protobuf resources
-------------------------------
GRPC API has been introduced for
the auth server. The auth server now serves both GRPC
and JSON-HTTP API on the same TLS socket and uses
the same client certificate authentication.
All future API methods should use GRPC and HTTP-JSON
API is considered obsolete.
In addition to that some resources like
Server and CertificateAuthority are now
generated from protobuf service specifications in
a way that is fully backward compatible with
original JSON spec and schema, so the same resource
can be encoded and decoded from JSON, YAML
and protobuf.
All models should be refactored
into new proto specification over time.
Streaming presence service
--------------------------
In order to cut bandwidth, nodes
are sending full updates only when changes
to labels or spec have occured, otherwise
new light-weight GRPC keep alive updates are sent
over to the presence service, reducing
bandwidth usage on multi-node deployments.
In addition to that nodes are no longer polling
auth server for certificate authority rotation
updates, instead they subscribe to event updates
to detect updates as soon as they happen.
This is a new API, so the errors are inevitable,
that's why polling is still done, but
on a way slower rate.
2018-11-07 23:33:38 +00:00
|
|
|
// than the recovery time (2 time the server keep alive ttl), change
|
2018-10-26 22:20:02 +00:00
|
|
|
// state to OK.
|
|
|
|
case TeleportOKEvent:
|
2020-08-18 18:23:13 +00:00
|
|
|
switch s.state {
|
|
|
|
case stateStarting:
|
|
|
|
s.state = stateOK
|
2020-12-07 14:35:15 +00:00
|
|
|
f.process.log.Debugf("Teleport component %q has started.", component)
|
2018-10-26 22:20:02 +00:00
|
|
|
case stateDegraded:
|
2020-08-18 18:23:13 +00:00
|
|
|
s.state = stateRecovering
|
2021-01-12 11:10:00 +00:00
|
|
|
s.recoveryTime = f.process.Clock.Now()
|
2020-12-07 14:35:15 +00:00
|
|
|
f.process.log.Infof("Teleport component %q is recovering from a degraded state.", component)
|
2018-10-26 22:20:02 +00:00
|
|
|
case stateRecovering:
|
2022-03-17 16:46:12 +00:00
|
|
|
if f.process.Clock.Since(s.recoveryTime) > defaults.HeartbeatCheckPeriod*2 {
|
2020-08-18 18:23:13 +00:00
|
|
|
s.state = stateOK
|
2020-12-07 14:35:15 +00:00
|
|
|
f.process.log.Infof("Teleport component %q has recovered from a degraded state.", component)
|
2018-10-26 22:20:02 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-08-18 18:23:13 +00:00
|
|
|
// getStateLocked returns the overall process state based on the state of
|
|
|
|
// individual components. If no components sent updates yet, returns
|
|
|
|
// stateStarting.
|
|
|
|
//
|
|
|
|
// Order of importance:
|
|
|
|
// 1. degraded
|
|
|
|
// 2. recovering
|
|
|
|
// 3. starting
|
|
|
|
// 4. ok
|
|
|
|
//
|
|
|
|
// Note: f.mu must be locked by the caller!
|
|
|
|
func (f *processState) getStateLocked() componentStateEnum {
|
|
|
|
state := stateStarting
|
|
|
|
numNotOK := len(f.states)
|
|
|
|
for _, s := range f.states {
|
|
|
|
switch s.state {
|
|
|
|
case stateDegraded:
|
|
|
|
return stateDegraded
|
|
|
|
case stateRecovering:
|
|
|
|
state = stateRecovering
|
|
|
|
case stateOK:
|
|
|
|
numNotOK--
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Only return stateOK if *all* components are in stateOK.
|
|
|
|
if numNotOK == 0 && len(f.states) > 0 {
|
|
|
|
state = stateOK
|
|
|
|
}
|
|
|
|
return state
|
|
|
|
}
|
|
|
|
|
|
|
|
// Note: f.mu must be locked by the caller!
|
|
|
|
func (f *processState) updateGauge() {
|
|
|
|
stateGauge.Set(float64(f.getStateLocked()))
|
|
|
|
}
|
|
|
|
|
2018-10-26 22:20:02 +00:00
|
|
|
// GetState returns the current state of the system.
|
2020-08-18 18:23:13 +00:00
|
|
|
func (f *processState) getState() componentStateEnum {
|
|
|
|
f.mu.Lock()
|
|
|
|
defer f.mu.Unlock()
|
|
|
|
return f.getStateLocked()
|
2018-10-26 22:20:02 +00:00
|
|
|
}
|