Metrics: add running services metric (#34999)

* Metrics: add running services metric

This PR adds a new gauge metric: `teleport_services`
This metric has a label identifying the service and whether or not it is
running.
Those services are the ones started in the supervisor.
Eg, proxy.web, discovery.init, ssh.node, auth.tls

When the service stops, the counter is decreased.

This gives us an overview of the currently running services in the
process.

* Consider only a subset of services

* use friendly name for service names

* improve metric's help message

* Update lib/service/supervisor.go

Co-authored-by: rosstimothy <39066650+rosstimothy@users.noreply.github.com>

* Fix service names

---------

Co-authored-by: rosstimothy <39066650+rosstimothy@users.noreply.github.com>
This commit is contained in:
Marco André Dinis 2023-12-05 09:11:30 +00:00 committed by GitHub
parent 88fa225dd4
commit ab6cf95459
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 42 additions and 0 deletions

View file

@ -25,9 +25,11 @@ import (
"time"
"github.com/gravitational/trace"
"github.com/prometheus/client_golang/prometheus"
"github.com/sirupsen/logrus"
"github.com/gravitational/teleport"
"github.com/gravitational/teleport/lib/observability/metrics"
)
// Supervisor implements the simple service logic - registering
@ -273,11 +275,38 @@ type ExitEventPayload struct {
Error error
}
var metricsServicesRunning = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: teleport.MetricNamespace,
Name: teleport.MetricTeleportServices,
Help: "Teleport services currently enabled and running",
},
[]string{teleport.TagServiceName},
)
var metricsServicesRunningMap = map[string]string{
"discovery.init": "discovery_service",
"ssh.node": "ssh_service",
"auth.tls": "auth_service",
"proxy.web": "proxy_service",
"kube.init": "kubernetes_service",
"apps.start": "application_service",
"db.init": "database_service",
"windows_desktop.init": "windows_desktop_service",
"okta.init": "okta_service",
"jamf.init": "jamf_service",
}
func (s *LocalSupervisor) serve(srv Service) {
s.wg.Add(1)
go func() {
defer s.wg.Done()
defer s.RemoveService(srv)
if label, ok := metricsServicesRunningMap[srv.Name()]; ok {
metricsServicesRunning.WithLabelValues(label).Inc()
defer metricsServicesRunning.WithLabelValues(label).Dec()
}
l := s.log.WithField("service", srv.Name())
l.Debug("Service has started.")
err := srv.Serve()
@ -307,6 +336,10 @@ func (s *LocalSupervisor) Start() error {
return nil
}
if err := metrics.RegisterPrometheusCollectors(metricsServicesRunning); err != nil {
return trace.Wrap(err)
}
for _, srv := range s.services {
s.serve(srv)
}

View file

@ -244,6 +244,9 @@ const (
// (as defined by types.PluginStatus) for a plugin instance
MetricHostedPluginStatus = "hosted_plugin_status"
// MetricTeleportServices tracks which services are currently running in the current Teleport Process.
MetricTeleportServices = "services"
// TagRange is a tag specifying backend requests
TagRange = "range"
@ -284,6 +287,12 @@ const (
// were used for the agent.
// This value comes from UpstreamInventoryAgentMetadata (sourced in lib/inventory/metadata.fetchInstallMethods).
TagInstallMethods = "install_methods"
// TagServiceName is the prometheus label to indicate what services are running in the current proxy.
// Those services are monitored using the Supervisor.
// Only a subset of services are monitored. See [lib/service.metricsServicesRunningMap]
// Eg, discovery_service
TagServiceName = "service_name"
)
const (