mirror of
https://github.com/gravitational/teleport
synced 2024-10-21 01:34:01 +00:00
97c18fa1a9
Fixes #7606, where a node doesn't notice when the tunnel port changes. Imagine you have a cluster with a node connected in via a tunnel through a proxy `proxy.example.com` on port `3024` Now change the proxy config so that `tunnel_public_address` is `proxy.example.com:4024`. You either restart the proxy, or reload the proxy config with a `SIGHUP`. ...and then the node a) loses its connection to auth (because the tunnel is gone), and b) _doesn't reconnect_, because even though the proxy address hasn't changed, the node has cached the old tunnel_public_address and keeps trying to connect to that. You can always manually restart the node to have it reconnect, but that would be a pain if you have thousands of nodes. In order to not have to manually restart all nodes, this change implements a check for a connection failures to the auth server, and re-starts the node if there are multiple connection failures in a given period of time. The check as-implemented piggybacks on the node's "common.rotate" service, which can already restart the node in certain circumstances, and uses the success of the periodic rotation sync as a proxy for the health of the node's connection to the auth server. See-Also: #7606
112 lines
3.7 KiB
Go
112 lines
3.7 KiB
Go
/*
|
|
Copyright 2021 Gravitational, Inc.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package integration
|
|
|
|
import (
|
|
"context"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/gravitational/teleport/lib"
|
|
"github.com/gravitational/teleport/lib/auth/testauthority"
|
|
"github.com/gravitational/teleport/lib/service"
|
|
log "github.com/sirupsen/logrus"
|
|
"github.com/stretchr/testify/require"
|
|
)
|
|
|
|
// TestLostConnectionToAuthCausesReload tests that a lost connection to the auth server
|
|
// will eventually restart a node
|
|
func TestLostConnectionToAuthCausesReload(t *testing.T) {
|
|
// Because testing that the node does a full restart is a bit tricky when
|
|
// running a cluster from inside a test runner (i.e. we don't want to
|
|
// SIGTERM the test runner), we will watch for the node emitting a
|
|
// `TeleportReload` even instead. In a proper Teleport instance, this
|
|
// event would be picked up at the Supervisor level and would eventually
|
|
// cause the instance to gracefully restart.
|
|
|
|
require := require.New(t)
|
|
log := log.StandardLogger()
|
|
|
|
log.Info(">>> Entering Test")
|
|
|
|
// InsecureDevMode needed for SSH connections
|
|
// TODO(tcsc): surface this as per-server config (see also issue #8913)
|
|
lib.SetInsecureDevMode(true)
|
|
defer lib.SetInsecureDevMode(false)
|
|
|
|
// GIVEN a cluster with a running auth+proxy instance....
|
|
log.Info(">>> Creating cluster")
|
|
keygen := testauthority.New()
|
|
privateKey, publicKey, err := keygen.GenerateKeyPair("")
|
|
require.NoError(err)
|
|
auth := NewInstance(InstanceConfig{
|
|
ClusterName: "test-tunnel-collapse",
|
|
HostID: "auth",
|
|
Priv: privateKey,
|
|
Pub: publicKey,
|
|
Ports: standardPortSetup(),
|
|
log: log,
|
|
})
|
|
|
|
log.Info(">>> Creating auth-proxy...")
|
|
authConf := service.MakeDefaultConfig()
|
|
authConf.Hostname = Host
|
|
authConf.Auth.Enabled = true
|
|
authConf.Proxy.Enabled = true
|
|
authConf.SSH.Enabled = false
|
|
authConf.Proxy.DisableWebInterface = true
|
|
authConf.Proxy.DisableDatabaseProxy = true
|
|
require.NoError(auth.CreateEx(t, nil, authConf))
|
|
t.Cleanup(func() { require.NoError(auth.StopAll()) })
|
|
|
|
log.Info(">>> Start auth-proxy...")
|
|
require.NoError(auth.Start())
|
|
|
|
// ... and an SSH node connected via a reverse tunnel configured to
|
|
// reload after only a few failed connection attempts per minute
|
|
log.Info(">>> Creating and starting node...")
|
|
nodeCfg := service.MakeDefaultConfig()
|
|
nodeCfg.Hostname = Host
|
|
nodeCfg.SSH.Enabled = true
|
|
nodeCfg.RotationConnectionInterval = 1 * time.Second
|
|
nodeCfg.RestartThreshold = service.Rate{Amount: 3, Time: 1 * time.Minute}
|
|
node, err := auth.StartReverseTunnelNode(nodeCfg)
|
|
require.NoError(err)
|
|
|
|
// WHEN I stop the auth node (and, by implication, disrupt the ssh node's
|
|
// connection to it)
|
|
log.Info(">>> Stopping auth node")
|
|
auth.StopAuth(false)
|
|
|
|
// EXPECT THAT the ssh node will eventually issue a reload request
|
|
log.Info(">>> Waiting for node restart request.")
|
|
waitCtx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
|
|
defer cancel()
|
|
|
|
eventCh := make(chan service.Event)
|
|
node.WaitForEvent(waitCtx, service.TeleportReloadEvent, eventCh)
|
|
select {
|
|
case e := <-eventCh:
|
|
log.Infof(">>> Received Reload event: %v. Test passed.", e)
|
|
|
|
case <-waitCtx.Done():
|
|
require.FailNow("Timed out", "Timed out waiting for reload event")
|
|
}
|
|
|
|
log.Info(">>> TEST COMPLETE")
|
|
}
|