Merge pull request #2697 from baude/healtcheckphase3

podman health check phase3
This commit is contained in:
OpenShift Merge Robot 2019-03-24 04:16:43 -07:00 committed by GitHub
commit d0c6a35c05
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 272 additions and 20 deletions

View file

@ -293,7 +293,7 @@ func getCreateFlags(c *cliconfig.PodmanCommand) {
)
createFlags.String(
"healthcheck-interval", "30s",
"set an interval for the healthchecks",
"set an interval for the healthchecks (a value of disable results in no automatic timer setup)",
)
createFlags.Uint(
"healthcheck-retries", 3,

View file

@ -494,6 +494,14 @@ func generateContainerFilterFuncs(filter, filterValue string, runtime *libpod.Ru
}
return false
}, nil
case "health":
return func(c *libpod.Container) bool {
hcStatus, err := c.HealthCheckStatus()
if err != nil {
return false
}
return hcStatus == filterValue
}, nil
}
return nil, errors.Errorf("%s is an invalid filter", filter)
}

View file

@ -868,21 +868,21 @@ func makeHealthCheckFromCli(c *cliconfig.PodmanCommand) (*manifest.Schema2Health
hc := manifest.Schema2HealthConfig{
Test: cmd,
}
if inInterval == "disable" {
inInterval = "0"
}
intervalDuration, err := time.ParseDuration(inInterval)
if err != nil {
return nil, errors.Wrapf(err, "invalid healthcheck-interval %s ", inInterval)
}
if intervalDuration < time.Duration(time.Second*1) {
return nil, errors.New("healthcheck-interval must be at least 1 second")
}
hc.Interval = intervalDuration
if inRetries < 1 {
return nil, errors.New("healthcheck-retries must be greater than 0.")
}
hc.Retries = int(inRetries)
timeoutDuration, err := time.ParseDuration(inTimeout)
if err != nil {
return nil, errors.Wrapf(err, "invalid healthcheck-timeout %s", inTimeout)

View file

@ -100,6 +100,7 @@ Valid filters are listed below:
| before | [ID] or [Name] Containers created before this container |
| since | [ID] or [Name] Containers created since this container |
| volume | [VolumeName] or [MountpointDestination] Volume mounted in container |
| health | [Status] healthy or unhealthy |
**--help**, **-h**

View file

@ -833,6 +833,12 @@ func (c *Container) init(ctx context.Context) error {
if err := c.save(); err != nil {
return err
}
if c.config.HealthCheckConfig != nil {
if err := c.createTimer(); err != nil {
logrus.Error(err)
}
}
defer c.newContainerEvent(events.Init)
return c.completeNetworkSetup()
}
@ -956,6 +962,15 @@ func (c *Container) start() error {
c.state.State = ContainerStateRunning
if c.config.HealthCheckConfig != nil {
if err := c.updateHealthStatus(HealthCheckStarting); err != nil {
logrus.Error(err)
}
if err := c.startTimer(); err != nil {
logrus.Error(err)
}
}
defer c.newContainerEvent(events.Start)
return c.save()
@ -1123,6 +1138,13 @@ func (c *Container) cleanup(ctx context.Context) error {
logrus.Debugf("Cleaning up container %s", c.ID())
// Remove healthcheck unit/timer file if it execs
if c.config.HealthCheckConfig != nil {
if err := c.removeTimer(); err != nil {
logrus.Error(err)
}
}
// Clean up network namespace, if present
if err := c.cleanupNetwork(); err != nil {
lastError = err

View file

@ -3,13 +3,16 @@ package libpod
import (
"bufio"
"bytes"
"fmt"
"io/ioutil"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
"github.com/containers/libpod/pkg/inspect"
"github.com/coreos/go-systemd/dbus"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
)
@ -47,6 +50,10 @@ const (
HealthCheckHealthy string = "healthy"
// HealthCheckUnhealthy describes an unhealthy container
HealthCheckUnhealthy string = "unhealthy"
// HealthCheckStarting describes the time between when the container starts
// and the start-period (time allowed for the container to start and application
// to be running) expires.
HealthCheckStarting string = "starting"
)
// hcWriteCloser allows us to use bufio as a WriteCloser
@ -68,17 +75,18 @@ func (r *Runtime) HealthCheck(name string) (HealthCheckStatus, error) {
}
hcStatus, err := checkHealthCheckCanBeRun(container)
if err == nil {
return container.RunHealthCheck()
return container.runHealthCheck()
}
return hcStatus, err
}
// RunHealthCheck runs the health check as defined by the container
func (c *Container) RunHealthCheck() (HealthCheckStatus, error) {
// runHealthCheck runs the health check as defined by the container
func (c *Container) runHealthCheck() (HealthCheckStatus, error) {
var (
newCommand []string
returnCode int
capture bytes.Buffer
newCommand []string
returnCode int
capture bytes.Buffer
inStartPeriod bool
)
hcStatus, err := checkHealthCheckCanBeRun(c)
if err != nil {
@ -111,12 +119,28 @@ func (c *Container) RunHealthCheck() (HealthCheckStatus, error) {
returnCode = 1
}
timeEnd := time.Now()
if c.HealthCheckConfig().StartPeriod > 0 {
// there is a start-period we need to honor; we add startPeriod to container start time
startPeriodTime := c.state.StartedTime.Add(c.HealthCheckConfig().StartPeriod)
if timeStart.Before(startPeriodTime) {
// we are still in the start period, flip the inStartPeriod bool
inStartPeriod = true
logrus.Debugf("healthcheck for %s being run in start-period", c.ID())
}
}
eventLog := capture.String()
if len(eventLog) > MaxHealthCheckLogLength {
eventLog = eventLog[:MaxHealthCheckLogLength]
}
if timeEnd.Sub(timeStart) > c.HealthCheckConfig().Timeout {
returnCode = -1
hcResult = HealthCheckFailure
hcErr = errors.Errorf("healthcheck command exceeded timeout of %s", c.HealthCheckConfig().Timeout.String())
}
hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog)
if err := c.updateHealthCheckLog(hcl); err != nil {
if err := c.updateHealthCheckLog(hcl, inStartPeriod); err != nil {
return hcResult, errors.Wrapf(err, "unable to update health check log %s for %s", c.healthCheckLogPath(), c.ID())
}
return hcResult, hcErr
@ -145,8 +169,23 @@ func newHealthCheckLog(start, end time.Time, exitCode int, log string) inspect.H
}
}
// updatedHealthCheckStatus updates the health status of the container
// in the healthcheck log
func (c *Container) updateHealthStatus(status string) error {
healthCheck, err := c.GetHealthCheckLog()
if err != nil {
return err
}
healthCheck.Status = status
newResults, err := json.Marshal(healthCheck)
if err != nil {
return errors.Wrapf(err, "unable to marshall healthchecks for writing status")
}
return ioutil.WriteFile(c.healthCheckLogPath(), newResults, 0700)
}
// UpdateHealthCheckLog parses the health check results and writes the log
func (c *Container) updateHealthCheckLog(hcl inspect.HealthCheckLog) error {
func (c *Container) updateHealthCheckLog(hcl inspect.HealthCheckLog, inStartPeriod bool) error {
healthCheck, err := c.GetHealthCheckLog()
if err != nil {
return err
@ -159,11 +198,13 @@ func (c *Container) updateHealthCheckLog(hcl inspect.HealthCheckLog) error {
if len(healthCheck.Status) < 1 {
healthCheck.Status = HealthCheckHealthy
}
// increment failing streak
healthCheck.FailingStreak = healthCheck.FailingStreak + 1
// if failing streak > retries, then status to unhealthy
if int(healthCheck.FailingStreak) > c.HealthCheckConfig().Retries {
healthCheck.Status = HealthCheckUnhealthy
if !inStartPeriod {
// increment failing streak
healthCheck.FailingStreak = healthCheck.FailingStreak + 1
// if failing streak > retries, then status to unhealthy
if int(healthCheck.FailingStreak) >= c.HealthCheckConfig().Retries {
healthCheck.Status = HealthCheckUnhealthy
}
}
}
healthCheck.Log = append(healthCheck.Log, hcl)
@ -199,3 +240,81 @@ func (c *Container) GetHealthCheckLog() (inspect.HealthCheckResults, error) {
}
return healthCheck, nil
}
// createTimer systemd timers for healthchecks of a container
func (c *Container) createTimer() error {
if c.disableHealthCheckSystemd() {
return nil
}
podman, err := os.Executable()
if err != nil {
return errors.Wrapf(err, "failed to get path for podman for a health check timer")
}
var cmd = []string{"--unit", fmt.Sprintf("%s", c.ID()), fmt.Sprintf("--on-unit-inactive=%s", c.HealthCheckConfig().Interval.String()), "--timer-property=AccuracySec=1s", podman, "healthcheck", "run", c.ID()}
conn, err := dbus.NewSystemdConnection()
if err != nil {
return errors.Wrapf(err, "unable to get systemd connection to add healthchecks")
}
conn.Close()
logrus.Debugf("creating systemd-transient files: %s %s", "systemd-run", cmd)
systemdRun := exec.Command("systemd-run", cmd...)
_, err = systemdRun.CombinedOutput()
if err != nil {
return err
}
return nil
}
// startTimer starts a systemd timer for the healthchecks
func (c *Container) startTimer() error {
if c.disableHealthCheckSystemd() {
return nil
}
conn, err := dbus.NewSystemdConnection()
if err != nil {
return errors.Wrapf(err, "unable to get systemd connection to start healthchecks")
}
defer conn.Close()
_, err = conn.StartUnit(fmt.Sprintf("%s.service", c.ID()), "fail", nil)
return err
}
// removeTimer removes the systemd timer and unit files
// for the container
func (c *Container) removeTimer() error {
if c.disableHealthCheckSystemd() {
return nil
}
conn, err := dbus.NewSystemdConnection()
if err != nil {
return errors.Wrapf(err, "unable to get systemd connection to remove healthchecks")
}
defer conn.Close()
serviceFile := fmt.Sprintf("%s.timer", c.ID())
_, err = conn.StopUnit(serviceFile, "fail", nil)
return err
}
// HealthCheckStatus returns the current state of a container with a healthcheck
func (c *Container) HealthCheckStatus() (string, error) {
if !c.HasHealthCheck() {
return "", errors.Errorf("container %s has no defined healthcheck", c.ID())
}
results, err := c.GetHealthCheckLog()
if err != nil {
return "", errors.Wrapf(err, "unable to get healthcheck log for %s", c.ID())
}
return results.Status, nil
}
func (c *Container) disableHealthCheckSystemd() bool {
if os.Getenv("DISABLE_HC_SYSTEMD") == "true" {
return true
}
if c.config.HealthCheckConfig.Interval == 0 {
return true
}
return false
}

View file

@ -239,7 +239,7 @@ func PodmanTestCreateUtil(tempDir string, remote bool) *PodmanTestIntegration {
ociRuntime = "/usr/bin/runc"
}
}
os.Setenv("DISABLE_HC_SYSTEMD", "true")
CNIConfigDir := "/etc/cni/net.d"
p := &PodmanTestIntegration{
@ -314,6 +314,14 @@ func (s *PodmanSessionIntegration) InspectImageJSON() []inspect.ImageData {
return i
}
// InspectContainer returns a container's inspect data in JSON format
func (p *PodmanTestIntegration) InspectContainer(name string) []inspect.ContainerData {
cmd := []string{"inspect", name}
session := p.Podman(cmd)
session.WaitWithDefaultTimeout()
return session.InspectContainerToJSON()
}
func processTestResult(f GinkgoTestDescription) {
tr := testResult{length: f.Duration.Seconds(), name: f.TestText}
testResults = append(testResults, tr)

View file

@ -83,4 +83,98 @@ var _ = Describe("Podman healthcheck run", func() {
hc.WaitWithDefaultTimeout()
Expect(hc.ExitCode()).To(Equal(125))
})
It("podman healthcheck should be starting", func() {
session := podmanTest.Podman([]string{"run", "-dt", "--name", "hc", "--healthcheck-retries", "2", "--healthcheck-command", "\"CMD-SHELL ls /foo || exit 1\"", ALPINE, "top"})
session.WaitWithDefaultTimeout()
Expect(session.ExitCode()).To(Equal(0))
inspect := podmanTest.InspectContainer("hc")
Expect(inspect[0].State.Healthcheck.Status).To(Equal("starting"))
})
It("podman healthcheck failed checks in start-period should not change status", func() {
session := podmanTest.Podman([]string{"run", "-dt", "--name", "hc", "--healthcheck-start-period", "2m", "--healthcheck-retries", "2", "--healthcheck-command", "\"CMD-SHELL ls /foo || exit 1\"", ALPINE, "top"})
session.WaitWithDefaultTimeout()
Expect(session.ExitCode()).To(Equal(0))
hc := podmanTest.Podman([]string{"healthcheck", "run", "hc"})
hc.WaitWithDefaultTimeout()
Expect(hc.ExitCode()).To(Equal(1))
hc = podmanTest.Podman([]string{"healthcheck", "run", "hc"})
hc.WaitWithDefaultTimeout()
Expect(hc.ExitCode()).To(Equal(1))
hc = podmanTest.Podman([]string{"healthcheck", "run", "hc"})
hc.WaitWithDefaultTimeout()
Expect(hc.ExitCode()).To(Equal(1))
inspect := podmanTest.InspectContainer("hc")
Expect(inspect[0].State.Healthcheck.Status).To(Equal("starting"))
})
It("podman healthcheck failed checks must reach retries before unhealthy ", func() {
session := podmanTest.Podman([]string{"run", "-dt", "--name", "hc", "--healthcheck-retries", "2", "--healthcheck-command", "\"CMD-SHELL ls /foo || exit 1\"", ALPINE, "top"})
session.WaitWithDefaultTimeout()
Expect(session.ExitCode()).To(Equal(0))
hc := podmanTest.Podman([]string{"healthcheck", "run", "hc"})
hc.WaitWithDefaultTimeout()
Expect(hc.ExitCode()).To(Equal(1))
inspect := podmanTest.InspectContainer("hc")
Expect(inspect[0].State.Healthcheck.Status).To(Equal("starting"))
hc = podmanTest.Podman([]string{"healthcheck", "run", "hc"})
hc.WaitWithDefaultTimeout()
Expect(hc.ExitCode()).To(Equal(1))
inspect = podmanTest.InspectContainer("hc")
Expect(inspect[0].State.Healthcheck.Status).To(Equal("unhealthy"))
})
It("podman healthcheck good check results in healthy even in start-period", func() {
session := podmanTest.Podman([]string{"run", "-dt", "--name", "hc", "--healthcheck-start-period", "2m", "--healthcheck-retries", "2", "--healthcheck-command", "\"CMD-SHELL\" \"ls\" \"||\" \"exit\" \"1\"", ALPINE, "top"})
session.WaitWithDefaultTimeout()
Expect(session.ExitCode()).To(Equal(0))
hc := podmanTest.Podman([]string{"healthcheck", "run", "hc"})
hc.WaitWithDefaultTimeout()
Expect(hc.ExitCode()).To(Equal(0))
inspect := podmanTest.InspectContainer("hc")
Expect(inspect[0].State.Healthcheck.Status).To(Equal("healthy"))
})
It("podman healthcheck single healthy result changes failed to healthy", func() {
session := podmanTest.Podman([]string{"run", "-dt", "--name", "hc", "--healthcheck-retries", "2", "--healthcheck-command", "\"CMD-SHELL\" \"ls\" \"/foo\" \"||\" \"exit\" \"1\"", ALPINE, "top"})
session.WaitWithDefaultTimeout()
Expect(session.ExitCode()).To(Equal(0))
hc := podmanTest.Podman([]string{"healthcheck", "run", "hc"})
hc.WaitWithDefaultTimeout()
Expect(hc.ExitCode()).To(Equal(1))
inspect := podmanTest.InspectContainer("hc")
Expect(inspect[0].State.Healthcheck.Status).To(Equal("starting"))
hc = podmanTest.Podman([]string{"healthcheck", "run", "hc"})
hc.WaitWithDefaultTimeout()
Expect(hc.ExitCode()).To(Equal(1))
inspect = podmanTest.InspectContainer("hc")
Expect(inspect[0].State.Healthcheck.Status).To(Equal("unhealthy"))
foo := podmanTest.Podman([]string{"exec", "hc", "touch", "/foo"})
foo.WaitWithDefaultTimeout()
Expect(foo.ExitCode()).To(BeZero())
hc = podmanTest.Podman([]string{"healthcheck", "run", "hc"})
hc.WaitWithDefaultTimeout()
Expect(hc.ExitCode()).To(Equal(0))
inspect = podmanTest.InspectContainer("hc")
Expect(inspect[0].State.Healthcheck.Status).To(Equal("healthy"))
})
})