mirror of
https://github.com/containers/podman
synced 2024-10-20 17:23:30 +00:00
660a72993c
podman gating tests are hanging in the new Fedora CI setup; long and tedious investigation suggests that 'socat' processes are being left unkilled, which then causes BATS to hang when it (presumably) runs a final 'wait' in its end cleanup. The two principal changes are to exec socat in a subshell with fd3 closed, and to pkill its child processes before killing the process itself. I don't know if both are needed. The pkill definitely is; the exec may just be superstition. Since I've wasted more than a day of PTO time on this, I'm okay with a little superstition. What I do know is that with these two changes, my reproducer fails to reproduce in over one hour of trying (normally it fails within 5 minutes). AND, update: only rawhide (f35) leaves stray socat processes behind. f33 and ubuntu do not, so 'pkill -P' fails. I really have no idea what's going on. Signed-off-by: Ed Santiago <santiago@redhat.com>
171 lines
5.4 KiB
Bash
171 lines
5.4 KiB
Bash
#!/usr/bin/env bats -*- bats -*-
|
|
#
|
|
# Tests for systemd sdnotify
|
|
#
|
|
|
|
load helpers
|
|
|
|
# Shared throughout this module: PID of socat process, and path to its log
|
|
_SOCAT_PID=
|
|
_SOCAT_LOG=
|
|
|
|
function setup() {
|
|
skip_if_remote "systemd tests are meaningless over remote"
|
|
|
|
# Skip if systemd is not running
|
|
systemctl list-units &>/dev/null || skip "systemd not available"
|
|
|
|
# sdnotify fails with runc 1.0.0-3-dev2 on Ubuntu. Let's just
|
|
# assume that we work only with crun, nothing else.
|
|
run_podman info --format '{{ .Host.OCIRuntime.Name }}'
|
|
if [[ "$output" != "crun" ]]; then
|
|
skip "this test only works with crun, not '$output'"
|
|
fi
|
|
|
|
basic_setup
|
|
}
|
|
|
|
function teardown() {
|
|
unset NOTIFY_SOCKET
|
|
|
|
_stop_socat
|
|
|
|
basic_teardown
|
|
}
|
|
|
|
###############################################################################
|
|
# BEGIN helpers
|
|
|
|
# Run socat process on a socket, logging to well-known path. Each received
|
|
# packet is logged with a newline appended, for ease of parsing the log file.
|
|
function _start_socat() {
|
|
_SOCAT_LOG="$PODMAN_TMPDIR/socat.log"
|
|
|
|
rm -f $_SOCAT_LOG
|
|
# Execute in subshell so we can close fd3 (which BATS uses).
|
|
# This is a superstitious ritual to try to avoid leaving processes behind,
|
|
# and thus prevent CI hangs.
|
|
(exec socat unix-recvfrom:"$NOTIFY_SOCKET",fork \
|
|
system:"(cat;echo) >> $_SOCAT_LOG" 3>&-) &
|
|
_SOCAT_PID=$!
|
|
}
|
|
|
|
# Stop the socat background process and clean up logs
|
|
function _stop_socat() {
|
|
if [[ -n "$_SOCAT_PID" ]]; then
|
|
# Kill all child processes, then the process itself.
|
|
# This is a superstitious incantation to avoid leaving processes behind.
|
|
# The '|| true' is because only f35 leaves behind socat processes;
|
|
# f33 (and perhaps others?) behave nicely. ARGH!
|
|
pkill -P $_SOCAT_PID || true
|
|
kill $_SOCAT_PID
|
|
fi
|
|
_SOCAT_PID=
|
|
|
|
if [[ -n "$_SOCAT_LOG" ]]; then
|
|
rm -f $_SOCAT_LOG
|
|
fi
|
|
_SOCAT_LOG=
|
|
}
|
|
|
|
# Check that MAINPID=xxxxx points to a running conmon process
|
|
function _assert_mainpid_is_conmon() {
|
|
local mainpid=$(expr "$1" : "MAINPID=\([0-9]\+\)")
|
|
test -n "$mainpid" || die "Could not parse '$1' as 'MAINPID=nnnn'"
|
|
|
|
test -d /proc/$mainpid || die "sdnotify MAINPID=$mainpid - but /proc/$mainpid does not exist"
|
|
|
|
# e.g. /proc/12345/exe -> /usr/bin/conmon
|
|
local mainpid_bin=$(readlink /proc/$mainpid/exe)
|
|
is "$mainpid_bin" ".*/conmon" "sdnotify MAINPID=$mainpid is conmon process"
|
|
}
|
|
|
|
# END helpers
|
|
###############################################################################
|
|
# BEGIN tests themselves
|
|
|
|
@test "sdnotify : ignore" {
|
|
export NOTIFY_SOCKET=$PODMAN_TMPDIR/ignore.sock
|
|
_start_socat
|
|
|
|
run_podman 1 run --rm --sdnotify=ignore $IMAGE printenv NOTIFY_SOCKET
|
|
is "$output" "" "\$NOTIFY_SOCKET in container"
|
|
|
|
is "$(< $_SOCAT_LOG)" "" "nothing received on socket"
|
|
_stop_socat
|
|
}
|
|
|
|
@test "sdnotify : conmon" {
|
|
export NOTIFY_SOCKET=$PODMAN_TMPDIR/conmon.sock
|
|
_start_socat
|
|
|
|
run_podman run -d --name sdnotify_conmon_c \
|
|
--sdnotify=conmon \
|
|
$IMAGE \
|
|
sh -c 'printenv NOTIFY_SOCKET;echo READY;while ! test -f /stop;do sleep 0.1;done'
|
|
cid="$output"
|
|
wait_for_ready $cid
|
|
|
|
run_podman logs sdnotify_conmon_c
|
|
is "$output" "READY" "\$NOTIFY_SOCKET in container"
|
|
|
|
# The 'echo's help us debug failed runs
|
|
run cat $_SOCAT_LOG
|
|
echo "socat log:"
|
|
echo "$output"
|
|
|
|
# ARGH! 'READY=1' should always be the last output line. But sometimes,
|
|
# for reasons unknown, we get an extra MAINPID=xxx after READY=1 (#8718).
|
|
# Who knows if this is a systemd bug, or conmon, or what. I don't
|
|
# even know where to begin asking. So, to eliminate the test flakes,
|
|
# we look for READY=1 _anywhere_ in the output, not just the last line.
|
|
is "$output" ".*READY=1.*" "sdnotify sent READY=1"
|
|
|
|
_assert_mainpid_is_conmon "${lines[0]}"
|
|
|
|
# Done. Stop container, clean up.
|
|
run_podman exec $cid touch /stop
|
|
run_podman wait $cid
|
|
run_podman rm $cid
|
|
_stop_socat
|
|
}
|
|
|
|
@test "sdnotify : container" {
|
|
# Sigh... we need to pull a humongous image because it has systemd-notify.
|
|
# (IMPORTANT: fedora:32 and above silently removed systemd-notify; this
|
|
# caused CI to hang. That's why we explicitly require fedora:31)
|
|
# FIXME: is there a smaller image we could use?
|
|
local _FEDORA="$PODMAN_TEST_IMAGE_REGISTRY/$PODMAN_TEST_IMAGE_USER/fedora:31"
|
|
# Pull that image. Retry in case of flakes.
|
|
run_podman pull $_FEDORA || \
|
|
run_podman pull $_FEDORA || \
|
|
run_podman pull $_FEDORA
|
|
|
|
export NOTIFY_SOCKET=$PODMAN_TMPDIR/container.sock
|
|
_start_socat
|
|
|
|
run_podman run -d --sdnotify=container $_FEDORA \
|
|
sh -c 'printenv NOTIFY_SOCKET;echo READY;systemd-notify --ready;while ! test -f /stop;do sleep 0.1;done'
|
|
cid="$output"
|
|
wait_for_ready $cid
|
|
|
|
run_podman logs $cid
|
|
is "${lines[0]}" "/.*/container\.sock/notify" "NOTIFY_SOCKET is passed to container"
|
|
|
|
# With container, READY=1 isn't necessarily the last message received;
|
|
# just look for it anywhere in received messages
|
|
run cat $_SOCAT_LOG
|
|
is "$output" ".*READY=1" "received READY=1 through notify socket"
|
|
|
|
_assert_mainpid_is_conmon "${lines[0]}"
|
|
|
|
# Done. Stop container, clean up.
|
|
run_podman exec $cid touch /stop
|
|
run_podman wait $cid
|
|
run_podman rm $cid
|
|
run_podman rmi $_FEDORA
|
|
_stop_socat
|
|
}
|
|
|
|
# vim: filetype=sh
|