GHA: Auto. re-run failed cirrus-cron builds once

With a seemingly ever growing list of cirrus-cron jobs running on
release branches, there are bound to be some hiccups.  Sometimes a lot
of them.  Normally any failures require a human to eyeball the logs
and/or manually re-run the job to see if it was simply a flake.  This
doesn't take long, but can be distracting and compounds over time.

Attempt to alleviate some maintainer burden by using a new github action
workflow to perform **one** automatic re-run on any failed builds.  This
task is scheduled an hour prior to a second failure check, and generation
of notification e-mail for review.

Note: If there are no failures, due to the auto. re-run or luck, no
e-mail is generated. If this proves useful in this repo, I intend to
re-use this workflow for other repo's cirrus-cron jobs.

Signed-off-by: Chris Evich <cevich@redhat.com>
This commit is contained in:
Chris Evich 2022-10-20 13:21:04 -04:00
parent 3a85d537b6
commit 35523d560a
No known key found for this signature in database
GPG key ID: 03EDC70FD578067F
5 changed files with 261 additions and 48 deletions

View file

@ -8,31 +8,25 @@ set -eo pipefail
source $(dirname "${BASH_SOURCE[0]}")/lib.sh
_errfmt="Expecting %s value to not be empty"
if [[ -z "$GITHUB_REPOSITORY" ]]; then
if [[ -z "$GITHUB_REPOSITORY" ]]; then # <owner>/<repo>
err $(printf "$_errfmt" "\$GITHUB_REPOSITORY")
elif [[ -z "$NAME_ID_FILEPATH" ]]; then
elif [[ -z "$NAME_ID_FILEPATH" ]]; then # output filepath
err $(printf "$_errfmt" "\$NAME_ID_FILEPATH")
fi
mkdir -p artifacts
cat > ./artifacts/query_raw.json << "EOF"
{"query":"
query CronNameStatus($owner: String!, $repo: String!) {
ownerRepository(platform: \"LINUX\", owner: $owner, name: $repo) {
cronSettings {
name
lastInvocationBuild {
id
status
}
query {
ownerRepository(platform: "LINUX", owner: "@@OWNER@@", name: "@@REPO@@") {
cronSettings {
name
lastInvocationBuild {
id
status
}
}
}
",
"variables":"{
\"owner\": \"@@OWNER@@\",
\"repo\": \"@@REPO@@\"
}"}
}
EOF
# Makes for easier copy/pasting query to/from
# https://cirrus-ci.com/explorer
@ -40,7 +34,6 @@ owner=$(cut -d '/' -f 1 <<<"$GITHUB_REPOSITORY")
repo=$(cut -d '/' -f 2 <<<"$GITHUB_REPOSITORY")
sed -i -r -e "s/@@OWNER@@/$owner/g" -e "s/@@REPO@@/$repo/g" ./artifacts/query_raw.json
echo "::group::Posting GraphQL Query"
# Easier to debug in error-reply when query is compacted
tr -d '\n' < ./artifacts/query_raw.json | tr -s ' ' | tee ./artifacts/query.json | \
jq --indent 4 --color-output .
@ -48,21 +41,13 @@ tr -d '\n' < ./artifacts/query_raw.json | tr -s ' ' | tee ./artifacts/query.json
if grep -q '@@' ./artifacts/query.json; then
err "Found unreplaced substitution token in raw query JSON"
fi
curl \
--request POST \
--silent \
--location \
--header 'content-type: application/json' \
--url 'https://api.cirrus-ci.com/graphql' \
--data @./artifacts/query.json \
--output ./artifacts/reply.json
echo "::endgroup::"
echo "::group::Received GraphQL Reply"
jq --indent 4 --color-output . <./artifacts/reply.json || \
cat ./artifacts/reply.json
echo "::endgroup::"
# The query should never ever return an empty-list, unless there are no cirrus-cron
# jobs defined for the repository. In that case, this monitoring script shouldn't
# be running anyway.
filt_head='.data.ownerRepository.cronSettings'
gql $(./artifacts/query.json) "$filt_head" > ./artifacts/reply.json
# e.x. reply.json
# {
# "data": {
@ -87,22 +72,8 @@ echo "::endgroup::"
# "lastInvocationBuild": {
# "id": "5003065549914112",
# "status": "FAILED"
# }
# }
# ]
# }
# }
# }
# This should never ever return an empty-list, unless there are no cirrus-cron
# jobs defined for the repository. In that case, this monitoring script shouldn't
# be running anyway.
filt_head='.data.ownerRepository.cronSettings'
if ! jq -e "$filt_head" ./artifacts/reply.json &> /dev/null
then
# Actual colorized JSON reply was printed above
err "Null/empty result filtering reply with '$filt_head'"
fi
# ...
filt="$filt_head | map(select(.lastInvocationBuild.status==\"FAILED\") | { name:.name, id:.lastInvocationBuild.id} | join(\" \")) | join(\"\n\")"
jq --raw-output "$filt" ./artifacts/reply.json > "$NAME_ID_FILEPATH"
@ -114,5 +85,7 @@ cat "$NAME_ID_FILEPATH"
records=$(wc --words "$NAME_ID_FILEPATH" | cut -d ' ' -f 1)
# Always two words per record
failures=$((records/2))
# Set the output of this step.
# Ref: https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#setting-an-output-parameter
echo "failures::$failures" >> $GITHUB_OUTPUT
echo "Total failed Cirrus-CI cron builds: $failures"

View file

@ -1,7 +1,75 @@
# Send text to stderr
msg() {
echo "$@" > /dev/stderr
}
# Must be called from top-level of script, not another function.
err() {
# Ref: https://docs.github.com/en/free-pro-team@latest/actions/reference/workflow-commands-for-github-actions
echo "::error file=${BASH_SOURCE[1]},line=${BASH_LINENO[1]}::${1:-No error message given}"
msg "::error file=${BASH_SOURCE[1]},line=${BASH_LINENO[0]}::$@"
exit 1
}
# Using python3 here is a compromise for readability and
# properly handling quote, control and unicode character encoding.
escape_query() {
local json_string
# Assume it's okay to squash repeated whitespaces inside the query
json_string=$(printf '%s' "$1" | \
tr --delete '\r\n' | \
tr --squeeze-repeats '[[:space:]]' | \
python3 -c 'import sys,json; print(json.dumps(sys.stdin.read()))')
# The $json_string in message is already quoted
echo -n "$json_string"
}
# Given a GraphQL query/mutation, fire it at the API.
# and return the output on stdout. The optional
# second parameter may contain a jq filter-string.
# When provided, if the GQL result is empty, null,
# fails to parse, or does not match the filter-string,
# non-zero will be returned.
gql() {
local e_query query
e_query=$(escape_query "$1")
query="{\"query\": $e_query}"
local filter
filter="$2"
local output
local filtered
msg "::group::Posting GraphQL Query and checking result"
msg "query: "
if ! jq -e . <<<"$query" > /dev/stderr; then
msg "::error file=${BASH_SOURCE[1]},line=${BASH_LINENO[0]}::Invalid query JSON: $query"
return 1
fi
if output=$(curl \
--request POST \
--silent \
--show-error \
--location \
--header 'content-type: application/json' \
--header "Authorization: Bearer $SECRET_CIRRUS_API_KEY" \
--url 'https://api.cirrus-ci.com/graphql' \
--data "$query") && [[ -n "$output" ]]; then
if filtered=$(jq -e "$filter" <<<"$output") && [[ -n "$filtered" ]]; then
msg "result:"
# Make debugging easier w/ formatted output
# to stderr for display, stdout for consumption by caller
jq --indent 2 . <<<"$output" | tee /dev/stderr
msg "::endgroup::"
return 0
fi
msg "::error file=${BASH_SOURCE[1]},line=${BASH_LINENO[0]}::Query result did not pass filter '$2': '$output'"
msg "::endgroup::"
return 2
fi
msg "::error file=${BASH_SOURCE[1]},line=${BASH_LINENO[0]}::Query failed or result empty: '$output'"
msg "::endgroup::"
return 3
}

View file

@ -14,8 +14,7 @@ if [[ -z "$GITHUB_REPOSITORY" ]]; then
elif [[ -z "$GITHUB_WORKFLOW" ]]; then
err $(printf "$_errfmt" "\$GITHUB_WORKFLOW")
elif [[ ! -r "$NAME_ID_FILEPATH" ]]; then
_errfmt="Expecting %s value to be a readable file"
err $(printf "$_errfmt" "\$NAME_ID_FILEPATH")
err "Expecting \$NAME_ID_FILEPATH value ($NAME_ID_FILEPATH) to be a readable file"
fi
mkdir -p artifacts

View file

@ -0,0 +1,112 @@
#!/bin/bash
set -eo pipefail
# Intended to be executed from a github action workflow step.
# Input: File listing space separated failed cron build names and IDs
# Output: $GITHUB_WORKSPACE/artifacts/email_body.txt file
#
# HOW TO TEST: This script may be manually tested assuming you have
# access to the github containers-org. Cirrus API key. With that in-hand,
# this script may be manually run by:
# 1. export SECRET_CIRRUS_API_KEY=<value>
# 2. Find an old podman build that failed on `main` or another **branch**.
# For example, from https://cirrus-ci.com/github/containers/podman/main
# (pick an old one from the bottom, since re-running it won't affect anybody)
# 3. Create a temp. file, like /tmp/fail with a single line, of the form:
# <branch> <cirrus build id number>
# 4. export NAME_ID_FILEPATH=/tmp/fail
# 5. execute this script, and refresh the build in the WebUI, all unsuccessful
# tasks should change status to running or scheduled. Note: some later
# tasks may remain red as they wait for dependencies to run and pass.
# 6. After each run, cleanup with 'rm -rf ./artifacts'
# (unless you want to examine them)
source $(dirname "${BASH_SOURCE[0]}")/lib.sh
_errfmt="Expecting %s value to not be empty"
if [[ -z "$SECRET_CIRRUS_API_KEY" ]]; then
err $(printf "$_errfmt" "\$SECRET_CIRRUS_API_KEY")
elif [[ ! -r "$NAME_ID_FILEPATH" ]]; then # output from cron_failures.sh
err $(printf "Expecting %s value to be a readable file" "\$NAME_ID_FILEPATH")
fi
mkdir -p artifacts
# If there are no tasks, don't fail reading the file
truncate -s 0 ./artifacts/rerun_tids.txt
cat "$NAME_ID_FILEPATH" | \
while read -r NAME BID; do
if [[ -z "$NAME" ]]; then
err $(printf "$_errfmt" "\$NAME")
elif [[ -z "$BID" ]]; then
err $(printf "$_errfmt" "\$BID")
fi
id_status_q="
query {
build(id: \"$BID\") {
tasks {
id,
status
}
}
}
"
task_id_status=$(gql "$id_status_q" '.data.build.tasks[0]')
# Expected query result like:
# {
# "data": {
# "build": {
# "tasks": [
# {
# "id": "6321184690667520",
# "status": "COMPLETED"
# },
# ...
msg "::group::Selecting failed/aborted tasks to re-run"
jq -r -e '.data.build.tasks[] | join(" ")' <<<"$task_id_status" | \
while read -r TID STATUS; do
if [[ -z "$TID" ]] || [[ -z "$STATUS" ]]; then
# assume empty line and/or end of file
msg "Skipping TID '$TID' with status '$STATUS'"
continue
# Failed task dependencies will have 'aborted' status
elif [[ "$STATUS" == "FAILED" ]] || [[ "$STATUS" == "ABORTED" ]]; then
msg "Rerunning build $BID task $TID"
# Must send result through a file into rerun_tasks array
# because this section is executing in a child-shell
echo "$TID" >> ./artifacts/rerun_tids.txt
fi
done
declare -a rerun_tasks
mapfile rerun_tasks <./artifacts/rerun_tids.txt
msg "::endgroup::"
if [[ "${#rerun_tasks[*]}" -eq 0 ]]; then
msg "No tasks to re-run for build $BID"
continue;
fi
msg "::warning::Rerunning ${#rerun_tasks[*]} tasks for build $BID"
# Check-value returned if the gql call was successful
canary=$(uuidgen)
# Ensure the trailing ',' is stripped from the end (would be invalid JSON)
task_ids=$(printf '[%s]' $(printf '"%s",' ${rerun_tasks[@]} | head -c -1))
rerun_m="
mutation {
batchReRun(input: {
clientMutationId: \"$canary\",
taskIds: $task_ids
}
) {
clientMutationId
}
}
"
filter='.data.batchReRun.clientMutationId'
result=$(gql "$rerun_m" "$filter")
if [[ $(jq -r -e "$filter"<<<"$result") != "$canary" ]]; then
err "Attempt to re-run tasks for build $BID failed: ${rerun_tasks[@]}"
fi
done

61
.github/workflows/rerun_cirrus_cron.yml vendored Normal file
View file

@ -0,0 +1,61 @@
---
# Format Ref: https://docs.github.com/en/free-pro-team@latest/actions/reference/workflow-syntax-for-github-actions
# Required to un-FUBAR default ${{github.workflow}} value
name: rerun_cirrus_cron
on:
# Note: This only applies to the main branch.
schedule:
# N/B: This should fire about an hour prior to check_cirrus_cron
# so the re-runs have a chance to complete.
- cron: '59 22 * * 1-5'
# Debug: Allow triggering job manually in github-actions WebUI
workflow_dispatch: {}
env:
# Debug-mode can reveal secrets, only enable by a secret value.
# Ref: https://help.github.com/en/actions/configuring-and-managing-workflows/managing-a-workflow-run#enabling-step-debug-logging
ACTIONS_STEP_DEBUG: '${{ secrets.ACTIONS_STEP_DEBUG }}'
# CSV listing of e-mail addresses for delivery failure or error notices
RCPTCSV: rh.container.bot@gmail.com,podman-monitor@lists.podman.io
# Filename for table of cron-name to build-id data
# (must be in $GITHUB_WORKSPACE/artifacts/)
NAME_ID_FILEPATH: './artifacts/name_id.txt'
permissions:
contents: read
jobs:
cron_failures:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@629c2de402a417ea7690ca6ce3f33229e27606a5 # v2
with:
persist-credentials: false
- name: Get failed cron names and Build IDs
id: cron
run: './.github/actions/check_cirrus_cron/cron_failures.sh'
- if: steps.cron.outputs.failures > 0
shell: bash
run: './.github/actions/check_cirrus_cron/rerun_failed_tasks.sh'
uses: actions/upload-artifact@82c141cc518b40d92cc801eee768e7aafc9c2fa2 # v2
with:
name: ${{ github.job }}_artifacts
path: artifacts/*
- if: failure()
name: Send error notification e-mail
uses: dawidd6/action-send-mail@a80d851dc950256421f1d1d735a2dc1ef314ac8f # v2.2.2
with:
server_address: ${{secrets.ACTION_MAIL_SERVER}}
server_port: 465
username: ${{secrets.ACTION_MAIL_USERNAME}}
password: ${{secrets.ACTION_MAIL_PASSWORD}}
subject: Github workflow error on ${{github.repository}}
to: ${{env.RCPTCSV}}
from: ${{secrets.ACTION_MAIL_SENDER}}
body: "Job failed: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}}"