mirror of
https://github.com/containers/podman
synced 2024-10-19 08:44:11 +00:00
GHA: Auto. re-run failed cirrus-cron builds once
With a seemingly ever growing list of cirrus-cron jobs running on release branches, there are bound to be some hiccups. Sometimes a lot of them. Normally any failures require a human to eyeball the logs and/or manually re-run the job to see if it was simply a flake. This doesn't take long, but can be distracting and compounds over time. Attempt to alleviate some maintainer burden by using a new github action workflow to perform **one** automatic re-run on any failed builds. This task is scheduled an hour prior to a second failure check, and generation of notification e-mail for review. Note: If there are no failures, due to the auto. re-run or luck, no e-mail is generated. If this proves useful in this repo, I intend to re-use this workflow for other repo's cirrus-cron jobs. Signed-off-by: Chris Evich <cevich@redhat.com>
This commit is contained in:
parent
3a85d537b6
commit
35523d560a
|
@ -8,31 +8,25 @@ set -eo pipefail
|
|||
source $(dirname "${BASH_SOURCE[0]}")/lib.sh
|
||||
|
||||
_errfmt="Expecting %s value to not be empty"
|
||||
if [[ -z "$GITHUB_REPOSITORY" ]]; then
|
||||
if [[ -z "$GITHUB_REPOSITORY" ]]; then # <owner>/<repo>
|
||||
err $(printf "$_errfmt" "\$GITHUB_REPOSITORY")
|
||||
elif [[ -z "$NAME_ID_FILEPATH" ]]; then
|
||||
elif [[ -z "$NAME_ID_FILEPATH" ]]; then # output filepath
|
||||
err $(printf "$_errfmt" "\$NAME_ID_FILEPATH")
|
||||
fi
|
||||
|
||||
mkdir -p artifacts
|
||||
cat > ./artifacts/query_raw.json << "EOF"
|
||||
{"query":"
|
||||
query CronNameStatus($owner: String!, $repo: String!) {
|
||||
ownerRepository(platform: \"LINUX\", owner: $owner, name: $repo) {
|
||||
cronSettings {
|
||||
name
|
||||
lastInvocationBuild {
|
||||
id
|
||||
status
|
||||
}
|
||||
query {
|
||||
ownerRepository(platform: "LINUX", owner: "@@OWNER@@", name: "@@REPO@@") {
|
||||
cronSettings {
|
||||
name
|
||||
lastInvocationBuild {
|
||||
id
|
||||
status
|
||||
}
|
||||
}
|
||||
}
|
||||
",
|
||||
"variables":"{
|
||||
\"owner\": \"@@OWNER@@\",
|
||||
\"repo\": \"@@REPO@@\"
|
||||
}"}
|
||||
}
|
||||
EOF
|
||||
# Makes for easier copy/pasting query to/from
|
||||
# https://cirrus-ci.com/explorer
|
||||
|
@ -40,7 +34,6 @@ owner=$(cut -d '/' -f 1 <<<"$GITHUB_REPOSITORY")
|
|||
repo=$(cut -d '/' -f 2 <<<"$GITHUB_REPOSITORY")
|
||||
sed -i -r -e "s/@@OWNER@@/$owner/g" -e "s/@@REPO@@/$repo/g" ./artifacts/query_raw.json
|
||||
|
||||
echo "::group::Posting GraphQL Query"
|
||||
# Easier to debug in error-reply when query is compacted
|
||||
tr -d '\n' < ./artifacts/query_raw.json | tr -s ' ' | tee ./artifacts/query.json | \
|
||||
jq --indent 4 --color-output .
|
||||
|
@ -48,21 +41,13 @@ tr -d '\n' < ./artifacts/query_raw.json | tr -s ' ' | tee ./artifacts/query.json
|
|||
if grep -q '@@' ./artifacts/query.json; then
|
||||
err "Found unreplaced substitution token in raw query JSON"
|
||||
fi
|
||||
curl \
|
||||
--request POST \
|
||||
--silent \
|
||||
--location \
|
||||
--header 'content-type: application/json' \
|
||||
--url 'https://api.cirrus-ci.com/graphql' \
|
||||
--data @./artifacts/query.json \
|
||||
--output ./artifacts/reply.json
|
||||
echo "::endgroup::"
|
||||
|
||||
echo "::group::Received GraphQL Reply"
|
||||
jq --indent 4 --color-output . <./artifacts/reply.json || \
|
||||
cat ./artifacts/reply.json
|
||||
echo "::endgroup::"
|
||||
# The query should never ever return an empty-list, unless there are no cirrus-cron
|
||||
# jobs defined for the repository. In that case, this monitoring script shouldn't
|
||||
# be running anyway.
|
||||
filt_head='.data.ownerRepository.cronSettings'
|
||||
|
||||
gql $(./artifacts/query.json) "$filt_head" > ./artifacts/reply.json
|
||||
# e.x. reply.json
|
||||
# {
|
||||
# "data": {
|
||||
|
@ -87,22 +72,8 @@ echo "::endgroup::"
|
|||
# "lastInvocationBuild": {
|
||||
# "id": "5003065549914112",
|
||||
# "status": "FAILED"
|
||||
# }
|
||||
# }
|
||||
# ]
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
|
||||
# This should never ever return an empty-list, unless there are no cirrus-cron
|
||||
# jobs defined for the repository. In that case, this monitoring script shouldn't
|
||||
# be running anyway.
|
||||
filt_head='.data.ownerRepository.cronSettings'
|
||||
if ! jq -e "$filt_head" ./artifacts/reply.json &> /dev/null
|
||||
then
|
||||
# Actual colorized JSON reply was printed above
|
||||
err "Null/empty result filtering reply with '$filt_head'"
|
||||
fi
|
||||
# ...
|
||||
|
||||
filt="$filt_head | map(select(.lastInvocationBuild.status==\"FAILED\") | { name:.name, id:.lastInvocationBuild.id} | join(\" \")) | join(\"\n\")"
|
||||
jq --raw-output "$filt" ./artifacts/reply.json > "$NAME_ID_FILEPATH"
|
||||
|
@ -114,5 +85,7 @@ cat "$NAME_ID_FILEPATH"
|
|||
records=$(wc --words "$NAME_ID_FILEPATH" | cut -d ' ' -f 1)
|
||||
# Always two words per record
|
||||
failures=$((records/2))
|
||||
# Set the output of this step.
|
||||
# Ref: https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#setting-an-output-parameter
|
||||
echo "failures::$failures" >> $GITHUB_OUTPUT
|
||||
echo "Total failed Cirrus-CI cron builds: $failures"
|
||||
|
|
70
.github/actions/check_cirrus_cron/lib.sh
vendored
70
.github/actions/check_cirrus_cron/lib.sh
vendored
|
@ -1,7 +1,75 @@
|
|||
|
||||
|
||||
# Send text to stderr
|
||||
msg() {
|
||||
echo "$@" > /dev/stderr
|
||||
}
|
||||
|
||||
# Must be called from top-level of script, not another function.
|
||||
err() {
|
||||
# Ref: https://docs.github.com/en/free-pro-team@latest/actions/reference/workflow-commands-for-github-actions
|
||||
echo "::error file=${BASH_SOURCE[1]},line=${BASH_LINENO[1]}::${1:-No error message given}"
|
||||
msg "::error file=${BASH_SOURCE[1]},line=${BASH_LINENO[0]}::$@"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Using python3 here is a compromise for readability and
|
||||
# properly handling quote, control and unicode character encoding.
|
||||
escape_query() {
|
||||
local json_string
|
||||
# Assume it's okay to squash repeated whitespaces inside the query
|
||||
json_string=$(printf '%s' "$1" | \
|
||||
tr --delete '\r\n' | \
|
||||
tr --squeeze-repeats '[[:space:]]' | \
|
||||
python3 -c 'import sys,json; print(json.dumps(sys.stdin.read()))')
|
||||
# The $json_string in message is already quoted
|
||||
echo -n "$json_string"
|
||||
}
|
||||
|
||||
# Given a GraphQL query/mutation, fire it at the API.
|
||||
# and return the output on stdout. The optional
|
||||
# second parameter may contain a jq filter-string.
|
||||
# When provided, if the GQL result is empty, null,
|
||||
# fails to parse, or does not match the filter-string,
|
||||
# non-zero will be returned.
|
||||
gql() {
|
||||
local e_query query
|
||||
e_query=$(escape_query "$1")
|
||||
query="{\"query\": $e_query}"
|
||||
local filter
|
||||
filter="$2"
|
||||
local output
|
||||
local filtered
|
||||
msg "::group::Posting GraphQL Query and checking result"
|
||||
msg "query: "
|
||||
if ! jq -e . <<<"$query" > /dev/stderr; then
|
||||
msg "::error file=${BASH_SOURCE[1]},line=${BASH_LINENO[0]}::Invalid query JSON: $query"
|
||||
return 1
|
||||
fi
|
||||
if output=$(curl \
|
||||
--request POST \
|
||||
--silent \
|
||||
--show-error \
|
||||
--location \
|
||||
--header 'content-type: application/json' \
|
||||
--header "Authorization: Bearer $SECRET_CIRRUS_API_KEY" \
|
||||
--url 'https://api.cirrus-ci.com/graphql' \
|
||||
--data "$query") && [[ -n "$output" ]]; then
|
||||
|
||||
if filtered=$(jq -e "$filter" <<<"$output") && [[ -n "$filtered" ]]; then
|
||||
msg "result:"
|
||||
# Make debugging easier w/ formatted output
|
||||
# to stderr for display, stdout for consumption by caller
|
||||
jq --indent 2 . <<<"$output" | tee /dev/stderr
|
||||
msg "::endgroup::"
|
||||
return 0
|
||||
fi
|
||||
|
||||
msg "::error file=${BASH_SOURCE[1]},line=${BASH_LINENO[0]}::Query result did not pass filter '$2': '$output'"
|
||||
msg "::endgroup::"
|
||||
return 2
|
||||
fi
|
||||
|
||||
msg "::error file=${BASH_SOURCE[1]},line=${BASH_LINENO[0]}::Query failed or result empty: '$output'"
|
||||
msg "::endgroup::"
|
||||
return 3
|
||||
}
|
||||
|
|
|
@ -14,8 +14,7 @@ if [[ -z "$GITHUB_REPOSITORY" ]]; then
|
|||
elif [[ -z "$GITHUB_WORKFLOW" ]]; then
|
||||
err $(printf "$_errfmt" "\$GITHUB_WORKFLOW")
|
||||
elif [[ ! -r "$NAME_ID_FILEPATH" ]]; then
|
||||
_errfmt="Expecting %s value to be a readable file"
|
||||
err $(printf "$_errfmt" "\$NAME_ID_FILEPATH")
|
||||
err "Expecting \$NAME_ID_FILEPATH value ($NAME_ID_FILEPATH) to be a readable file"
|
||||
fi
|
||||
|
||||
mkdir -p artifacts
|
||||
|
|
112
.github/actions/check_cirrus_cron/rerun_failed_tasks.sh
vendored
Executable file
112
.github/actions/check_cirrus_cron/rerun_failed_tasks.sh
vendored
Executable file
|
@ -0,0 +1,112 @@
|
|||
#!/bin/bash
|
||||
|
||||
set -eo pipefail
|
||||
|
||||
# Intended to be executed from a github action workflow step.
|
||||
# Input: File listing space separated failed cron build names and IDs
|
||||
# Output: $GITHUB_WORKSPACE/artifacts/email_body.txt file
|
||||
#
|
||||
# HOW TO TEST: This script may be manually tested assuming you have
|
||||
# access to the github containers-org. Cirrus API key. With that in-hand,
|
||||
# this script may be manually run by:
|
||||
# 1. export SECRET_CIRRUS_API_KEY=<value>
|
||||
# 2. Find an old podman build that failed on `main` or another **branch**.
|
||||
# For example, from https://cirrus-ci.com/github/containers/podman/main
|
||||
# (pick an old one from the bottom, since re-running it won't affect anybody)
|
||||
# 3. Create a temp. file, like /tmp/fail with a single line, of the form:
|
||||
# <branch> <cirrus build id number>
|
||||
# 4. export NAME_ID_FILEPATH=/tmp/fail
|
||||
# 5. execute this script, and refresh the build in the WebUI, all unsuccessful
|
||||
# tasks should change status to running or scheduled. Note: some later
|
||||
# tasks may remain red as they wait for dependencies to run and pass.
|
||||
# 6. After each run, cleanup with 'rm -rf ./artifacts'
|
||||
# (unless you want to examine them)
|
||||
|
||||
source $(dirname "${BASH_SOURCE[0]}")/lib.sh
|
||||
|
||||
_errfmt="Expecting %s value to not be empty"
|
||||
if [[ -z "$SECRET_CIRRUS_API_KEY" ]]; then
|
||||
err $(printf "$_errfmt" "\$SECRET_CIRRUS_API_KEY")
|
||||
elif [[ ! -r "$NAME_ID_FILEPATH" ]]; then # output from cron_failures.sh
|
||||
err $(printf "Expecting %s value to be a readable file" "\$NAME_ID_FILEPATH")
|
||||
fi
|
||||
|
||||
mkdir -p artifacts
|
||||
# If there are no tasks, don't fail reading the file
|
||||
truncate -s 0 ./artifacts/rerun_tids.txt
|
||||
|
||||
cat "$NAME_ID_FILEPATH" | \
|
||||
while read -r NAME BID; do
|
||||
if [[ -z "$NAME" ]]; then
|
||||
err $(printf "$_errfmt" "\$NAME")
|
||||
elif [[ -z "$BID" ]]; then
|
||||
err $(printf "$_errfmt" "\$BID")
|
||||
fi
|
||||
|
||||
id_status_q="
|
||||
query {
|
||||
build(id: \"$BID\") {
|
||||
tasks {
|
||||
id,
|
||||
status
|
||||
}
|
||||
}
|
||||
}
|
||||
"
|
||||
task_id_status=$(gql "$id_status_q" '.data.build.tasks[0]')
|
||||
# Expected query result like:
|
||||
# {
|
||||
# "data": {
|
||||
# "build": {
|
||||
# "tasks": [
|
||||
# {
|
||||
# "id": "6321184690667520",
|
||||
# "status": "COMPLETED"
|
||||
# },
|
||||
# ...
|
||||
msg "::group::Selecting failed/aborted tasks to re-run"
|
||||
jq -r -e '.data.build.tasks[] | join(" ")' <<<"$task_id_status" | \
|
||||
while read -r TID STATUS; do
|
||||
if [[ -z "$TID" ]] || [[ -z "$STATUS" ]]; then
|
||||
# assume empty line and/or end of file
|
||||
msg "Skipping TID '$TID' with status '$STATUS'"
|
||||
continue
|
||||
# Failed task dependencies will have 'aborted' status
|
||||
elif [[ "$STATUS" == "FAILED" ]] || [[ "$STATUS" == "ABORTED" ]]; then
|
||||
msg "Rerunning build $BID task $TID"
|
||||
# Must send result through a file into rerun_tasks array
|
||||
# because this section is executing in a child-shell
|
||||
echo "$TID" >> ./artifacts/rerun_tids.txt
|
||||
fi
|
||||
done
|
||||
declare -a rerun_tasks
|
||||
mapfile rerun_tasks <./artifacts/rerun_tids.txt
|
||||
msg "::endgroup::"
|
||||
|
||||
if [[ "${#rerun_tasks[*]}" -eq 0 ]]; then
|
||||
msg "No tasks to re-run for build $BID"
|
||||
continue;
|
||||
fi
|
||||
|
||||
msg "::warning::Rerunning ${#rerun_tasks[*]} tasks for build $BID"
|
||||
# Check-value returned if the gql call was successful
|
||||
canary=$(uuidgen)
|
||||
# Ensure the trailing ',' is stripped from the end (would be invalid JSON)
|
||||
task_ids=$(printf '[%s]' $(printf '"%s",' ${rerun_tasks[@]} | head -c -1))
|
||||
rerun_m="
|
||||
mutation {
|
||||
batchReRun(input: {
|
||||
clientMutationId: \"$canary\",
|
||||
taskIds: $task_ids
|
||||
}
|
||||
) {
|
||||
clientMutationId
|
||||
}
|
||||
}
|
||||
"
|
||||
filter='.data.batchReRun.clientMutationId'
|
||||
result=$(gql "$rerun_m" "$filter")
|
||||
if [[ $(jq -r -e "$filter"<<<"$result") != "$canary" ]]; then
|
||||
err "Attempt to re-run tasks for build $BID failed: ${rerun_tasks[@]}"
|
||||
fi
|
||||
done
|
61
.github/workflows/rerun_cirrus_cron.yml
vendored
Normal file
61
.github/workflows/rerun_cirrus_cron.yml
vendored
Normal file
|
@ -0,0 +1,61 @@
|
|||
---
|
||||
|
||||
# Format Ref: https://docs.github.com/en/free-pro-team@latest/actions/reference/workflow-syntax-for-github-actions
|
||||
|
||||
# Required to un-FUBAR default ${{github.workflow}} value
|
||||
name: rerun_cirrus_cron
|
||||
|
||||
on:
|
||||
# Note: This only applies to the main branch.
|
||||
schedule:
|
||||
# N/B: This should fire about an hour prior to check_cirrus_cron
|
||||
# so the re-runs have a chance to complete.
|
||||
- cron: '59 22 * * 1-5'
|
||||
# Debug: Allow triggering job manually in github-actions WebUI
|
||||
workflow_dispatch: {}
|
||||
|
||||
env:
|
||||
# Debug-mode can reveal secrets, only enable by a secret value.
|
||||
# Ref: https://help.github.com/en/actions/configuring-and-managing-workflows/managing-a-workflow-run#enabling-step-debug-logging
|
||||
ACTIONS_STEP_DEBUG: '${{ secrets.ACTIONS_STEP_DEBUG }}'
|
||||
# CSV listing of e-mail addresses for delivery failure or error notices
|
||||
RCPTCSV: rh.container.bot@gmail.com,podman-monitor@lists.podman.io
|
||||
# Filename for table of cron-name to build-id data
|
||||
# (must be in $GITHUB_WORKSPACE/artifacts/)
|
||||
NAME_ID_FILEPATH: './artifacts/name_id.txt'
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
cron_failures:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@629c2de402a417ea7690ca6ce3f33229e27606a5 # v2
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Get failed cron names and Build IDs
|
||||
id: cron
|
||||
run: './.github/actions/check_cirrus_cron/cron_failures.sh'
|
||||
|
||||
- if: steps.cron.outputs.failures > 0
|
||||
shell: bash
|
||||
run: './.github/actions/check_cirrus_cron/rerun_failed_tasks.sh'
|
||||
uses: actions/upload-artifact@82c141cc518b40d92cc801eee768e7aafc9c2fa2 # v2
|
||||
with:
|
||||
name: ${{ github.job }}_artifacts
|
||||
path: artifacts/*
|
||||
|
||||
- if: failure()
|
||||
name: Send error notification e-mail
|
||||
uses: dawidd6/action-send-mail@a80d851dc950256421f1d1d735a2dc1ef314ac8f # v2.2.2
|
||||
with:
|
||||
server_address: ${{secrets.ACTION_MAIL_SERVER}}
|
||||
server_port: 465
|
||||
username: ${{secrets.ACTION_MAIL_USERNAME}}
|
||||
password: ${{secrets.ACTION_MAIL_PASSWORD}}
|
||||
subject: Github workflow error on ${{github.repository}}
|
||||
to: ${{env.RCPTCSV}}
|
||||
from: ${{secrets.ACTION_MAIL_SENDER}}
|
||||
body: "Job failed: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}}"
|
Loading…
Reference in a new issue