Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,9 +105,17 @@ You also need the agent to have access to the following defined environment vari
* `BUILDKITE_PLUGIN_S3_CACHE_PREFIX`: optional prefix to use for the cache within the bucket
* `BUILDKITE_PLUGIN_S3_CACHE_ENDPOINT`: optional S3 custom endpoint to use
* `BUILDKITE_PLUGIN_S3_CACHE_PROFILE`: optional profile (that [must exist in the agent's config](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-role.html)) to use for CLI calls
* `BUILDKITE_PLUGIN_S3_CACHE_DOWNLOAD_RETRIES`: optional number of attempts when restoring an object (default `3`). A download is retried if a concurrent save overwrites the object mid-transfer (see [Concurrent saves](#concurrent-saves)).

Setting the `BUILDKITE_PLUGIN_S3_CACHE_ONLY_SHOW_ERRORS` environment variable will reduce logging of file operations towards S3.

##### Concurrent saves

Cache keys are content-addressed, so multiple agents running the same step in parallel compute the **same** S3 key. To keep these races safe:

* **Saving** a single object uses a conditional create (`aws s3api put-object --if-none-match '*'`). The first writer wins and later writers get a `PreconditionFailed`, which is treated as success — the object is never overwritten, so its `ETag` stays stable. If the endpoint or CLI does not support conditional writes (or the object exceeds the single-`PUT` limit), the plugin falls back to a normal copy.
* **Restoring** retries on the AWS CLI's `did not match expected ETag` error. This error happens when an object is overwritten while a multipart download is in flight; because the key is content-addressed the retried download returns identical contents.


#### Example

Expand Down
66 changes: 60 additions & 6 deletions backends/cache_s3
Original file line number Diff line number Diff line change
Expand Up @@ -72,19 +72,73 @@ restore_cache() {
sync=true
fi

# can not use sync as it may be a single file
s3_copy "s3://${BUILDKITE_PLUGIN_S3_CACHE_BUCKET}/${key}" "${to}" "${sync}"
# A concurrent save can overwrite the object while it is being downloaded. The
# AWS CLI guards multipart downloads with an `If-Match` on the ETag recorded by
# its initial HEAD, so an overwrite mid-download fails with "did not match
# expected ETag". Because cache keys are content-addressed, the rewritten
# object has identical contents, so retrying resolves the transient failure.
local attempts="${BUILDKITE_PLUGIN_S3_CACHE_DOWNLOAD_RETRIES:-3}"
local attempt=1
while true; do
# can not use sync as it may be a single file
if s3_copy "s3://${BUILDKITE_PLUGIN_S3_CACHE_BUCKET}/${key}" "${to}" "${sync}"; then
return 0
fi

if [ "${attempt}" -ge "${attempts}" ]; then
return 1
fi

echo "~~~ :warning: Cache download failed (attempt ${attempt}/${attempts}), retrying (object may have been rewritten by a concurrent save)" >&2
attempt=$((attempt + 1))
sleep 1
done
}

save_cache() {
local to="$1"
local from="$2"
local use_sync='true'
if [ -f "${from}" ]; then
use_sync='false'
local key
key="$(build_key "${to}")"

# Folders must be uploaded with `sync`; conditional writes only apply to a
# single object, so fall back to the previous behaviour for directories.
if [ ! -f "${from}" ]; then
s3_copy "${from}" "s3://${BUILDKITE_PLUGIN_S3_CACHE_BUCKET}/${key}" 'true'
return $?
fi

s3_copy "${from}" "s3://${BUILDKITE_PLUGIN_S3_CACHE_BUCKET}/$(build_key "${to}")" "${use_sync}"
# When `force` is requested the caller explicitly wants to replace the object,
# so overwrite it instead of using a conditional create (which would refuse).
if [ "${BUILDKITE_PLUGIN_CACHE_FORCE:-false}" != 'false' ]; then
s3_copy "${from}" "s3://${BUILDKITE_PLUGIN_S3_CACHE_BUCKET}/${key}" 'false'
return $?
fi

# Single file: store it with a conditional create (`--if-none-match '*'`) so
# two jobs racing to save the same content-addressed key cannot overwrite each
# other (first write wins). Without this, a later overwrite changes the ETag
# and breaks any download already in flight. A `PreconditionFailed` response
# means another job already stored the object, which is success for us.
local output
if output="$(aws_cmd s3api put-object \
--bucket "${BUILDKITE_PLUGIN_S3_CACHE_BUCKET}" \
--key "${key}" \
--if-none-match '*' \
--body "${from}" 2>&1)"; then
return 0
fi

if echo "${output}" | grep -qe 'PreconditionFailed' -e 'pre-conditions'; then
echo "Cache already saved by a concurrent job, skipping"
return 0
fi

# The conditional create is unsupported (older CLI / S3-compatible endpoint) or
# the object is too large for a single PUT: fall back to a normal copy.
echo "${output}" >&2
echo "~~~ :warning: Conditional cache upload unavailable, falling back to copy" >&2
s3_copy "${from}" "s3://${BUILDKITE_PLUGIN_S3_CACHE_BUCKET}/${key}" 'false'
}

exists_cache() {
Expand Down
99 changes: 98 additions & 1 deletion tests/cache_s3.bats
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ setup() {
mkdir "${BATS_TEST_TMPDIR}/s3-cache"
stub aws \
"echo null" \
"s3 cp \* \* : ln -s \$3 $BATS_TEST_TMPDIR/s3-cache/\$(echo \$4 | md5sum | cut -c-32)" \
"s3api put-object --bucket \* --key \* --if-none-match \* --body \* : cp \${10} $BATS_TEST_TMPDIR/s3-cache/\$(echo s3://\$4/\$6 | md5sum | cut -c-32)" \
"echo 'exists'" \
's3api head-object --bucket \* --key \* : true ' \
"s3 cp \* \* : cp -r $BATS_TEST_TMPDIR/s3-cache/\$(echo \$3 | md5sum | cut -c-32) \$4"
Expand Down Expand Up @@ -214,3 +214,100 @@ setup() {
rm -rf "${BATS_TEST_TMPDIR}/new-folder"
rm -rf "${BATS_TEST_TMPDIR}/other-folder"
}

@test 'Single file save uses a conditional put-object' {
touch "${BATS_TEST_TMPDIR}/file-to-save"

stub aws \
's3api put-object --bucket my-bucket --key \* --if-none-match \* --body \* : echo stored'

run "${PWD}/backends/cache_s3" save my-key "${BATS_TEST_TMPDIR}/file-to-save"

assert_success
assert_output ''

unstub aws
rm -f "${BATS_TEST_TMPDIR}/file-to-save"
}

@test 'Single file save with force overwrites instead of conditional create' {
export BUILDKITE_PLUGIN_CACHE_FORCE=true
touch "${BATS_TEST_TMPDIR}/file-to-save"

stub aws \
's3 cp \* \* : echo copied'

run "${PWD}/backends/cache_s3" save my-key "${BATS_TEST_TMPDIR}/file-to-save"

assert_success

unstub aws
rm -f "${BATS_TEST_TMPDIR}/file-to-save"
unset BUILDKITE_PLUGIN_CACHE_FORCE
}

@test 'Single file save treats PreconditionFailed as success (concurrent writer won)' {
touch "${BATS_TEST_TMPDIR}/file-to-save"

stub aws \
's3api put-object --bucket my-bucket --key \* --if-none-match \* --body \* : echo "An error occurred (PreconditionFailed) when calling the PutObject operation" >&2; exit 1'

run "${PWD}/backends/cache_s3" save my-key "${BATS_TEST_TMPDIR}/file-to-save"

assert_success
assert_output --partial 'Cache already saved by a concurrent job'

unstub aws
rm -f "${BATS_TEST_TMPDIR}/file-to-save"
}

@test 'Single file save falls back to copy when conditional put is unsupported' {
touch "${BATS_TEST_TMPDIR}/file-to-save"

stub aws \
's3api put-object --bucket my-bucket --key \* --if-none-match \* --body \* : echo "Unknown options: --if-none-match" >&2; exit 255' \
's3 cp \* \* : echo copied'

run "${PWD}/backends/cache_s3" save my-key "${BATS_TEST_TMPDIR}/file-to-save"

assert_success
assert_output --partial 'falling back to copy'

unstub aws
rm -f "${BATS_TEST_TMPDIR}/file-to-save"
}

@test 'Restore retries when a download fails then succeeds' {
export BUILDKITE_PLUGIN_S3_CACHE_DOWNLOAD_RETRIES=3

stub sleep '1 : true'
stub aws \
's3api head-object --bucket \* --key \* : true ' \
's3 cp \* \* : echo "did not match expected ETag" >&2; exit 1' \
's3 cp \* \* : echo restored'

run "${PWD}/backends/cache_s3" get my-key "${BATS_TEST_TMPDIR}/dest"

assert_success
assert_output --partial 'retrying'

unstub aws
unstub sleep
}

@test 'Restore fails after exhausting retries' {
export BUILDKITE_PLUGIN_S3_CACHE_DOWNLOAD_RETRIES=2

stub sleep '1 : true'
stub aws \
's3api head-object --bucket \* --key \* : true ' \
's3 cp \* \* : echo "did not match expected ETag" >&2; exit 1' \
's3 cp \* \* : echo "did not match expected ETag" >&2; exit 1'

run "${PWD}/backends/cache_s3" get my-key "${BATS_TEST_TMPDIR}/dest"

assert_failure

unstub aws
unstub sleep
}