Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .buildkite/configs/cleanup.aws.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ accounts:
- name: "${ACCOUNT_PROJECT}"
driver: "aws"
options:
key: '${ACCOUNT_KEY}'
secret: '${ACCOUNT_SECRET}'
key: '${AWS_ACCESS_KEY_ID}'
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To help with using the existing env variables created by https://github.com/elastic/oblt-aws-auth-buildkite-plugin

secret: '${AWS_SECRET_ACCESS_KEY}'

scanners:
- account_name: "${ACCOUNT_PROJECT}"
Expand Down
4 changes: 3 additions & 1 deletion .buildkite/hooks/pre-command
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ export SERVERLESS=${SERVERLESS:-"false"}
WORKSPACE=$(pwd)
export WORKSPACE

AWS_SERVICE_ACCOUNT_SECRET_PATH=kv/ci-shared/platform-ingest/aws_ingest_ci
PRIVATE_CI_GCS_CREDENTIALS_PATH=kv/ci-shared/platform-ingest/gcp-platform-ingest-ci-service-account

EC_TOKEN_PATH=kv/ci-shared/platform-ingest/platform-ingest-ec-qa
Expand Down Expand Up @@ -91,7 +90,10 @@ if [[ "$BUILDKITE_PIPELINE_SLUG" == "elastic-package-test-with-integrations" &&
export GITHUB_TOKEN=$VAULT_GITHUB_TOKEN
fi

# NOTE: this approach is deprecated and will be removed in the near future.
# see https://github.com/elastic/observability-robots/issues/2771 (only accessible by Elastic employees)
if [[ "$BUILDKITE_PIPELINE_SLUG" == "elastic-package-cloud-cleanup" && "$BUILDKITE_STEP_KEY" == "cloud-cleanup" ]]; then
AWS_SERVICE_ACCOUNT_SECRET_PATH=kv/ci-shared/platform-ingest/aws_ingest_ci
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's define the variable where it's used

ELASTIC_PACKAGE_AWS_SECRET_KEY=$(retry 5 vault kv get -field secret_key ${AWS_SERVICE_ACCOUNT_SECRET_PATH})
export ELASTIC_PACKAGE_AWS_SECRET_KEY
ELASTIC_PACKAGE_AWS_ACCESS_KEY=$(retry 5 vault kv get -field access_key ${AWS_SERVICE_ACCOUNT_SECRET_PATH})
Expand Down
14 changes: 13 additions & 1 deletion .buildkite/pipeline.cloud-cleanup.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,19 @@ steps:
if: "build.source == 'ui'"
allow_dependency_failure: false

- label: "Cloud Cleanup"
- label: "Cloud Cleanup OIDC"
key: "cloud-cleanup-oidc"
command: ".buildkite/scripts/cloud-cleanup-oidc.sh"
env:
RESOURCE_RETENTION_PERIOD: "24 hours"
DRY_RUN: "${DRY_RUN:-true}"
agents:
provider: "gcp" # this step requires docker
plugins:
- elastic/oblt-aws-auth#v0.1.0:
duration: 3600 # seconds

- label: "Cloud Cleanup (deprecated)"
key: "cloud-cleanup"
command: ".buildkite/scripts/cloud-cleanup.sh"
env:
Expand Down
229 changes: 229 additions & 0 deletions .buildkite/scripts/cloud-cleanup-oidc.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
#!/usr/bin/env bash
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I copied it from .buildkite/scripts/cloud-cleanup.sh and added the support for OIDC.


source .buildkite/scripts/install_deps.sh

cleanup_cloud_stale() {
local exit_code=$?

cd "$WORKSPACE"
rm -f "${AWS_RESOURCES_FILE}"
rm -f "${AWS_REDSHIFT_RESOURCES_FILE}"

exit "$exit_code"
}

trap cleanup_cloud_stale EXIT

set -euo pipefail

AWS_RESOURCES_FILE="aws.resources.txt"
AWS_REDSHIFT_RESOURCES_FILE="redshift_clusters.json"

RESOURCE_RETENTION_PERIOD="${RESOURCE_RETENTION_PERIOD:-"24 hours"}"
DELETE_RESOURCES_BEFORE_DATE=$(date -Is -d "${RESOURCE_RETENTION_PERIOD} ago")
export DELETE_RESOURCES_BEFORE_DATE

CLOUD_REAPER_IMAGE="${DOCKER_REGISTRY}/observability-ci/cloud-reaper:0.3.0"

DRY_RUN="$(buildkite-agent meta-data get DRY_RUN --default "${DRY_RUN:-"true"}")"

resources_to_delete=0

COMMAND="validate"
if [[ "${DRY_RUN}" != "true" ]]; then
# TODO: to be changed to "destroy --confirm" once it can be tested
# that filters work as expected
COMMAND="plan"
else
COMMAND="plan"
fi

any_resources_to_delete() {
local file=$1
local number=0
# First three lines are like:
# ⇒ Loading configuration...
# ✓ Succeeded to load configuration
# Scanning resources... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00
number=$(tail -n +4 "${file}" | wc -l)
if [ "${number}" -eq 0 ]; then
return 1
fi
return 0
}

# As long as cloud reaper does not support OIDC authentication.
create_aws_ephemeral_user() {
# Generate a unique name for the ephemeral IAM user.
EPHEMERAL_USER="ephemeral-admin-$(date +%s)"
echo "Creating IAM user: ${EPHEMERAL_USER}"
aws iam create-user --user-name "${EPHEMERAL_USER}" \
--tags Key=ephemeral,Value=true Key=division,Value=engineering Key=org,Value=obs Key=environment,Value=ci Key=repo,Value=elastic-package Key=created_at,Value="$(date -Is)"

echo "Attaching AdministratorAccess policy to ${EPHEMERAL_USER}..."
aws iam attach-user-policy --user-name "${EPHEMERAL_USER}" --policy-arn arn:aws:iam::aws:policy/AdministratorAccess

echo "Creating access keys for ${EPHEMERAL_USER}..."
creds_json=$(aws iam create-access-key --user-name "${EPHEMERAL_USER}")
AWS_ACCESS_KEY_ID_EPHEMERAL=$(echo "$creds_json" | jq -r '.AccessKey.AccessKeyId')
AWS_SECRET_ACCESS_KEY_EPHEMERAL=$(echo "$creds_json" | jq -r '.AccessKey.SecretAccessKey')
export EPHEMERAL_USER AWS_ACCESS_KEY_ID_EPHEMERAL AWS_SECRET_ACCESS_KEY_EPHEMERAL
Comment on lines +68 to +70
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should AWS_ACCESS_KEY_ID_EPHEMERAL and AWS_SECRET_ACCESS_KEY_EPHEMERAL be renamed to follow the patterns so their contents are redacted ?

https://buildkite.com/docs/pipelines/configure/managing-log-output#redacted-environment-variables

Maybe something like AWS_ACCESS_KEY_ID_EPHEMERAL_SECRET and AWS_SECRET_ACCESS_KEY_EPHEMERAL_SECRET ? Or is it not needed in this scenario ?

Copy link
Member Author

@v1v v1v May 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does not matter at all, those env variables are not masked, unless they are set in the pre-command.

When creating env variable on the fly, there is no way to redact values

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, got it!

}

# Define cleanup function to delete the ephemeral IAM user regardless of script outcome.
cleanup_ephemeral_user() {
echo "Cleaning up ephemeral IAM user: ${EPHEMERAL_USER}"
aws iam detach-user-policy --user-name "${EPHEMERAL_USER}" --policy-arn arn:aws:iam::aws:policy/AdministratorAccess
key_id=$(echo "$creds_json" | jq -r '.AccessKey.AccessKeyId')
aws iam delete-access-key --user-name "${EPHEMERAL_USER}" --access-key-id "${key_id}"
aws iam delete-user --user-name "${EPHEMERAL_USER}"
echo "Ephemeral IAM user ${EPHEMERAL_USER} deleted."
}
trap cleanup_ephemeral_user EXIT

cloud_reaper_aws() {
echo "--- Configuring ephemeral user"
create_aws_ephemeral_user
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is what I added to help with the OIDC when running cloud-reaper


echo "Validating configuration"
docker run --rm -v "$(pwd)/.buildkite/configs/cleanup.aws.yml":/etc/cloud-reaper/config.yml \
-e AWS_ACCESS_KEY_ID="$AWS_ACCESS_KEY_ID_EPHEMERAL" \
-e AWS_SECRET_ACCESS_KEY="$AWS_SECRET_ACCESS_KEY_EPHEMERAL" \
-e ACCOUNT_PROJECT="observability-ci" \
-e CREATION_DATE="${DELETE_RESOURCES_BEFORE_DATE}" \
"${CLOUD_REAPER_IMAGE}" \
cloud-reaper \
--debug \
--config /etc/cloud-reaper/config.yml \
validate

echo "Scanning resources"
docker run --rm -v "$(pwd)/.buildkite/configs/cleanup.aws.yml":/etc/cloud-reaper/config.yml \
-e AWS_ACCESS_KEY_ID="$AWS_ACCESS_KEY_ID_EPHEMERAL" \
-e AWS_SECRET_ACCESS_KEY="$AWS_SECRET_ACCESS_KEY_EPHEMERAL" \
-e ACCOUNT_PROJECT="observability-ci" \
-e CREATION_DATE="${DELETE_RESOURCES_BEFORE_DATE}" \
"${CLOUD_REAPER_IMAGE}" \
cloud-reaper \
--config /etc/cloud-reaper/config.yml \
${COMMAND} | tee "${AWS_RESOURCES_FILE}"
}

echo "--- Installing awscli"
with_aws_cli

Comment on lines +113 to +115
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Run earlier, to help with using the aws cli to create the ephemeral service account

echo "--- Cleaning up AWS resources older than ${DELETE_RESOURCES_BEFORE_DATE}..."
cloud_reaper_aws

if any_resources_to_delete "${AWS_RESOURCES_FILE}" ; then
echo "Pending AWS resources"
resources_to_delete=1
fi

if [ "${resources_to_delete}" -eq 1 ]; then
message="There are resources to be deleted"
echo "${message}"
if running_on_buildkite ; then
buildkite-agent annotate \
"${message}" \
--context "ctx-cloud-reaper-error" \
--style "error"
fi
fi

echo "--- Cleaning up other AWS resources older than ${DELETE_RESOURCES_BEFORE_DATE}"

export AWS_DEFAULT_REGION=us-east-1
# Avoid to send the output of the CLI to a pager
export AWS_PAGER=""

echo "--- Checking if any Redshift cluster still created"
aws redshift describe-clusters \
--tag-keys "environment" \
--tag-values "ci" > "${AWS_REDSHIFT_RESOURCES_FILE}"

clusters_num=$(jq -rc '.Clusters | length' "${AWS_REDSHIFT_RESOURCES_FILE}")

echo "Number of clusters found: ${clusters_num}"

redshift_clusters_to_delete=0
while read -r i ; do
identifier=$(echo "$i" | jq -rc ".ClusterIdentifier")
# tags
repo=$(echo "$i" | jq -rc '.Tags[] | select(.Key == "repo").Value')
environment=$(echo "$i" | jq -rc '.Tags[] | select(.Key == "environment").Value')
# creation time tag in milliseconds
createdAt=$(echo "$i" | jq -rc '.Tags[] | select(.Key == "created_date").Value')
# epoch in milliseconds minus retention period
thresholdEpoch=$(date -d "${RESOURCE_RETENTION_PERIOD} ago" +"%s%3N")

if [[ ! "${identifier}" =~ ^elastic-package-test- ]]; then
echo "Skip cluster ${identifier}, do not match required identifiers."
continue
fi

if [[ "${repo}" != "integrations" && "${repo}" != "elastic-package" ]]; then
echo "Skip cluster ${identifier}, not from the expected repo: ${repo}."
continue
fi

if [[ "${environment}" != "ci" ]]; then
echo "Skip cluster ${identifier}, not from the expected environment: ${environment}."
continue
fi

if [ "${createdAt}" -gt "${thresholdEpoch}" ]; then
echo "Skip cluster $identifier. It was created < ${RESOURCE_RETENTION_PERIOD} ago"
continue
fi

echo "To be deleted cluster: $identifier. It was created > ${RESOURCE_RETENTION_PERIOD} ago"
if [ "${DRY_RUN}" != "false" ]; then
redshift_clusters_to_delete=1
continue
fi

echo "Deleting: $identifier. It was created > ${RESOURCE_RETENTION_PERIOD} ago"
if ! aws redshift delete-cluster \
--cluster-identifier "${identifier}" \
--skip-final-cluster-snapshot \
--output json \
--query "Cluster.{ClusterStatus:ClusterStatus,ClusterIdentifier:ClusterIdentifier}" ; then

echo "Failed delete-cluster"
buildkite-agent annotate \
"Deleted redshift cluster: ${identifier}" \
--context "ctx-aws-readshift-deleted-error-${identifier}" \
--style "error"

redshift_clusters_to_delete=1
else
echo "Done."
# if deletion works, no need to mark this one as to be deleted
buildkite-agent annotate \
"Deleted redshift cluster: ${identifier}" \
--context "ctx-aws-readshift-deleted-${identifier}" \
--style "success"
fi
done <<< "$(jq -c '.Clusters[]' "${AWS_REDSHIFT_RESOURCES_FILE}")"

if [ "${redshift_clusters_to_delete}" -eq 1 ]; then
resources_to_delete=1
message="There are redshift resources to be deleted"
echo "${message}"
if running_on_buildkite ; then
buildkite-agent annotate \
"${message}" \
--context "ctx-aws-readshift-error" \
--style "error"
fi
fi

# TODO: List and delete the required resources using aws cli or using cloud-reaper tool
echo "--- TODO: Cleaning up IAM roles"
echo "--- TODO: Cleaning up IAM policies"
echo "--- TODO: Cleaning up Schedulers"

if [ "${resources_to_delete}" -eq 1 ]; then
exit 1
fi
8 changes: 4 additions & 4 deletions .buildkite/scripts/cloud-cleanup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ any_resources_to_delete() {
cloud_reaper_aws() {
echo "Validating configuration"
docker run --rm -v "$(pwd)/.buildkite/configs/cleanup.aws.yml":/etc/cloud-reaper/config.yml \
-e ACCOUNT_SECRET="${ELASTIC_PACKAGE_AWS_SECRET_KEY}" \
-e ACCOUNT_KEY="${ELASTIC_PACKAGE_AWS_ACCESS_KEY}" \
-e AWS_SECRET_ACCESS_KEY="${ELASTIC_PACKAGE_AWS_SECRET_KEY}" \
-e AWS_ACCESS_KEY_ID="${ELASTIC_PACKAGE_AWS_ACCESS_KEY}" \
-e ACCOUNT_PROJECT="${ELASTIC_PACKAGE_AWS_USER_SECRET}" \
-e CREATION_DATE="${DELETE_RESOURCES_BEFORE_DATE}" \
"${CLOUD_REAPER_IMAGE}" \
Expand All @@ -66,8 +66,8 @@ cloud_reaper_aws() {

echo "Scanning resources"
docker run --rm -v "$(pwd)/.buildkite/configs/cleanup.aws.yml":/etc/cloud-reaper/config.yml \
-e ACCOUNT_SECRET="${ELASTIC_PACKAGE_AWS_SECRET_KEY}" \
-e ACCOUNT_KEY="${ELASTIC_PACKAGE_AWS_ACCESS_KEY}" \
-e AWS_SECRET_ACCESS_KEY="${ELASTIC_PACKAGE_AWS_SECRET_KEY}" \
-e AWS_ACCESS_KEY_ID="${ELASTIC_PACKAGE_AWS_ACCESS_KEY}" \
-e ACCOUNT_PROJECT="${ELASTIC_PACKAGE_AWS_USER_SECRET}" \
-e CREATION_DATE="${DELETE_RESOURCES_BEFORE_DATE}" \
"${CLOUD_REAPER_IMAGE}" \
Expand Down