-
Notifications
You must be signed in to change notification settings - Fork 129
bk: use OIDC to tear-down the cloud resources #2567
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
9c0faa9
e65d705
51771ba
8042e94
0fe3b95
b8999af
aaad895
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,7 +10,6 @@ export SERVERLESS=${SERVERLESS:-"false"} | |
| WORKSPACE=$(pwd) | ||
| export WORKSPACE | ||
|
|
||
| AWS_SERVICE_ACCOUNT_SECRET_PATH=kv/ci-shared/platform-ingest/aws_ingest_ci | ||
| PRIVATE_CI_GCS_CREDENTIALS_PATH=kv/ci-shared/platform-ingest/gcp-platform-ingest-ci-service-account | ||
|
|
||
| EC_TOKEN_PATH=kv/ci-shared/platform-ingest/platform-ingest-ec-qa | ||
|
|
@@ -91,7 +90,10 @@ if [[ "$BUILDKITE_PIPELINE_SLUG" == "elastic-package-test-with-integrations" && | |
| export GITHUB_TOKEN=$VAULT_GITHUB_TOKEN | ||
| fi | ||
|
|
||
| # NOTE: this approach is deprecated and will be removed in the near future. | ||
| # see https://github.com/elastic/observability-robots/issues/2771 (only accessible by Elastic employees) | ||
| if [[ "$BUILDKITE_PIPELINE_SLUG" == "elastic-package-cloud-cleanup" && "$BUILDKITE_STEP_KEY" == "cloud-cleanup" ]]; then | ||
| AWS_SERVICE_ACCOUNT_SECRET_PATH=kv/ci-shared/platform-ingest/aws_ingest_ci | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's define the variable where it's used |
||
| ELASTIC_PACKAGE_AWS_SECRET_KEY=$(retry 5 vault kv get -field secret_key ${AWS_SERVICE_ACCOUNT_SECRET_PATH}) | ||
| export ELASTIC_PACKAGE_AWS_SECRET_KEY | ||
| ELASTIC_PACKAGE_AWS_ACCESS_KEY=$(retry 5 vault kv get -field access_key ${AWS_SERVICE_ACCOUNT_SECRET_PATH}) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,229 @@ | ||
| #!/usr/bin/env bash | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I copied it from . |
||
|
|
||
| source .buildkite/scripts/install_deps.sh | ||
|
|
||
| cleanup_cloud_stale() { | ||
| local exit_code=$? | ||
|
|
||
| cd "$WORKSPACE" | ||
| rm -f "${AWS_RESOURCES_FILE}" | ||
| rm -f "${AWS_REDSHIFT_RESOURCES_FILE}" | ||
|
|
||
| exit "$exit_code" | ||
| } | ||
|
|
||
| trap cleanup_cloud_stale EXIT | ||
|
|
||
| set -euo pipefail | ||
|
|
||
| AWS_RESOURCES_FILE="aws.resources.txt" | ||
| AWS_REDSHIFT_RESOURCES_FILE="redshift_clusters.json" | ||
|
|
||
| RESOURCE_RETENTION_PERIOD="${RESOURCE_RETENTION_PERIOD:-"24 hours"}" | ||
| DELETE_RESOURCES_BEFORE_DATE=$(date -Is -d "${RESOURCE_RETENTION_PERIOD} ago") | ||
| export DELETE_RESOURCES_BEFORE_DATE | ||
|
|
||
| CLOUD_REAPER_IMAGE="${DOCKER_REGISTRY}/observability-ci/cloud-reaper:0.3.0" | ||
|
|
||
| DRY_RUN="$(buildkite-agent meta-data get DRY_RUN --default "${DRY_RUN:-"true"}")" | ||
|
|
||
| resources_to_delete=0 | ||
|
|
||
| COMMAND="validate" | ||
| if [[ "${DRY_RUN}" != "true" ]]; then | ||
| # TODO: to be changed to "destroy --confirm" once it can be tested | ||
| # that filters work as expected | ||
| COMMAND="plan" | ||
| else | ||
| COMMAND="plan" | ||
| fi | ||
|
|
||
| any_resources_to_delete() { | ||
| local file=$1 | ||
| local number=0 | ||
| # First three lines are like: | ||
| # ⇒ Loading configuration... | ||
| # ✓ Succeeded to load configuration | ||
| # Scanning resources... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00 | ||
| number=$(tail -n +4 "${file}" | wc -l) | ||
| if [ "${number}" -eq 0 ]; then | ||
| return 1 | ||
| fi | ||
| return 0 | ||
| } | ||
|
|
||
| # As long as cloud reaper does not support OIDC authentication. | ||
| create_aws_ephemeral_user() { | ||
| # Generate a unique name for the ephemeral IAM user. | ||
| EPHEMERAL_USER="ephemeral-admin-$(date +%s)" | ||
| echo "Creating IAM user: ${EPHEMERAL_USER}" | ||
| aws iam create-user --user-name "${EPHEMERAL_USER}" \ | ||
| --tags Key=ephemeral,Value=true Key=division,Value=engineering Key=org,Value=obs Key=environment,Value=ci Key=repo,Value=elastic-package Key=created_at,Value="$(date -Is)" | ||
|
|
||
| echo "Attaching AdministratorAccess policy to ${EPHEMERAL_USER}..." | ||
| aws iam attach-user-policy --user-name "${EPHEMERAL_USER}" --policy-arn arn:aws:iam::aws:policy/AdministratorAccess | ||
|
|
||
| echo "Creating access keys for ${EPHEMERAL_USER}..." | ||
| creds_json=$(aws iam create-access-key --user-name "${EPHEMERAL_USER}") | ||
| AWS_ACCESS_KEY_ID_EPHEMERAL=$(echo "$creds_json" | jq -r '.AccessKey.AccessKeyId') | ||
| AWS_SECRET_ACCESS_KEY_EPHEMERAL=$(echo "$creds_json" | jq -r '.AccessKey.SecretAccessKey') | ||
| export EPHEMERAL_USER AWS_ACCESS_KEY_ID_EPHEMERAL AWS_SECRET_ACCESS_KEY_EPHEMERAL | ||
|
Comment on lines
+68
to
+70
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should https://buildkite.com/docs/pipelines/configure/managing-log-output#redacted-environment-variables Maybe something like
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It does not matter at all, those env variables are not masked, unless they are set in the When creating env variable on the fly, there is no way to redact values
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, got it! |
||
| } | ||
|
|
||
| # Define cleanup function to delete the ephemeral IAM user regardless of script outcome. | ||
| cleanup_ephemeral_user() { | ||
| echo "Cleaning up ephemeral IAM user: ${EPHEMERAL_USER}" | ||
| aws iam detach-user-policy --user-name "${EPHEMERAL_USER}" --policy-arn arn:aws:iam::aws:policy/AdministratorAccess | ||
| key_id=$(echo "$creds_json" | jq -r '.AccessKey.AccessKeyId') | ||
| aws iam delete-access-key --user-name "${EPHEMERAL_USER}" --access-key-id "${key_id}" | ||
| aws iam delete-user --user-name "${EPHEMERAL_USER}" | ||
| echo "Ephemeral IAM user ${EPHEMERAL_USER} deleted." | ||
v1v marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
| trap cleanup_ephemeral_user EXIT | ||
|
|
||
| cloud_reaper_aws() { | ||
| echo "--- Configuring ephemeral user" | ||
| create_aws_ephemeral_user | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is what I added to help with the OIDC when running |
||
|
|
||
| echo "Validating configuration" | ||
| docker run --rm -v "$(pwd)/.buildkite/configs/cleanup.aws.yml":/etc/cloud-reaper/config.yml \ | ||
| -e AWS_ACCESS_KEY_ID="$AWS_ACCESS_KEY_ID_EPHEMERAL" \ | ||
| -e AWS_SECRET_ACCESS_KEY="$AWS_SECRET_ACCESS_KEY_EPHEMERAL" \ | ||
| -e ACCOUNT_PROJECT="observability-ci" \ | ||
| -e CREATION_DATE="${DELETE_RESOURCES_BEFORE_DATE}" \ | ||
| "${CLOUD_REAPER_IMAGE}" \ | ||
| cloud-reaper \ | ||
| --debug \ | ||
| --config /etc/cloud-reaper/config.yml \ | ||
| validate | ||
|
|
||
| echo "Scanning resources" | ||
| docker run --rm -v "$(pwd)/.buildkite/configs/cleanup.aws.yml":/etc/cloud-reaper/config.yml \ | ||
| -e AWS_ACCESS_KEY_ID="$AWS_ACCESS_KEY_ID_EPHEMERAL" \ | ||
| -e AWS_SECRET_ACCESS_KEY="$AWS_SECRET_ACCESS_KEY_EPHEMERAL" \ | ||
| -e ACCOUNT_PROJECT="observability-ci" \ | ||
| -e CREATION_DATE="${DELETE_RESOURCES_BEFORE_DATE}" \ | ||
| "${CLOUD_REAPER_IMAGE}" \ | ||
| cloud-reaper \ | ||
| --config /etc/cloud-reaper/config.yml \ | ||
| ${COMMAND} | tee "${AWS_RESOURCES_FILE}" | ||
| } | ||
|
|
||
| echo "--- Installing awscli" | ||
| with_aws_cli | ||
|
|
||
|
Comment on lines
+113
to
+115
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Run earlier, to help with using the aws cli to create the ephemeral service account |
||
| echo "--- Cleaning up AWS resources older than ${DELETE_RESOURCES_BEFORE_DATE}..." | ||
| cloud_reaper_aws | ||
|
|
||
| if any_resources_to_delete "${AWS_RESOURCES_FILE}" ; then | ||
| echo "Pending AWS resources" | ||
| resources_to_delete=1 | ||
| fi | ||
|
|
||
| if [ "${resources_to_delete}" -eq 1 ]; then | ||
| message="There are resources to be deleted" | ||
| echo "${message}" | ||
| if running_on_buildkite ; then | ||
| buildkite-agent annotate \ | ||
| "${message}" \ | ||
| --context "ctx-cloud-reaper-error" \ | ||
| --style "error" | ||
| fi | ||
| fi | ||
|
|
||
| echo "--- Cleaning up other AWS resources older than ${DELETE_RESOURCES_BEFORE_DATE}" | ||
|
|
||
| export AWS_DEFAULT_REGION=us-east-1 | ||
| # Avoid to send the output of the CLI to a pager | ||
| export AWS_PAGER="" | ||
|
|
||
| echo "--- Checking if any Redshift cluster still created" | ||
| aws redshift describe-clusters \ | ||
| --tag-keys "environment" \ | ||
| --tag-values "ci" > "${AWS_REDSHIFT_RESOURCES_FILE}" | ||
|
|
||
| clusters_num=$(jq -rc '.Clusters | length' "${AWS_REDSHIFT_RESOURCES_FILE}") | ||
|
|
||
| echo "Number of clusters found: ${clusters_num}" | ||
|
|
||
| redshift_clusters_to_delete=0 | ||
| while read -r i ; do | ||
| identifier=$(echo "$i" | jq -rc ".ClusterIdentifier") | ||
| # tags | ||
| repo=$(echo "$i" | jq -rc '.Tags[] | select(.Key == "repo").Value') | ||
| environment=$(echo "$i" | jq -rc '.Tags[] | select(.Key == "environment").Value') | ||
| # creation time tag in milliseconds | ||
| createdAt=$(echo "$i" | jq -rc '.Tags[] | select(.Key == "created_date").Value') | ||
| # epoch in milliseconds minus retention period | ||
| thresholdEpoch=$(date -d "${RESOURCE_RETENTION_PERIOD} ago" +"%s%3N") | ||
|
|
||
| if [[ ! "${identifier}" =~ ^elastic-package-test- ]]; then | ||
| echo "Skip cluster ${identifier}, do not match required identifiers." | ||
| continue | ||
| fi | ||
|
|
||
| if [[ "${repo}" != "integrations" && "${repo}" != "elastic-package" ]]; then | ||
| echo "Skip cluster ${identifier}, not from the expected repo: ${repo}." | ||
| continue | ||
| fi | ||
|
|
||
| if [[ "${environment}" != "ci" ]]; then | ||
| echo "Skip cluster ${identifier}, not from the expected environment: ${environment}." | ||
| continue | ||
| fi | ||
|
|
||
| if [ "${createdAt}" -gt "${thresholdEpoch}" ]; then | ||
| echo "Skip cluster $identifier. It was created < ${RESOURCE_RETENTION_PERIOD} ago" | ||
| continue | ||
| fi | ||
|
|
||
| echo "To be deleted cluster: $identifier. It was created > ${RESOURCE_RETENTION_PERIOD} ago" | ||
| if [ "${DRY_RUN}" != "false" ]; then | ||
| redshift_clusters_to_delete=1 | ||
| continue | ||
| fi | ||
|
|
||
| echo "Deleting: $identifier. It was created > ${RESOURCE_RETENTION_PERIOD} ago" | ||
| if ! aws redshift delete-cluster \ | ||
| --cluster-identifier "${identifier}" \ | ||
| --skip-final-cluster-snapshot \ | ||
| --output json \ | ||
| --query "Cluster.{ClusterStatus:ClusterStatus,ClusterIdentifier:ClusterIdentifier}" ; then | ||
|
|
||
| echo "Failed delete-cluster" | ||
| buildkite-agent annotate \ | ||
| "Deleted redshift cluster: ${identifier}" \ | ||
| --context "ctx-aws-readshift-deleted-error-${identifier}" \ | ||
| --style "error" | ||
|
|
||
| redshift_clusters_to_delete=1 | ||
| else | ||
| echo "Done." | ||
| # if deletion works, no need to mark this one as to be deleted | ||
| buildkite-agent annotate \ | ||
| "Deleted redshift cluster: ${identifier}" \ | ||
| --context "ctx-aws-readshift-deleted-${identifier}" \ | ||
| --style "success" | ||
| fi | ||
| done <<< "$(jq -c '.Clusters[]' "${AWS_REDSHIFT_RESOURCES_FILE}")" | ||
|
|
||
| if [ "${redshift_clusters_to_delete}" -eq 1 ]; then | ||
| resources_to_delete=1 | ||
| message="There are redshift resources to be deleted" | ||
| echo "${message}" | ||
| if running_on_buildkite ; then | ||
| buildkite-agent annotate \ | ||
| "${message}" \ | ||
| --context "ctx-aws-readshift-error" \ | ||
| --style "error" | ||
| fi | ||
| fi | ||
|
|
||
| # TODO: List and delete the required resources using aws cli or using cloud-reaper tool | ||
| echo "--- TODO: Cleaning up IAM roles" | ||
| echo "--- TODO: Cleaning up IAM policies" | ||
| echo "--- TODO: Cleaning up Schedulers" | ||
|
|
||
| if [ "${resources_to_delete}" -eq 1 ]; then | ||
| exit 1 | ||
| fi | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To help with using the existing env variables created by https://github.com/elastic/oblt-aws-auth-buildkite-plugin