From d04fad6c5d8d575521d25ad6b4ece9892053359d Mon Sep 17 00:00:00 2001 From: Aidan Hilt <11202897+AidanHilt@users.noreply.github.com> Date: Thu, 11 Jul 2024 12:01:12 -0400 Subject: [PATCH 1/8] Raising workflow provisioner limits (#2590) --- kube/services/argo-events/workflows/configmap.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kube/services/argo-events/workflows/configmap.yaml b/kube/services/argo-events/workflows/configmap.yaml index c754c3694..4ebb90f19 100644 --- a/kube/services/argo-events/workflows/configmap.yaml +++ b/kube/services/argo-events/workflows/configmap.yaml @@ -84,7 +84,7 @@ data: purpose: workflow limits: resources: - cpu: 2000 + cpu: 4000 providerRef: name: workflow-WORKFLOW_NAME # Kill nodes after 30 days to ensure they stay up to date From 350c83d28cea676baf428219aee1439bdb1116f1 Mon Sep 17 00:00:00 2001 From: Aidan Hilt <11202897+AidanHilt@users.noreply.github.com> Date: Mon, 15 Jul 2024 11:00:26 -0400 Subject: [PATCH 2/8] Chore/fixing argo workflow age monitor (#2591) * Updating the argo workflow monitor to only alert on workflows that have a started time * Fixing syntax * Quotations * I'm confused * Fixing some issues * Using the alarm webhook, instead of the regular one --- .../argo-pod-pending.yaml | 2 +- .../argo-monitors/argo-node-age.yaml | 4 +-- .../node-monitors/node-not-ready.yaml | 2 +- .../argo-workflow-age.yaml | 34 +++++++++++-------- 4 files changed, 24 insertions(+), 18 deletions(-) diff --git a/kube/services/argo-pod-pending-monitor/argo-pod-pending.yaml b/kube/services/argo-pod-pending-monitor/argo-pod-pending.yaml index 9486d06c2..d3d75a84e 100644 --- a/kube/services/argo-pod-pending-monitor/argo-pod-pending.yaml +++ b/kube/services/argo-pod-pending-monitor/argo-pod-pending.yaml @@ -25,7 +25,7 @@ spec: valueFrom: configMapKeyRef: name: global - key: slack_webhook + key: slack_alarm_webhook command: ["/bin/bash"] args: diff --git a/kube/services/node-monitors/argo-monitors/argo-node-age.yaml b/kube/services/node-monitors/argo-monitors/argo-node-age.yaml index 890495ee0..b389c072c 100644 --- a/kube/services/node-monitors/argo-monitors/argo-node-age.yaml +++ b/kube/services/node-monitors/argo-monitors/argo-node-age.yaml @@ -27,7 +27,7 @@ spec: valueFrom: configMapKeyRef: name: global - key: slack_webhook + key: slack_alarm_webhook command: ["/bin/bash"] args: @@ -55,4 +55,4 @@ spec: curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"WARNING: Node \`${NODE_NAME}\` is older than 3 hours!\"}" $SLACK_WEBHOOK_URL fi done - restartPolicy: OnFailure \ No newline at end of file + restartPolicy: OnFailure diff --git a/kube/services/node-monitors/node-not-ready.yaml b/kube/services/node-monitors/node-not-ready.yaml index 500832fc3..15ed616e6 100644 --- a/kube/services/node-monitors/node-not-ready.yaml +++ b/kube/services/node-monitors/node-not-ready.yaml @@ -21,7 +21,7 @@ spec: valueFrom: configMapKeyRef: name: global - key: slack_webhook + key: slack_alarm_webhook - name: ENVIRONMENT valueFrom: configMapKeyRef: diff --git a/kube/services/workflow-age-monitor/argo-workflow-age.yaml b/kube/services/workflow-age-monitor/argo-workflow-age.yaml index 0d0c29115..52910ad4a 100644 --- a/kube/services/workflow-age-monitor/argo-workflow-age.yaml +++ b/kube/services/workflow-age-monitor/argo-workflow-age.yaml @@ -24,7 +24,7 @@ spec: valueFrom: configMapKeyRef: name: global - key: slack_webhook + key: slack_alarm_webhook command: ["/bin/bash"] args: @@ -32,24 +32,30 @@ spec: - | #!/bin/bash # Get all workflows with specific label and check their age - kubectl get workflows --all-namespaces -o json | jq -c '.items[] | {name: .metadata.name, creationTimestamp: .metadata.creationTimestamp}' | while read workflow_info; do + kubectl get workflows --all-namespaces -o json | jq -c '.items[] | {name: .metadata.name, startedTimestamp: .status.startedAt}' | while read workflow_info; do WORKFLOW_NAME=$(echo $workflow_info | jq -r '.name') - CREATION_TIMESTAMP=$(echo $workflow_info | jq -r '.creationTimestamp') + STARTED_TIMESTAMP=$(echo $workflow_info | jq -r '.startedTimestamp') - # Convert creation timestamp to Unix Epoch time - CREATION_EPOCH=$(date -d "$CREATION_TIMESTAMP" +%s) + echo "Checking workflow $WORKFLOW_NAME" + echo "$STARTED_TIMESTAMP" - # Get current Unix Epoch time - CURRENT_EPOCH=$(date +%s) + if [ "$STARTED_TIMESTAMP" != "null" ]; then + echo "Workflow $WORKFLOW_NAME started at $STARTED_TIMESTAMP" + # Convert creation timestamp to Unix Epoch time + CREATION_EPOCH=$(date -d "$STARTED_TIMESTAMP" +%s) - # Calculate workflow age in seconds - WORKFLOW_AGE=$(($CURRENT_EPOCH - $CREATION_EPOCH)) + # Get current Unix Epoch time + CURRENT_EPOCH=$(date +%s) - # Check if workflow age is greater than threshold - if [ "$WORKFLOW_AGE" -gt "$THRESHOLD_TIME" ]; then - echo "Workflow $WORKFLOW_NAME has been running for over $THRESHOLD_TIME seconds, sending an alert" - # Send alert to Slack - curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"WARNING: Workflow \`${WORKFLOW_NAME}\` has been running longer than $THRESHOLD_TIME seconds\"}" $SLACK_WEBHOOK_URL + # Calculate workflow age in seconds + WORKFLOW_AGE=$(($CURRENT_EPOCH - $CREATION_EPOCH)) + + # Check if workflow age is greater than threshold + if [ "$WORKFLOW_AGE" -gt "$THRESHOLD_TIME" ]; then + echo "Workflow $WORKFLOW_NAME has been running for over $THRESHOLD_TIME seconds, sending an alert" + # Send alert to Slack + curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"WARNING: Workflow \`${WORKFLOW_NAME}\` has been running longer than $THRESHOLD_TIME seconds\"}" $SLACK_WEBHOOK_URL + fi fi done restartPolicy: OnFailure From 2f7e8aab2b7d92ffea0f8d82b4be10959cf15ae1 Mon Sep 17 00:00:00 2001 From: Pauline Ribeyre <4224001+paulineribeyre@users.noreply.github.com> Date: Tue, 16 Jul 2024 11:33:23 -0500 Subject: [PATCH 3/8] Add hatchery access (#2592) --- files/scripts/ecr-access-job.md | 2 +- gen3/bin/kube-setup-hatchery.sh | 22 +++++++++++++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/files/scripts/ecr-access-job.md b/files/scripts/ecr-access-job.md index 9659b186b..5f8dff767 100644 --- a/files/scripts/ecr-access-job.md +++ b/files/scripts/ecr-access-job.md @@ -59,7 +59,7 @@ Trust policy (allows Acct2): } ``` -- Policy in the account (Acct2) that contains the DynamoDB table (created automatically by `kube-setup-ecr-access-job.sh`): +- Policy in the account (Acct2) that contains the DynamoDB table (created automatically by `kube-setup-ecr-access-cronjob.sh`): ``` { "Version": "2012-10-17", diff --git a/gen3/bin/kube-setup-hatchery.sh b/gen3/bin/kube-setup-hatchery.sh index dadbbd930..97365677d 100644 --- a/gen3/bin/kube-setup-hatchery.sh +++ b/gen3/bin/kube-setup-hatchery.sh @@ -175,6 +175,8 @@ $assumeImageBuilderRolePolicyBlock "Action": [ "batch:DescribeComputeEnvironments", "batch:CreateComputeEnvironment", + "batch:UpdateComputeEnvironment", + "batch:ListJobs", "batch:CreateJobQueue", "batch:TagResource", "iam:ListPolicies", @@ -197,10 +199,28 @@ $assumeImageBuilderRolePolicyBlock "iam:CreateInstanceProfile", "iam:AddRoleToInstanceProfile", "iam:PassRole", - "s3:CreateBucket" + "kms:CreateKey", + "kms:CreateAlias", + "kms:DescribeKey", + "kms:TagResource", + "s3:CreateBucket", + "s3:PutEncryptionConfiguration", + "s3:PutBucketPolicy", + "s3:PutLifecycleConfiguration" ], "Resource": "*" }, + { + "Sid": "CreateSlrForNextflowBatchWorkspaces", + "Effect": "Allow", + "Action": "iam:CreateServiceLinkedRole", + "Resource": "arn:aws:iam::*:role/aws-service-role/batch.amazonaws.com/*", + "Condition": { + "StringLike": { + "iam:AWSServiceName": "batch.amazonaws.com" + } + } + }, { "Sid": "PassRoleForNextflowBatchWorkspaces", "Effect": "Allow", From e7fb972628f3b3a977ec6b3576743a6de4fe8976 Mon Sep 17 00:00:00 2001 From: Ajo Augustine Date: Wed, 17 Jul 2024 08:54:52 -0500 Subject: [PATCH 4/8] copy gen3 dataabses in Aurora (#2356) * copy gen3 dataabses in Aurora * Update psql-db-copy-aurora-job.yaml * Update psql-db-copy-aurora-job.yaml * add service account * Add documentation for dbbackup.sh * Add psql-db-aurora-migration-job, updating dbbackup.sh * Add psql-db-aurora-migration-job, updating dbbackup.sh * Update dbbackup.sh * Update psql-db-copy-aurora-job.yaml * Update dbbackup.md --- doc/dbbackup.md | 52 +++++ gen3/bin/dbbackup.sh | 102 ++++++-- .../jobs/psql-db-aurora-migration-job.yaml | 219 ++++++++++++++++++ .../jobs/psql-db-copy-aurora-job.yaml | 193 +++++++++++++++ .../services/jobs/psql-db-copy-aurora-sa.yaml | 30 +++ 5 files changed, 575 insertions(+), 21 deletions(-) create mode 100644 doc/dbbackup.md create mode 100644 kube/services/jobs/psql-db-aurora-migration-job.yaml create mode 100644 kube/services/jobs/psql-db-copy-aurora-job.yaml create mode 100644 kube/services/jobs/psql-db-copy-aurora-sa.yaml diff --git a/doc/dbbackup.md b/doc/dbbackup.md new file mode 100644 index 000000000..9e21f2bde --- /dev/null +++ b/doc/dbbackup.md @@ -0,0 +1,52 @@ +# TL;DR + +This script facilitates the management of database backup and restore within the Gen3 environment. It can establish policies, service accounts, roles, and S3 buckets. Depending on the command provided, it can initiate a database dump, perform a restore, migrate databases to a new RDS instance on Aurora, or clone databases to an RDS Aurora instance. + +## Usage + +```sh +gen3 dbbackup [dump|restore|va-dump|create-sa|migrate-to-aurora|copy-to-aurora] +``` + +### Commands + +#### dump + +Initiates a database dump and pushes it to an S3 bucket, creating the essential AWS resources if they are absent. The dump operation is intended to be executed from the namespace/commons that requires the backup. + +```sh +gen3 dbbackup dump +``` + +#### restore + +Initiates a database restore from an S3 bucket, creating the essential AWS resources if they are absent. The restore operation is meant to be executed in the target namespace where the backup needs to be restored. + +```sh +gen3 dbbackup restore +``` + +#### create-sa + +Creates the necessary service account and roles for DB copy. + +```sh +gen3 dbbackup create-sa +``` + +#### migrate-to-aurora + +Triggers a service account creation and a job to migrate a Gen3 commons to an AWS RDS Aurora instance. + +```sh +gen3 dbbackup migrate-to-aurora +``` + +#### copy-to-aurora + +Triggers a service account creation and a job to copy the databases Indexd, Sheepdog & Metadata to new databases within an RDS Aurora cluster from another namespace in same RDS cluster. + +```sh +gen3 dbbackup copy-to-aurora +``` + diff --git a/gen3/bin/dbbackup.sh b/gen3/bin/dbbackup.sh index eb9611a90..eeb569519 100644 --- a/gen3/bin/dbbackup.sh +++ b/gen3/bin/dbbackup.sh @@ -1,26 +1,28 @@ #!/bin/bash #################################################################################################### -# Script: dbdump.sh +# Script: dbbackup.sh # # Description: # This script facilitates the management of database backups within the gen3 environment. It is -# equipped to establish policies, service accounts, roles, and S3 buckets. Depending on the -# command provided, it will either initiate a database dump or perform a restore. +# equipped to establish policies, service accounts, roles, and S3 buckets. Depending on the +# command provided, it will either initiate a database dump, perform a restore, migrate to Aurora, +# or copy to Aurora. # # Usage: -# gen3 dbbackup [dump|restore] +# gen3 dbbackup [dump|restore|va-dump|create-sa|migrate-to-aurora|copy-to-aurora ] # -# dump - Initiates a database dump, creating the essential AWS resources if they are absent. -# The dump operation is intended to be executed from the namespace/commons that requires -# the backup. -# restore - Initiates a database restore, creating the essential AWS resources if they are absent. -# The restore operation is meant to be executed in the target namespace, where the backup -# needs to be restored. +# dump - Initiates a database dump, creating the essential AWS resources if they are absent. +# The dump operation is intended to be executed from the namespace/commons that requires +# the backup. +# restore - Initiates a database restore, creating the essential AWS resources if they are absent. +# The restore operation is meant to be executed in the target namespace, where the backup +# needs to be restored. +# va-dump - Runs a va-testing DB dump. +# create-sa - Creates the necessary service account and roles for DB copy. +# migrate-to-aurora - Triggers a service account creation and a job to migrate a Gen3 commons to an AWS RDS Aurora instance. +# copy-to-aurora - Triggers a service account creation and a job to copy the databases Indexd, Sheepdog & Metadata to new databases within an RDS Aurora cluster. # -# Notes: -# This script extensively utilizes the AWS CLI and the gen3 CLI. Proper functioning demands a -# configured gen3 environment and the availability of the necessary CLI tools. # #################################################################################################### @@ -49,7 +51,6 @@ gen3_log_info "namespace: $namespace" gen3_log_info "sa_name: $sa_name" gen3_log_info "bucket_name: $bucket_name" - # Create an S3 access policy if it doesn't exist create_policy() { # Check if policy exists @@ -87,7 +88,6 @@ EOM fi } - # Create or update the Service Account and its corresponding IAM Role create_service_account_and_role() { cluster_arn=$(kubectl config current-context) @@ -101,7 +101,6 @@ create_service_account_and_role() { gen3_log_info "oidc_url: $oidc_url" gen3_log_info "role_name: $role_name" - cat > ${trust_policy} <" + exit 1 + fi + gen3_log_info "Copying databases within Aurora..." + copy_to_aurora "$2" + ;; *) - echo "Invalid command. Usage: gen3 dbbackup [dump|restore|va-dump]" + echo "Invalid command. Usage: gen3 dbbackup [dump|restore|va-dump|create-sa|migrate-to-aurora|copy-to-aurora ]" return 1 ;; esac } -main "$1" +main "$@" diff --git a/kube/services/jobs/psql-db-aurora-migration-job.yaml b/kube/services/jobs/psql-db-aurora-migration-job.yaml new file mode 100644 index 000000000..dc6f40c11 --- /dev/null +++ b/kube/services/jobs/psql-db-aurora-migration-job.yaml @@ -0,0 +1,219 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: psql-db-aurora-migration +spec: + template: + metadata: + labels: + app: gen3job + spec: + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: karpenter.sh/capacity-type + operator: In + values: + - on-demand + - weight: 99 + preference: + matchExpressions: + - key: eks.amazonaws.com/capacityType + operator: In + values: + - ONDEMAND + serviceAccountName: psql-db-copy-sa + containers: + - name: pgdump + image: quay.io/cdis/awshelper:master + imagePullPolicy: Always + env: + - name: gen3Env + valueFrom: + configMapKeyRef: + name: global + key: environment + - name: JENKINS_HOME + value: "devterm" + - name: GEN3_HOME + value: /home/ubuntu/cloud-automation + command: [ "/bin/bash" ] + args: + - "-c" + - | + # This job migrates (takes backup and restores) the databases in a Gen3 instance to an Aurora RDS cluster. + # Requirements: + # 1. Aurora server credentials should be present in the Gen3Secrets/creds.json with name 'aurora'. + # 2. Ensure that `gen3 psql aurora` and `gen3 secrets decode aurora-creds` work as expected. + # 3. The job needs the "psql-db-copy-sa" service account with the necessary permissions to read secrets from all relevant namespaces. + + source "${GEN3_HOME}/gen3/lib/utils.sh" + gen3_load "gen3/gen3setup" + namespace=$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace) + default_databases=($(echo -e "$(gen3 db services)" | sort -r)) + date_str=$(date -u +%y%m%d_%H%M%S) + databases=("${default_databases[@]}") + gen3_log_info "databases: ${databases[@]}" + + # Initialize sheepdog_db_name and failed_migrations variables + sheepdog_db_name="" + failed_migrations="" + + # find Aurora Server credentials + aurora_host_name=$(gen3 secrets decode aurora-creds creds.json | jq -r '.db_host') + aurora_master_username=$(gen3 secrets decode aurora-creds creds.json | jq -r '.db_username') + aurora_master_password=$(gen3 secrets decode aurora-creds creds.json | jq -r '.db_password') + aurora_master_database=$(gen3 secrets decode aurora-creds creds.json | jq -r '.db_database') + + gen3_log_info "Aurora Creds: \n aurora_host_name: $aurora_host_name \n aurora_master_username: $aurora_master_username \n aurora_master_database: $aurora_master_database" + + # Verify important variables are present + if [ -z "$aurora_host_name" ] || [ -z "$aurora_master_username" ] || [ -z "$aurora_master_password" ] || [ -z "$aurora_master_database" ]; then + gen3_log_err "Aurora credentials are missing. Exiting." + exit 1 + fi + + new_resources="" + + # Function to truncate to 63 characters + function truncate_identifier() { + local identifier=$1 + if [ ${#identifier} -gt 63 ]; then + echo "${identifier:0:63}" + else + echo "$identifier" + fi + } + + # Function to create a database with retry logic + function create_database_with_retry() { + local db_name=$1 + local retries=5 + local wait_time=10 + for i in $(seq 1 $retries); do + PGPASSWORD=${db_password} psql -h $aurora_host_name -U "$db_user" -d postgres -c "CREATE DATABASE $db_name" + if [ $? -eq 0 ]; then + return 0 + fi + gen3_log_err "Failed to create database $db_name. Retrying in $wait_time seconds..." + sleep $wait_time + done + return 1 + } + + # Looping through each service to: + # - Extract the database credentials. + # - Check if the user already exists, if not, create the user. + # - Grant required privileges. + # - Create the database (except for peregrine). + # - Backup and restore the database on the Aurora Cluster. + for database in "${databases[@]}"; do + for secret_name in "${database}-creds creds.json" "$database-g3auto dbcreds.json"; do + creds=$(gen3 secrets decode $secret_name 2>/dev/null) + if [ $? -eq 0 ] && [ ! -z "$creds" ]; then + db_hostname=$(echo $creds | jq -r .db_host) + db_username=$(echo $creds | jq -r .db_username) + db_password=$(echo $creds | jq -r .db_password) + db_database=$(echo $creds | jq -r .db_database) + gen3_log_info "Extracting service credentials for $database from $secret_name: \n db_hostname: $db_hostname \n db_username: $db_username \n db_database: $db_database \n" + break + fi + done + + if [ -z "$db_hostname" ] || [ -z "$db_username" ] || [ -z "$db_password" ] || [ -z "$db_database" ]; then + gen3_log_err "Failed to extract database credentials for $database" + failed_migrations="${failed_migrations}\nDatabase: $database, Error: Failed to extract credentials" + continue + fi + + # Check source database accessibility + PGPASSWORD=${db_password} pg_isready -h $db_hostname -U "$db_username" -d "$db_database" + if [ $? -ne 0 ]; then + gen3_log_err "Cannot connect to source database $db_database at $db_hostname. Skipping database $database." + failed_migrations="${failed_migrations}\nDatabase: $database, Error: Cannot connect to source database at $db_hostname" + continue + fi + + # Define db_user and db_name variables with replaced hyphens + db_user="$(echo $database | tr '-' '_')_user_$(echo $namespace | tr '-' '_')" + db_name="$(echo $database | tr '-' '_')_$(echo $namespace | tr '-' '_')_${date_str}" + + # Truncate identifiers if necessary + db_user=$(truncate_identifier $db_user) + db_name=$(truncate_identifier $db_name) + + # Try to connect to the Aurora database with the extracted credentials. + # If the connection is successful, it means the user already exists. + # If not, create the user. + + PGPASSWORD=${db_password} psql -h $aurora_host_name -U "$db_user" -d postgres -c "\q" + if [ $? -eq 0 ]; then + gen3_log_info "User $db_user, password already exists" + else + gen3 psql aurora -c "CREATE USER \"$db_user\" WITH PASSWORD '$db_password' CREATEDB" + if [ $? -ne 0 ]; then + gen3_log_err "Failed to create user for $database" + failed_migrations="${failed_migrations}\nDatabase: $database, Error: Failed to create user" + continue + else + gen3_log_info "Database user $db_user created successfully" + fi + fi + + if [ "$database" != "peregrine" ]; then + # Create the database with a unique name by appending namespace and date. + create_database_with_retry $db_name + if [ $? -ne 0 ]; then + gen3_log_err "Failed to create database for $database" + failed_migrations="${failed_migrations}\nDatabase: $database, Error: Failed to create database" + continue + else + gen3_log_info "Database $db_name created successfully" + if [ "$database" == "sheepdog" ]; then + sheepdog_db_name=$db_name + fi + fi + + # Backup the current database and restore it to the newly created database. + if gen3 db backup $database | PGPASSWORD=${db_password} psql -h $aurora_host_name -U "$db_user" -d "$db_name"; then + gen3_log_info "Database $database restored successfully to $db_name" + new_resources="${new_resources}\nSource_Database: $db_database Source_Host: $db_hostname Source_User: $db_username Restored_Database: $db_name User: $db_user" + else + gen3_log_err "Failed to backup and restore database for $database" + failed_migrations="${failed_migrations}\nDatabase: $database, Error: Failed to backup and restore database" + fi + fi + + if [ "$database" == "peregrine" ]; then + if [ -n "$sheepdog_db_name" ]; then + gen3 psql aurora -d "$sheepdog_db_name" -c "GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO \"$db_user\"" + if [ $? -ne 0 ]; then + gen3_log_err "Failed to grant access to sheepdog tables for peregrine user" + failed_migrations="${failed_migrations}\nDatabase: $database, Error: Failed to grant access to sheepdog tables for peregrine user" + continue + else + gen3_log_info "Access to sheepdog tables granted successfully for peregrine user" + new_resources="${new_resources}\nUser: $db_user with access to sheepdog database $sheepdog_db_name" + fi + else + gen3_log_err "Sheepdog database not found for granting permissions to peregrine user" + failed_migrations="${failed_migrations}\nDatabase: $database, Error: Sheepdog database not found for granting permissions" + fi + fi + done + + # Logging the newly created resources + gen3_log_info "New resources created on $aurora_host_name\n$new_resources" + + # Logging the failed migrations + if [ -n "$failed_migrations" ]; then + gen3_log_info "Failed migrations:\n$failed_migrations" + fi + + # Sleep for 600 seconds to allow the user to check the logs + sleep 600 + restartPolicy: Never diff --git a/kube/services/jobs/psql-db-copy-aurora-job.yaml b/kube/services/jobs/psql-db-copy-aurora-job.yaml new file mode 100644 index 000000000..8fd6e899a --- /dev/null +++ b/kube/services/jobs/psql-db-copy-aurora-job.yaml @@ -0,0 +1,193 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: psql-db-copy-aurora +spec: + template: + metadata: + labels: + app: gen3job + spec: + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: karpenter.sh/capacity-type + operator: In + values: + - on-demand + - weight: 99 + preference: + matchExpressions: + - key: eks.amazonaws.com/capacityType + operator: In + values: + - ONDEMAND + serviceAccountName: psql-db-copy-sa + containers: + - name: pgdump + image: quay.io/cdis/awshelper:master + imagePullPolicy: Always + env: + - name: gen3Env + valueFrom: + configMapKeyRef: + name: global + key: environment + - name: JENKINS_HOME + value: "devterm" + - name: GEN3_HOME + value: /home/ubuntu/cloud-automation + - name: SOURCE_NAMESPACE + GEN3_SOURCE_NAMESPACE|-value: "staging"-| # Default value, should be overwritten by the environment variable + command: [ "/bin/bash" ] + args: + - "-c" + - | + # This script copies specified databases from a source namespace to the current namespace on the same Aurora RDS instance. + # + # This script requires the following to work properly: + # + # 1. Aurora server credentials must be present in the Gen3Secrets/creds.json file. + # These credentials should be present as a Kubernetes secret named "aurora-creds". + # This secret should contain the keys: db_host, db_username, db_password, and db_database. + # + # 2. The "gen3 psql aurora" command should be available to connect to the Aurora server. + # + # 3. The "gen3 secrets decode aurora-creds creds.json" command should work, allowing the script to decode the necessary secrets. + # + # 4. The source and the destination databases should be on the same Aurora instance. + # + # 5. The ServiceAccount, roles, and role binding must be set up using the script psql-db-copy-aurora-sa.yaml. + # The psql-db-copy-aurora-sa.yaml script is configured for the default namespace. + # Modify the namespace as needed before applying it where the script will run. + # These can be created by executing the command: + # kubectl apply -f ${GEN3_HOME}/kube/services/jobs/psql-db-copy-aurora-sa.yaml + # + # How to run the script: + # gen3 job run psql-db-copy-aurora -v SOURCE_NAMESPACE + # + + source "${GEN3_HOME}/gen3/lib/utils.sh" + gen3_load "gen3/gen3setup" + namespace=$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace) + date_str=$(date -u +%y%m%d_%H%M%S) + # Define the default databases to be copied + databases=( "indexd" "sheepdog" "metadata") + gen3_log_info "databases to be processed: ${databases[@]}" + source_namespace=$SOURCE_NAMESPACE + gen3_log_info "Source Namespace: $source_namespace" + + # find Aurora Server credentials + aurora_host_name=$(gen3 secrets decode aurora-creds creds.json | jq -r '.db_host') + aurora_master_username=$(gen3 secrets decode aurora-creds creds.json | jq -r '.db_username') + aurora_master_password=$(gen3 secrets decode aurora-creds creds.json | jq -r '.db_password') + aurora_database=$(gen3 secrets decode aurora-creds creds.json | jq -r '.db_database') + + # Verify important variables are present + if [ -z "$aurora_host_name" ] || [ -z "$aurora_master_username" ] || [ -z "$aurora_master_password" ] || [ -z "$aurora_database" ]; then + gen3_log_err "Aurora credentials are missing. Exiting." + exit 1 + fi + + # Function to truncate to 63 characters + function truncate_identifier() { + local identifier=$1 + if [ ${#identifier} -gt 63 ]; then + echo "${identifier:0:63}" + else + echo "$identifier" + fi + } + + # Function to decode Kubernetes secrets + function secrets_decode() { + local namespace=$1 + local secret=$2 + local key=$3 + local secrets_value + + secrets_value=$(kubectl get secret -n $namespace $secret -o json 2>/dev/null | jq -r --arg key "$key" '.data[$key]' | base64 --decode --ignore-garbage 2>/dev/null) + if [ $? -ne 0 ] || [ -z "$secrets_value" ]; then + echo "Secret $secret in namespace $namespace not found or failed to decode" >&2 + return 1 + else + echo "$secrets_value" + fi + } + + # Array to hold the names of newly created databases + new_databases=() + + # Looping through each database + for database in "${databases[@]}"; do + source_creds="" + creds="" + + # Try to get the source and destination credentials with the "-g3auto" suffix and key "dbcreds.json" + source_creds=$(secrets_decode $source_namespace ${database}-g3auto dbcreds.json) + if [ $? -ne 0 ]; then + source_creds="" + fi + creds=$(secrets_decode $namespace ${database}-g3auto dbcreds.json) + if [ $? -ne 0 ]; then + creds="" + fi + + # If the "-g3auto" suffix didn't work for both source_creds and creds, try with the suffix "creds" and key "creds.json" + if [ -z "$source_creds" ] && [ -z "$creds" ]; then + source_creds=$(secrets_decode $source_namespace ${database}-creds creds.json) + if [ $? -ne 0 ]; then + source_creds="" + fi + creds=$(secrets_decode $namespace ${database}-creds creds.json) + if [ $? -ne 0 ]; then + creds="" + fi + fi + + # If we still couldn't get the credentials, log an error and continue to the next database + if [ -z "$source_creds" ] || [ -z "$creds" ]; then + gen3_log_err "Failed to extract database credentials for $database" + continue + fi + + source_db_database=$(echo $source_creds | jq -r .db_database) + db_username=$(echo $creds | jq -r .db_username) + db_database=$(echo $creds | jq -r .db_database) + + if [ -z "$source_db_database" ] || [ -z "$db_username" ] || [ -z "$db_database" ]; then + gen3_log_err "One or more required credentials are missing for $database. Skipping." + continue + fi + target_db=$(truncate_identifier $(echo "${database}_${namespace}_${date_str}" | tr '-' '_')) + gen3_log_info "Processing database: $database" + gen3_log_info "Source DB: $source_db_database, Username: $db_username, Current DB: $db_database, Target DB: $target_db" + + # DB commands + gen3 psql aurora -c "GRANT $db_username TO $aurora_master_username" + gen3 psql aurora -c "SELECT pg_terminate_backend(pg_stat_activity.pid) FROM pg_stat_activity WHERE pg_stat_activity.datname = '$source_db_database' AND pid <> pg_backend_pid()" + gen3 psql aurora -c "CREATE DATABASE $target_db WITH TEMPLATE $source_db_database OWNER $db_username" + pg_command="DO \$\$ DECLARE tbl record; BEGIN FOR tbl IN (SELECT table_schema || '.' || table_name AS full_table_name FROM information_schema.tables WHERE table_schema = 'public') LOOP EXECUTE 'ALTER TABLE ' || tbl.full_table_name || ' OWNER TO $db_username;'; END LOOP; END \$\$;" + PGPASSWORD=${aurora_master_password} psql -h $aurora_host_name -U $aurora_master_username -d "$target_db" -c "$pg_command" + if [ $? -eq 0 ]; then + gen3_log_info "Successfully processed $database" + new_databases+=("$target_db") + else + gen3_log_err "Failed to process $database" + fi + done + + gen3_log_info "Job Completed" + + # Print the list of newly created databases + gen3_log_info "Newly created Database Names::" + for new_db in "${new_databases[@]}"; do + gen3_log_info "$new_db" + done + + sleep 600 + restartPolicy: Never diff --git a/kube/services/jobs/psql-db-copy-aurora-sa.yaml b/kube/services/jobs/psql-db-copy-aurora-sa.yaml new file mode 100644 index 000000000..e6977a187 --- /dev/null +++ b/kube/services/jobs/psql-db-copy-aurora-sa.yaml @@ -0,0 +1,30 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: psql-db-copy-sa + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: psql-db-copy-role +rules: +- apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "watch", "list"] + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: psql-db-copy-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: psql-db-copy-role +subjects: +- kind: ServiceAccount + name: psql-db-copy-sa + namespace: default # Ensure this references the correct namespace + From d4e265183fa78277b5c77eb775cc39f300bfd762 Mon Sep 17 00:00:00 2001 From: Aidan Hilt <11202897+AidanHilt@users.noreply.github.com> Date: Wed, 17 Jul 2024 10:54:20 -0400 Subject: [PATCH 5/8] Adding an alert to all workflows if they get timed out (#2593) * Adding an alert to all workflows if they get timed out * Let's add some logic to create the secret * Let's just delete and recreate at all times * Changing image * Removing APK command --- gen3/bin/kube-setup-argo.sh | 12 ++++++++++++ kube/services/argo/values.yaml | 14 ++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/gen3/bin/kube-setup-argo.sh b/gen3/bin/kube-setup-argo.sh index 677f62257..1a25a98c8 100644 --- a/gen3/bin/kube-setup-argo.sh +++ b/gen3/bin/kube-setup-argo.sh @@ -204,6 +204,18 @@ EOF aws iam put-role-policy --role-name ${roleName} --policy-name ${internalBucketPolicy} --policy-document file://$internalBucketPolicyFile || true fi + # Create a secret for the slack webhook + alarm_webhook=$(g3kubectl get cm global -o yaml | yq .data.slack_alarm_webhook | tr -d '"') + + if [ -z "$alarm_webhook" ]; then + gen3_log_err "Please set a slack_alarm_webhook in the 'global' configmap. This is needed to alert for failed workflows." + exit 1 + fi + + g3kubectl -n argo delete secret slack-webhook-secret + g3kubectl -n argo create secret generic "slack-webhook-secret" --from-literal=SLACK_WEBHOOK_URL=$alarm_webhook + + ## if new bucket then do the following # Get the aws keys from secret # Create and attach lifecycle policy diff --git a/kube/services/argo/values.yaml b/kube/services/argo/values.yaml index c8178dd2a..eeb2e9e01 100644 --- a/kube/services/argo/values.yaml +++ b/kube/services/argo/values.yaml @@ -61,6 +61,20 @@ controller: workflowDefaults: spec: archiveLogs: true + onExit: alert-on-timeout + templates: + - name: alert-on-timeout + script: + image: quay.io/cdis/amazonlinux-debug:master + command: [sh] + envFrom: + - secretRef: + name: slack-webhook-secret + source: | + failure_reason=$(echo {{workflow.failures}} | jq 'any(.[]; .message == "Step exceeded its deadline")' ) + if [ "$failure_reason" ]; then + curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"ALERT: Workflow {{workflow.name}} has been killed due to timeout\"}" "$SLACK_WEBHOOK_URL" + fi # -- [Node selector] nodeSelector: From 9dff5360fdbef7dbcc01f57ecec4888145c252c6 Mon Sep 17 00:00:00 2001 From: Andrew Prokhorenkov Date: Fri, 19 Jul 2024 03:20:24 -0500 Subject: [PATCH 6/8] fix: bash comparison for true values in argo slack webhook notify (#2596) --- kube/services/argo/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kube/services/argo/values.yaml b/kube/services/argo/values.yaml index eeb2e9e01..c1e951773 100644 --- a/kube/services/argo/values.yaml +++ b/kube/services/argo/values.yaml @@ -72,7 +72,7 @@ controller: name: slack-webhook-secret source: | failure_reason=$(echo {{workflow.failures}} | jq 'any(.[]; .message == "Step exceeded its deadline")' ) - if [ "$failure_reason" ]; then + if [ "$failure_reason" = "true" ]; then curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"ALERT: Workflow {{workflow.name}} has been killed due to timeout\"}" "$SLACK_WEBHOOK_URL" fi From 63ecfc9f30466507110c61953c2683387b030b9b Mon Sep 17 00:00:00 2001 From: Hara Prasad Date: Fri, 19 Jul 2024 08:57:09 -0700 Subject: [PATCH 7/8] Update python to 3.9 in jenkins pods (#2597) * Update python to 3.9 in jenkins pods * fix version --- .pre-commit-config.yaml | 4 +- .secrets.baseline | 104 +++++------------- Docker/jenkins/Jenkins-CI-Worker/Dockerfile | 18 +-- .../Jenkins-CI-Worker/install-python3.8.sh | 8 -- .../Jenkins-CI-Worker/install-python3.9.sh | 8 ++ Docker/jenkins/Jenkins/Dockerfile | 18 +-- Docker/jenkins/Jenkins/install-python3.8.sh | 7 -- Docker/jenkins/Jenkins/install-python3.9.sh | 7 ++ Docker/jenkins/Jenkins2/Dockerfile | 18 +-- Docker/jenkins/Jenkins2/install-python3.8.sh | 7 -- Docker/jenkins/Jenkins2/install-python3.9.sh | 7 ++ 11 files changed, 77 insertions(+), 129 deletions(-) delete mode 100755 Docker/jenkins/Jenkins-CI-Worker/install-python3.8.sh create mode 100755 Docker/jenkins/Jenkins-CI-Worker/install-python3.9.sh delete mode 100755 Docker/jenkins/Jenkins/install-python3.8.sh create mode 100755 Docker/jenkins/Jenkins/install-python3.9.sh delete mode 100755 Docker/jenkins/Jenkins2/install-python3.8.sh create mode 100755 Docker/jenkins/Jenkins2/install-python3.9.sh diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 82034495d..c3a384baa 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,11 @@ repos: - repo: git@github.com:Yelp/detect-secrets - rev: v1.4.0 + rev: v1.5.0 hooks: - id: detect-secrets args: ['--baseline', '.secrets.baseline'] - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v2.5.0 + rev: v4.6.0 hooks: - id: no-commit-to-branch args: [--branch, develop, --branch, master, --pattern, release/.*] diff --git a/.secrets.baseline b/.secrets.baseline index 4a300c57c..0cc95d0da 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -1,5 +1,5 @@ { - "version": "1.4.0", + "version": "1.5.0", "plugins_used": [ { "name": "ArtifactoryDetector" @@ -26,6 +26,9 @@ { "name": "GitHubTokenDetector" }, + { + "name": "GitLabTokenDetector" + }, { "name": "HexHighEntropyString", "limit": 3.0 @@ -36,6 +39,9 @@ { "name": "IbmCosHmacDetector" }, + { + "name": "IPPublicDetector" + }, { "name": "JwtTokenDetector" }, @@ -49,9 +55,15 @@ { "name": "NpmDetector" }, + { + "name": "OpenAIDetector" + }, { "name": "PrivateKeyDetector" }, + { + "name": "PypiTokenDetector" + }, { "name": "SendGridDetector" }, @@ -67,6 +79,9 @@ { "name": "StripeDetector" }, + { + "name": "TelegramBotTokenDetector" + }, { "name": "TwilioKeyDetector" } @@ -75,10 +90,6 @@ { "path": "detect_secrets.filters.allowlist.is_line_allowlisted" }, - { - "path": "detect_secrets.filters.common.is_baseline_file", - "filename": ".secrets.baseline" - }, { "path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies", "min_level": 2 @@ -246,6 +257,15 @@ "line_number": 154 } ], + "files/lambda/test-security_alerts.py": [ + { + "type": "AWS Access Key", + "filename": "files/lambda/test-security_alerts.py", + "hashed_secret": "4e041fbfd5dd5918d3d5e968f5f739f815ae92da", + "is_verified": false, + "line_number": 5 + } + ], "files/scripts/psql-fips-fix.sh": [ { "type": "Secret Keyword", @@ -640,78 +660,6 @@ "line_number": 25 } ], - "gen3/test/terraformTest.sh": [ - { - "type": "Secret Keyword", - "filename": "gen3/test/terraformTest.sh", - "hashed_secret": "6b44a330b450ee550c081410c6b705dfeaa105ce", - "is_verified": false, - "line_number": 156 - }, - { - "type": "Secret Keyword", - "filename": "gen3/test/terraformTest.sh", - "hashed_secret": "d869db7fe62fb07c25a0403ecaea55031744b5fb", - "is_verified": false, - "line_number": 163 - }, - { - "type": "Base64 High Entropy String", - "filename": "gen3/test/terraformTest.sh", - "hashed_secret": "1cc07dccfdf640eb0e403e490a873a5536759009", - "is_verified": false, - "line_number": 172 - }, - { - "type": "Secret Keyword", - "filename": "gen3/test/terraformTest.sh", - "hashed_secret": "1cc07dccfdf640eb0e403e490a873a5536759009", - "is_verified": false, - "line_number": 172 - }, - { - "type": "Base64 High Entropy String", - "filename": "gen3/test/terraformTest.sh", - "hashed_secret": "185a71a740ef6b9b21c84e6eaa47b89c7de181ef", - "is_verified": false, - "line_number": 175 - }, - { - "type": "Secret Keyword", - "filename": "gen3/test/terraformTest.sh", - "hashed_secret": "185a71a740ef6b9b21c84e6eaa47b89c7de181ef", - "is_verified": false, - "line_number": 175 - }, - { - "type": "Secret Keyword", - "filename": "gen3/test/terraformTest.sh", - "hashed_secret": "212e1d3823c8c9af9e4c0c172164ee292b9a6768", - "is_verified": false, - "line_number": 311 - }, - { - "type": "Secret Keyword", - "filename": "gen3/test/terraformTest.sh", - "hashed_secret": "cb80dbb67a1a5bdf4957eea1473789f1c65357c6", - "is_verified": false, - "line_number": 312 - }, - { - "type": "Secret Keyword", - "filename": "gen3/test/terraformTest.sh", - "hashed_secret": "5f35c25f4bf588b5fad46e249fcd9221f5257ce4", - "is_verified": false, - "line_number": 313 - }, - { - "type": "Secret Keyword", - "filename": "gen3/test/terraformTest.sh", - "hashed_secret": "5308421b43dde5775f1993bd25a8163070d65598", - "is_verified": false, - "line_number": 314 - } - ], "kube/services/access-backend/access-backend-deploy.yaml": [ { "type": "Secret Keyword", @@ -3737,5 +3685,5 @@ } ] }, - "generated_at": "2024-07-05T21:37:59Z" + "generated_at": "2024-07-19T04:34:31Z" } diff --git a/Docker/jenkins/Jenkins-CI-Worker/Dockerfile b/Docker/jenkins/Jenkins-CI-Worker/Dockerfile index 6eeb8f4fd..9401e6a4b 100644 --- a/Docker/jenkins/Jenkins-CI-Worker/Dockerfile +++ b/Docker/jenkins/Jenkins-CI-Worker/Dockerfile @@ -83,21 +83,21 @@ RUN curl -fsSL https://www.postgresql.org/media/keys/ACCC4CF8.asc| gpg --dearmor apt-get install -y postgresql-client-13 # Copy sh script responsible for installing Python -COPY install-python3.8.sh /root/tmp/install-python3.8.sh +COPY install-python3.9.sh /root/tmp/install-python3.9.sh -# Run the script responsible for installing Python 3.8.0 and link it to /usr/bin/python -RUN chmod +x /root/tmp/install-python3.8.sh; sync && \ - bash /root/tmp/install-python3.8.sh && \ - rm -rf /root/tmp/install-python3.8.sh && \ +# Run the script responsible for installing Python 3.9.19 and link it to /usr/bin/python +RUN chmod +x /root/tmp/install-python3.9.sh; sync && \ + bash /root/tmp/install-python3.9.sh && \ + rm -rf /root/tmp/install-python3.9.sh && \ unlink /usr/bin/python3 && \ - ln -s /usr/local/bin/python3.8 /usr/bin/python3 + ln -s /usr/local/bin/python3.9 /usr/bin/python3 # Fix shebang for lsb_release -RUN sed -i 's/python3/python3.8/' /usr/bin/lsb_release && \ - sed -i 's/python3/python3.8/' /usr/bin/add-apt-repository +RUN sed -i 's/python3/python3.9/' /usr/bin/lsb_release && \ + sed -i 's/python3/python3.9/' /usr/bin/add-apt-repository # install aws cli, poetry, pytest, etc. -RUN set -xe && python3.8 -m pip install --upgrade pip setuptools && python3.8 -m pip install awscli --upgrade && python3.8 -m pip install pytest --upgrade && python3.8 -m pip install poetry && python3.8 -m pip install PyYAML --upgrade && python3.8 -m pip install lxml --upgrade && python3.8 -m pip install yq --upgrade && python3.8 -m pip install datadog --upgrade +RUN set -xe && python3.9 -m pip install --upgrade pip setuptools && python3.9 -m pip install awscli --upgrade && python3.9 -m pip install pytest --upgrade && python3.9 -m pip install poetry && python3.9 -m pip install PyYAML --upgrade && python3.9 -m pip install lxml --upgrade && python3.9 -m pip install yq --upgrade && python3.9 -m pip install datadog --upgrade # install terraform RUN curl -o /tmp/terraform.zip https://releases.hashicorp.com/terraform/0.11.15/terraform_0.11.15_linux_amd64.zip \ diff --git a/Docker/jenkins/Jenkins-CI-Worker/install-python3.8.sh b/Docker/jenkins/Jenkins-CI-Worker/install-python3.8.sh deleted file mode 100755 index a01d59420..000000000 --- a/Docker/jenkins/Jenkins-CI-Worker/install-python3.8.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -wget https://www.python.org/ftp/python/3.8.0/Python-3.8.0.tar.xz -tar xf Python-3.8.0.tar.xz -rm Python-3.8.0.tar.xz -cd Python-3.8.0 -./configure -make -make altinstall diff --git a/Docker/jenkins/Jenkins-CI-Worker/install-python3.9.sh b/Docker/jenkins/Jenkins-CI-Worker/install-python3.9.sh new file mode 100755 index 000000000..88b7596ae --- /dev/null +++ b/Docker/jenkins/Jenkins-CI-Worker/install-python3.9.sh @@ -0,0 +1,8 @@ +#!/bin/bash +wget https://www.python.org/ftp/python/3.9.19/Python-3.9.19.tar.xz +tar xf Python-3.9.19.tar.xz +rm Python-3.9.19.tar.xz +cd Python-3.9.19 +./configure +make +make altinstall diff --git a/Docker/jenkins/Jenkins/Dockerfile b/Docker/jenkins/Jenkins/Dockerfile index 535fdebc1..49c0f82b5 100644 --- a/Docker/jenkins/Jenkins/Dockerfile +++ b/Docker/jenkins/Jenkins/Dockerfile @@ -68,21 +68,21 @@ RUN DISTRO="$(lsb_release -c -s)" \ && rm -rf /var/lib/apt/lists/* # Copy sh script responsible for installing Python -COPY install-python3.8.sh /root/tmp/install-python3.8.sh +COPY install-python3.9.sh /root/tmp/install-python3.9.sh -# Run the script responsible for installing Python 3.8.0 and link it to /usr/bin/python -RUN chmod +x /root/tmp/install-python3.8.sh; sync && \ - ./root/tmp/install-python3.8.sh && \ - rm -rf /root/tmp/install-python3.8.sh && \ +# Run the script responsible for installing Python 3.9.19 and link it to /usr/bin/python +RUN chmod +x /root/tmp/install-python3.9.sh; sync && \ + ./root/tmp/install-python3.9.sh && \ + rm -rf /root/tmp/install-python3.9.sh && \ unlink /usr/bin/python3 && \ - ln -s /Python-3.8.0/python /usr/bin/python3 + ln -s /Python-3.9.0/python /usr/bin/python3 # Fix shebang for lsb_release -RUN sed -i 's/python3/python3.8/' /usr/bin/lsb_release && \ - sed -i 's/python3/python3.8/' /usr/bin/add-apt-repository +RUN sed -i 's/python3/python3.9/' /usr/bin/lsb_release && \ + sed -i 's/python3/python3.9/' /usr/bin/add-apt-repository # install aws cli, poetry, pytest, etc. -RUN set -xe && python3 -m pip install --upgrade pip && python3 -m pip install awscli --upgrade && python3 -m pip install pytest --upgrade && python3 -m pip install poetry && python3 -m pip install PyYAML --upgrade && python3 -m pip install lxml --upgrade && python3 -m pip install yq --upgrade +RUN set -xe && python3.9 -m pip install --upgrade pip && python3.9 -m pip install awscli --upgrade && python3.9 -m pip install pytest --upgrade && python3.9 -m pip install poetry && python3.9 -m pip install PyYAML --upgrade && python3.9 -m pip install lxml --upgrade && python3.9 -m pip install yq --upgrade # install chrome (supports headless mode) RUN set -xe \ diff --git a/Docker/jenkins/Jenkins/install-python3.8.sh b/Docker/jenkins/Jenkins/install-python3.8.sh deleted file mode 100755 index df21c66e5..000000000 --- a/Docker/jenkins/Jenkins/install-python3.8.sh +++ /dev/null @@ -1,7 +0,0 @@ -wget https://www.python.org/ftp/python/3.8.0/Python-3.8.0.tar.xz -tar xf Python-3.8.0.tar.xz -rm Python-3.8.0.tar.xz -cd Python-3.8.0 -./configure -make -make altinstall diff --git a/Docker/jenkins/Jenkins/install-python3.9.sh b/Docker/jenkins/Jenkins/install-python3.9.sh new file mode 100755 index 000000000..83d7f17cd --- /dev/null +++ b/Docker/jenkins/Jenkins/install-python3.9.sh @@ -0,0 +1,7 @@ +wget https://www.python.org/ftp/python/3.9.19/Python-3.9.19.tar.xz +tar xf Python-3.9.19.tar.xz +rm Python-3.9.19.tar.xz +cd Python-3.9.19 +./configure +make +make altinstall diff --git a/Docker/jenkins/Jenkins2/Dockerfile b/Docker/jenkins/Jenkins2/Dockerfile index cd470268b..9e585ca0e 100644 --- a/Docker/jenkins/Jenkins2/Dockerfile +++ b/Docker/jenkins/Jenkins2/Dockerfile @@ -69,21 +69,21 @@ RUN DISTRO="$(lsb_release -c -s)" \ && rm -rf /var/lib/apt/lists/* # Copy sh script responsible for installing Python -COPY install-python3.8.sh /root/tmp/install-python3.8.sh +COPY install-python3.9.sh /root/tmp/install-python3.9.sh -# Run the script responsible for installing Python 3.8.0 and link it to /usr/bin/python -RUN chmod +x /root/tmp/install-python3.8.sh; sync && \ - ./root/tmp/install-python3.8.sh && \ - rm -rf /root/tmp/install-python3.8.sh && \ +# Run the script responsible for installing Python 3.9.19 and link it to /usr/bin/python +RUN chmod +x /root/tmp/install-python3.9.sh; sync && \ + ./root/tmp/install-python3.9.sh && \ + rm -rf /root/tmp/install-python3.9.sh && \ unlink /usr/bin/python3 && \ - ln -s /Python-3.8.0/python /usr/bin/python3 + ln -s /Python-3.9.19/python /usr/bin/python3 # Fix shebang for lsb_release -RUN sed -i 's/python3/python3.5/' /usr/bin/lsb_release && \ - sed -i 's/python3/python3.5/' /usr/bin/add-apt-repository +RUN sed -i 's/python3/python3.9/' /usr/bin/lsb_release && \ + sed -i 's/python3/python3.9/' /usr/bin/add-apt-repository # install aws cli, poetry, pytest, etc. -RUN set -xe && python3 -m pip install --upgrade pip && python3 -m pip install awscli --upgrade && python3 -m pip install pytest --upgrade && python3 -m pip install poetry && python3 -m pip install PyYAML --upgrade && python3 -m pip install lxml --upgrade && python3 -m pip install yq --upgrade +RUN set -xe && python3.9 -m pip install --upgrade pip && python3.9 -m pip install awscli --upgrade && python3.9 -m pip install pytest --upgrade && python3.9 -m pip install poetry && python3.9 -m pip install PyYAML --upgrade && python3.9 -m pip install lxml --upgrade && python3.9 -m pip install yq --upgrade # install chrome (supports headless mode) RUN set -xe \ diff --git a/Docker/jenkins/Jenkins2/install-python3.8.sh b/Docker/jenkins/Jenkins2/install-python3.8.sh deleted file mode 100755 index df21c66e5..000000000 --- a/Docker/jenkins/Jenkins2/install-python3.8.sh +++ /dev/null @@ -1,7 +0,0 @@ -wget https://www.python.org/ftp/python/3.8.0/Python-3.8.0.tar.xz -tar xf Python-3.8.0.tar.xz -rm Python-3.8.0.tar.xz -cd Python-3.8.0 -./configure -make -make altinstall diff --git a/Docker/jenkins/Jenkins2/install-python3.9.sh b/Docker/jenkins/Jenkins2/install-python3.9.sh new file mode 100755 index 000000000..83d7f17cd --- /dev/null +++ b/Docker/jenkins/Jenkins2/install-python3.9.sh @@ -0,0 +1,7 @@ +wget https://www.python.org/ftp/python/3.9.19/Python-3.9.19.tar.xz +tar xf Python-3.9.19.tar.xz +rm Python-3.9.19.tar.xz +cd Python-3.9.19 +./configure +make +make altinstall From 522b2bb1f5ede3b47ec67765e6490234d75d5ee5 Mon Sep 17 00:00:00 2001 From: EliseCastle23 <109446148+EliseCastle23@users.noreply.github.com> Date: Fri, 19 Jul 2024 14:09:45 -0600 Subject: [PATCH 8/8] adding a backoff limit and job history limit to monitor cronjobs (#2598) --- kube/services/node-monitors/argo-monitors/argo-node-age.yaml | 3 +++ kube/services/node-monitors/fenceshib-jenkins-test.yaml | 3 +++ kube/services/node-monitors/node-not-ready.yaml | 3 +++ 3 files changed, 9 insertions(+) diff --git a/kube/services/node-monitors/argo-monitors/argo-node-age.yaml b/kube/services/node-monitors/argo-monitors/argo-node-age.yaml index b389c072c..7a60a32ce 100644 --- a/kube/services/node-monitors/argo-monitors/argo-node-age.yaml +++ b/kube/services/node-monitors/argo-monitors/argo-node-age.yaml @@ -5,8 +5,11 @@ metadata: namespace: default spec: schedule: "*/5 * * * *" + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 1 jobTemplate: spec: + backoffLimit: 4 template: metadata: labels: diff --git a/kube/services/node-monitors/fenceshib-jenkins-test.yaml b/kube/services/node-monitors/fenceshib-jenkins-test.yaml index e9e27af98..deaf26b3e 100644 --- a/kube/services/node-monitors/fenceshib-jenkins-test.yaml +++ b/kube/services/node-monitors/fenceshib-jenkins-test.yaml @@ -5,8 +5,11 @@ metadata: namespace: default spec: schedule: "0 */4 * * *" + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 1 jobTemplate: spec: + backoffLimit: 4 template: metadata: labels: diff --git a/kube/services/node-monitors/node-not-ready.yaml b/kube/services/node-monitors/node-not-ready.yaml index 15ed616e6..709dfc79e 100644 --- a/kube/services/node-monitors/node-not-ready.yaml +++ b/kube/services/node-monitors/node-not-ready.yaml @@ -5,8 +5,11 @@ metadata: namespace: default spec: schedule: "*/30 * * * *" + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 1 jobTemplate: spec: + backoffLimit: 4 template: metadata: labels: