diff --git a/Capfile b/Capfile new file mode 100644 index 0000000..1f07fb9 --- /dev/null +++ b/Capfile @@ -0,0 +1,15 @@ +# frozen_string_literal: true +# Load DSL and set up stages +require 'capistrano/setup' + +# Include default deployment tasks +require 'capistrano/deploy' +require 'capistrano/scm/git' +install_plugin Capistrano::SCM::Git + +require 'capistrano/maintenance' + +require 'dlss/docker/capistrano' + +# Load custom tasks from `lib/capistrano/tasks` if you have any defined +Dir.glob('lib/capistrano/tasks/*.rake').each { |r| import r } diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..d149072 --- /dev/null +++ b/Gemfile @@ -0,0 +1,8 @@ +# frozen_string_literal: true + +source 'https://rubygems.org' + +gem 'capistrano' +gem 'capistrano-maintenance', '~> 1.2', require: false +gem 'dlss-capistrano-docker', require: false +gem 'rake' diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 0000000..030f236 --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,63 @@ +GEM + remote: https://rubygems.org/ + specs: + airbrussh (1.5.2) + sshkit (>= 1.6.1, != 1.7.0) + base64 (0.2.0) + bcrypt_pbkdf (1.1.1) + bcrypt_pbkdf (1.1.1-arm64-darwin) + bundler-audit (0.9.1) + bundler (>= 1.2.0, < 3) + thor (~> 1.0) + capistrano (3.19.0) + airbrussh (>= 1.0.0) + i18n + rake (>= 10.0.0) + sshkit (>= 1.9.0) + capistrano-bundle_audit (0.4.0) + bundler-audit (~> 0.5) + capistrano (~> 3.0) + capistrano-bundler (>= 1.4) + capistrano-bundler (2.1.0) + capistrano (~> 3.1) + capistrano-maintenance (1.2.1) + capistrano (>= 3.0) + capistrano-one_time_key (0.2.0) + capistrano (~> 3.0) + capistrano-shared_configs (0.2.2) + concurrent-ruby (1.3.3) + dlss-capistrano-docker (1.1.1) + bcrypt_pbkdf + capistrano (~> 3.0) + capistrano-bundle_audit (>= 0.3.0) + capistrano-one_time_key + capistrano-shared_configs + ed25519 + ed25519 (1.3.0) + i18n (1.14.5) + concurrent-ruby (~> 1.0) + net-scp (4.0.0) + net-ssh (>= 2.6.5, < 8.0.0) + net-sftp (4.0.0) + net-ssh (>= 5.0.0, < 8.0.0) + net-ssh (7.2.3) + rake (13.2.1) + sshkit (1.23.0) + base64 + net-scp (>= 1.1.2) + net-sftp (>= 2.1.2) + net-ssh (>= 2.8.0) + thor (1.3.1) + +PLATFORMS + arm64-darwin-23 + ruby + +DEPENDENCIES + capistrano + capistrano-maintenance (~> 1.2) + dlss-capistrano-docker + rake + +BUNDLED WITH + 2.5.14 diff --git a/README.md b/README.md index 143f393..f427ce0 100644 --- a/README.md +++ b/README.md @@ -120,3 +120,13 @@ pytest 1. Run linting: `ruff check` 2. Automatically fix linting: `ruff check --fix` 3. Run formatting: `ruff format` (or `ruff format --check` to identify any unformatted files) + +## Deployment + +Deployment to https://sul-rialto-airflow-dev.stanford.edu/ is handled like other SDR services using Capistrano. You'll need to have Ruby installed and then: + +``` +gem install bundler +bundle install +bundle exec cap dev deploy +``` diff --git a/config/deploy.rb b/config/deploy.rb new file mode 100644 index 0000000..c76c82a --- /dev/null +++ b/config/deploy.rb @@ -0,0 +1,55 @@ +# frozen_string_literal: true + +set :application, 'rialto-airflow' +set :repo_url, 'https://github.com/sul-dlss-labs/rialto-airflow.git' + +# Default branch is :master +ask :branch, `git rev-parse --abbrev-ref HEAD`.chomp + +# Default deploy_to directory is /var/www/my_app_name +set :deploy_to, "/opt/app/rialto/#{fetch(:application)}" + +# Default value for :format is :airbrussh. +# set :format, :airbrussh + +# You can configure the Airbrussh format using :format_options. +# These are the defaults. +# set :format_options, command_output: true, log_file: "log/capistrano.log", color: :auto, truncate: :auto + +# Default value for :log_level is :debug +set :log_level, :info + +# Default value for :pty is false +# set :pty, true + +# Only using capistrano for docker compose based deployment of a python app, these aren't currentl used. +# set :linked_files, %w[config/honeybadger.yml] +# set :linked_dirs, %w[log config/settings public/system] +# set :dereference_dirs, %w[config/settings] + +# Default value for default_env is {} +# set :default_env, { path: "/opt/ruby/bin:$PATH" } + +# Default value for local_user is ENV['USER'] +# set :local_user, -> { `git config user.name`.chomp } + +# Default value for keep_releases is 5 +# set :keep_releases, 5 + +# Uncomment the following to require manually verifying the host key before first deploy. +# set :ssh_options, verify_host_key: :secure + +# honeybadger_env otherwise defaults to rails_env +set :honeybadger_env, fetch(:stage) + +# Set Rails env to production in all Cap environments +set :rails_env, 'production' + +set :docker_compose_file, 'docker-compose.prod.yaml' +set :docker_compose_migrate_use_hooks, false +set :docker_compose_seed_use_hooks, false +set :docker_compose_rabbitmq_use_hooks, false +set :docker_compose_build_use_hooks, false +set :docker_compose_restart_use_hooks, true +set :docker_compose_copy_assets_use_hooks, false +set :honeybadger_use_hooks, false diff --git a/config/deploy/dev.rb b/config/deploy/dev.rb new file mode 100644 index 0000000..e685778 --- /dev/null +++ b/config/deploy/dev.rb @@ -0,0 +1,7 @@ +# frozen_string_literal: true + +# Roles are passed to docker-compose as profiles. +server 'sul-rialto-airflow-dev.stanford.edu', user: 'rialto', roles: %w[app] + +Capistrano::OneTimeKey.generate_one_time_key! + diff --git a/docker-compose.prod.yaml b/docker-compose.prod.yaml new file mode 100644 index 0000000..928fff4 --- /dev/null +++ b/docker-compose.prod.yaml @@ -0,0 +1,231 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL. +# +# WARNING: This configuration is for local development. Do not use it in a production deployment. +# +# This configuration supports basic configuration using environment variables or an .env file +# The following variables are supported: +# +# AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow. +# Default: apache/airflow:2.9.2 +# AIRFLOW_UID - User ID in Airflow containers +# Default: 50000 +# AIRFLOW_PROJ_DIR - Base path to which all the files will be volumed. +# Default: . +# Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode +# +# _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested). +# Default: airflow +# _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested). +# Default: airflow +# _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers. +# Use this option ONLY for quick checks. Installing requirements at container +# startup is done EVERY TIME the service is started. +# A better way is to build a custom image or extend the official image +# as described in https://airflow.apache.org/docs/docker-stack/build.html. +# Default: '' +# +# Feel free to modify this file to suit your needs. +--- +x-airflow-common: + &airflow-common + # In order to add custom dependencies or upgrade provider packages you can use your extended image. + # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml + # and uncomment the "build" line below, Then run `docker-compose build` to build the images. + #image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.9.2} + build: . + environment: + &airflow-common-env + AIRFLOW_UID: 503 + AIRFLOW_GID: 0 + AIRFLOW__CORE__EXECUTOR: CeleryExecutor + AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: "postgresql+psycopg2://${DATABASE_USERNAME}:${DATABASE_PASSWORD}@${DATABASE_HOSTNAME}/rialto-airflow" + AIRFLOW__CELERY__RESULT_BACKEND: "db+postgresql://${DATABASE_USERNAME}:${DATABASE_PASSWORD}@${DATABASE_HOSTNAME}/rialto-airflow" + AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0 + AIRFLOW__CORE__FERNET_KEY: '' + AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' + AIRFLOW__CORE__LOAD_EXAMPLES: 'false' + AIRFLOW__CORE__DAGS_FOLDER: '/opt/airflow/rialto_airflow/dags' + AIRFLOW__CORE__PLUGINS_FOLDER: '/opt/airflow/rialto_airflow/plugins' + AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth' + AIRFLOW__SMTP__SMTP_USER: ${AIRFLOW__SMTP__SMTP_USER} + AIRFLOW__SMTP__SMTP_HOST: ${AIRFLOW__SMTP__SMTP_HOST} + AIRFLOW__SMTP__SMTP_PASSWORD: '' + AIRFLOW__SMTP__SMTP_MAIL_FROM: ${AIRFLOW__SMTP__SMTP_MAIL_FROM} + AIRFLOW__WEBSERVER__SECRET_KEY: ${AIRFLOW__WEBSERVER__SECRET_KEY} + # yamllint disable rule:line-length + # Use simple http server on scheduler for health checks + # See https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/check-health.html#scheduler-health-check-server + # yamllint enable rule:line-length + AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true' + # WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks + # for other purpose (development, test and especially production usage) build/extend Airflow image. + _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-} + # The following line can be used to set a custom config file, stored in the local config folder + # If you want to use it, outcomment it and replace airflow.cfg with the name of your config file + # AIRFLOW_CONFIG: '/opt/airflow/config/airflow.cfg' + AIRFLOW_VAR_DIMENSIONS_API_USER: ${AIRFLOW_VAR_DIMENSIONS_API_USER} + AIRFLOW_VAR_DIMENSIONS_API_PASS: ${AIRFLOW_VAR_DIMENSIONS_API_PASS} + AIRFLOW_VAR_SUL_PUB_HOST: ${AIRFLOW_VAR_SUL_PUB_HOST} + AIRFLOW_VAR_SUL_PUB_KEY: ${AIRFLOW_VAR_SUL_PUB_KEY} + AIRFLOW_VAR_DATA_DIR: /opt/airflow/data + volumes: + - /opt/app/rialto/rialto-airflow/current/rialto_airflow:/opt/airflow/rialto_airflow + - /data:/opt/airflow/data + - /opt/app/rialto/rialto-airflow/shared/logs:/opt/airflow/logs + user: "503:0" + depends_on: + &airflow-common-depends-on + redis: + condition: service_healthy + +services: + redis: + # Redis is limited to 7.2-bookworm due to licencing change + # https://redis.io/blog/redis-adopts-dual-source-available-licensing/ + image: redis:7.2-bookworm + expose: + - 6379 + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 10s + timeout: 30s + retries: 50 + start_period: 30s + restart: always + + airflow-webserver: + <<: *airflow-common + command: webserver + ports: + - "3000:8080" + healthcheck: + test: ["CMD", "curl", "--fail", "http://localhost:8080/health"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 30s + restart: always + depends_on: + <<: *airflow-common-depends-on + airflow-init: + condition: service_completed_successfully + + airflow-scheduler: + <<: *airflow-common + command: scheduler + healthcheck: + test: ["CMD", "curl", "--fail", "http://localhost:8974/health"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 30s + restart: always + depends_on: + <<: *airflow-common-depends-on + airflow-init: + condition: service_completed_successfully + + airflow-worker: + <<: *airflow-common + command: celery worker + healthcheck: + # yamllint disable rule:line-length + test: + - "CMD-SHELL" + - 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}" || celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"' + interval: 30s + timeout: 10s + retries: 5 + start_period: 30s + environment: + <<: *airflow-common-env + # Required to handle warm shutdown of the celery workers properly + # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation + DUMB_INIT_SETSID: "0" + restart: always + depends_on: + <<: *airflow-common-depends-on + airflow-init: + condition: service_completed_successfully + + airflow-triggerer: + <<: *airflow-common + command: triggerer + healthcheck: + test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"'] + interval: 30s + timeout: 10s + retries: 5 + start_period: 30s + restart: always + depends_on: + <<: *airflow-common-depends-on + airflow-init: + condition: service_completed_successfully + + airflow-init: + <<: *airflow-common + entrypoint: /bin/bash + environment: + <<: *airflow-common-env + _AIRFLOW_DB_MIGRATE: 'true' + _AIRFLOW_WWW_USER_CREATE: 'true' + _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} + _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} + command: + - -c + - exec /entrypoint airflow version + + + airflow-cli: + <<: *airflow-common + profiles: + - debug + environment: + <<: *airflow-common-env + CONNECTION_CHECK_MAX_COUNT: "0" + # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252 + command: + - bash + - -c + - airflow + + # You can enable flower by adding "--profile flower" option e.g. docker-compose --profile flower up + # or by explicitly targeted on the command line e.g. docker-compose up flower. + # See: https://docs.docker.com/compose/profiles/ + flower: + <<: *airflow-common + command: celery flower + profiles: + - flower + ports: + - "5555:5555" + healthcheck: + test: ["CMD", "curl", "--fail", "http://localhost:5555/"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 30s + restart: always + depends_on: + <<: *airflow-common-depends-on + airflow-init: + condition: service_completed_successfully diff --git a/docker-compose.yaml b/docker-compose.yaml index 8692160..1c237d7 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -87,7 +87,7 @@ x-airflow-common: # they can persist across capistrano deploys? - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs - ${AIRFLOW_PROJ_DIR:-.}/data:/opt/airflow/data - user: "${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-0}" + user: "503:0" depends_on: &airflow-common-depends-on redis: diff --git a/rialto_airflow/dags/harvest.py b/rialto_airflow/dags/harvest.py index 3b2481e..94db713 100644 --- a/rialto_airflow/dags/harvest.py +++ b/rialto_airflow/dags/harvest.py @@ -15,7 +15,7 @@ sul_pub_key = Variable.get("sul_pub_key") # to artificially limit the API activity in development -dev_limit = Variable.get("dev_limit") +dev_limit = Variable.get("dev_limit", default_var=None) if dev_limit is not None: dev_limit = int(dev_limit)