From 59eb7d8530a79efb0b84396256625c087fc9e25e Mon Sep 17 00:00:00 2001 From: iizitounene Date: Thu, 28 Mar 2024 11:40:22 +0100 Subject: [PATCH 01/17] feat(spark-base): Add new spark-base image (java/scala only) without okdp extensions --- spark-base/Dockerfile | 81 ++++++++++++++++++++++ spark-base/entrypoint.sh | 142 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 223 insertions(+) create mode 100644 spark-base/Dockerfile create mode 100644 spark-base/entrypoint.sh diff --git a/spark-base/Dockerfile b/spark-base/Dockerfile new file mode 100644 index 0000000..78326f5 --- /dev/null +++ b/spark-base/Dockerfile @@ -0,0 +1,81 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +ARG JAVA_VERSION=11 +ARG BASE_IMAGE=eclipse-temurin:${JAVA_VERSION}-jre-jammy +FROM $BASE_IMAGE + +ARG spark_uid=185 + +ARG SPARK_VERSION=3.2.1 +ARG HADOOP_VERSION=3.2 +ARG SCALA_VERSION=2.12 +ARG SPARK_DIST_DOWNLOAD_URL=https://archive.apache.org/dist/spark + +ENV SPARK_HOME /opt/spark +ENV SPARK_CONF_DIR ${SPARK_HOME}/conf + +ENV SPARK_VERSION ${SPARK_VERSION} +ENV HADOOP_VERSION ${HADOOP_VERSION} +ENV SCALA_VERSION ${SCALA_VERSION} + +RUN groupadd --system --gid=${spark_uid} spark && \ + useradd --system --uid=${spark_uid} --gid=spark spark + +RUN set -ex; \ + apt-get update; \ + ln -s /lib /lib64; \ + apt install -y --no-install-recommends gnupg2 bash tini libc6 libpam-modules krb5-user libnss3 procps net-tools gosu libnss-wrapper curl; \ + mkdir -p ${SPARK_HOME}; \ + mkdir -p ${SPARK_HOME}/work-dir; \ + chmod g+w ${SPARK_HOME}/work-dir; \ + chown -R spark:spark ${SPARK_HOME}; \ + rm /bin/sh; \ + ln -sv /bin/bash /bin/sh; \ + echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su; \ + chgrp root /etc/passwd && chmod ug+rw /etc/passwd; \ + rm -rf /var/cache/apt/* && rm -rf /var/lib/apt/lists/* + +RUN set -ex;\ + export WORK_DIR="$(mktemp -d)"; \ + DIST=spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}; \ + if [ "${SCALA_VERSION}" = "2.13" ]; then \ + DIST+=-scala${SCALA_VERSION}; \ + fi; \ + curl --retry 3 --retry-all-errors -k ${SPARK_DIST_DOWNLOAD_URL}/${DIST}.tgz -o ${WORK_DIR}/spark.tgz; \ + curl --retry 3 --retry-all-errors -k ${SPARK_DIST_DOWNLOAD_URL}/${DIST}.tgz.asc -o ${WORK_DIR}/spark.tgz.asc; \ + curl --retry 3 --retry-all-errors -k https://downloads.apache.org/spark/KEYS -o ${WORK_DIR}/KEYS; \ + export GNUPGHOME="$(mktemp -d)"; \ + gpg --batch --import ${WORK_DIR}/KEYS; \ + gpg --batch --verify ${WORK_DIR}/spark.tgz.asc ${WORK_DIR}/spark.tgz; \ + tar --strip-components=1 -zxvf ${WORK_DIR}/spark.tgz -C ${SPARK_HOME}/; \ + chown -R spark:spark ${SPARK_HOME}/; \ + mv ${SPARK_HOME}/kubernetes/dockerfiles/spark/decom.sh /opt/; \ + mv ${SPARK_HOME}/kubernetes/tests ${SPARK_HOME}/; \ + chmod a+x /opt/decom.sh; \ + gpgconf --kill all; \ + rm -rf ${GNUPGHOME} ${WORK_DIR}; \ + rm -fr ${SPARK_HOME}/conf rm -fr ${SPARK_HOME}/yarn rm -fr ${SPARK_HOME}/kubernetes + +COPY entrypoint.sh /opt/entrypoint.sh +RUN chmod a+x /opt/entrypoint.sh + +WORKDIR ${SPARK_HOME}/work-dir + +USER spark + +ENTRYPOINT [ "/opt/entrypoint.sh" ] + diff --git a/spark-base/entrypoint.sh b/spark-base/entrypoint.sh new file mode 100644 index 0000000..75611a9 --- /dev/null +++ b/spark-base/entrypoint.sh @@ -0,0 +1,142 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Prevent any errors from being silently ignored +set -eo pipefail + +attempt_setup_fake_passwd_entry() { + # Check whether there is a passwd entry for the container UID + local myuid; myuid="$(id -u)" + # If there is no passwd entry for the container UID, attempt to fake one + # You can also refer to the https://github.com/docker-library/official-images/pull/13089#issuecomment-1534706523 + # It's to resolve OpenShift random UID case. + # See also: https://github.com/docker-library/postgres/pull/448 + if ! getent passwd "$myuid" &> /dev/null; then + local wrapper + for wrapper in {/usr,}/lib{/*,}/libnss_wrapper.so; do + if [ -s "$wrapper" ]; then + NSS_WRAPPER_PASSWD="$(mktemp)" + NSS_WRAPPER_GROUP="$(mktemp)" + export LD_PRELOAD="$wrapper" NSS_WRAPPER_PASSWD NSS_WRAPPER_GROUP + local mygid; mygid="$(id -g)" + printf 'spark:x:%s:%s:${SPARK_USER_NAME:-anonymous uid}:%s:/bin/false\n' "$myuid" "$mygid" "$SPARK_HOME" > "$NSS_WRAPPER_PASSWD" + printf 'spark:x:%s:\n' "$mygid" > "$NSS_WRAPPER_GROUP" + break + fi + done + fi +} + +if [ -z "$JAVA_HOME" ]; then + JAVA_HOME=$(java -XshowSettings:properties -version 2>&1 > /dev/null | grep 'java.home' | awk '{print $3}') +fi + +SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*" +for v in "${!SPARK_JAVA_OPT_@}"; do + SPARK_EXECUTOR_JAVA_OPTS+=( "${!v}" ) +done + +if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then + SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH" +fi + +if ! [ -z "${PYSPARK_PYTHON+x}" ]; then + export PYSPARK_PYTHON +fi +if ! [ -z "${PYSPARK_DRIVER_PYTHON+x}" ]; then + export PYSPARK_DRIVER_PYTHON +fi + +# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor. +# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s. +if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then + export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)" +fi + +if ! [ -z "${HADOOP_CONF_DIR+x}" ]; then + SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH"; +fi + +if ! [ -z "${SPARK_CONF_DIR+x}" ]; then + SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH"; +elif ! [ -z "${SPARK_HOME+x}" ]; then + SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH"; +fi + +# SPARK-43540: add current working directory into executor classpath +SPARK_CLASSPATH="$SPARK_CLASSPATH:$PWD" + +# Switch to spark if no USER specified (root by default) otherwise use USER directly +switch_spark_if_root() { + if [ $(id -u) -eq 0 ]; then + echo gosu spark + fi +} + +spark_3_2_support(){ + if ! printf '%s\n%s' "$1" "$2" | sort -C -V + then + # + 3.3.0 + echo "org.apache.spark.scheduler.cluster.k8s.KubernetesExecutorBackend --podName $SPARK_EXECUTOR_POD_NAME" + else + # -3.0.0 + echo "org.apache.spark.executor.CoarseGrainedExecutorBackend" + fi +} + +KUBERNETES_EXECUTOR_BACKEND="$(spark_3_2_support $SPARK_VERSION '3.2.4')" + +case "$1" in + driver) + shift 1 + CMD=( + "$SPARK_HOME/bin/spark-submit" + --conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS" + --conf "spark.executorEnv.SPARK_DRIVER_POD_IP=$SPARK_DRIVER_BIND_ADDRESS" + --deploy-mode client + "$@" + ) + attempt_setup_fake_passwd_entry + # Execute the container CMD under tini for better hygiene + exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}" + ;; + executor) + shift 1 + CMD=( + ${JAVA_HOME}/bin/java + "${SPARK_EXECUTOR_JAVA_OPTS[@]}" + -Xms"$SPARK_EXECUTOR_MEMORY" + -Xmx"$SPARK_EXECUTOR_MEMORY" + -cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH" + $KUBERNETES_EXECUTOR_BACKEND + --driver-url "$SPARK_DRIVER_URL" + --executor-id "$SPARK_EXECUTOR_ID" + --cores "$SPARK_EXECUTOR_CORES" + --app-id "$SPARK_APPLICATION_ID" + --hostname "$SPARK_EXECUTOR_POD_IP" + --resourceProfileId "$SPARK_RESOURCE_PROFILE_ID" + ) + attempt_setup_fake_passwd_entry + # Execute the container CMD under tini for better hygiene + exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}" + ;; + + *) + # Non-spark-on-k8s command provided, proceeding in pass-through mode... + exec "$@" + ;; +esac From 1d935f744ede3160eba6fb4d05dd8b71e9bed991 Mon Sep 17 00:00:00 2001 From: iizitounene Date: Thu, 28 Mar 2024 11:44:43 +0100 Subject: [PATCH 02/17] feat(spark): Add new spark image (java/scala only) with okdp extensions (aws/minio, prometheus java agent, okdp-spark-auth-filter) --- spark/Dockerfile | 64 ++++++++++++++++++++ spark/metrics.properties | 19 ++++++ spark/okdp-addons.pom | 111 +++++++++++++++++++++++++++++++++++ spark/prometheus.yaml | 123 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 317 insertions(+) create mode 100644 spark/Dockerfile create mode 100644 spark/metrics.properties create mode 100644 spark/okdp-addons.pom create mode 100644 spark/prometheus.yaml diff --git a/spark/Dockerfile b/spark/Dockerfile new file mode 100644 index 0000000..dc9a504 --- /dev/null +++ b/spark/Dockerfile @@ -0,0 +1,64 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +ARG SPARK_VERSION=3.2.1 +ARG HADOOP_VERSION=3.2 +ARG SCALA_VERSION=2.12 +ARG JAVA_VERSION=11 + +ARG REGISTRY=quay.io +ARG REPO=okdp +ARG BASE_IMAGE=${REGISTRY}/${REPO}/spark:base-spark-${SPARK_VERSION}-scala-${SCALA_VERSION}-java-${JAVA_VERSION} + +FROM eclipse-temurin:${JAVA_VERSION}-jre-jammy AS okdp_addons +ARG SPARK_VERSION=3.2.1 +ARG SCALA_VERSION=2.12 + +RUN set -ex; \ + apt-get update; \ + apt install -y --no-install-recommends maven + +WORKDIR /workspace + +COPY okdp-addons.pom deps/pom.xml + +# The setup consumes less space compare to inheriting from the parent pom +# Handles the transitive dependencies versions through the pom +# Manage Java AWS SDK v1 (hadoop <3.4)/V2 (hadoop >=3.4) +RUN mvn dependency:get -DgroupId=org.apache.spark -DartifactId=spark-parent_${SCALA_VERSION} -Dversion=${SPARK_VERSION} -Dpackaging=pom; \ + mvn dependency:copy -Dartifact=org.apache.spark:spark-parent_${SCALA_VERSION}:${SPARK_VERSION}:pom -Dproject.basedir=./ -DoutputDirectory=./; \ + HADOOP_VERSION=$(grep "" spark-parent_${SCALA_VERSION}-${SPARK_VERSION}.pom | sed -e 's/^ *\(.*\)<\/hadoop.version> *$/\1/'|head -1); \ + mv ./deps/pom.xml .; \ + mvn clean dependency:copy-dependencies \ + -Dspark.version=${SPARK_VERSION} \ + -Dscala.version=${SCALA_VERSION} \ + -Dhadoop.version=${HADOOP_VERSION} \ + -Paws + +FROM $BASE_IMAGE + +ENV JMX_CONF_DIR /etc/metrics/conf/ + +# OKDP addons +COPY --from=okdp_addons --chown=spark:spark /workspace/target/dependency/* $SPARK_HOME/jars +RUN chown -R spark:spark ${SPARK_HOME}/jars/ + +# Jmx prometheus metrics +COPY --chown=spark:spark metrics.properties ${JMX_CONF_DIR}/metrics.properties +COPY --chown=spark:spark prometheus.yaml ${JMX_CONF_DIR}/prometheus.yaml + +USER spark + diff --git a/spark/metrics.properties b/spark/metrics.properties new file mode 100644 index 0000000..9640deb --- /dev/null +++ b/spark/metrics.properties @@ -0,0 +1,19 @@ +# +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +*.sink.jmx.class=org.apache.spark.metrics.sink.JmxSink +driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource +executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource \ No newline at end of file diff --git a/spark/okdp-addons.pom b/spark/okdp-addons.pom new file mode 100644 index 0000000..eb94065 --- /dev/null +++ b/spark/okdp-addons.pom @@ -0,0 +1,111 @@ + + + + + 4.0.0 + OKDP Addons + io.okdp + okdp-spark-docker-addons + ${spark.version} + pom + + OKDP extensions for spark docker images + + + UTF-8 + + + + + + io.okdp + okdp-spark-auth-filter + 1.1.0 + + + * + * + + + + + + io.prometheus.jmx + jmx_prometheus_javaagent + 0.20.0 + + + * + * + + + + + + org.apache.hadoop + hadoop-common + ${hadoop.version} + + + * + * + + + + + + + + aws + + + org.apache.hadoop + hadoop-aws + ${hadoop.version} + + + + org.apache.spark + spark-hadoop-cloud_${scala.version} + ${spark.version} + + + * + * + + + + + + + \ No newline at end of file diff --git a/spark/prometheus.yaml b/spark/prometheus.yaml new file mode 100644 index 0000000..f054e28 --- /dev/null +++ b/spark/prometheus.yaml @@ -0,0 +1,123 @@ +# +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +--- +lowercaseOutputName: true +attrNameSnakeCase: true +rules: + # These come from the application driver if it's a streaming application + # Example: default/streaming.driver.com.example.ClassName.StreamingMetrics.streaming.lastCompletedBatch_schedulingDelay + - pattern: metrics<>Value + name: spark_streaming_driver_$4 + labels: + app_namespace: "$1" + app_id: "$2" + # These come from the application driver if it's a structured streaming application + # Example: default/streaming.driver.spark.streaming.QueryName.inputRate-total + - pattern: metrics<>Value + name: spark_structured_streaming_driver_$4 + labels: + app_namespace: "$1" + app_id: "$2" + query_name: "$3" + # These come from the application executors + # Example: default/spark-pi.0.executor.threadpool.activeTasks + - pattern: metrics<>Value + name: spark_executor_$4 + type: GAUGE + labels: + app_namespace: "$1" + app_id: "$2" + executor_id: "$3" + # These come from the application driver + # Example: default/spark-pi.driver.DAGScheduler.stage.failedStages + - pattern: metrics<>Value + name: spark_driver_$3_$4 + type: GAUGE + labels: + app_namespace: "$1" + app_id: "$2" + # These come from the application driver + # Emulate timers for DAGScheduler like messagePRocessingTime + - pattern: metrics<>Count + name: spark_driver_DAGScheduler_$3_count + type: COUNTER + labels: + app_namespace: "$1" + app_id: "$2" + # HiveExternalCatalog is of type counter + - pattern: metrics<>Count + name: spark_driver_HiveExternalCatalog_$3_count + type: COUNTER + labels: + app_namespace: "$1" + app_id: "$2" + # These come from the application driver + # Emulate histograms for CodeGenerator + - pattern: metrics<>Count + name: spark_driver_CodeGenerator_$3_count + type: COUNTER + labels: + app_namespace: "$1" + app_id: "$2" + # These come from the application driver + # Emulate timer (keep only count attribute) plus counters for LiveListenerBus + - pattern: metrics<>Count + name: spark_driver_LiveListenerBus_$3_count + type: COUNTER + labels: + app_namespace: "$1" + app_id: "$2" + # Get Gauge type metrics for LiveListenerBus + - pattern: metrics<>Value + name: spark_driver_LiveListenerBus_$3 + type: GAUGE + labels: + app_namespace: "$1" + app_id: "$2" + # Executors counters + - pattern: metrics<>Count + name: spark_executor_$4_count + type: COUNTER + labels: + app_namespace: "$1" + app_id: "$2" + executor_id: "$3" + # These come from the application executors + # Example: app-20160809000059-0000.0.jvm.threadpool.activeTasks + - pattern: metrics<>Value + name: spark_executor_$4_$5 + type: GAUGE + labels: + app_namespace: "$1" + app_id: "$2" + executor_id: "$3" + - pattern: metrics<>Count + name: spark_executor_HiveExternalCatalog_$4_count + type: COUNTER + labels: + app_namespace: "$1" + app_id: "$2" + executor_id: "$3" + # These come from the application driver + # Emulate histograms for CodeGenerator + - pattern: metrics<>Count + name: spark_executor_CodeGenerator_$4_count + type: COUNTER + labels: + app_namespace: "$1" + app_id: "$2" + executor_id: "$3" \ No newline at end of file From ad94e07da550eb00e2228b539d166450ef2e1a4d Mon Sep 17 00:00:00 2001 From: iizitounene Date: Thu, 28 Mar 2024 11:47:45 +0100 Subject: [PATCH 03/17] feat(spark-py): Add new spark-py image with python basic requirements --- spark-py/Dockerfile | 40 +++++++++++++++++++++++++++++++++++++++ spark-py/requirements.txt | 0 2 files changed, 40 insertions(+) create mode 100644 spark-py/Dockerfile create mode 100644 spark-py/requirements.txt diff --git a/spark-py/Dockerfile b/spark-py/Dockerfile new file mode 100644 index 0000000..ae9239f --- /dev/null +++ b/spark-py/Dockerfile @@ -0,0 +1,40 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +ARG SPARK_VERSION=3.2.1 +ARG HADOOP_VERSION=3.2 +ARG SCALA_VERSION=2.12 +ARG JAVA_VERSION=11 + +ARG REGISTRY=quay.io +ARG REPO=okdp +ARG BASE_IMAGE=${REGISTRY}/${REPO}/spark:spark-${SPARK_VERSION}-scala-${SCALA_VERSION}-java-${JAVA_VERSION} + +FROM $BASE_IMAGE + +ARG PYTHON_VERSION 3.11 + +USER root + +COPY requirements.txt . + +RUN set -ex; \ + apt-get update; \ + apt-get install -y --no-install-recommends python${PYTHON_VERSION} python3-pip; \ + pip install -r requirements.txt; \ + rm -rf /var/lib/apt/lists/* requirements.txt + +USER spark diff --git a/spark-py/requirements.txt b/spark-py/requirements.txt new file mode 100644 index 0000000..e69de29 From a9c0880ebdd7c67c09e4150740bfb0be57a9d9f3 Mon Sep 17 00:00:00 2001 From: iizitounene Date: Thu, 28 Mar 2024 11:48:43 +0100 Subject: [PATCH 04/17] feat(spark-r): Add new spark-r image with R basic requirements --- spark-r/Dockerfile | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 spark-r/Dockerfile diff --git a/spark-r/Dockerfile b/spark-r/Dockerfile new file mode 100644 index 0000000..a124df7 --- /dev/null +++ b/spark-r/Dockerfile @@ -0,0 +1,37 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +ARG SPARK_VERSION=3.2.1 +ARG HADOOP_VERSION=3.2 +ARG SCALA_VERSION=2.12 +ARG JAVA_VERSION=11 + +ARG REGISTRY=quay.io +ARG REPO=okdp +ARG BASE_IMAGE=${REGISTRY}/${REPO}/spark:spark-${SPARK_VERSION}-scala-${SCALA_VERSION}-java-${JAVA_VERSION} + +FROM $BASE_IMAGE + +USER root + +RUN set -ex; \ + apt update; \ + apt-get install -y --no-install-recommends r-base r-base-dev; \ + rm -rf /var/lib/apt/lists/* + +ENV R_HOME /usr/lib/R + +USER spark From 2201f4561d4478b3ff5ec7812e1d4bcb75f1884c Mon Sep 17 00:00:00 2001 From: iizitounene Date: Thu, 28 Mar 2024 12:21:22 +0100 Subject: [PATCH 05/17] Build pipeline - Add basic pipeline to build and push into CI registry --- .build/images.yml | 45 +++++++ .github/actions/free-disk-space/action.yml | 40 +++++++ .github/actions/setup-buildx/action.yaml | 29 +++++ .github/actions/spark-image-tag/action.yaml | 99 ++++++++++++++++ .github/workflows/build-image-template.yml | 124 ++++++++++++++++++++ .github/workflows/build-images-template.yml | 99 ++++++++++++++++ .github/workflows/ci.yml | 70 +++++++++++ .gitignore | 32 +++++ 8 files changed, 538 insertions(+) create mode 100644 .build/images.yml create mode 100644 .github/actions/free-disk-space/action.yml create mode 100644 .github/actions/setup-buildx/action.yaml create mode 100644 .github/actions/spark-image-tag/action.yaml create mode 100644 .github/workflows/build-image-template.yml create mode 100644 .github/workflows/build-images-template.yml create mode 100644 .github/workflows/ci.yml create mode 100644 .gitignore diff --git a/.build/images.yml b/.build/images.yml new file mode 100644 index 0000000..4124759 --- /dev/null +++ b/.build/images.yml @@ -0,0 +1,45 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +images: + - name: docker.io/eclipse-temurin + tags: + - ${java_version}-jre-jammy + - name: spark-base + dependsOn: docker.io/eclipse-temurin + tags: + - spark-${spark_version}-scala-${scala_version}-java-${java_version} + - spark-${spark_version}-scala-${scala_version}-java-${java_version}-$(date '+%Y-%m-%d') + - name: spark + dependsOn: spark-base + tags: + - spark-${spark_version}-scala-${scala_version}-java-${java_version} + - spark-${spark_version}-scala-${scala_version}-java-${java_version}-$(date '+%Y-%m-%d') + - name: spark-py + dependsOn: spark + tags: + - spark-${spark_version}-python-${python_version}-scala-${scala_version}-java-${java_version} + - spark-${spark_version}-python-${python_version}-scala-${scala_version}-java-${java_version}-$(date '+%Y-%m-%d') + - name: spark-r + dependsOn: spark + tags: + - spark-${spark_version}-r-${r_version}-scala-${scala_version}-java-${java_version} + - spark-${spark_version}-r-${r_version}-scala-${scala_version}-java-${java_version}-$(date '+%Y-%m-%d') + - name: spark-py-r + dependsOn: spark-py + tags: + - spark-${spark_version}-python-${python_version}-r-${r_version}-scala-${scala_version}-java-${java_version} + - spark-${spark_version}-python-${python_version}-r-${r_version}-scala-${scala_version}-java-${java_version}-$(date '+%Y-%m-%d') diff --git a/.github/actions/free-disk-space/action.yml b/.github/actions/free-disk-space/action.yml new file mode 100644 index 0000000..4f661da --- /dev/null +++ b/.github/actions/free-disk-space/action.yml @@ -0,0 +1,40 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: Free disk space +description: Free Github runnner disk space + +runs: + using: composite + steps: + - name: Free Disk Space (Ubuntu) + uses: jlumbroso/free-disk-space@main + with: + # this might remove tools that are actually needed, + # if set to "true" but frees about 6 GB + tool-cache: false + + # all of these default to true, but feel free to set to + # "false" if necessary for your workflow + android: true + dotnet: true + haskell: true + large-packages: true + docker-images: true + swap-storage: true + + + diff --git a/.github/actions/setup-buildx/action.yaml b/.github/actions/setup-buildx/action.yaml new file mode 100644 index 0000000..6de6665 --- /dev/null +++ b/.github/actions/setup-buildx/action.yaml @@ -0,0 +1,29 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: Set up QEMU and Docker Buildx +description: Set up Docker Buildx + +runs: + using: composite + steps: + - name: Set up QEMU 📦 + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx 📦 + uses: docker/setup-buildx-action@v3 + with: + driver-opts: network=host \ No newline at end of file diff --git a/.github/actions/spark-image-tag/action.yaml b/.github/actions/spark-image-tag/action.yaml new file mode 100644 index 0000000..eaf56a9 --- /dev/null +++ b/.github/actions/spark-image-tag/action.yaml @@ -0,0 +1,99 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: Generate spark image tags +description: Generate spark image tags + +inputs: + image: + description: Image name + required: true + spark_version: + description: Spark version + required: true + scala_version: + description: Scala version + required: true + java_version: + description: Java version + required: true + python_version: + description: Python version + required: true + ci_repo: + description: The CI registry repo + required: false + +outputs: + parent_image: + description: "Image tags" + value: ${{ steps.tags.outputs.parent_image }} + latest_tag: + description: "CI image tags (ex.: spark-3.3.4....)" + value: ${{ steps.tags.outputs.latest_tag }} + +runs: + using: composite + steps: + - name: Install yq + run: | + sudo wget -qO /usr/local/bin/yq https://github.com/mikefarah/yq/releases/download/v4.42.1/yq_linux_amd64 + sudo chmod a+x /usr/local/bin/yq + shell: bash + + - name: Expose git commit sha as env variable + uses: rlespinasse/git-commit-data-action@v1.5.0 + + - name: Generate spark image tags 📦 + id: tags + run: | + ### Inputs + ### Variables substitution used in '.build/images.yml' file + spark_version=${{ inputs.spark_version }} + scala_version=${{ inputs.scala_version }} + java_version=${{ inputs.java_version }} + python_version=${{ inputs.python_version }} + + git_commit_sha=${{ env.GIT_COMMIT_SHA }} + git_commit_short_sha=${{ env.GIT_COMMIT_SHORT_SHA }} + git_commit_short_sha=${{ env.GIT_COMMIT_SHORT_SHA }} + + + ### Outputs - Parse: .build/images.yml + PARENT_IMAGE_NAME=$(yq '(.images[] | select(.name == "${{ inputs.image }}").dependsOn)' .build/images.yml) + PARENT_IMAGE_NAME=$(eval echo ${PARENT_IMAGE_NAME}) + + PARENT_IMAGE_TAG=$(yq -oc "(.images[] | select(.name == \"${PARENT_IMAGE_NAME}\").tags[0])" .build/images.yml) + PARENT_IMAGE_TAG=$(eval echo ${PARENT_IMAGE_TAG}) + PARENT_IMAGE_NAME="${PARENT_IMAGE_NAME}:${PARENT_IMAGE_TAG}" + + LATEST_TAG=$(yq -oc '(.images[] | select(.name == "${{ inputs.image }}").tags[0])' .build/images.yml) + LATEST_TAG=$(eval echo ${LATEST_TAG}) + + # The image can inherit from a community image like docker.io/eclipse-temurin, ... + if [[ "${PARENT_IMAGE_NAME}" != *"/"* ]] + then + PARENT_IMAGE_NAME="${{ inputs.ci_repo }}/${PARENT_IMAGE_NAME}" + fi + + # Logging + echo "parent_image=${PARENT_IMAGE_NAME}" + echo "latest_tag=${LATEST_TAG}" + # Set outputs + echo "parent_image=${PARENT_IMAGE_NAME}" >> $GITHUB_OUTPUT + echo "latest_tag=${LATEST_TAG}" >> $GITHUB_OUTPUT + + shell: bash diff --git a/.github/workflows/build-image-template.yml b/.github/workflows/build-image-template.yml new file mode 100644 index 0000000..ced1f46 --- /dev/null +++ b/.github/workflows/build-image-template.yml @@ -0,0 +1,124 @@ + +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: Spark build single image template + +on: + workflow_call: + inputs: + image: + description: The spark image name (ex. spark-base, spark, spark-py, spark-r, etc) + required: true + type: string + spark_version: + description: Spark version + required: true + type: string + scala_version: + description: Scala version + required: true + type: string + java_version: + description: Java version + required: true + type: string + hadoop_version: + description: Hadoop version + required: true + type: string + python_version: + description: Python version + required: true + type: string + ci_registry: + description: "The registry used to push ci images" + required: false + type: string + default: "ghcr.io" + git_latest_release_tag: + description: The latest remote release tag + required: false + type: string + default: "" + runs-on: + description: GitHub Actions Runner image + required: true + type: string + +jobs: + + build-test-push: + name: ${{ inputs.image }} (scala-${{ inputs.scala_version }}, java-${{ inputs.java_version }}, python-${{ inputs.python_version }}) + runs-on: ${{ inputs.runs-on }} + steps: + ### The CI is based on the main branch + - name: Checkout Repo ⚡️ + uses: actions/checkout@v4 + + ### Common steps between CI and Publish + - name: Free up disk space 📦 + uses: ./.github/actions/free-disk-space + + - name: Set up QEMU and Docker Buildx 📦 + uses: ./.github/actions/setup-buildx + + - name: Set up CI and Publish registries 📦 + id: registry-repos + run: | + echo "repo_owner=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_OUTPUT + echo "ci_repo=${{ inputs.ci_registry }}/${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_OUTPUT + shell: bash + + - name: Generate image tags 📦 + id: image-tags + uses: ./.github/actions/spark-image-tag + with: + image: ${{ inputs.image }} + spark_version: ${{ inputs.spark_version}} + scala_version: ${{ inputs.scala_version }} + java_version: ${{ inputs.java_version }} + python_version: ${{ inputs.python_version}} + ci_repo: ${{ steps.registry-repos.outputs.ci_repo }} + + - name: Login to the CI registry 🔐 + if: (!startsWith(inputs.spark_version, '2.')) + uses: docker/login-action@v3 + with: + registry: ${{ inputs.ci_registry }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push to ci registry + if: (!startsWith(inputs.spark_version, '2.')) + uses: docker/build-push-action@v5 + with: + context: ${{ inputs.image }} + platforms: linux/amd64,linux/arm64 + push: true + build-args: | + SPARK_VERSION=${{ inputs.spark_version}} + SCALA_VERSION=${{ inputs.scala_version }} + JAVA_VERSION=${{ inputs.java_version }} + PYTHON_VERSION=${{ inputs.python_version }} + HADOOP_VERSION=${{ inputs.hadoop_version }} + BASE_IMAGE=${{ steps.image-tags.outputs.parent_image }} + tags: | + ${{ steps.registry-repos.outputs.ci_repo }}/${{ inputs.image }}:${{ steps.image-tags.outputs.latest_tag }} + + + + diff --git a/.github/workflows/build-images-template.yml b/.github/workflows/build-images-template.yml new file mode 100644 index 0000000..0b79e80 --- /dev/null +++ b/.github/workflows/build-images-template.yml @@ -0,0 +1,99 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: spark build multiple images template + +on: + workflow_call: + inputs: + spark_version: + description: Spark version + required: true + type: string + scala_version: + description: Scala version + required: true + type: string + java_version: + description: Java version + required: true + type: string + hadoop_version: + description: Hadoop version + required: true + type: string + python_version: + description: Python version + required: true + type: string + runs-on: + description: GitHub Actions Runner image + required: false + type: string + default: "ubuntu-latest" + +jobs: + + spark-base: + uses: ./.github/workflows/build-image-template.yml + with: + image: spark-base + python_version: ${{ inputs.python_version }} + spark_version: ${{ inputs.spark_version }} + java_version: ${{ inputs.java_version }} + scala_version: ${{ inputs.scala_version }} + hadoop_version: ${{ inputs.hadoop_version }} + runs-on: ${{ inputs.runs-on }} + secrets: inherit + + spark: + uses: ./.github/workflows/build-image-template.yml + needs: [spark-base] + with: + image: spark + python_version: ${{ inputs.python_version }} + spark_version: ${{ inputs.spark_version }} + java_version: ${{ inputs.java_version }} + scala_version: ${{ inputs.scala_version }} + hadoop_version: ${{ inputs.hadoop_version }} + runs-on: ${{ inputs.runs-on }} + secrets: inherit + + spark-py: + uses: ./.github/workflows/build-image-template.yml + needs: [spark] + with: + image: spark-py + python_version: ${{ inputs.python_version }} + spark_version: ${{ inputs.spark_version }} + java_version: ${{ inputs.java_version }} + scala_version: ${{ inputs.scala_version }} + hadoop_version: ${{ inputs.hadoop_version }} + runs-on: ${{ inputs.runs-on }} + secrets: inherit + + spark-r: + uses: ./.github/workflows/build-image-template.yml + needs: [spark] + with: + image: spark-r + python_version: ${{ inputs.python_version }} + spark_version: ${{ inputs.spark_version }} + java_version: ${{ inputs.java_version }} + scala_version: ${{ inputs.scala_version }} + hadoop_version: ${{ inputs.hadoop_version }} + runs-on: ${{ inputs.runs-on }} + secrets: inherit diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..5569799 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,70 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: ci + +on: + pull_request: + paths: + - ".github/workflows/**" + - ".github/actions/**" + + - "spark/**" + - "spark-*/**" + + - "!README.md" + + push: + branches: + - main + paths: + - ".github/workflows/**" + - ".github/actions/**" + + - "spark/**" + - "spark-*/**" + + - "!README.md" + + workflow_dispatch: + +# https://docs.github.com/en/actions/using-jobs/using-concurrency +concurrency: + # Only cancel in-progress jobs or runs for the current workflow - matches against branch & tags + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: + packages: write + +jobs: + + spark-ci: + name: spark-ci (spark-${{ matrix.spark_version }}) + strategy: + matrix: + python_version: [3.11] + spark_version: [3.4.2, 3.3.4] + java_version: [17] + scala_version: [2.12, 2.13] + hadoop_version: [3] + uses: ./.github/workflows/build-images-template.yml + with: + python_version: ${{ matrix.python_version }} + spark_version: ${{ matrix.spark_version }} + java_version: ${{ matrix.java_version }} + scala_version: ${{ matrix.scala_version }} + hadoop_version: ${{ matrix.hadoop_version }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..94d918d --- /dev/null +++ b/.gitignore @@ -0,0 +1,32 @@ +### IntelliJ IDEA ### +.idea +*.iml + +### Eclipse ### +.apt_generated +.classpath +.factorypath +.project +.settings +.springBeans +.sts4-cache +bin/ +!**/src/main/**/bin/ +!**/src/test/**/bin/ + +### NetBeans ### +/nbproject/private/ +/nbbuild/ +/dist/ +/nbdist/ +/.nb-gradle/ + + +### Mac OS ### +.DS_Store + +### vscode ### +.vscode/ + +# Other +tmp/ \ No newline at end of file From cbabf51fc59df227dff9e472edc1e9a88178ad7d Mon Sep 17 00:00:00 2001 From: iizitounene Date: Thu, 28 Mar 2024 14:05:26 +0100 Subject: [PATCH 06/17] Build pipeline - Add scala 2.12/2.13, python and r integration tests for images (spark > 3.3.0) --- .github/actions/setup-kind/action.yaml | 37 ++++++ .../actions/spark-tests-prepare/action.yml | 63 +++++++++ .github/actions/spark-tests-run/action.yml | 125 ++++++++++++++++++ .github/workflows/build-image-template.yml | 37 +++++- .github/workflows/ci.yml | 2 +- 5 files changed, 261 insertions(+), 3 deletions(-) create mode 100644 .github/actions/setup-kind/action.yaml create mode 100644 .github/actions/spark-tests-prepare/action.yml create mode 100644 .github/actions/spark-tests-run/action.yml diff --git a/.github/actions/setup-kind/action.yaml b/.github/actions/setup-kind/action.yaml new file mode 100644 index 0000000..e16035f --- /dev/null +++ b/.github/actions/setup-kind/action.yaml @@ -0,0 +1,37 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: Setup kind +description: Deploy kind cluster + +runs: + using: composite + steps: + - name: Create k8s Kind Cluster + uses: helm/kind-action@v1 + with: + # https://github.com/helm/kind-action?tab=readme-ov-file#inputs + verbosity: 10 + cluster_name: "kind-ci-${{ github.job }}" + ignore_failed_clean: true # Ignore the post delete cluster action failing + wait: "180s" # Max timeout to wait Kind becomes ready + + - name: Print Kind cluster state + run: | + kubectl cluster-info + kubectl get pods -A + kubectl describe node + shell: bash \ No newline at end of file diff --git a/.github/actions/spark-tests-prepare/action.yml b/.github/actions/spark-tests-prepare/action.yml new file mode 100644 index 0000000..96242af --- /dev/null +++ b/.github/actions/spark-tests-prepare/action.yml @@ -0,0 +1,63 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: Prepare integration tests +description: Prepare integration tests + +inputs: + spark_version: + description: Spark version + required: true + scala_version: + description: Scala version + required: true + java_version: + description: Java version + required: true + +outputs: + git_tag_checkout_dir: + description: "Git checkout tag local source directory" + value: ${{ steps.git-checkout-tag.outputs.git_tag_checkout_dir }} + +runs: + using: composite + # https://github.com/apache/spark/blob/master/.github/workflows/build_and_test.yml + steps: + - name: Set up Java ${{ inputs.java_version }} + uses: actions/setup-java@v4 + with: + distribution: 'zulu' + java-version: ${{ inputs.java_version }} + + - name: Cache Scala, SBT and Maven + uses: actions/cache@v4 + with: + path: | + build/apache-maven-* + build/scala-* + build/*.jar + ~/.sbt + key: build-${{ inputs.spark_version }}-scala${{ inputs.scala_version }}-java${{ inputs.java_version }} + + - name: Cache Coursier local repository + uses: actions/cache@v4 + with: + path: ~/.cache/coursier + key: build-${{ inputs.spark_version }}-scala${{ inputs.scala_version }}-java${{ inputs.java_version }}-coursier + + + diff --git a/.github/actions/spark-tests-run/action.yml b/.github/actions/spark-tests-run/action.yml new file mode 100644 index 0000000..9063b9c --- /dev/null +++ b/.github/actions/spark-tests-run/action.yml @@ -0,0 +1,125 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: Run integration tests +description: Run integration tests + +inputs: + ci-repo: + description: The CI registry repo URL + required: true + image: + description: Spark image name to test (ex. spark) + required: true + image-tag: + description: Spark image tag to test (ex. latest) + required: true + scala_version: + description: Scala version + required: true + git_checkout_tag_dir: + description: Git checkout tag directory + required: true + +runs: + using: composite + # https://github.com/apache/spark/tree/master/resource-managers/kubernetes/integration-tests + # https://github.com/apache/spark/blob/master/.github/workflows/build_and_test.yml + # https://github.com/apache/spark/pull/35830 + steps: + - name: Load image ${{ inputs.image }} into Kind and setup Spark RBACs + run: | + kubectl create clusterrolebinding serviceaccounts-cluster-admin \ + --clusterrole=cluster-admin \ + --group=system:serviceaccounts || true + # Pull and Load the image into all kind nodes (current setup mono node) for fast executors startup + docker pull ${{ inputs.ci-repo}}/${{ inputs.image }}:${{ inputs.image-tag }} + kind load docker-image ${{ inputs.ci-repo}}/${{ inputs.image }}:${{ inputs.image-tag }} --name kind-ci-${{ github.job }} + shell: bash + + - name: Change Scala version to ${{ inputs.scala_version }} + run: | + ./dev/change-scala-version.sh ${{ inputs.scala_version }} + echo "SCALA_PROFILE=scala-${{ inputs.scala_version }}" >> $GITHUB_ENV + + working-directory: ${{ inputs.git_checkout_tag_dir }} + shell: bash + + - name: Run base integration tests (${{ inputs.image }}) + if: inputs.image == 'spark-base' || inputs.image == 'spark' + run: | + build/sbt -P${{ env.SCALA_PROFILE }} -Pkubernetes -Pkubernetes-integration-tests \ + -Dspark.kubernetes.test.driverRequestCores=0.5 -Dspark.kubernetes.test.executorRequestCores=0.2 \ + -Dspark.kubernetes.test.deployMode=cloud \ + -Dspark.kubernetes.test.imageRepo=${{ inputs.ci-repo}} -Dspark.kubernetes.test.imageTag=${{ inputs.image-tag }} \ + -Dspark.kubernetes.test.jvmImage=${{ inputs.image }} \ + -Dspark.kubernetes.test.pythonImage=${{ inputs.image }} \ + -Dspark.kubernetes.test.pythonImage=${{ inputs.image }} \ + -Dspark.kubernetes.test.rImage=${{ inputs.image }} \ + 'kubernetes-integration-tests/testOnly -- -z "Run SparkPi"' + + working-directory: ${{ inputs.git_checkout_tag_dir }} + shell: bash + + - name: Run spark-py integration tests (${{ inputs.image }}) + if: inputs.image == 'spark-py' + run: | + build/sbt -P${{ env.SCALA_PROFILE }} -Pkubernetes -Pkubernetes-integration-tests \ + -Dspark.kubernetes.test.driverRequestCores=0.5 -Dspark.kubernetes.test.executorRequestCores=0.2 \ + -Dspark.kubernetes.test.deployMode=cloud \ + -Dspark.kubernetes.test.imageRepo=${{ inputs.ci-repo}} -Dspark.kubernetes.test.imageTag=${{ inputs.image-tag }} \ + -Dspark.kubernetes.test.jvmImage=${{ inputs.image }} \ + -Dspark.kubernetes.test.pythonImage=${{ inputs.image }} \ + -Dspark.kubernetes.test.pythonImage=${{ inputs.image }} \ + -Dspark.kubernetes.test.rImage=${{ inputs.image }} \ + 'kubernetes-integration-tests/testOnly -- -z "Run PySpark"' + + working-directory: ${{ inputs.git_checkout_tag_dir }} + shell: bash + + - name: Run spark-r integration tests (${{ inputs.image }}) + if: inputs.image == 'spark-r' + run: | + build/sbt -P${{ env.SCALA_PROFILE }} -Pkubernetes -Pkubernetes-integration-tests \ + -Dspark.kubernetes.test.driverRequestCores=0.5 -Dspark.kubernetes.test.executorRequestCores=0.2 \ + -Dspark.kubernetes.test.deployMode=cloud \ + -Dspark.kubernetes.test.imageRepo=${{ inputs.ci-repo}} -Dspark.kubernetes.test.imageTag=${{ inputs.image-tag }} \ + -Dspark.kubernetes.test.jvmImage=${{ inputs.image }} \ + -Dspark.kubernetes.test.pythonImage=${{ inputs.image }} \ + -Dspark.kubernetes.test.pythonImage=${{ inputs.image }} \ + -Dspark.kubernetes.test.rImage=${{ inputs.image }} \ + -Psparkr -Dtest.include.tags=r \ + 'kubernetes-integration-tests/testOnly' + + working-directory: ${{ inputs.git_checkout_tag_dir }} + shell: bash + + # - name: Run All integration tests (${{ inputs.image }}) + # if: inputs.image == 'spark-py-r' + # run: | + # build/sbt -P${{ env.SCALA_PROFILE }} -Pkubernetes -Pkubernetes-integration-tests \ + # -Dspark.kubernetes.test.driverRequestCores=0.5 -Dspark.kubernetes.test.executorRequestCores=0.2 \ + # -Dspark.kubernetes.test.deployMode=cloud \ + # -Dspark.kubernetes.test.imageRepo=${{ inputs.ci-repo}} -Dspark.kubernetes.test.imageTag=${{ inputs.image-tag }} \ + # -Dspark.kubernetes.test.jvmImage=${{ inputs.image }} \ + # -Dspark.kubernetes.test.pythonImage=${{ inputs.image }} \ + # -Dspark.kubernetes.test.pythonImage=${{ inputs.image }} \ + # -Dspark.kubernetes.test.rImage=${{ inputs.image }} \ + # 'kubernetes-integration-tests/testOnly' + + # working-directory: ${{ inputs.git_checkout_tag_dir }} + # shell: bash + diff --git a/.github/workflows/build-image-template.yml b/.github/workflows/build-image-template.yml index ced1f46..e054a22 100644 --- a/.github/workflows/build-image-template.yml +++ b/.github/workflows/build-image-template.yml @@ -95,7 +95,6 @@ jobs: ci_repo: ${{ steps.registry-repos.outputs.ci_repo }} - name: Login to the CI registry 🔐 - if: (!startsWith(inputs.spark_version, '2.')) uses: docker/login-action@v3 with: registry: ${{ inputs.ci_registry }} @@ -103,7 +102,6 @@ jobs: password: ${{ secrets.GITHUB_TOKEN }} - name: Build and push to ci registry - if: (!startsWith(inputs.spark_version, '2.')) uses: docker/build-push-action@v5 with: context: ${{ inputs.image }} @@ -119,6 +117,41 @@ jobs: tags: | ${{ steps.registry-repos.outputs.ci_repo }}/${{ inputs.image }}:${{ steps.image-tags.outputs.latest_tag }} + # https://github.com/nektos/act/issues/678 + # https://github.com/apache/spark/pull/35830 + - name: Checkout integration tests tag v${{ inputs.spark_version }} (${{ inputs.spark_version}} > 3.3.0) ⚡️ + if: (!startsWith(inputs.spark_version, '3.1') && !startsWith(inputs.spark_version, '3.2') && !startsWith(inputs.spark_version, '3.3.0')) + id: git-checkout-tag + run: | + CHECKOUT_TAG_DIR="$(mktemp -d)/spark" + git clone https://github.com/apache/spark.git ${CHECKOUT_TAG_DIR} + cd ${CHECKOUT_TAG_DIR} + git checkout v${{ inputs.spark_version }} + echo "checkout_directory=${CHECKOUT_TAG_DIR}" >> $GITHUB_OUTPUT + shell: bash + + - name: Prepare integration tests env (${{ inputs.spark_version}} > 3.3.0) 📦 + if: (!startsWith(inputs.spark_version, '3.1') && !startsWith(inputs.spark_version, '3.2') && !startsWith(inputs.spark_version, '3.3.0')) + uses: ./.github/actions/spark-tests-prepare + with: + spark_version: ${{ inputs.spark_version}} + scala_version: ${{ inputs.scala_version }} + java_version: ${{ inputs.java_version }} + + - name: Set up Kind integration tests cluster (${{ inputs.spark_version}} > 3.3.0) 📦 + if: (!startsWith(inputs.spark_version, '3.1') && !startsWith(inputs.spark_version, '3.2') && !startsWith(inputs.spark_version, '3.3.0')) + uses: ./.github/actions/setup-kind + + - name: Run integration tests (${{ inputs.spark_version}} > 3.3.0) ✅ + if: (!startsWith(inputs.spark_version, '3.1') && !startsWith(inputs.spark_version, '3.2') && !startsWith(inputs.spark_version, '3.3.0')) + uses: ./.github/actions/spark-tests-run + with: + ci-repo: ${{ steps.registry-repos.outputs.ci_repo }} + image: ${{ inputs.image }} + image-tag: ${{ steps.image-tags.outputs.latest_tag }} + spark_version: ${{ inputs.spark_version }} + scala_version: ${{ inputs.scala_version }} + git_checkout_tag_dir: ${{ steps.git-checkout-tag.outputs.checkout_directory }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5569799..c0ee4fc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -57,7 +57,7 @@ jobs: strategy: matrix: python_version: [3.11] - spark_version: [3.4.2, 3.3.4] + spark_version: [3.3.4, 3.4.2] java_version: [17] scala_version: [2.12, 2.13] hadoop_version: [3] From 22ffda26ccc7f4023488164a70977b4b832fa158 Mon Sep 17 00:00:00 2001 From: iizitounene Date: Thu, 28 Mar 2024 14:45:02 +0100 Subject: [PATCH 07/17] Build pipeline - Publish to official registry --- .github/actions/spark-image-tag/action.yaml | 21 +++++- .github/workflows/build-image-template.yml | 56 ++++++++++++--- .github/workflows/build-images-template.yml | 17 +++++ .github/workflows/ci.yml | 1 + .github/workflows/publish.yml | 76 +++++++++++++++++++++ 5 files changed, 159 insertions(+), 12 deletions(-) create mode 100644 .github/workflows/publish.yml diff --git a/.github/actions/spark-image-tag/action.yaml b/.github/actions/spark-image-tag/action.yaml index eaf56a9..247c765 100644 --- a/.github/actions/spark-image-tag/action.yaml +++ b/.github/actions/spark-image-tag/action.yaml @@ -36,6 +36,12 @@ inputs: ci_repo: description: The CI registry repo required: false + publish_repo: + description: The official registry repo + required: false + publish_to_registry: + description: Whether to push or not to the official registry repo + required: true outputs: parent_image: @@ -44,6 +50,9 @@ outputs: latest_tag: description: "CI image tags (ex.: spark-3.3.4....)" value: ${{ steps.tags.outputs.latest_tag }} + publish_tags: + description: "Image tags to push into registry (ex.: quay.io/spark-r:spark-3.3.4...)" + value: ${{ steps.tags.outputs.publish_tags }} runs: using: composite @@ -83,17 +92,27 @@ runs: LATEST_TAG=$(yq -oc '(.images[] | select(.name == "${{ inputs.image }}").tags[0])' .build/images.yml) LATEST_TAG=$(eval echo ${LATEST_TAG}) + PUBLISH_TAGS=$(yq -oc '[.images[] | select(.name == "${{ inputs.image }}").tags | .[] |"${{ inputs.publish_repo }}/${{ inputs.image }}:" + .]' .build/images.yml) + PUBLISH_TAGS=$(eval echo ${PUBLISH_TAGS}) + # The image can inherit from a community image like docker.io/eclipse-temurin, ... if [[ "${PARENT_IMAGE_NAME}" != *"/"* ]] then - PARENT_IMAGE_NAME="${{ inputs.ci_repo }}/${PARENT_IMAGE_NAME}" + if [[ "${{ inputs.publish_to_registry }}" == "true" ]] + then + PARENT_IMAGE_NAME="${{ inputs.publish_repo }}/${PARENT_IMAGE_NAME}" + else + PARENT_IMAGE_NAME="${{ inputs.ci_repo }}/${PARENT_IMAGE_NAME}" + fi fi # Logging echo "parent_image=${PARENT_IMAGE_NAME}" echo "latest_tag=${LATEST_TAG}" + echo "publish_tags=${PUBLISH_TAGS}" # Set outputs echo "parent_image=${PARENT_IMAGE_NAME}" >> $GITHUB_OUTPUT echo "latest_tag=${LATEST_TAG}" >> $GITHUB_OUTPUT + echo "publish_tags=${PUBLISH_TAGS}" >> $GITHUB_OUTPUT shell: bash diff --git a/.github/workflows/build-image-template.yml b/.github/workflows/build-image-template.yml index e054a22..974feac 100644 --- a/.github/workflows/build-image-template.yml +++ b/.github/workflows/build-image-template.yml @@ -44,16 +44,20 @@ on: description: Python version required: true type: string + publish_to_registry: + description: Wheter to push to the registry + required: false + type: string + default: "false" + registry: + description: The container registry + required: false + type: string ci_registry: description: "The registry used to push ci images" required: false type: string default: "ghcr.io" - git_latest_release_tag: - description: The latest remote release tag - required: false - type: string - default: "" runs-on: description: GitHub Actions Runner image required: true @@ -65,6 +69,7 @@ jobs: name: ${{ inputs.image }} (scala-${{ inputs.scala_version }}, java-${{ inputs.java_version }}, python-${{ inputs.python_version }}) runs-on: ${{ inputs.runs-on }} steps: + ### The CI is based on the main branch - name: Checkout Repo ⚡️ uses: actions/checkout@v4 @@ -76,11 +81,12 @@ jobs: - name: Set up QEMU and Docker Buildx 📦 uses: ./.github/actions/setup-buildx - - name: Set up CI and Publish registries 📦 + - name: Set up CI and official registries 📦 id: registry-repos run: | echo "repo_owner=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_OUTPUT echo "ci_repo=${{ inputs.ci_registry }}/${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_OUTPUT + echo "publish_repo=${{ inputs.registry }}/${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_OUTPUT shell: bash - name: Generate image tags 📦 @@ -93,8 +99,11 @@ jobs: java_version: ${{ inputs.java_version }} python_version: ${{ inputs.python_version}} ci_repo: ${{ steps.registry-repos.outputs.ci_repo }} + publish_repo: ${{ steps.registry-repos.outputs.publish_repo }} + publish_to_registry: ${{ inputs.publish_to_registry }} - name: Login to the CI registry 🔐 + if: inputs.publish_to_registry == 'false' uses: docker/login-action@v3 with: registry: ${{ inputs.ci_registry }} @@ -102,6 +111,7 @@ jobs: password: ${{ secrets.GITHUB_TOKEN }} - name: Build and push to ci registry + if: inputs.publish_to_registry == 'false' uses: docker/build-push-action@v5 with: context: ${{ inputs.image }} @@ -117,10 +127,11 @@ jobs: tags: | ${{ steps.registry-repos.outputs.ci_repo }}/${{ inputs.image }}:${{ steps.image-tags.outputs.latest_tag }} + ### CI Steps # https://github.com/nektos/act/issues/678 # https://github.com/apache/spark/pull/35830 - name: Checkout integration tests tag v${{ inputs.spark_version }} (${{ inputs.spark_version}} > 3.3.0) ⚡️ - if: (!startsWith(inputs.spark_version, '3.1') && !startsWith(inputs.spark_version, '3.2') && !startsWith(inputs.spark_version, '3.3.0')) + if: inputs.publish_to_registry == 'false' && !(startsWith(inputs.spark_version, '3.1') || startsWith(inputs.spark_version, '3.2') || startsWith(inputs.spark_version, '3.3.0')) id: git-checkout-tag run: | CHECKOUT_TAG_DIR="$(mktemp -d)/spark" @@ -131,7 +142,7 @@ jobs: shell: bash - name: Prepare integration tests env (${{ inputs.spark_version}} > 3.3.0) 📦 - if: (!startsWith(inputs.spark_version, '3.1') && !startsWith(inputs.spark_version, '3.2') && !startsWith(inputs.spark_version, '3.3.0')) + if: inputs.publish_to_registry == 'false' && !(startsWith(inputs.spark_version, '3.1') || startsWith(inputs.spark_version, '3.2') || startsWith(inputs.spark_version, '3.3.0')) uses: ./.github/actions/spark-tests-prepare with: spark_version: ${{ inputs.spark_version}} @@ -139,19 +150,42 @@ jobs: java_version: ${{ inputs.java_version }} - name: Set up Kind integration tests cluster (${{ inputs.spark_version}} > 3.3.0) 📦 - if: (!startsWith(inputs.spark_version, '3.1') && !startsWith(inputs.spark_version, '3.2') && !startsWith(inputs.spark_version, '3.3.0')) + if: inputs.publish_to_registry == 'false' && !(startsWith(inputs.spark_version, '3.1') || startsWith(inputs.spark_version, '3.2') || startsWith(inputs.spark_version, '3.3.0')) uses: ./.github/actions/setup-kind - name: Run integration tests (${{ inputs.spark_version}} > 3.3.0) ✅ - if: (!startsWith(inputs.spark_version, '3.1') && !startsWith(inputs.spark_version, '3.2') && !startsWith(inputs.spark_version, '3.3.0')) + if: inputs.publish_to_registry == 'false' && !(startsWith(inputs.spark_version, '3.1') || startsWith(inputs.spark_version, '3.2') || startsWith(inputs.spark_version, '3.3.0')) uses: ./.github/actions/spark-tests-run with: ci-repo: ${{ steps.registry-repos.outputs.ci_repo }} image: ${{ inputs.image }} image-tag: ${{ steps.image-tags.outputs.latest_tag }} - spark_version: ${{ inputs.spark_version }} scala_version: ${{ inputs.scala_version }} git_checkout_tag_dir: ${{ steps.git-checkout-tag.outputs.checkout_directory }} + ### Publish steps + - name: Login into official registry 🔐 + if: inputs.publish_to_registry == 'true' + uses: docker/login-action@v3 + with: + registry: ${{ inputs.registry }} + username: ${{ secrets.REGISTRY_USERNAME }} + password: ${{ secrets.REGISTRY_ROBOT_TOKEN }} + + - name: Build and push to official registry 📤 + if: inputs.publish_to_registry == 'true' + uses: docker/build-push-action@v5 + with: + context: ${{ inputs.image }} + platforms: linux/amd64,linux/arm64 + push: true + build-args: | + SPARK_VERSION=${{ inputs.spark_version}} + SCALA_VERSION=${{ inputs.scala_version }} + JAVA_VERSION=${{ inputs.java_version }} + PYTHON_VERSION=${{ inputs.python_version }} + HADOOP_VERSION=${{ inputs.hadoop_version }} + BASE_IMAGE=${{ steps.image-tags.outputs.parent_image }} + tags: ${{ steps.image-tags.outputs.publish_tags }} diff --git a/.github/workflows/build-images-template.yml b/.github/workflows/build-images-template.yml index 0b79e80..071ce6e 100644 --- a/.github/workflows/build-images-template.yml +++ b/.github/workflows/build-images-template.yml @@ -39,6 +39,15 @@ on: description: Python version required: true type: string + registry: + description: The container registry + required: false + type: string + publish_to_registry: + description: Wheter to push to the registry + required: false + type: string + default: "false" runs-on: description: GitHub Actions Runner image required: false @@ -56,6 +65,8 @@ jobs: java_version: ${{ inputs.java_version }} scala_version: ${{ inputs.scala_version }} hadoop_version: ${{ inputs.hadoop_version }} + registry: ${{ inputs.registry }} + publish_to_registry: ${{ inputs.publish_to_registry }} runs-on: ${{ inputs.runs-on }} secrets: inherit @@ -69,6 +80,8 @@ jobs: java_version: ${{ inputs.java_version }} scala_version: ${{ inputs.scala_version }} hadoop_version: ${{ inputs.hadoop_version }} + registry: ${{ inputs.registry }} + publish_to_registry: ${{ inputs.publish_to_registry }} runs-on: ${{ inputs.runs-on }} secrets: inherit @@ -82,6 +95,8 @@ jobs: java_version: ${{ inputs.java_version }} scala_version: ${{ inputs.scala_version }} hadoop_version: ${{ inputs.hadoop_version }} + registry: ${{ inputs.registry }} + publish_to_registry: ${{ inputs.publish_to_registry }} runs-on: ${{ inputs.runs-on }} secrets: inherit @@ -95,5 +110,7 @@ jobs: java_version: ${{ inputs.java_version }} scala_version: ${{ inputs.scala_version }} hadoop_version: ${{ inputs.hadoop_version }} + registry: ${{ inputs.registry }} + publish_to_registry: ${{ inputs.publish_to_registry }} runs-on: ${{ inputs.runs-on }} secrets: inherit diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c0ee4fc..87eec28 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -68,3 +68,4 @@ jobs: java_version: ${{ matrix.java_version }} scala_version: ${{ matrix.scala_version }} hadoop_version: ${{ matrix.hadoop_version }} + publish_to_registry: "false" diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..1ac73da --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,76 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: publish + +on: + # pull_request: + # paths: + # - ".github/workflows/**" + # - ".github/actions/**" + + # - "spark/**" + # - "spark-*/**" + + # - "!README.md" + + # push: + # branches: + # - main + # paths: + # - ".github/workflows/**" + # - ".github/actions/**" + + # - "spark/**" + # - "spark-*/**" + + # - "!README.md" + + workflow_dispatch: + +# https://docs.github.com/en/actions/using-jobs/using-concurrency +concurrency: + # Only cancel in-progress jobs or runs for the current workflow - matches against branch & tags + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: + packages: write + +jobs: + + spark-publish: + if: github.repository_owner == 'OKDP' + name: spark-publish (spark-${{ matrix.spark_version }}) + strategy: + matrix: + python_version: [3.11] + spark_version: [3.3.3, 3.3.4] + java_version: [17] + scala_version: [2.12] + hadoop_version: [3] + uses: ./.github/workflows/build-images-template.yml + with: + python_version: ${{ matrix.python_version }} + spark_version: ${{ matrix.spark_version }} + java_version: ${{ matrix.java_version }} + scala_version: ${{ matrix.scala_version }} + hadoop_version: ${{ matrix.hadoop_version }} + registry: ${{ vars.REGISTRY || 'quay.io' }} + publish_to_registry: "true" + secrets: inherit + + \ No newline at end of file From fc9d948d990deb0756c2d6c93dee347a5a61372d Mon Sep 17 00:00:00 2001 From: iizitounene Date: Thu, 28 Mar 2024 14:50:53 +0100 Subject: [PATCH 08/17] Build pipeline - Publish to official registry repo based on the latest Github release/pre-release tag --- .build/images.yml | 15 ++++++++++ .github/actions/spark-image-tag/action.yaml | 5 ++++ .github/workflows/build-image-template.yml | 15 ++++++++++ .github/workflows/build-images-template.yml | 9 ++++++ .github/workflows/publish.yml | 31 ++++++++++++++++++++- 5 files changed, 74 insertions(+), 1 deletion(-) diff --git a/.build/images.yml b/.build/images.yml index 4124759..da019f2 100644 --- a/.build/images.yml +++ b/.build/images.yml @@ -23,23 +23,38 @@ images: tags: - spark-${spark_version}-scala-${scala_version}-java-${java_version} - spark-${spark_version}-scala-${scala_version}-java-${java_version}-$(date '+%Y-%m-%d') + - spark-${spark_version}-scala-${scala_version}-java-${java_version}-${git_release_version} + - spark-${spark_version}-scala-${scala_version}-java-${java_version}-$(date '+%Y-%m-%d')-${git_release_version} + #- spark-${spark_version}-scala-${scala_version}-java-${java_version}-${git_commit_short_sha} - name: spark dependsOn: spark-base tags: - spark-${spark_version}-scala-${scala_version}-java-${java_version} - spark-${spark_version}-scala-${scala_version}-java-${java_version}-$(date '+%Y-%m-%d') + - spark-${spark_version}-scala-${scala_version}-java-${java_version}-${git_release_version} + - spark-${spark_version}-scala-${scala_version}-java-${java_version}-$(date '+%Y-%m-%d')-${git_release_version} + #- spark-${spark_version}-scala-${scala_version}-java-${java_version}-${git_commit_short_sha} - name: spark-py dependsOn: spark tags: - spark-${spark_version}-python-${python_version}-scala-${scala_version}-java-${java_version} - spark-${spark_version}-python-${python_version}-scala-${scala_version}-java-${java_version}-$(date '+%Y-%m-%d') + - spark-${spark_version}-python-${python_version}-scala-${scala_version}-java-${java_version}-${git_release_version} + - spark-${spark_version}-python-${python_version}-scala-${scala_version}-java-${java_version}-$(date '+%Y-%m-%d')-${git_release_version} + #- spark-${spark_version}-python-${python_version}-scala-${scala_version}-java-${java_version}-${git_commit_short_sha} - name: spark-r dependsOn: spark tags: - spark-${spark_version}-r-${r_version}-scala-${scala_version}-java-${java_version} - spark-${spark_version}-r-${r_version}-scala-${scala_version}-java-${java_version}-$(date '+%Y-%m-%d') + - spark-${spark_version}-r-${r_version}-scala-${scala_version}-java-${java_version}-${git_release_version} + - spark-${spark_version}-r-${r_version}-scala-${scala_version}-java-${java_version}-$(date '+%Y-%m-%d')-${git_release_version} + #- spark-${spark_version}-r-${r_version}-scala-${scala_version}-java-${java_version}-${git_commit_short_sha} - name: spark-py-r dependsOn: spark-py tags: - spark-${spark_version}-python-${python_version}-r-${r_version}-scala-${scala_version}-java-${java_version} - spark-${spark_version}-python-${python_version}-r-${r_version}-scala-${scala_version}-java-${java_version}-$(date '+%Y-%m-%d') + - spark-${spark_version}-python-${python_version}-r-${r_version}-scala-${scala_version}-java-${java_version}-${git_release_version} + - spark-${spark_version}-python-${python_version}-r-${r_version}-scala-${scala_version}-java-${java_version}-$(date '+%Y-%m-%d')-${git_release_version} + #- spark-${spark_version}-python-${python_version}-r-${r_version}-scala-${scala_version}-java-${java_version}-${git_commit_short_sha} diff --git a/.github/actions/spark-image-tag/action.yaml b/.github/actions/spark-image-tag/action.yaml index 247c765..ba0cac7 100644 --- a/.github/actions/spark-image-tag/action.yaml +++ b/.github/actions/spark-image-tag/action.yaml @@ -36,6 +36,9 @@ inputs: ci_repo: description: The CI registry repo required: false + git_tag_name: + description: The Git remote latest tag name + required: false publish_repo: description: The official registry repo required: false @@ -75,6 +78,8 @@ runs: scala_version=${{ inputs.scala_version }} java_version=${{ inputs.java_version }} python_version=${{ inputs.python_version }} + git_tag_name=${{ inputs.git_tag_name }} + git_release_version=$(echo '${{ inputs.git_tag_name }}' | tr -d 'v') git_commit_sha=${{ env.GIT_COMMIT_SHA }} git_commit_short_sha=${{ env.GIT_COMMIT_SHORT_SHA }} diff --git a/.github/workflows/build-image-template.yml b/.github/workflows/build-image-template.yml index 974feac..78c45dd 100644 --- a/.github/workflows/build-image-template.yml +++ b/.github/workflows/build-image-template.yml @@ -58,6 +58,11 @@ on: required: false type: string default: "ghcr.io" + git_latest_release_tag: + description: The latest remote release tag + required: false + type: string + default: "" runs-on: description: GitHub Actions Runner image required: true @@ -70,8 +75,16 @@ jobs: runs-on: ${{ inputs.runs-on }} steps: + ### The publish and periodic rebuilds are based on the latest stable github release tag + - name: Checkout latest Github Release tag (${{ inputs.git_latest_release_tag }}) ⚡️ + if: inputs.publish_to_registry == 'true' + uses: actions/checkout@v4 + with: + ref: ${{ inputs.git_latest_release_tag }} + ### The CI is based on the main branch - name: Checkout Repo ⚡️ + if: inputs.publish_to_registry == 'false' uses: actions/checkout@v4 ### Common steps between CI and Publish @@ -101,6 +114,7 @@ jobs: ci_repo: ${{ steps.registry-repos.outputs.ci_repo }} publish_repo: ${{ steps.registry-repos.outputs.publish_repo }} publish_to_registry: ${{ inputs.publish_to_registry }} + git_tag_name: ${{ inputs.git_latest_release_tag }} - name: Login to the CI registry 🔐 if: inputs.publish_to_registry == 'false' @@ -164,6 +178,7 @@ jobs: git_checkout_tag_dir: ${{ steps.git-checkout-tag.outputs.checkout_directory }} ### Publish steps + ### The publish and periodic rebuilds are based on the latest stable github release tag - name: Login into official registry 🔐 if: inputs.publish_to_registry == 'true' uses: docker/login-action@v3 diff --git a/.github/workflows/build-images-template.yml b/.github/workflows/build-images-template.yml index 071ce6e..5e5593a 100644 --- a/.github/workflows/build-images-template.yml +++ b/.github/workflows/build-images-template.yml @@ -48,6 +48,11 @@ on: required: false type: string default: "false" + git_latest_release_tag: + description: The latest remote release tag + required: false + type: string + default: "" runs-on: description: GitHub Actions Runner image required: false @@ -67,6 +72,7 @@ jobs: hadoop_version: ${{ inputs.hadoop_version }} registry: ${{ inputs.registry }} publish_to_registry: ${{ inputs.publish_to_registry }} + git_latest_release_tag: ${{ inputs.git_latest_release_tag }} runs-on: ${{ inputs.runs-on }} secrets: inherit @@ -82,6 +88,7 @@ jobs: hadoop_version: ${{ inputs.hadoop_version }} registry: ${{ inputs.registry }} publish_to_registry: ${{ inputs.publish_to_registry }} + git_latest_release_tag: ${{ inputs.git_latest_release_tag }} runs-on: ${{ inputs.runs-on }} secrets: inherit @@ -97,6 +104,7 @@ jobs: hadoop_version: ${{ inputs.hadoop_version }} registry: ${{ inputs.registry }} publish_to_registry: ${{ inputs.publish_to_registry }} + git_latest_release_tag: ${{ inputs.git_latest_release_tag }} runs-on: ${{ inputs.runs-on }} secrets: inherit @@ -112,5 +120,6 @@ jobs: hadoop_version: ${{ inputs.hadoop_version }} registry: ${{ inputs.registry }} publish_to_registry: ${{ inputs.publish_to_registry }} + git_latest_release_tag: ${{ inputs.git_latest_release_tag }} runs-on: ${{ inputs.runs-on }} secrets: inherit diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 1ac73da..93ae43e 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -52,9 +52,37 @@ permissions: jobs: - spark-publish: + latest-github-release: if: github.repository_owner == 'OKDP' + runs-on: "ubuntu-latest" + outputs: + tag_name: ${{ steps.git-release-tag.outputs.tag_name }} + steps: + - name: Checkout Repo ⚡️ + uses: actions/checkout@v4 + + - name: Get latest GitHub Release tag name 📥 + id: git-release-tag + uses: InsonusK/get-latest-release@v1.0.1 + with: + myToken: ${{ github.token }} + exclude_types: "draft" + view_top: 1 + + - name: Info - Found latest release tag + run: | + echo "id: ${{ steps.git-release-tag.outputs.id }}" + echo "name: ${{ steps.git-release-tag.outputs.name }}" + echo "tag_name: ${{ steps.git-release-tag.outputs.tag_name }}" + echo "created_at: ${{ steps.git-release-tag.outputs.created_at }}" + echo "draft: ${{ steps.git-release-tag.outputs.draft }}" + echo "prerelease: ${{ steps.git-release-tag.outputs.prerelease }}" + shell: bash + + spark-publish: + if: github.repository_owner == 'OKDP' && needs.latest-github-release.outputs.tag_name != '' name: spark-publish (spark-${{ matrix.spark_version }}) + needs: [latest-github-release] strategy: matrix: python_version: [3.11] @@ -71,6 +99,7 @@ jobs: hadoop_version: ${{ matrix.hadoop_version }} registry: ${{ vars.REGISTRY || 'quay.io' }} publish_to_registry: "true" + git_latest_release_tag: ${{ needs.latest-github-release.outputs.tag_name }} secrets: inherit \ No newline at end of file From c76825cf062884e4ade004e2a572f94bd5dcb692 Mon Sep 17 00:00:00 2001 From: iizitounene Date: Thu, 28 Mar 2024 17:30:32 +0100 Subject: [PATCH 09/17] Spark image - Remove Control M characters from spark parent pom.xml --- spark/Dockerfile | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/spark/Dockerfile b/spark/Dockerfile index dc9a504..8f92fb9 100644 --- a/spark/Dockerfile +++ b/spark/Dockerfile @@ -29,7 +29,7 @@ ARG SCALA_VERSION=2.12 RUN set -ex; \ apt-get update; \ - apt install -y --no-install-recommends maven + apt install -y --no-install-recommends maven dos2unix WORKDIR /workspace @@ -38,11 +38,13 @@ COPY okdp-addons.pom deps/pom.xml # The setup consumes less space compare to inheriting from the parent pom # Handles the transitive dependencies versions through the pom # Manage Java AWS SDK v1 (hadoop <3.4)/V2 (hadoop >=3.4) -RUN mvn dependency:get -DgroupId=org.apache.spark -DartifactId=spark-parent_${SCALA_VERSION} -Dversion=${SPARK_VERSION} -Dpackaging=pom; \ - mvn dependency:copy -Dartifact=org.apache.spark:spark-parent_${SCALA_VERSION}:${SPARK_VERSION}:pom -Dproject.basedir=./ -DoutputDirectory=./; \ - HADOOP_VERSION=$(grep "" spark-parent_${SCALA_VERSION}-${SPARK_VERSION}.pom | sed -e 's/^ *\(.*\)<\/hadoop.version> *$/\1/'|head -1); \ +# Some pom.xml versions comes with Control M +RUN mvn -ntp dependency:get -DgroupId=org.apache.spark -DartifactId=spark-parent_${SCALA_VERSION} -Dversion=${SPARK_VERSION} -Dpackaging=pom; \ + mvn -ntp dependency:copy -Dartifact=org.apache.spark:spark-parent_${SCALA_VERSION}:${SPARK_VERSION}:pom -Dproject.basedir=./ -DoutputDirectory=./; \ + dos2unix spark-parent_${SCALA_VERSION}-${SPARK_VERSION}.pom; \ + HADOOP_VERSION=$(grep "" spark-parent_${SCALA_VERSION}-${SPARK_VERSION}.pom | tr -d ' ' | sed -e 's/^ *\(.*\)<\/hadoop.version> *$/\1/' | sort -rn | head -n 1); \ mv ./deps/pom.xml .; \ - mvn clean dependency:copy-dependencies \ + mvn -ntp clean dependency:copy-dependencies \ -Dspark.version=${SPARK_VERSION} \ -Dscala.version=${SCALA_VERSION} \ -Dhadoop.version=${HADOOP_VERSION} \ From 03828a4c54481f43e518094e478089f4275c5661 Mon Sep 17 00:00:00 2001 From: iizitounene Date: Fri, 29 Mar 2024 10:56:34 +0100 Subject: [PATCH 10/17] Build pipeline - Add action to run CI and publish against versions and reference versions matrices --- .build/ci-versions.yml | 50 ++++++++++++++++ .build/reference-versions.yml | 50 ++++++++++++++++ .build/release-versions.yml | 50 ++++++++++++++++ .../actions/spark-version-matrix/action.yml | 59 +++++++++++++++++++ .github/workflows/build-image-template.yml | 2 +- .github/workflows/ci.yml | 38 ++++++++---- .github/workflows/publish.yml | 59 ++++++++----------- 7 files changed, 260 insertions(+), 48 deletions(-) create mode 100644 .build/ci-versions.yml create mode 100644 .build/reference-versions.yml create mode 100644 .build/release-versions.yml create mode 100644 .github/actions/spark-version-matrix/action.yml diff --git a/.build/ci-versions.yml b/.build/ci-versions.yml new file mode 100644 index 0000000..d6118b7 --- /dev/null +++ b/.build/ci-versions.yml @@ -0,0 +1,50 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +########### ~~~ CI TEST MATRIX VERSIONS ~~~ ############################################################################ +#### PUT THE SPARK VERSIONS TO TEST IN CORRESPONDANCE WITH 'reference-versions.yml' FILE ############################### +#### !!! ANY DECLARED TEST VERSION WHICH IS NOT PRESENT IN 'reference-versions.yml' FILE IS SKIPPED DURING BUILD !!! ### +#### REMOVE, UPDATE OR ADD VERSIONS TO TEST ############################################################################ +versions: + # Maximum python version supported by spark-3.2.x: 3.9 + # Java support: 8/11 + - python_version: 3.9 + spark_version: [3.2.4] + java_version: [11] + scala_version: [2.12] + hadoop_version: 3.2 + # Maximum python version supported by spark-3.3.x: 3.10 + # Java support: 8/11/17 + - python_version: '3.10' + spark_version: [3.3.4] + java_version: [17] + scala_version: [2.12, 2.13] + hadoop_version: 3 + # Maximum python version supported by spark-3.4.x: 3.11 + # Java support: 8/11/17 + - python_version: 3.11 + spark_version: [3.4.2] + java_version: [17] + scala_version: [2.12, 2.13] + hadoop_version: 3 + # https://spark.apache.org/releases/spark-release-3-5-0.html + # Minimum supported java version: 17/21 + - python_version: 3.11 + spark_version: [3.5.0] + java_version: [17] + scala_version: [2.13] + hadoop_version: 3 + diff --git a/.build/reference-versions.yml b/.build/reference-versions.yml new file mode 100644 index 0000000..3369ece --- /dev/null +++ b/.build/reference-versions.yml @@ -0,0 +1,50 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +### REFERENCE MATRIX VERSIONS ############################## +#### !!! DOT NOT DELETE ANY ELEMENT !!! #################### +######## APPEND ONLY WHEN NEW SPARK VERSION IS REALEASED ### +############ USED AS REFERENCE DURING BUILD ################ +versions: + # Maximum python version supported by spark-3.2.x: 3.9 + # Java support: 8/11 + - python_version: 3.9 + spark_version: [3.2.1, 3.2.2, 3.2.3, 3.2.4] + java_version: [11] + scala_version: [2.12, 2.13] + hadoop_version: 3.2 + # Maximum python version supported by spark-3.3.x: 3.10 + # Java support: 8/11/17 + - python_version: '3.10' + spark_version: [3.3.1, 3.3.2, 3.3.3, 3.3.4] + java_version: [17] + scala_version: [2.12, 2.13] + hadoop_version: 3 + # Maximum python version supported by spark-3.4.x: 3.11 + # Java support: 8/11/17 + - python_version: 3.11 + spark_version: [3.4.1, 3.4.2] + java_version: [17] + scala_version: [2.12, 2.13] + hadoop_version: 3 + # https://spark.apache.org/releases/spark-release-3-5-0.html + # Minimum supported java version: 17/21 + - python_version: 3.11 + spark_version: [3.5.0] + java_version: [17] + scala_version: [2.12, 2.13] + hadoop_version: 3 + diff --git a/.build/release-versions.yml b/.build/release-versions.yml new file mode 100644 index 0000000..6bfc534 --- /dev/null +++ b/.build/release-versions.yml @@ -0,0 +1,50 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +########### CURRENT MATRIX VERSIONS ################################################################################ +#### PUT THE SPARK VERSIONS TO BUILD IN CORRESPONDANCE WITH 'reference-versions.yml' FILE ########################## +#### !!! ANY DECLARED VERSION WHICH IS NOT PRESENT IN 'reference-versions.yml' FILE IS SKIPPED DURING BUILD !!! #### +#### REMOVE, UPDATE OR ADD VERSIONS ################################################################################ +versions: + # Maximum python version supported by spark-3.2.x: 3.9 + # Java support: 8/11 + - python_version: 3.9 + spark_version: [3.2.1, 3.2.2, 3.2.3, 3.2.4] + java_version: [11] + scala_version: [2.12, 2.13] + hadoop_version: 3.2 + # Maximum python version supported by spark-3.3.x: 3.10 + # Java support: 8/11/17 + - python_version: '3.10' + spark_version: [3.3.1, 3.3.2, 3.3.3, 3.3.4] + java_version: [17] + scala_version: [2.12, 2.13] + hadoop_version: 3 + # Maximum python version supported by spark-3.4.x: 3.11 + # Java support: 8/11/17 + - python_version: 3.11 + spark_version: [3.4.1, 3.4.2] + java_version: [17] + scala_version: [2.12, 2.13] + hadoop_version: 3 + # https://spark.apache.org/releases/spark-release-3-5-0.html + # Minimum supported java version: 17/21 + - python_version: 3.11 + spark_version: [3.5.0] + java_version: [17] + scala_version: [2.12, 2.13] + hadoop_version: 3 + diff --git a/.github/actions/spark-version-matrix/action.yml b/.github/actions/spark-version-matrix/action.yml new file mode 100644 index 0000000..782b21c --- /dev/null +++ b/.github/actions/spark-version-matrix/action.yml @@ -0,0 +1,59 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: Build Spark versions matrix +description: Build Spark versions matrix from '.build/versions.yml' and '.build/reference-versions.yml' files + +inputs: + use_matrix: + description: The matrix version file to use + required: true + +outputs: + matrix: + description: "Spark versions matrix" + value: ${{ steps.generate-matrix.outputs.matrix }} + +runs: + using: composite + steps: + - name: Generate Matrix + id: generate-matrix + run: | + + INPUT_MATRIX=$(yq -oj ${{ inputs.use_matrix }} | jq '.versions | .[] | + {python_version: .python_version, + hadoop_version: .hadoop_version} + + (.spark_version[] | {spark_version: .}) + + (.scala_version[] | {scala_version: .}) + + (.java_version[] | {java_version: .})' | jq -c --slurp '.') + REF_MATRIX=$(yq -oj .build/reference-versions.yml | jq '.versions | .[] | + {python_version: .python_version, + hadoop_version: .hadoop_version} + + (.spark_version[] | {spark_version: .}) + + (.scala_version[] | {scala_version: .}) + + (.java_version[] | {java_version: .})' | jq -c --slurp '.') + + ### Intersection between the versions matrix and the reference versions matrix + ### When the intersection is empty, the jobs are skipped! + MATRIX=$(jq --argjson IN ${INPUT_MATRIX} --argjson REF ${REF_MATRIX} -cn '$IN - ($IN- $REF)') + + LENGHT=$(echo ${MATRIX} | jq '. | length') + echo "${MATRIX}" + echo "Found ${LENGHT} compatible version combinations" + echo "matrix=${MATRIX}" >> $GITHUB_OUTPUT + + shell: bash diff --git a/.github/workflows/build-image-template.yml b/.github/workflows/build-image-template.yml index 78c45dd..ad39c32 100644 --- a/.github/workflows/build-image-template.yml +++ b/.github/workflows/build-image-template.yml @@ -71,7 +71,7 @@ on: jobs: build-test-push: - name: ${{ inputs.image }} (scala-${{ inputs.scala_version }}, java-${{ inputs.java_version }}, python-${{ inputs.python_version }}) + name: ${{ inputs.image }} (scala-${{ inputs.scala_version }}, java-${{ inputs.java_version }}, python-${{ inputs.python_version }}, hadoop-${{ inputs.hadoop_version }}) runs-on: ${{ inputs.runs-on }} steps: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 87eec28..2b0a449 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,6 +21,7 @@ on: paths: - ".github/workflows/**" - ".github/actions/**" + - ".build/**" - "spark/**" - "spark-*/**" @@ -33,6 +34,7 @@ on: paths: - ".github/workflows/**" - ".github/actions/**" + - ".build/**" - "spark/**" - "spark-*/**" @@ -51,21 +53,33 @@ permissions: packages: write jobs: - + + get-ci-versions: + runs-on: "ubuntu-latest" + outputs: + matrix: ${{ steps.ci-versions.outputs.matrix }} + steps: + - name: Checkout Repo ⚡️ + uses: actions/checkout@v4 + + - name: Get CI versions matrix 📥 + id: ci-versions + uses: ./.github/actions/spark-version-matrix + with: + use_matrix: ".build/ci-versions.yml" + spark-ci: - name: spark-ci (spark-${{ matrix.spark_version }}) + name: spark-ci (spark-${{ matrix.version.spark_version }}) + needs: [get-ci-versions] strategy: + fail-fast: false matrix: - python_version: [3.11] - spark_version: [3.3.4, 3.4.2] - java_version: [17] - scala_version: [2.12, 2.13] - hadoop_version: [3] + version: ${{ fromJson(needs.get-ci-versions.outputs.matrix) }} uses: ./.github/workflows/build-images-template.yml with: - python_version: ${{ matrix.python_version }} - spark_version: ${{ matrix.spark_version }} - java_version: ${{ matrix.java_version }} - scala_version: ${{ matrix.scala_version }} - hadoop_version: ${{ matrix.hadoop_version }} + python_version: ${{ matrix.version.python_version }} + spark_version: ${{ matrix.version.spark_version }} + java_version: ${{ matrix.version.java_version }} + scala_version: ${{ matrix.version.scala_version }} + hadoop_version: ${{ matrix.version.hadoop_version }} publish_to_registry: "false" diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 93ae43e..5a670f6 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -17,27 +17,6 @@ name: publish on: - # pull_request: - # paths: - # - ".github/workflows/**" - # - ".github/actions/**" - - # - "spark/**" - # - "spark-*/**" - - # - "!README.md" - - # push: - # branches: - # - main - # paths: - # - ".github/workflows/**" - # - ".github/actions/**" - - # - "spark/**" - # - "spark-*/**" - - # - "!README.md" workflow_dispatch: @@ -79,27 +58,37 @@ jobs: echo "prerelease: ${{ steps.git-release-tag.outputs.prerelease }}" shell: bash + get-release-versions: + if: github.repository_owner == 'OKDP' + runs-on: "ubuntu-latest" + outputs: + matrix: ${{ steps.release-versions.outputs.matrix }} + steps: + - name: Checkout Repo ⚡️ + uses: actions/checkout@v4 + + - name: Get release versions matrix 📥 + id: release-versions + uses: ./.github/actions/spark-version-matrix + with: + use_matrix: ".build/release-versions.yml" + spark-publish: if: github.repository_owner == 'OKDP' && needs.latest-github-release.outputs.tag_name != '' - name: spark-publish (spark-${{ matrix.spark_version }}) - needs: [latest-github-release] + name: spark-publish (${{ needs.latest-github-release.outputs.tag_name }}/spark-${{ matrix.version.spark_version }}) + needs: [latest-github-release, get-release-versions] strategy: + fail-fast: false matrix: - python_version: [3.11] - spark_version: [3.3.3, 3.3.4] - java_version: [17] - scala_version: [2.12] - hadoop_version: [3] + version: ${{ fromJson(needs.get-release-versions.outputs.matrix) }} uses: ./.github/workflows/build-images-template.yml with: - python_version: ${{ matrix.python_version }} - spark_version: ${{ matrix.spark_version }} - java_version: ${{ matrix.java_version }} - scala_version: ${{ matrix.scala_version }} - hadoop_version: ${{ matrix.hadoop_version }} + python_version: ${{ matrix.version.python_version }} + spark_version: ${{ matrix.version.spark_version }} + java_version: ${{ matrix.version.java_version }} + scala_version: ${{ matrix.version.scala_version }} + hadoop_version: ${{ matrix.version.hadoop_version }} registry: ${{ vars.REGISTRY || 'quay.io' }} publish_to_registry: "true" git_latest_release_tag: ${{ needs.latest-github-release.outputs.tag_name }} secrets: inherit - - \ No newline at end of file From f0e6088740d6ddb5eca9bf1d525a96345d958230 Mon Sep 17 00:00:00 2001 From: iizitounene Date: Fri, 29 Mar 2024 12:51:34 +0100 Subject: [PATCH 11/17] Build pipeline - Add branch name as suffix for CI images latest tag --- .github/actions/spark-image-tag/action.yaml | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/.github/actions/spark-image-tag/action.yaml b/.github/actions/spark-image-tag/action.yaml index ba0cac7..0b2b0fe 100644 --- a/.github/actions/spark-image-tag/action.yaml +++ b/.github/actions/spark-image-tag/action.yaml @@ -69,6 +69,10 @@ runs: - name: Expose git commit sha as env variable uses: rlespinasse/git-commit-data-action@v1.5.0 + - name: Get current branch 📦 + id: git-branch + uses: tj-actions/branch-names@v8 + - name: Generate spark image tags 📦 id: tags run: | @@ -100,6 +104,16 @@ runs: PUBLISH_TAGS=$(yq -oc '[.images[] | select(.name == "${{ inputs.image }}").tags | .[] |"${{ inputs.publish_repo }}/${{ inputs.image }}:" + .]' .build/images.yml) PUBLISH_TAGS=$(eval echo ${PUBLISH_TAGS}) + ### For pull request branchs merge, suffix the CI tag with the branch name + #### The tag is pushed in the CI registry only + CI_GIT_BRANCH_SUFFIX="${{ steps.git-branch.outputs.current_branch }}" + CI_GIT_BRANCH_SUFFIX=${CI_GIT_BRANCH_SUFFIX//\//-} + + if [[ "${{ inputs.publish_to_registry }}" == "false" ]] + then + LATEST_TAG="${LATEST_TAG}-${CI_GIT_BRANCH_SUFFIX}" + fi + # The image can inherit from a community image like docker.io/eclipse-temurin, ... if [[ "${PARENT_IMAGE_NAME}" != *"/"* ]] then @@ -107,7 +121,7 @@ runs: then PARENT_IMAGE_NAME="${{ inputs.publish_repo }}/${PARENT_IMAGE_NAME}" else - PARENT_IMAGE_NAME="${{ inputs.ci_repo }}/${PARENT_IMAGE_NAME}" + PARENT_IMAGE_NAME="${{ inputs.ci_repo }}/${PARENT_IMAGE_NAME}-${CI_GIT_BRANCH_SUFFIX}" fi fi From fd92972023701223f225aad7821d215fa9ddb753 Mon Sep 17 00:00:00 2001 From: iizitounene Date: Fri, 29 Mar 2024 13:35:49 +0100 Subject: [PATCH 12/17] release-please - automate release/publish process and periodic images rebuild --- .github/workflows/publish.yml | 11 +++++ .github/workflows/release-please.yml | 70 ++++++++++++++++++++++++++++ .release-please-manifest.json | 1 + release-please-config.json | 19 ++++++++ 4 files changed, 101 insertions(+) create mode 100644 .github/workflows/release-please.yml create mode 100644 .release-please-manifest.json create mode 100644 release-please-config.json diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 5a670f6..0a5ffd6 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -17,6 +17,17 @@ name: publish on: + ### Periodically rebuild all the images to fix os security vulnerabilities + schedule: + # At 05:00 AM, only on Tuesday + #- cron: "0 5 * * 2" + # At 05:00 AM, only on Friday + - cron: "0 5 * * 5" + # The release should be created manually (or with user token=pr approval/merge) in order to trigger the event + ### https://github.com/orgs/community/discussions/25281 + ### Instead of using the event, we call the workflow from release-please workflow (more secure) + #release: + # types: [published] workflow_dispatch: diff --git a/.github/workflows/release-please.yml b/.github/workflows/release-please.yml new file mode 100644 index 0000000..f73f3c2 --- /dev/null +++ b/.github/workflows/release-please.yml @@ -0,0 +1,70 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: release-please + +on: + pull_request: + types: + - closed + branches: + - main + +permissions: + contents: write + pull-requests: write + +# https://docs.github.com/en/actions/using-jobs/using-concurrency +concurrency: + # Only cancel in-progress jobs or runs for the current workflow - matches against branch & tags + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +defaults: + run: + shell: bash + +jobs: + release-please: + runs-on: ubuntu-latest + outputs: + release_created: ${{ steps.release-please.outputs.release_created }} + tag_name: ${{ steps.release-please.outputs.tag_name }} + # Skip the release process in the fork + # The pull request should come from the same repo (github_token from the fork does not have write permissions) + if: github.repository_owner == 'OKDP' && github.event.pull_request.merged == true && github.event.pull_request.head.repo.full_name == github.repository + steps: + - uses: google-github-actions/release-please-action@v4 + id: release-please + + publish: + runs-on: ubuntu-latest + needs: [release-please] + if: needs.release-please.outputs.release_created == 'true' + permissions: + contents: write + actions: write + packages: write + steps: + - name: "Publish images to official registry" + env: + GH_REPO: ${{ github.repository }} + GH_TOKEN: ${{ github.token }} + GH_DEBUG: api + run: | + gh workflow run publish.yml + shell: bash + \ No newline at end of file diff --git a/.release-please-manifest.json b/.release-please-manifest.json new file mode 100644 index 0000000..0967ef4 --- /dev/null +++ b/.release-please-manifest.json @@ -0,0 +1 @@ +{} diff --git a/release-please-config.json b/release-please-config.json new file mode 100644 index 0000000..c7275fa --- /dev/null +++ b/release-please-config.json @@ -0,0 +1,19 @@ +{ + "extra-files": [ + "README.md" + ], + "packages": { + ".": { + "changelog-path": "CHANGELOG.md", + "release-type": "simple", + "changelog-type": "default", + "bump-minor-pre-major": false, + "bump-patch-for-minor-pre-major": false, + "draft": false, + "prerelease": false, + "skip-snapshot": false + } + }, + "$schema": "https://raw.githubusercontent.com/googleapis/release-please/main/schemas/config.json" +} + From 980b0011181f23ed6867b1e4c32489f8904b3543 Mon Sep 17 00:00:00 2001 From: iizitounene Date: Fri, 29 Mar 2024 15:31:09 +0100 Subject: [PATCH 13/17] fix(spark-base): Add missing gpg keys in the spark project release keys --- spark-base/Dockerfile | 8 +++++++- spark-base/MISSING-GPG-KEYS.yml | 23 +++++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 spark-base/MISSING-GPG-KEYS.yml diff --git a/spark-base/Dockerfile b/spark-base/Dockerfile index 78326f5..35e96e6 100644 --- a/spark-base/Dockerfile +++ b/spark-base/Dockerfile @@ -32,6 +32,9 @@ ENV SPARK_VERSION ${SPARK_VERSION} ENV HADOOP_VERSION ${HADOOP_VERSION} ENV SCALA_VERSION ${SCALA_VERSION} +## Add missing gpg keys from https://downloads.apache.org/spark/KEYS +COPY MISSING-GPG-KEYS.yml . + RUN groupadd --system --gid=${spark_uid} spark && \ useradd --system --uid=${spark_uid} --gid=spark spark @@ -58,8 +61,11 @@ RUN set -ex;\ curl --retry 3 --retry-all-errors -k ${SPARK_DIST_DOWNLOAD_URL}/${DIST}.tgz -o ${WORK_DIR}/spark.tgz; \ curl --retry 3 --retry-all-errors -k ${SPARK_DIST_DOWNLOAD_URL}/${DIST}.tgz.asc -o ${WORK_DIR}/spark.tgz.asc; \ curl --retry 3 --retry-all-errors -k https://downloads.apache.org/spark/KEYS -o ${WORK_DIR}/KEYS; \ + MISSING_KEYS=($(cat MISSING-GPG-KEYS.yml | grep "keys:" -A300 | awk -F: '{ print $2 }' | tr -d '\n' | tr -d \"\" )); \ export GNUPGHOME="$(mktemp -d)"; \ gpg --batch --import ${WORK_DIR}/KEYS; \ + gpg --batch --keyserver hkps://keys.openpgp.org --recv-keys ${MISSING_KEYS} || true; \ + gpg --batch --keyserver hkps://keyserver.ubuntu.com --recv-keys ${MISSING_KEYS} || true; \ gpg --batch --verify ${WORK_DIR}/spark.tgz.asc ${WORK_DIR}/spark.tgz; \ tar --strip-components=1 -zxvf ${WORK_DIR}/spark.tgz -C ${SPARK_HOME}/; \ chown -R spark:spark ${SPARK_HOME}/; \ @@ -67,7 +73,7 @@ RUN set -ex;\ mv ${SPARK_HOME}/kubernetes/tests ${SPARK_HOME}/; \ chmod a+x /opt/decom.sh; \ gpgconf --kill all; \ - rm -rf ${GNUPGHOME} ${WORK_DIR}; \ + rm -rf ${GNUPGHOME} ${WORK_DIR} MISSING-GPG-KEYS.yml; \ rm -fr ${SPARK_HOME}/conf rm -fr ${SPARK_HOME}/yarn rm -fr ${SPARK_HOME}/kubernetes COPY entrypoint.sh /opt/entrypoint.sh diff --git a/spark-base/MISSING-GPG-KEYS.yml b/spark-base/MISSING-GPG-KEYS.yml new file mode 100644 index 0000000..241021b --- /dev/null +++ b/spark-base/MISSING-GPG-KEYS.yml @@ -0,0 +1,23 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Some gpg keys are missing in the spark project release key https://downloads.apache.org/spark/KEYS +## We add them manually thanks to apache/spark-docker official images repo: +#### https://github.com/apache/spark-docker/blob/master/tools/template.py +keys: + # issuer "yumwang@apache.org" + - "3.3.1": "86727D43E73A415F67A0B1A14E68B3E6CD473653" From d81faae2ea1ae023ed7478f7b1c20287c94a702c Mon Sep 17 00:00:00 2001 From: iizitounene Date: Tue, 2 Apr 2024 13:49:09 +0200 Subject: [PATCH 14/17] Add license file --- LICENSE | 202 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 202 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..c16bed3 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2024 tosit.io + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. From 8f552459c840b2e3d880dfb71062fe0785c12008 Mon Sep 17 00:00:00 2001 From: iizitounene Date: Tue, 2 Apr 2024 13:49:22 +0200 Subject: [PATCH 15/17] Update documentation --- README.md | 57 +++++++++- docs/images/spark-images.drawio.svg | 156 ++++++++++++++++++++++++++++ 2 files changed, 211 insertions(+), 2 deletions(-) create mode 100644 docs/images/spark-images.drawio.svg diff --git a/README.md b/README.md index b9d63e1..85dcdd5 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,55 @@ -# spark-images -Collection of Spark docker images for OKDP +[![ci](https://github.com/okdp/spark-images/actions/workflows/ci.yml/badge.svg)](https://github.com/okdp/spark-images/actions/workflows/ci.yml) +[![Release](https://img.shields.io/github/v/release/okdp/spark-images)](https://github.com/okdp/spark-images/releases/latest) +[![License Apache2](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0) + + +Collection of [Apache Spark](https://spark.apache.org/) docker images for [OKDP platform](https://okdp.io/). + +Currently, the images are built from the [Apache Spark project distribution](https://archive.apache.org/dist/spark) and the requirement may evolve to produce them from the [source code](https://github.com/apache/spark). + +The image relashionship is described by the following diagram: + +

+ +

+ + + + +| Image | Description | +|:---------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `JRE` | The JRE LTS base image supported by Apache Spark depending on the version. This includes Java 11/17/21. Please, check the [reference versions](.build/reference-versions.yml) or [Apache Spark website](https://spark.apache.org/docs/latest/) for more information. | +| `spark-base` | The Apache Spark base image with official spark binaries (scala/java) and without OKDP extensions. | +| `spark` | The Apache Spark image with official spark binaries (scala/java) and OKDP extensions. | +| `spark-py` | The Apache Spark image with official spark binaries (scala/java), OKDP extensions and python support. | +| `spark-r` | The Apache Spark image with official spark binaries (scala/java), OKDP extensions and R support. | + +# Tagging + +The project builds the images with a long format tags. Each tag combines multiple compatible versions combinations. + +There are multiple tags levels and the format to use is depending on your convenience in term of stability and reproducibility. + +The images are pushed to [OKDP quay.io](https://quay.io/organization/okdp) repository with the following [tags](.build/images.yml): + +| Images | Tags | +|:--------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| spark-base, spark | spark--scala--java-

spark--scala--java--

spark--scala--java--

spark--scala--java--- | +| spark-py | spark--python--scala--java-

spark--python--scala--java--

spark--python--scala--java--

spark--python--scala--java--- | +| spark-r | spark--r--scala--java-

spark--r--scala--java--

spark--r--scala--java--

spark--r--scala--java--- | + +> [!NOTE] +> 1. `` corresponds to the Github [release version](https://github.com/okdp/spark-images/releases) or [git tag](https://github.com/okdp/spark-images/tags) without the leading `v`. +> Ex.: 1.0.0 +> +> 2. `` corresponds to the images build date with the `YYYY-MM-DD` format. The latest release tag is built every week. +> +> An example of a `py-spark` image with a long form tag including `spark/java/scala/python` compatible versions and a `build date` with a `release version` is: +> +> `quay.io/okdp/spark-py:spark-3.3.4-python-3.10-scala-2.12-java-17-2024-03-29-1.0.0`. +> + +# Alternatives + +- [Official images](https://github.com/apache/spark-docker) + diff --git a/docs/images/spark-images.drawio.svg b/docs/images/spark-images.drawio.svg new file mode 100644 index 0000000..f284035 --- /dev/null +++ b/docs/images/spark-images.drawio.svg @@ -0,0 +1,156 @@ + + + + + + + + + + + + + +
+
+
+ + + eclipse-temurin:jre (LTS) + + +
+
+
+
+ + eclipse-temurin:jre... + +
+
+ + + + + + + + + + + +
+
+
+ + + spark-base + + +
+
+
+
+ + spark-base + +
+
+ + + + + + +
+
+
+ + + spark + + +
+
+
+
+ + spark + +
+
+ + + + + + + + + + + +
+
+
+ + + spark-py + + +
+
+
+
+ + spark-py + +
+
+ + + + + + + + + + + +
+
+
+ + + spark-r + + +
+
+
+
+ + spark-r + +
+
+ + + + + + + +
+ + + + + Text is not SVG - cannot display + + + +
\ No newline at end of file From 5da5b8a734c40d178e35d7981434afc66fc29604 Mon Sep 17 00:00:00 2001 From: iizitounene Date: Thu, 4 Apr 2024 09:18:47 +0200 Subject: [PATCH 16/17] feat(spark): Minimize minio/aws sdk v1/v2 depedendencies to reduce spark image size --- spark/Dockerfile | 17 +++- spark/minio.pom | 209 ++++++++++++++++++++++++++++++++++++++++++ spark/okdp-addons.pom | 42 ++++++++- 3 files changed, 262 insertions(+), 6 deletions(-) create mode 100644 spark/minio.pom diff --git a/spark/Dockerfile b/spark/Dockerfile index 8f92fb9..d50d6ad 100644 --- a/spark/Dockerfile +++ b/spark/Dockerfile @@ -33,22 +33,32 @@ RUN set -ex; \ WORKDIR /workspace -COPY okdp-addons.pom deps/pom.xml +COPY okdp-addons.pom . +COPY minio.pom . # The setup consumes less space compare to inheriting from the parent pom # Handles the transitive dependencies versions through the pom # Manage Java AWS SDK v1 (hadoop <3.4)/V2 (hadoop >=3.4) # Some pom.xml versions comes with Control M +# Minio and AWS profiles are mutually exclusive: aws includes minio RUN mvn -ntp dependency:get -DgroupId=org.apache.spark -DartifactId=spark-parent_${SCALA_VERSION} -Dversion=${SPARK_VERSION} -Dpackaging=pom; \ mvn -ntp dependency:copy -Dartifact=org.apache.spark:spark-parent_${SCALA_VERSION}:${SPARK_VERSION}:pom -Dproject.basedir=./ -DoutputDirectory=./; \ dos2unix spark-parent_${SCALA_VERSION}-${SPARK_VERSION}.pom; \ HADOOP_VERSION=$(grep "" spark-parent_${SCALA_VERSION}-${SPARK_VERSION}.pom | tr -d ' ' | sed -e 's/^ *\(.*\)<\/hadoop.version> *$/\1/' | sort -rn | head -n 1); \ - mv ./deps/pom.xml .; \ + mvn -ntp dependency:get -DgroupId=org.apache.hadoop -DartifactId=hadoop-aws -Dversion=${HADOOP_VERSION} -Dpackaging=pom; \ + mvn -ntp dependency:copy -Dartifact=org.apache.hadoop:hadoop-aws:${HADOOP_VERSION}:pom -Dproject.basedir=./ -DoutputDirectory=./; \ + AWS_JAVA_SDK_VERSION=$(mvn -ntp dependency:tree -f hadoop-aws-${HADOOP_VERSION}.pom | grep -E "(com.amazonaws|software.amazon.awssdk):(aws-java-sdk-bundle|bundle):jar:.*:compile" | awk '{ print $NF }' | awk -F: '{ print $4 }'); \ + mvn -ntp clean install \ + -Daws-java-sdk.version=${AWS_JAVA_SDK_VERSION} \ + -Daws-sdk-profile.version=v$(echo ${AWS_JAVA_SDK_VERSION} | cut -d '.' -f 1) \ + -f minio.pom; \ mvn -ntp clean dependency:copy-dependencies \ -Dspark.version=${SPARK_VERSION} \ -Dscala.version=${SCALA_VERSION} \ -Dhadoop.version=${HADOOP_VERSION} \ - -Paws + -Daws-java-sdk.version=${AWS_JAVA_SDK_VERSION} \ + -Pminio \ + -f okdp-addons.pom FROM $BASE_IMAGE @@ -56,7 +66,6 @@ ENV JMX_CONF_DIR /etc/metrics/conf/ # OKDP addons COPY --from=okdp_addons --chown=spark:spark /workspace/target/dependency/* $SPARK_HOME/jars -RUN chown -R spark:spark ${SPARK_HOME}/jars/ # Jmx prometheus metrics COPY --chown=spark:spark metrics.properties ${JMX_CONF_DIR}/metrics.properties diff --git a/spark/minio.pom b/spark/minio.pom new file mode 100644 index 0000000..e2c37ca --- /dev/null +++ b/spark/minio.pom @@ -0,0 +1,209 @@ + + + + + 4.0.0 + OKDP AWS SDK for Java - Minio Bundle + com.amazonaws + okdp-minio-aws-s3-bundle + ${aws-java-sdk.version} + jar + + OKDP AWS SDK for Java - Minio Bundle + The bundle contains S3 service only with around 6.5MB instead of +350MB (+540MB in v2 bundle) + + + UTF-8 + + + + + + minio-aws-java-sdk-s3-v1 + + + aws-sdk-profile.version + v1 + + + + + com.amazonaws + aws-java-sdk-s3 + ${aws-java-sdk.version} + + + + com.amazonaws + aws-java-sdk-dynamodb + ${aws-java-sdk.version} + + + + + + org.apache.maven.plugins + maven-shade-plugin + + + package + + shade + + + false + true + + + joda-time:joda-time + com.fasterxml.jackson.core:* + com.fasterxml.jackson.dataformat:jackson-dataformat-cbor + org.apache.httpcomponents:* + commons-codec:commons-codec + commons-logging:commons-logging + io.netty:* + com.amazonaws:* + + + + + org.joda + com.amazonaws.thirdparty.joda + + + com.fasterxml.jackson + com.amazonaws.thirdparty.jackson + + + org.apache.http + com.amazonaws.thirdparty.apache.http + + + org.apache.commons.codec + com.amazonaws.thirdparty.apache.codec + + + org.apache.commons.logging + com.amazonaws.thirdparty.apache.logging + + + io.netty + com.amazonaws.thirdparty.io.netty + + + + + + + + + + + + + minio-aws-java-sdk-s3-v2 + + + aws-sdk-profile.version + v2 + + + + + software.amazon.awssdk + s3 + ${aws-java-sdk.version} + + + + + + org.apache.maven.plugins + maven-shade-plugin + + + package + + shade + + + false + true + + + com.fasterxml.jackson.jr:* + io.netty:* + org.apache.httpcomponents:* + org.reactivestreams:* + org.slf4j:* + commons-codec:commons-codec + software.amazon.awssdk:* + software.amazon:* + software.amazon.s3.accessgrants:* + com.github.ben-manes.caffeine:* + commons-logging:* + + + + + org.apache + software.amazon.awssdk.thirdparty.org.apache + + org.apache.log4j.* + + + + io.netty + software.amazon.awssdk.thirdparty.io.netty + + + org.slf4j + software.amazon.awssdk.thirdparty.org.slf4j + + + + + + + + + + + \ No newline at end of file diff --git a/spark/okdp-addons.pom b/spark/okdp-addons.pom index eb94065..ef663c6 100644 --- a/spark/okdp-addons.pom +++ b/spark/okdp-addons.pom @@ -82,8 +82,46 @@ - + + minio + + + org.apache.hadoop + hadoop-aws + ${hadoop.version} + + + * + * + + + + + + com.amazonaws + okdp-minio-aws-s3-bundle + ${aws-java-sdk.version} + + + + org.apache.spark + spark-hadoop-cloud_${scala.version} + ${spark.version} + + + * + * + + + + + + + aws From 47968145005b129f517643e2e2ac0adfea92a985 Mon Sep 17 00:00:00 2001 From: iizitounene Date: Thu, 4 Apr 2024 09:24:50 +0200 Subject: [PATCH 17/17] Prepare first official release --- .build/ci-versions.yml | 11 ++++++----- .build/reference-versions.yml | 2 +- .build/release-versions.yml | 2 +- .github/workflows/build-image-template.yml | 15 ++++++++++++++- .github/workflows/ci.yml | 2 ++ .github/workflows/publish.yml | 5 +++-- README.md | 22 +++++++++++++++------- 7 files changed, 42 insertions(+), 17 deletions(-) diff --git a/.build/ci-versions.yml b/.build/ci-versions.yml index d6118b7..c36821f 100644 --- a/.build/ci-versions.yml +++ b/.build/ci-versions.yml @@ -14,10 +14,11 @@ # limitations under the License. # -########### ~~~ CI TEST MATRIX VERSIONS ~~~ ############################################################################ -#### PUT THE SPARK VERSIONS TO TEST IN CORRESPONDANCE WITH 'reference-versions.yml' FILE ############################### -#### !!! ANY DECLARED TEST VERSION WHICH IS NOT PRESENT IN 'reference-versions.yml' FILE IS SKIPPED DURING BUILD !!! ### -#### REMOVE, UPDATE OR ADD VERSIONS TO TEST ############################################################################ +########### ~~~ CI TEST MATRIX VERSIONS ~~~ ###################################################################################### +########### DEFINE A CI REFERENCE COMBINATIONS TO TEST AND PREVENT TESTING ALL THE COMBINATION WITCH LEADS TO TAKE A LOT OF TIME # +#### PUT THE SPARK VERSIONS TO TEST IN CORRESPONDANCE WITH 'reference-versions.yml' FILE ######################################### +#### !!! ANY DECLARED TEST VERSION WHICH IS NOT PRESENT IN 'reference-versions.yml' FILE IS SKIPPED DURING BUILD !!! ############# +#### REMOVE, UPDATE OR ADD VERSIONS TO TEST ###################################################################################### versions: # Maximum python version supported by spark-3.2.x: 3.9 # Java support: 8/11 @@ -43,7 +44,7 @@ versions: # https://spark.apache.org/releases/spark-release-3-5-0.html # Minimum supported java version: 17/21 - python_version: 3.11 - spark_version: [3.5.0] + spark_version: [3.5.1] java_version: [17] scala_version: [2.13] hadoop_version: 3 diff --git a/.build/reference-versions.yml b/.build/reference-versions.yml index 3369ece..450ca54 100644 --- a/.build/reference-versions.yml +++ b/.build/reference-versions.yml @@ -43,7 +43,7 @@ versions: # https://spark.apache.org/releases/spark-release-3-5-0.html # Minimum supported java version: 17/21 - python_version: 3.11 - spark_version: [3.5.0] + spark_version: [3.5.1] java_version: [17] scala_version: [2.12, 2.13] hadoop_version: 3 diff --git a/.build/release-versions.yml b/.build/release-versions.yml index 6bfc534..fe814fe 100644 --- a/.build/release-versions.yml +++ b/.build/release-versions.yml @@ -43,7 +43,7 @@ versions: # https://spark.apache.org/releases/spark-release-3-5-0.html # Minimum supported java version: 17/21 - python_version: 3.11 - spark_version: [3.5.0] + spark_version: [3.5.1] java_version: [17] scala_version: [2.12, 2.13] hadoop_version: 3 diff --git a/.github/workflows/build-image-template.yml b/.github/workflows/build-image-template.yml index ad39c32..3615a1f 100644 --- a/.github/workflows/build-image-template.yml +++ b/.github/workflows/build-image-template.yml @@ -140,6 +140,13 @@ jobs: BASE_IMAGE=${{ steps.image-tags.outputs.parent_image }} tags: | ${{ steps.registry-repos.outputs.ci_repo }}/${{ inputs.image }}:${{ steps.image-tags.outputs.latest_tag }} + labels: | + org.opencontainers.image.title="${{ inputs.image }}" + org.opencontainers.image.version="${{ inputs.spark_version}}" + org.opencontainers.image.description="Spark image" + org.opencontainers.image.base.name="${{ steps.image-tags.outputs.parent_image }}" + org.opencontainers.image.source="https://github.com/${{ github.repository }}" + org.opencontainers.image.licenses="Apache-2.0" ### CI Steps # https://github.com/nektos/act/issues/678 @@ -202,5 +209,11 @@ jobs: HADOOP_VERSION=${{ inputs.hadoop_version }} BASE_IMAGE=${{ steps.image-tags.outputs.parent_image }} tags: ${{ steps.image-tags.outputs.publish_tags }} - + labels: | + org.opencontainers.image.title="${{ inputs.image }}" + org.opencontainers.image.version="${{ inputs.spark_version}}" + org.opencontainers.image.description="Spark image" + org.opencontainers.image.base.name="${{ steps.image-tags.outputs.parent_image }}" + org.opencontainers.image.source="https://github.com/${{ github.repository }}" + org.opencontainers.image.licenses="Apache-2.0" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2b0a449..0da6ca0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -18,6 +18,8 @@ name: ci on: pull_request: + branches: + - main paths: - ".github/workflows/**" - ".github/actions/**" diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 0a5ffd6..4d8f7d4 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -21,8 +21,9 @@ on: schedule: # At 05:00 AM, only on Tuesday #- cron: "0 5 * * 2" - # At 05:00 AM, only on Friday - - cron: "0 5 * * 5" + # https://crontab.cronhub.io/ + # At 05:"0 AM, only on Tuesday + - cron: "0 5 * * 2" # The release should be created manually (or with user token=pr approval/merge) in order to trigger the event ### https://github.com/orgs/community/discussions/25281 ### Instead of using the event, we call the workflow from release-please workflow (more secure) diff --git a/README.md b/README.md index 85dcdd5..8b01d7e 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![License Apache2](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0) -Collection of [Apache Spark](https://spark.apache.org/) docker images for [OKDP platform](https://okdp.io/). +Collection of [Apache Spark](https://spark.apache.org/) docker images for [OKDP Platform](https://okdp.io/). Currently, the images are built from the [Apache Spark project distribution](https://archive.apache.org/dist/spark) and the requirement may evolve to produce them from the [source code](https://github.com/apache/spark). @@ -28,9 +28,9 @@ The image relashionship is described by the following diagram: The project builds the images with a long format tags. Each tag combines multiple compatible versions combinations. -There are multiple tags levels and the format to use is depending on your convenience in term of stability and reproducibility. +There are multiple tags levels and the format to use depends on your convenience in term of stability and reproducibility. -The images are pushed to [OKDP quay.io](https://quay.io/organization/okdp) repository with the following [tags](.build/images.yml): +The images are pushed to [quay.io/okdp](https://quay.io/organization/okdp) repository with the following [tags](.build/images.yml): | Images | Tags | |:--------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| @@ -40,13 +40,21 @@ The images are pushed to [OKDP quay.io](https://quay.io/organization/okdp) repos > [!NOTE] > 1. `` corresponds to the Github [release version](https://github.com/okdp/spark-images/releases) or [git tag](https://github.com/okdp/spark-images/tags) without the leading `v`. -> Ex.: 1.0.0 +> Ex.: 1.0.0 > -> 2. `` corresponds to the images build date with the `YYYY-MM-DD` format. The latest release tag is built every week. +> 2. `` corresponds to the images build date with the `YYYY-MM-DD` format. The latest release tag is rebuilt every week to ensure the OS image is up to date against the latest security updates. > -> An example of a `py-spark` image with a long form tag including `spark/java/scala/python` compatible versions and a `build date` with a `release version` is: +> You may need to switch to the latest release version if your are using the long form tag image with a ``. Please, check the [changelog](https://github.com/okdp/spark-images/releases) to see the notable impacts. +> +> An example of `py-spark` image with a long form tag including `spark/java/scala/python` compatible versions and a `` with a `` is: +> +> `quay.io/okdp/spark-py:spark-3.5.1-python-3.11-scala-2.13-java-17-2024-04-04-1.0.0`. +> +> The corresponding changelog is [releases/tag/v1.0.0](https://github.com/okdp/spark-images/releases/tag/v1.0.0). +> +> 3. You can also use the latest tag without `` and `` which is always up to date with the latest security updates. > -> `quay.io/okdp/spark-py:spark-3.3.4-python-3.10-scala-2.12-java-17-2024-03-29-1.0.0`. +> An example of `py-spark` image with the latest tag is: `quay.io/okdp/spark-py:spark-3.5.1-python-3.11-scala-2.13-java-17` > # Alternatives