diff --git a/.build/ci-versions.yml b/.build/ci-versions.yml new file mode 100644 index 0000000..c36821f --- /dev/null +++ b/.build/ci-versions.yml @@ -0,0 +1,51 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +########### ~~~ CI TEST MATRIX VERSIONS ~~~ ###################################################################################### +########### DEFINE A CI REFERENCE COMBINATIONS TO TEST AND PREVENT TESTING ALL THE COMBINATION WITCH LEADS TO TAKE A LOT OF TIME # +#### PUT THE SPARK VERSIONS TO TEST IN CORRESPONDANCE WITH 'reference-versions.yml' FILE ######################################### +#### !!! ANY DECLARED TEST VERSION WHICH IS NOT PRESENT IN 'reference-versions.yml' FILE IS SKIPPED DURING BUILD !!! ############# +#### REMOVE, UPDATE OR ADD VERSIONS TO TEST ###################################################################################### +versions: + # Maximum python version supported by spark-3.2.x: 3.9 + # Java support: 8/11 + - python_version: 3.9 + spark_version: [3.2.4] + java_version: [11] + scala_version: [2.12] + hadoop_version: 3.2 + # Maximum python version supported by spark-3.3.x: 3.10 + # Java support: 8/11/17 + - python_version: '3.10' + spark_version: [3.3.4] + java_version: [17] + scala_version: [2.12, 2.13] + hadoop_version: 3 + # Maximum python version supported by spark-3.4.x: 3.11 + # Java support: 8/11/17 + - python_version: 3.11 + spark_version: [3.4.2] + java_version: [17] + scala_version: [2.12, 2.13] + hadoop_version: 3 + # https://spark.apache.org/releases/spark-release-3-5-0.html + # Minimum supported java version: 17/21 + - python_version: 3.11 + spark_version: [3.5.1] + java_version: [17] + scala_version: [2.13] + hadoop_version: 3 + diff --git a/.build/images.yml b/.build/images.yml new file mode 100644 index 0000000..da019f2 --- /dev/null +++ b/.build/images.yml @@ -0,0 +1,60 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +images: + - name: docker.io/eclipse-temurin + tags: + - ${java_version}-jre-jammy + - name: spark-base + dependsOn: docker.io/eclipse-temurin + tags: + - spark-${spark_version}-scala-${scala_version}-java-${java_version} + - spark-${spark_version}-scala-${scala_version}-java-${java_version}-$(date '+%Y-%m-%d') + - spark-${spark_version}-scala-${scala_version}-java-${java_version}-${git_release_version} + - spark-${spark_version}-scala-${scala_version}-java-${java_version}-$(date '+%Y-%m-%d')-${git_release_version} + #- spark-${spark_version}-scala-${scala_version}-java-${java_version}-${git_commit_short_sha} + - name: spark + dependsOn: spark-base + tags: + - spark-${spark_version}-scala-${scala_version}-java-${java_version} + - spark-${spark_version}-scala-${scala_version}-java-${java_version}-$(date '+%Y-%m-%d') + - spark-${spark_version}-scala-${scala_version}-java-${java_version}-${git_release_version} + - spark-${spark_version}-scala-${scala_version}-java-${java_version}-$(date '+%Y-%m-%d')-${git_release_version} + #- spark-${spark_version}-scala-${scala_version}-java-${java_version}-${git_commit_short_sha} + - name: spark-py + dependsOn: spark + tags: + - spark-${spark_version}-python-${python_version}-scala-${scala_version}-java-${java_version} + - spark-${spark_version}-python-${python_version}-scala-${scala_version}-java-${java_version}-$(date '+%Y-%m-%d') + - spark-${spark_version}-python-${python_version}-scala-${scala_version}-java-${java_version}-${git_release_version} + - spark-${spark_version}-python-${python_version}-scala-${scala_version}-java-${java_version}-$(date '+%Y-%m-%d')-${git_release_version} + #- spark-${spark_version}-python-${python_version}-scala-${scala_version}-java-${java_version}-${git_commit_short_sha} + - name: spark-r + dependsOn: spark + tags: + - spark-${spark_version}-r-${r_version}-scala-${scala_version}-java-${java_version} + - spark-${spark_version}-r-${r_version}-scala-${scala_version}-java-${java_version}-$(date '+%Y-%m-%d') + - spark-${spark_version}-r-${r_version}-scala-${scala_version}-java-${java_version}-${git_release_version} + - spark-${spark_version}-r-${r_version}-scala-${scala_version}-java-${java_version}-$(date '+%Y-%m-%d')-${git_release_version} + #- spark-${spark_version}-r-${r_version}-scala-${scala_version}-java-${java_version}-${git_commit_short_sha} + - name: spark-py-r + dependsOn: spark-py + tags: + - spark-${spark_version}-python-${python_version}-r-${r_version}-scala-${scala_version}-java-${java_version} + - spark-${spark_version}-python-${python_version}-r-${r_version}-scala-${scala_version}-java-${java_version}-$(date '+%Y-%m-%d') + - spark-${spark_version}-python-${python_version}-r-${r_version}-scala-${scala_version}-java-${java_version}-${git_release_version} + - spark-${spark_version}-python-${python_version}-r-${r_version}-scala-${scala_version}-java-${java_version}-$(date '+%Y-%m-%d')-${git_release_version} + #- spark-${spark_version}-python-${python_version}-r-${r_version}-scala-${scala_version}-java-${java_version}-${git_commit_short_sha} diff --git a/.build/reference-versions.yml b/.build/reference-versions.yml new file mode 100644 index 0000000..450ca54 --- /dev/null +++ b/.build/reference-versions.yml @@ -0,0 +1,50 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +### REFERENCE MATRIX VERSIONS ############################## +#### !!! DOT NOT DELETE ANY ELEMENT !!! #################### +######## APPEND ONLY WHEN NEW SPARK VERSION IS REALEASED ### +############ USED AS REFERENCE DURING BUILD ################ +versions: + # Maximum python version supported by spark-3.2.x: 3.9 + # Java support: 8/11 + - python_version: 3.9 + spark_version: [3.2.1, 3.2.2, 3.2.3, 3.2.4] + java_version: [11] + scala_version: [2.12, 2.13] + hadoop_version: 3.2 + # Maximum python version supported by spark-3.3.x: 3.10 + # Java support: 8/11/17 + - python_version: '3.10' + spark_version: [3.3.1, 3.3.2, 3.3.3, 3.3.4] + java_version: [17] + scala_version: [2.12, 2.13] + hadoop_version: 3 + # Maximum python version supported by spark-3.4.x: 3.11 + # Java support: 8/11/17 + - python_version: 3.11 + spark_version: [3.4.1, 3.4.2] + java_version: [17] + scala_version: [2.12, 2.13] + hadoop_version: 3 + # https://spark.apache.org/releases/spark-release-3-5-0.html + # Minimum supported java version: 17/21 + - python_version: 3.11 + spark_version: [3.5.1] + java_version: [17] + scala_version: [2.12, 2.13] + hadoop_version: 3 + diff --git a/.build/release-versions.yml b/.build/release-versions.yml new file mode 100644 index 0000000..fe814fe --- /dev/null +++ b/.build/release-versions.yml @@ -0,0 +1,50 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +########### CURRENT MATRIX VERSIONS ################################################################################ +#### PUT THE SPARK VERSIONS TO BUILD IN CORRESPONDANCE WITH 'reference-versions.yml' FILE ########################## +#### !!! ANY DECLARED VERSION WHICH IS NOT PRESENT IN 'reference-versions.yml' FILE IS SKIPPED DURING BUILD !!! #### +#### REMOVE, UPDATE OR ADD VERSIONS ################################################################################ +versions: + # Maximum python version supported by spark-3.2.x: 3.9 + # Java support: 8/11 + - python_version: 3.9 + spark_version: [3.2.1, 3.2.2, 3.2.3, 3.2.4] + java_version: [11] + scala_version: [2.12, 2.13] + hadoop_version: 3.2 + # Maximum python version supported by spark-3.3.x: 3.10 + # Java support: 8/11/17 + - python_version: '3.10' + spark_version: [3.3.1, 3.3.2, 3.3.3, 3.3.4] + java_version: [17] + scala_version: [2.12, 2.13] + hadoop_version: 3 + # Maximum python version supported by spark-3.4.x: 3.11 + # Java support: 8/11/17 + - python_version: 3.11 + spark_version: [3.4.1, 3.4.2] + java_version: [17] + scala_version: [2.12, 2.13] + hadoop_version: 3 + # https://spark.apache.org/releases/spark-release-3-5-0.html + # Minimum supported java version: 17/21 + - python_version: 3.11 + spark_version: [3.5.1] + java_version: [17] + scala_version: [2.12, 2.13] + hadoop_version: 3 + diff --git a/.github/actions/free-disk-space/action.yml b/.github/actions/free-disk-space/action.yml new file mode 100644 index 0000000..4f661da --- /dev/null +++ b/.github/actions/free-disk-space/action.yml @@ -0,0 +1,40 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: Free disk space +description: Free Github runnner disk space + +runs: + using: composite + steps: + - name: Free Disk Space (Ubuntu) + uses: jlumbroso/free-disk-space@main + with: + # this might remove tools that are actually needed, + # if set to "true" but frees about 6 GB + tool-cache: false + + # all of these default to true, but feel free to set to + # "false" if necessary for your workflow + android: true + dotnet: true + haskell: true + large-packages: true + docker-images: true + swap-storage: true + + + diff --git a/.github/actions/setup-buildx/action.yaml b/.github/actions/setup-buildx/action.yaml new file mode 100644 index 0000000..6de6665 --- /dev/null +++ b/.github/actions/setup-buildx/action.yaml @@ -0,0 +1,29 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: Set up QEMU and Docker Buildx +description: Set up Docker Buildx + +runs: + using: composite + steps: + - name: Set up QEMU 📦 + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx 📦 + uses: docker/setup-buildx-action@v3 + with: + driver-opts: network=host \ No newline at end of file diff --git a/.github/actions/setup-kind/action.yaml b/.github/actions/setup-kind/action.yaml new file mode 100644 index 0000000..e16035f --- /dev/null +++ b/.github/actions/setup-kind/action.yaml @@ -0,0 +1,37 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: Setup kind +description: Deploy kind cluster + +runs: + using: composite + steps: + - name: Create k8s Kind Cluster + uses: helm/kind-action@v1 + with: + # https://github.com/helm/kind-action?tab=readme-ov-file#inputs + verbosity: 10 + cluster_name: "kind-ci-${{ github.job }}" + ignore_failed_clean: true # Ignore the post delete cluster action failing + wait: "180s" # Max timeout to wait Kind becomes ready + + - name: Print Kind cluster state + run: | + kubectl cluster-info + kubectl get pods -A + kubectl describe node + shell: bash \ No newline at end of file diff --git a/.github/actions/spark-image-tag/action.yaml b/.github/actions/spark-image-tag/action.yaml new file mode 100644 index 0000000..0b2b0fe --- /dev/null +++ b/.github/actions/spark-image-tag/action.yaml @@ -0,0 +1,137 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: Generate spark image tags +description: Generate spark image tags + +inputs: + image: + description: Image name + required: true + spark_version: + description: Spark version + required: true + scala_version: + description: Scala version + required: true + java_version: + description: Java version + required: true + python_version: + description: Python version + required: true + ci_repo: + description: The CI registry repo + required: false + git_tag_name: + description: The Git remote latest tag name + required: false + publish_repo: + description: The official registry repo + required: false + publish_to_registry: + description: Whether to push or not to the official registry repo + required: true + +outputs: + parent_image: + description: "Image tags" + value: ${{ steps.tags.outputs.parent_image }} + latest_tag: + description: "CI image tags (ex.: spark-3.3.4....)" + value: ${{ steps.tags.outputs.latest_tag }} + publish_tags: + description: "Image tags to push into registry (ex.: quay.io/spark-r:spark-3.3.4...)" + value: ${{ steps.tags.outputs.publish_tags }} + +runs: + using: composite + steps: + - name: Install yq + run: | + sudo wget -qO /usr/local/bin/yq https://github.com/mikefarah/yq/releases/download/v4.42.1/yq_linux_amd64 + sudo chmod a+x /usr/local/bin/yq + shell: bash + + - name: Expose git commit sha as env variable + uses: rlespinasse/git-commit-data-action@v1.5.0 + + - name: Get current branch 📦 + id: git-branch + uses: tj-actions/branch-names@v8 + + - name: Generate spark image tags 📦 + id: tags + run: | + ### Inputs + ### Variables substitution used in '.build/images.yml' file + spark_version=${{ inputs.spark_version }} + scala_version=${{ inputs.scala_version }} + java_version=${{ inputs.java_version }} + python_version=${{ inputs.python_version }} + git_tag_name=${{ inputs.git_tag_name }} + git_release_version=$(echo '${{ inputs.git_tag_name }}' | tr -d 'v') + + git_commit_sha=${{ env.GIT_COMMIT_SHA }} + git_commit_short_sha=${{ env.GIT_COMMIT_SHORT_SHA }} + git_commit_short_sha=${{ env.GIT_COMMIT_SHORT_SHA }} + + + ### Outputs - Parse: .build/images.yml + PARENT_IMAGE_NAME=$(yq '(.images[] | select(.name == "${{ inputs.image }}").dependsOn)' .build/images.yml) + PARENT_IMAGE_NAME=$(eval echo ${PARENT_IMAGE_NAME}) + + PARENT_IMAGE_TAG=$(yq -oc "(.images[] | select(.name == \"${PARENT_IMAGE_NAME}\").tags[0])" .build/images.yml) + PARENT_IMAGE_TAG=$(eval echo ${PARENT_IMAGE_TAG}) + PARENT_IMAGE_NAME="${PARENT_IMAGE_NAME}:${PARENT_IMAGE_TAG}" + + LATEST_TAG=$(yq -oc '(.images[] | select(.name == "${{ inputs.image }}").tags[0])' .build/images.yml) + LATEST_TAG=$(eval echo ${LATEST_TAG}) + + PUBLISH_TAGS=$(yq -oc '[.images[] | select(.name == "${{ inputs.image }}").tags | .[] |"${{ inputs.publish_repo }}/${{ inputs.image }}:" + .]' .build/images.yml) + PUBLISH_TAGS=$(eval echo ${PUBLISH_TAGS}) + + ### For pull request branchs merge, suffix the CI tag with the branch name + #### The tag is pushed in the CI registry only + CI_GIT_BRANCH_SUFFIX="${{ steps.git-branch.outputs.current_branch }}" + CI_GIT_BRANCH_SUFFIX=${CI_GIT_BRANCH_SUFFIX//\//-} + + if [[ "${{ inputs.publish_to_registry }}" == "false" ]] + then + LATEST_TAG="${LATEST_TAG}-${CI_GIT_BRANCH_SUFFIX}" + fi + + # The image can inherit from a community image like docker.io/eclipse-temurin, ... + if [[ "${PARENT_IMAGE_NAME}" != *"/"* ]] + then + if [[ "${{ inputs.publish_to_registry }}" == "true" ]] + then + PARENT_IMAGE_NAME="${{ inputs.publish_repo }}/${PARENT_IMAGE_NAME}" + else + PARENT_IMAGE_NAME="${{ inputs.ci_repo }}/${PARENT_IMAGE_NAME}-${CI_GIT_BRANCH_SUFFIX}" + fi + fi + + # Logging + echo "parent_image=${PARENT_IMAGE_NAME}" + echo "latest_tag=${LATEST_TAG}" + echo "publish_tags=${PUBLISH_TAGS}" + # Set outputs + echo "parent_image=${PARENT_IMAGE_NAME}" >> $GITHUB_OUTPUT + echo "latest_tag=${LATEST_TAG}" >> $GITHUB_OUTPUT + echo "publish_tags=${PUBLISH_TAGS}" >> $GITHUB_OUTPUT + + shell: bash diff --git a/.github/actions/spark-tests-prepare/action.yml b/.github/actions/spark-tests-prepare/action.yml new file mode 100644 index 0000000..96242af --- /dev/null +++ b/.github/actions/spark-tests-prepare/action.yml @@ -0,0 +1,63 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: Prepare integration tests +description: Prepare integration tests + +inputs: + spark_version: + description: Spark version + required: true + scala_version: + description: Scala version + required: true + java_version: + description: Java version + required: true + +outputs: + git_tag_checkout_dir: + description: "Git checkout tag local source directory" + value: ${{ steps.git-checkout-tag.outputs.git_tag_checkout_dir }} + +runs: + using: composite + # https://github.com/apache/spark/blob/master/.github/workflows/build_and_test.yml + steps: + - name: Set up Java ${{ inputs.java_version }} + uses: actions/setup-java@v4 + with: + distribution: 'zulu' + java-version: ${{ inputs.java_version }} + + - name: Cache Scala, SBT and Maven + uses: actions/cache@v4 + with: + path: | + build/apache-maven-* + build/scala-* + build/*.jar + ~/.sbt + key: build-${{ inputs.spark_version }}-scala${{ inputs.scala_version }}-java${{ inputs.java_version }} + + - name: Cache Coursier local repository + uses: actions/cache@v4 + with: + path: ~/.cache/coursier + key: build-${{ inputs.spark_version }}-scala${{ inputs.scala_version }}-java${{ inputs.java_version }}-coursier + + + diff --git a/.github/actions/spark-tests-run/action.yml b/.github/actions/spark-tests-run/action.yml new file mode 100644 index 0000000..9063b9c --- /dev/null +++ b/.github/actions/spark-tests-run/action.yml @@ -0,0 +1,125 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: Run integration tests +description: Run integration tests + +inputs: + ci-repo: + description: The CI registry repo URL + required: true + image: + description: Spark image name to test (ex. spark) + required: true + image-tag: + description: Spark image tag to test (ex. latest) + required: true + scala_version: + description: Scala version + required: true + git_checkout_tag_dir: + description: Git checkout tag directory + required: true + +runs: + using: composite + # https://github.com/apache/spark/tree/master/resource-managers/kubernetes/integration-tests + # https://github.com/apache/spark/blob/master/.github/workflows/build_and_test.yml + # https://github.com/apache/spark/pull/35830 + steps: + - name: Load image ${{ inputs.image }} into Kind and setup Spark RBACs + run: | + kubectl create clusterrolebinding serviceaccounts-cluster-admin \ + --clusterrole=cluster-admin \ + --group=system:serviceaccounts || true + # Pull and Load the image into all kind nodes (current setup mono node) for fast executors startup + docker pull ${{ inputs.ci-repo}}/${{ inputs.image }}:${{ inputs.image-tag }} + kind load docker-image ${{ inputs.ci-repo}}/${{ inputs.image }}:${{ inputs.image-tag }} --name kind-ci-${{ github.job }} + shell: bash + + - name: Change Scala version to ${{ inputs.scala_version }} + run: | + ./dev/change-scala-version.sh ${{ inputs.scala_version }} + echo "SCALA_PROFILE=scala-${{ inputs.scala_version }}" >> $GITHUB_ENV + + working-directory: ${{ inputs.git_checkout_tag_dir }} + shell: bash + + - name: Run base integration tests (${{ inputs.image }}) + if: inputs.image == 'spark-base' || inputs.image == 'spark' + run: | + build/sbt -P${{ env.SCALA_PROFILE }} -Pkubernetes -Pkubernetes-integration-tests \ + -Dspark.kubernetes.test.driverRequestCores=0.5 -Dspark.kubernetes.test.executorRequestCores=0.2 \ + -Dspark.kubernetes.test.deployMode=cloud \ + -Dspark.kubernetes.test.imageRepo=${{ inputs.ci-repo}} -Dspark.kubernetes.test.imageTag=${{ inputs.image-tag }} \ + -Dspark.kubernetes.test.jvmImage=${{ inputs.image }} \ + -Dspark.kubernetes.test.pythonImage=${{ inputs.image }} \ + -Dspark.kubernetes.test.pythonImage=${{ inputs.image }} \ + -Dspark.kubernetes.test.rImage=${{ inputs.image }} \ + 'kubernetes-integration-tests/testOnly -- -z "Run SparkPi"' + + working-directory: ${{ inputs.git_checkout_tag_dir }} + shell: bash + + - name: Run spark-py integration tests (${{ inputs.image }}) + if: inputs.image == 'spark-py' + run: | + build/sbt -P${{ env.SCALA_PROFILE }} -Pkubernetes -Pkubernetes-integration-tests \ + -Dspark.kubernetes.test.driverRequestCores=0.5 -Dspark.kubernetes.test.executorRequestCores=0.2 \ + -Dspark.kubernetes.test.deployMode=cloud \ + -Dspark.kubernetes.test.imageRepo=${{ inputs.ci-repo}} -Dspark.kubernetes.test.imageTag=${{ inputs.image-tag }} \ + -Dspark.kubernetes.test.jvmImage=${{ inputs.image }} \ + -Dspark.kubernetes.test.pythonImage=${{ inputs.image }} \ + -Dspark.kubernetes.test.pythonImage=${{ inputs.image }} \ + -Dspark.kubernetes.test.rImage=${{ inputs.image }} \ + 'kubernetes-integration-tests/testOnly -- -z "Run PySpark"' + + working-directory: ${{ inputs.git_checkout_tag_dir }} + shell: bash + + - name: Run spark-r integration tests (${{ inputs.image }}) + if: inputs.image == 'spark-r' + run: | + build/sbt -P${{ env.SCALA_PROFILE }} -Pkubernetes -Pkubernetes-integration-tests \ + -Dspark.kubernetes.test.driverRequestCores=0.5 -Dspark.kubernetes.test.executorRequestCores=0.2 \ + -Dspark.kubernetes.test.deployMode=cloud \ + -Dspark.kubernetes.test.imageRepo=${{ inputs.ci-repo}} -Dspark.kubernetes.test.imageTag=${{ inputs.image-tag }} \ + -Dspark.kubernetes.test.jvmImage=${{ inputs.image }} \ + -Dspark.kubernetes.test.pythonImage=${{ inputs.image }} \ + -Dspark.kubernetes.test.pythonImage=${{ inputs.image }} \ + -Dspark.kubernetes.test.rImage=${{ inputs.image }} \ + -Psparkr -Dtest.include.tags=r \ + 'kubernetes-integration-tests/testOnly' + + working-directory: ${{ inputs.git_checkout_tag_dir }} + shell: bash + + # - name: Run All integration tests (${{ inputs.image }}) + # if: inputs.image == 'spark-py-r' + # run: | + # build/sbt -P${{ env.SCALA_PROFILE }} -Pkubernetes -Pkubernetes-integration-tests \ + # -Dspark.kubernetes.test.driverRequestCores=0.5 -Dspark.kubernetes.test.executorRequestCores=0.2 \ + # -Dspark.kubernetes.test.deployMode=cloud \ + # -Dspark.kubernetes.test.imageRepo=${{ inputs.ci-repo}} -Dspark.kubernetes.test.imageTag=${{ inputs.image-tag }} \ + # -Dspark.kubernetes.test.jvmImage=${{ inputs.image }} \ + # -Dspark.kubernetes.test.pythonImage=${{ inputs.image }} \ + # -Dspark.kubernetes.test.pythonImage=${{ inputs.image }} \ + # -Dspark.kubernetes.test.rImage=${{ inputs.image }} \ + # 'kubernetes-integration-tests/testOnly' + + # working-directory: ${{ inputs.git_checkout_tag_dir }} + # shell: bash + diff --git a/.github/actions/spark-version-matrix/action.yml b/.github/actions/spark-version-matrix/action.yml new file mode 100644 index 0000000..782b21c --- /dev/null +++ b/.github/actions/spark-version-matrix/action.yml @@ -0,0 +1,59 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: Build Spark versions matrix +description: Build Spark versions matrix from '.build/versions.yml' and '.build/reference-versions.yml' files + +inputs: + use_matrix: + description: The matrix version file to use + required: true + +outputs: + matrix: + description: "Spark versions matrix" + value: ${{ steps.generate-matrix.outputs.matrix }} + +runs: + using: composite + steps: + - name: Generate Matrix + id: generate-matrix + run: | + + INPUT_MATRIX=$(yq -oj ${{ inputs.use_matrix }} | jq '.versions | .[] | + {python_version: .python_version, + hadoop_version: .hadoop_version} + + (.spark_version[] | {spark_version: .}) + + (.scala_version[] | {scala_version: .}) + + (.java_version[] | {java_version: .})' | jq -c --slurp '.') + REF_MATRIX=$(yq -oj .build/reference-versions.yml | jq '.versions | .[] | + {python_version: .python_version, + hadoop_version: .hadoop_version} + + (.spark_version[] | {spark_version: .}) + + (.scala_version[] | {scala_version: .}) + + (.java_version[] | {java_version: .})' | jq -c --slurp '.') + + ### Intersection between the versions matrix and the reference versions matrix + ### When the intersection is empty, the jobs are skipped! + MATRIX=$(jq --argjson IN ${INPUT_MATRIX} --argjson REF ${REF_MATRIX} -cn '$IN - ($IN- $REF)') + + LENGHT=$(echo ${MATRIX} | jq '. | length') + echo "${MATRIX}" + echo "Found ${LENGHT} compatible version combinations" + echo "matrix=${MATRIX}" >> $GITHUB_OUTPUT + + shell: bash diff --git a/.github/workflows/build-image-template.yml b/.github/workflows/build-image-template.yml new file mode 100644 index 0000000..3615a1f --- /dev/null +++ b/.github/workflows/build-image-template.yml @@ -0,0 +1,219 @@ + +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: Spark build single image template + +on: + workflow_call: + inputs: + image: + description: The spark image name (ex. spark-base, spark, spark-py, spark-r, etc) + required: true + type: string + spark_version: + description: Spark version + required: true + type: string + scala_version: + description: Scala version + required: true + type: string + java_version: + description: Java version + required: true + type: string + hadoop_version: + description: Hadoop version + required: true + type: string + python_version: + description: Python version + required: true + type: string + publish_to_registry: + description: Wheter to push to the registry + required: false + type: string + default: "false" + registry: + description: The container registry + required: false + type: string + ci_registry: + description: "The registry used to push ci images" + required: false + type: string + default: "ghcr.io" + git_latest_release_tag: + description: The latest remote release tag + required: false + type: string + default: "" + runs-on: + description: GitHub Actions Runner image + required: true + type: string + +jobs: + + build-test-push: + name: ${{ inputs.image }} (scala-${{ inputs.scala_version }}, java-${{ inputs.java_version }}, python-${{ inputs.python_version }}, hadoop-${{ inputs.hadoop_version }}) + runs-on: ${{ inputs.runs-on }} + steps: + + ### The publish and periodic rebuilds are based on the latest stable github release tag + - name: Checkout latest Github Release tag (${{ inputs.git_latest_release_tag }}) ⚡️ + if: inputs.publish_to_registry == 'true' + uses: actions/checkout@v4 + with: + ref: ${{ inputs.git_latest_release_tag }} + + ### The CI is based on the main branch + - name: Checkout Repo ⚡️ + if: inputs.publish_to_registry == 'false' + uses: actions/checkout@v4 + + ### Common steps between CI and Publish + - name: Free up disk space 📦 + uses: ./.github/actions/free-disk-space + + - name: Set up QEMU and Docker Buildx 📦 + uses: ./.github/actions/setup-buildx + + - name: Set up CI and official registries 📦 + id: registry-repos + run: | + echo "repo_owner=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_OUTPUT + echo "ci_repo=${{ inputs.ci_registry }}/${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_OUTPUT + echo "publish_repo=${{ inputs.registry }}/${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_OUTPUT + shell: bash + + - name: Generate image tags 📦 + id: image-tags + uses: ./.github/actions/spark-image-tag + with: + image: ${{ inputs.image }} + spark_version: ${{ inputs.spark_version}} + scala_version: ${{ inputs.scala_version }} + java_version: ${{ inputs.java_version }} + python_version: ${{ inputs.python_version}} + ci_repo: ${{ steps.registry-repos.outputs.ci_repo }} + publish_repo: ${{ steps.registry-repos.outputs.publish_repo }} + publish_to_registry: ${{ inputs.publish_to_registry }} + git_tag_name: ${{ inputs.git_latest_release_tag }} + + - name: Login to the CI registry 🔐 + if: inputs.publish_to_registry == 'false' + uses: docker/login-action@v3 + with: + registry: ${{ inputs.ci_registry }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push to ci registry + if: inputs.publish_to_registry == 'false' + uses: docker/build-push-action@v5 + with: + context: ${{ inputs.image }} + platforms: linux/amd64,linux/arm64 + push: true + build-args: | + SPARK_VERSION=${{ inputs.spark_version}} + SCALA_VERSION=${{ inputs.scala_version }} + JAVA_VERSION=${{ inputs.java_version }} + PYTHON_VERSION=${{ inputs.python_version }} + HADOOP_VERSION=${{ inputs.hadoop_version }} + BASE_IMAGE=${{ steps.image-tags.outputs.parent_image }} + tags: | + ${{ steps.registry-repos.outputs.ci_repo }}/${{ inputs.image }}:${{ steps.image-tags.outputs.latest_tag }} + labels: | + org.opencontainers.image.title="${{ inputs.image }}" + org.opencontainers.image.version="${{ inputs.spark_version}}" + org.opencontainers.image.description="Spark image" + org.opencontainers.image.base.name="${{ steps.image-tags.outputs.parent_image }}" + org.opencontainers.image.source="https://github.com/${{ github.repository }}" + org.opencontainers.image.licenses="Apache-2.0" + + ### CI Steps + # https://github.com/nektos/act/issues/678 + # https://github.com/apache/spark/pull/35830 + - name: Checkout integration tests tag v${{ inputs.spark_version }} (${{ inputs.spark_version}} > 3.3.0) ⚡️ + if: inputs.publish_to_registry == 'false' && !(startsWith(inputs.spark_version, '3.1') || startsWith(inputs.spark_version, '3.2') || startsWith(inputs.spark_version, '3.3.0')) + id: git-checkout-tag + run: | + CHECKOUT_TAG_DIR="$(mktemp -d)/spark" + git clone https://github.com/apache/spark.git ${CHECKOUT_TAG_DIR} + cd ${CHECKOUT_TAG_DIR} + git checkout v${{ inputs.spark_version }} + echo "checkout_directory=${CHECKOUT_TAG_DIR}" >> $GITHUB_OUTPUT + shell: bash + + - name: Prepare integration tests env (${{ inputs.spark_version}} > 3.3.0) 📦 + if: inputs.publish_to_registry == 'false' && !(startsWith(inputs.spark_version, '3.1') || startsWith(inputs.spark_version, '3.2') || startsWith(inputs.spark_version, '3.3.0')) + uses: ./.github/actions/spark-tests-prepare + with: + spark_version: ${{ inputs.spark_version}} + scala_version: ${{ inputs.scala_version }} + java_version: ${{ inputs.java_version }} + + - name: Set up Kind integration tests cluster (${{ inputs.spark_version}} > 3.3.0) 📦 + if: inputs.publish_to_registry == 'false' && !(startsWith(inputs.spark_version, '3.1') || startsWith(inputs.spark_version, '3.2') || startsWith(inputs.spark_version, '3.3.0')) + uses: ./.github/actions/setup-kind + + - name: Run integration tests (${{ inputs.spark_version}} > 3.3.0) ✅ + if: inputs.publish_to_registry == 'false' && !(startsWith(inputs.spark_version, '3.1') || startsWith(inputs.spark_version, '3.2') || startsWith(inputs.spark_version, '3.3.0')) + uses: ./.github/actions/spark-tests-run + with: + ci-repo: ${{ steps.registry-repos.outputs.ci_repo }} + image: ${{ inputs.image }} + image-tag: ${{ steps.image-tags.outputs.latest_tag }} + scala_version: ${{ inputs.scala_version }} + git_checkout_tag_dir: ${{ steps.git-checkout-tag.outputs.checkout_directory }} + + ### Publish steps + ### The publish and periodic rebuilds are based on the latest stable github release tag + - name: Login into official registry 🔐 + if: inputs.publish_to_registry == 'true' + uses: docker/login-action@v3 + with: + registry: ${{ inputs.registry }} + username: ${{ secrets.REGISTRY_USERNAME }} + password: ${{ secrets.REGISTRY_ROBOT_TOKEN }} + + - name: Build and push to official registry 📤 + if: inputs.publish_to_registry == 'true' + uses: docker/build-push-action@v5 + with: + context: ${{ inputs.image }} + platforms: linux/amd64,linux/arm64 + push: true + build-args: | + SPARK_VERSION=${{ inputs.spark_version}} + SCALA_VERSION=${{ inputs.scala_version }} + JAVA_VERSION=${{ inputs.java_version }} + PYTHON_VERSION=${{ inputs.python_version }} + HADOOP_VERSION=${{ inputs.hadoop_version }} + BASE_IMAGE=${{ steps.image-tags.outputs.parent_image }} + tags: ${{ steps.image-tags.outputs.publish_tags }} + labels: | + org.opencontainers.image.title="${{ inputs.image }}" + org.opencontainers.image.version="${{ inputs.spark_version}}" + org.opencontainers.image.description="Spark image" + org.opencontainers.image.base.name="${{ steps.image-tags.outputs.parent_image }}" + org.opencontainers.image.source="https://github.com/${{ github.repository }}" + org.opencontainers.image.licenses="Apache-2.0" + diff --git a/.github/workflows/build-images-template.yml b/.github/workflows/build-images-template.yml new file mode 100644 index 0000000..5e5593a --- /dev/null +++ b/.github/workflows/build-images-template.yml @@ -0,0 +1,125 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: spark build multiple images template + +on: + workflow_call: + inputs: + spark_version: + description: Spark version + required: true + type: string + scala_version: + description: Scala version + required: true + type: string + java_version: + description: Java version + required: true + type: string + hadoop_version: + description: Hadoop version + required: true + type: string + python_version: + description: Python version + required: true + type: string + registry: + description: The container registry + required: false + type: string + publish_to_registry: + description: Wheter to push to the registry + required: false + type: string + default: "false" + git_latest_release_tag: + description: The latest remote release tag + required: false + type: string + default: "" + runs-on: + description: GitHub Actions Runner image + required: false + type: string + default: "ubuntu-latest" + +jobs: + + spark-base: + uses: ./.github/workflows/build-image-template.yml + with: + image: spark-base + python_version: ${{ inputs.python_version }} + spark_version: ${{ inputs.spark_version }} + java_version: ${{ inputs.java_version }} + scala_version: ${{ inputs.scala_version }} + hadoop_version: ${{ inputs.hadoop_version }} + registry: ${{ inputs.registry }} + publish_to_registry: ${{ inputs.publish_to_registry }} + git_latest_release_tag: ${{ inputs.git_latest_release_tag }} + runs-on: ${{ inputs.runs-on }} + secrets: inherit + + spark: + uses: ./.github/workflows/build-image-template.yml + needs: [spark-base] + with: + image: spark + python_version: ${{ inputs.python_version }} + spark_version: ${{ inputs.spark_version }} + java_version: ${{ inputs.java_version }} + scala_version: ${{ inputs.scala_version }} + hadoop_version: ${{ inputs.hadoop_version }} + registry: ${{ inputs.registry }} + publish_to_registry: ${{ inputs.publish_to_registry }} + git_latest_release_tag: ${{ inputs.git_latest_release_tag }} + runs-on: ${{ inputs.runs-on }} + secrets: inherit + + spark-py: + uses: ./.github/workflows/build-image-template.yml + needs: [spark] + with: + image: spark-py + python_version: ${{ inputs.python_version }} + spark_version: ${{ inputs.spark_version }} + java_version: ${{ inputs.java_version }} + scala_version: ${{ inputs.scala_version }} + hadoop_version: ${{ inputs.hadoop_version }} + registry: ${{ inputs.registry }} + publish_to_registry: ${{ inputs.publish_to_registry }} + git_latest_release_tag: ${{ inputs.git_latest_release_tag }} + runs-on: ${{ inputs.runs-on }} + secrets: inherit + + spark-r: + uses: ./.github/workflows/build-image-template.yml + needs: [spark] + with: + image: spark-r + python_version: ${{ inputs.python_version }} + spark_version: ${{ inputs.spark_version }} + java_version: ${{ inputs.java_version }} + scala_version: ${{ inputs.scala_version }} + hadoop_version: ${{ inputs.hadoop_version }} + registry: ${{ inputs.registry }} + publish_to_registry: ${{ inputs.publish_to_registry }} + git_latest_release_tag: ${{ inputs.git_latest_release_tag }} + runs-on: ${{ inputs.runs-on }} + secrets: inherit diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..0da6ca0 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,87 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: ci + +on: + pull_request: + branches: + - main + paths: + - ".github/workflows/**" + - ".github/actions/**" + - ".build/**" + + - "spark/**" + - "spark-*/**" + + - "!README.md" + + push: + branches: + - main + paths: + - ".github/workflows/**" + - ".github/actions/**" + - ".build/**" + + - "spark/**" + - "spark-*/**" + + - "!README.md" + + workflow_dispatch: + +# https://docs.github.com/en/actions/using-jobs/using-concurrency +concurrency: + # Only cancel in-progress jobs or runs for the current workflow - matches against branch & tags + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: + packages: write + +jobs: + + get-ci-versions: + runs-on: "ubuntu-latest" + outputs: + matrix: ${{ steps.ci-versions.outputs.matrix }} + steps: + - name: Checkout Repo ⚡️ + uses: actions/checkout@v4 + + - name: Get CI versions matrix 📥 + id: ci-versions + uses: ./.github/actions/spark-version-matrix + with: + use_matrix: ".build/ci-versions.yml" + + spark-ci: + name: spark-ci (spark-${{ matrix.version.spark_version }}) + needs: [get-ci-versions] + strategy: + fail-fast: false + matrix: + version: ${{ fromJson(needs.get-ci-versions.outputs.matrix) }} + uses: ./.github/workflows/build-images-template.yml + with: + python_version: ${{ matrix.version.python_version }} + spark_version: ${{ matrix.version.spark_version }} + java_version: ${{ matrix.version.java_version }} + scala_version: ${{ matrix.version.scala_version }} + hadoop_version: ${{ matrix.version.hadoop_version }} + publish_to_registry: "false" diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..4d8f7d4 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,106 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: publish + +on: + ### Periodically rebuild all the images to fix os security vulnerabilities + schedule: + # At 05:00 AM, only on Tuesday + #- cron: "0 5 * * 2" + # https://crontab.cronhub.io/ + # At 05:"0 AM, only on Tuesday + - cron: "0 5 * * 2" + # The release should be created manually (or with user token=pr approval/merge) in order to trigger the event + ### https://github.com/orgs/community/discussions/25281 + ### Instead of using the event, we call the workflow from release-please workflow (more secure) + #release: + # types: [published] + + workflow_dispatch: + +# https://docs.github.com/en/actions/using-jobs/using-concurrency +concurrency: + # Only cancel in-progress jobs or runs for the current workflow - matches against branch & tags + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: + packages: write + +jobs: + + latest-github-release: + if: github.repository_owner == 'OKDP' + runs-on: "ubuntu-latest" + outputs: + tag_name: ${{ steps.git-release-tag.outputs.tag_name }} + steps: + - name: Checkout Repo ⚡️ + uses: actions/checkout@v4 + + - name: Get latest GitHub Release tag name 📥 + id: git-release-tag + uses: InsonusK/get-latest-release@v1.0.1 + with: + myToken: ${{ github.token }} + exclude_types: "draft" + view_top: 1 + + - name: Info - Found latest release tag + run: | + echo "id: ${{ steps.git-release-tag.outputs.id }}" + echo "name: ${{ steps.git-release-tag.outputs.name }}" + echo "tag_name: ${{ steps.git-release-tag.outputs.tag_name }}" + echo "created_at: ${{ steps.git-release-tag.outputs.created_at }}" + echo "draft: ${{ steps.git-release-tag.outputs.draft }}" + echo "prerelease: ${{ steps.git-release-tag.outputs.prerelease }}" + shell: bash + + get-release-versions: + if: github.repository_owner == 'OKDP' + runs-on: "ubuntu-latest" + outputs: + matrix: ${{ steps.release-versions.outputs.matrix }} + steps: + - name: Checkout Repo ⚡️ + uses: actions/checkout@v4 + + - name: Get release versions matrix 📥 + id: release-versions + uses: ./.github/actions/spark-version-matrix + with: + use_matrix: ".build/release-versions.yml" + + spark-publish: + if: github.repository_owner == 'OKDP' && needs.latest-github-release.outputs.tag_name != '' + name: spark-publish (${{ needs.latest-github-release.outputs.tag_name }}/spark-${{ matrix.version.spark_version }}) + needs: [latest-github-release, get-release-versions] + strategy: + fail-fast: false + matrix: + version: ${{ fromJson(needs.get-release-versions.outputs.matrix) }} + uses: ./.github/workflows/build-images-template.yml + with: + python_version: ${{ matrix.version.python_version }} + spark_version: ${{ matrix.version.spark_version }} + java_version: ${{ matrix.version.java_version }} + scala_version: ${{ matrix.version.scala_version }} + hadoop_version: ${{ matrix.version.hadoop_version }} + registry: ${{ vars.REGISTRY || 'quay.io' }} + publish_to_registry: "true" + git_latest_release_tag: ${{ needs.latest-github-release.outputs.tag_name }} + secrets: inherit diff --git a/.github/workflows/release-please.yml b/.github/workflows/release-please.yml new file mode 100644 index 0000000..f73f3c2 --- /dev/null +++ b/.github/workflows/release-please.yml @@ -0,0 +1,70 @@ +# +# Copyright 2024 tosit.io +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: release-please + +on: + pull_request: + types: + - closed + branches: + - main + +permissions: + contents: write + pull-requests: write + +# https://docs.github.com/en/actions/using-jobs/using-concurrency +concurrency: + # Only cancel in-progress jobs or runs for the current workflow - matches against branch & tags + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +defaults: + run: + shell: bash + +jobs: + release-please: + runs-on: ubuntu-latest + outputs: + release_created: ${{ steps.release-please.outputs.release_created }} + tag_name: ${{ steps.release-please.outputs.tag_name }} + # Skip the release process in the fork + # The pull request should come from the same repo (github_token from the fork does not have write permissions) + if: github.repository_owner == 'OKDP' && github.event.pull_request.merged == true && github.event.pull_request.head.repo.full_name == github.repository + steps: + - uses: google-github-actions/release-please-action@v4 + id: release-please + + publish: + runs-on: ubuntu-latest + needs: [release-please] + if: needs.release-please.outputs.release_created == 'true' + permissions: + contents: write + actions: write + packages: write + steps: + - name: "Publish images to official registry" + env: + GH_REPO: ${{ github.repository }} + GH_TOKEN: ${{ github.token }} + GH_DEBUG: api + run: | + gh workflow run publish.yml + shell: bash + \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..94d918d --- /dev/null +++ b/.gitignore @@ -0,0 +1,32 @@ +### IntelliJ IDEA ### +.idea +*.iml + +### Eclipse ### +.apt_generated +.classpath +.factorypath +.project +.settings +.springBeans +.sts4-cache +bin/ +!**/src/main/**/bin/ +!**/src/test/**/bin/ + +### NetBeans ### +/nbproject/private/ +/nbbuild/ +/dist/ +/nbdist/ +/.nb-gradle/ + + +### Mac OS ### +.DS_Store + +### vscode ### +.vscode/ + +# Other +tmp/ \ No newline at end of file diff --git a/.release-please-manifest.json b/.release-please-manifest.json new file mode 100644 index 0000000..0967ef4 --- /dev/null +++ b/.release-please-manifest.json @@ -0,0 +1 @@ +{} diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..c16bed3 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2024 tosit.io + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index b9d63e1..8b01d7e 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,63 @@ -# spark-images -Collection of Spark docker images for OKDP +[![ci](https://github.com/okdp/spark-images/actions/workflows/ci.yml/badge.svg)](https://github.com/okdp/spark-images/actions/workflows/ci.yml) +[![Release](https://img.shields.io/github/v/release/okdp/spark-images)](https://github.com/okdp/spark-images/releases/latest) +[![License Apache2](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0) + + +Collection of [Apache Spark](https://spark.apache.org/) docker images for [OKDP Platform](https://okdp.io/). + +Currently, the images are built from the [Apache Spark project distribution](https://archive.apache.org/dist/spark) and the requirement may evolve to produce them from the [source code](https://github.com/apache/spark). + +The image relashionship is described by the following diagram: + +

+ +

+ + + + +| Image | Description | +|:---------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `JRE` | The JRE LTS base image supported by Apache Spark depending on the version. This includes Java 11/17/21. Please, check the [reference versions](.build/reference-versions.yml) or [Apache Spark website](https://spark.apache.org/docs/latest/) for more information. | +| `spark-base` | The Apache Spark base image with official spark binaries (scala/java) and without OKDP extensions. | +| `spark` | The Apache Spark image with official spark binaries (scala/java) and OKDP extensions. | +| `spark-py` | The Apache Spark image with official spark binaries (scala/java), OKDP extensions and python support. | +| `spark-r` | The Apache Spark image with official spark binaries (scala/java), OKDP extensions and R support. | + +# Tagging + +The project builds the images with a long format tags. Each tag combines multiple compatible versions combinations. + +There are multiple tags levels and the format to use depends on your convenience in term of stability and reproducibility. + +The images are pushed to [quay.io/okdp](https://quay.io/organization/okdp) repository with the following [tags](.build/images.yml): + +| Images | Tags | +|:--------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| spark-base, spark | spark--scala--java-

spark--scala--java--

spark--scala--java--

spark--scala--java--- | +| spark-py | spark--python--scala--java-

spark--python--scala--java--

spark--python--scala--java--

spark--python--scala--java--- | +| spark-r | spark--r--scala--java-

spark--r--scala--java--

spark--r--scala--java--

spark--r--scala--java--- | + +> [!NOTE] +> 1. `` corresponds to the Github [release version](https://github.com/okdp/spark-images/releases) or [git tag](https://github.com/okdp/spark-images/tags) without the leading `v`. +> Ex.: 1.0.0 +> +> 2. `` corresponds to the images build date with the `YYYY-MM-DD` format. The latest release tag is rebuilt every week to ensure the OS image is up to date against the latest security updates. +> +> You may need to switch to the latest release version if your are using the long form tag image with a ``. Please, check the [changelog](https://github.com/okdp/spark-images/releases) to see the notable impacts. +> +> An example of `py-spark` image with a long form tag including `spark/java/scala/python` compatible versions and a `` with a `` is: +> +> `quay.io/okdp/spark-py:spark-3.5.1-python-3.11-scala-2.13-java-17-2024-04-04-1.0.0`. +> +> The corresponding changelog is [releases/tag/v1.0.0](https://github.com/okdp/spark-images/releases/tag/v1.0.0). +> +> 3. You can also use the latest tag without `` and `` which is always up to date with the latest security updates. +> +> An example of `py-spark` image with the latest tag is: `quay.io/okdp/spark-py:spark-3.5.1-python-3.11-scala-2.13-java-17` +> + +# Alternatives + +- [Official images](https://github.com/apache/spark-docker) + diff --git a/docs/images/spark-images.drawio.svg b/docs/images/spark-images.drawio.svg new file mode 100644 index 0000000..f284035 --- /dev/null +++ b/docs/images/spark-images.drawio.svg @@ -0,0 +1,156 @@ + + + + + + + + + + + + + +
+
+
+ + + eclipse-temurin:jre (LTS) + + +
+
+
+
+ + eclipse-temurin:jre... + +
+
+ + + + + + + + + + + +
+
+
+ + + spark-base + + +
+
+
+
+ + spark-base + +
+
+ + + + + + +
+
+
+ + + spark + + +
+
+
+
+ + spark + +
+
+ + + + + + + + + + + +
+
+
+ + + spark-py + + +
+
+
+
+ + spark-py + +
+
+ + + + + + + + + + + +
+
+
+ + + spark-r + + +
+
+
+
+ + spark-r + +
+
+ + + + + + + +
+ + + + + Text is not SVG - cannot display + + + +
\ No newline at end of file diff --git a/release-please-config.json b/release-please-config.json new file mode 100644 index 0000000..c7275fa --- /dev/null +++ b/release-please-config.json @@ -0,0 +1,19 @@ +{ + "extra-files": [ + "README.md" + ], + "packages": { + ".": { + "changelog-path": "CHANGELOG.md", + "release-type": "simple", + "changelog-type": "default", + "bump-minor-pre-major": false, + "bump-patch-for-minor-pre-major": false, + "draft": false, + "prerelease": false, + "skip-snapshot": false + } + }, + "$schema": "https://raw.githubusercontent.com/googleapis/release-please/main/schemas/config.json" +} + diff --git a/spark-base/Dockerfile b/spark-base/Dockerfile new file mode 100644 index 0000000..35e96e6 --- /dev/null +++ b/spark-base/Dockerfile @@ -0,0 +1,87 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +ARG JAVA_VERSION=11 +ARG BASE_IMAGE=eclipse-temurin:${JAVA_VERSION}-jre-jammy +FROM $BASE_IMAGE + +ARG spark_uid=185 + +ARG SPARK_VERSION=3.2.1 +ARG HADOOP_VERSION=3.2 +ARG SCALA_VERSION=2.12 +ARG SPARK_DIST_DOWNLOAD_URL=https://archive.apache.org/dist/spark + +ENV SPARK_HOME /opt/spark +ENV SPARK_CONF_DIR ${SPARK_HOME}/conf + +ENV SPARK_VERSION ${SPARK_VERSION} +ENV HADOOP_VERSION ${HADOOP_VERSION} +ENV SCALA_VERSION ${SCALA_VERSION} + +## Add missing gpg keys from https://downloads.apache.org/spark/KEYS +COPY MISSING-GPG-KEYS.yml . + +RUN groupadd --system --gid=${spark_uid} spark && \ + useradd --system --uid=${spark_uid} --gid=spark spark + +RUN set -ex; \ + apt-get update; \ + ln -s /lib /lib64; \ + apt install -y --no-install-recommends gnupg2 bash tini libc6 libpam-modules krb5-user libnss3 procps net-tools gosu libnss-wrapper curl; \ + mkdir -p ${SPARK_HOME}; \ + mkdir -p ${SPARK_HOME}/work-dir; \ + chmod g+w ${SPARK_HOME}/work-dir; \ + chown -R spark:spark ${SPARK_HOME}; \ + rm /bin/sh; \ + ln -sv /bin/bash /bin/sh; \ + echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su; \ + chgrp root /etc/passwd && chmod ug+rw /etc/passwd; \ + rm -rf /var/cache/apt/* && rm -rf /var/lib/apt/lists/* + +RUN set -ex;\ + export WORK_DIR="$(mktemp -d)"; \ + DIST=spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}; \ + if [ "${SCALA_VERSION}" = "2.13" ]; then \ + DIST+=-scala${SCALA_VERSION}; \ + fi; \ + curl --retry 3 --retry-all-errors -k ${SPARK_DIST_DOWNLOAD_URL}/${DIST}.tgz -o ${WORK_DIR}/spark.tgz; \ + curl --retry 3 --retry-all-errors -k ${SPARK_DIST_DOWNLOAD_URL}/${DIST}.tgz.asc -o ${WORK_DIR}/spark.tgz.asc; \ + curl --retry 3 --retry-all-errors -k https://downloads.apache.org/spark/KEYS -o ${WORK_DIR}/KEYS; \ + MISSING_KEYS=($(cat MISSING-GPG-KEYS.yml | grep "keys:" -A300 | awk -F: '{ print $2 }' | tr -d '\n' | tr -d \"\" )); \ + export GNUPGHOME="$(mktemp -d)"; \ + gpg --batch --import ${WORK_DIR}/KEYS; \ + gpg --batch --keyserver hkps://keys.openpgp.org --recv-keys ${MISSING_KEYS} || true; \ + gpg --batch --keyserver hkps://keyserver.ubuntu.com --recv-keys ${MISSING_KEYS} || true; \ + gpg --batch --verify ${WORK_DIR}/spark.tgz.asc ${WORK_DIR}/spark.tgz; \ + tar --strip-components=1 -zxvf ${WORK_DIR}/spark.tgz -C ${SPARK_HOME}/; \ + chown -R spark:spark ${SPARK_HOME}/; \ + mv ${SPARK_HOME}/kubernetes/dockerfiles/spark/decom.sh /opt/; \ + mv ${SPARK_HOME}/kubernetes/tests ${SPARK_HOME}/; \ + chmod a+x /opt/decom.sh; \ + gpgconf --kill all; \ + rm -rf ${GNUPGHOME} ${WORK_DIR} MISSING-GPG-KEYS.yml; \ + rm -fr ${SPARK_HOME}/conf rm -fr ${SPARK_HOME}/yarn rm -fr ${SPARK_HOME}/kubernetes + +COPY entrypoint.sh /opt/entrypoint.sh +RUN chmod a+x /opt/entrypoint.sh + +WORKDIR ${SPARK_HOME}/work-dir + +USER spark + +ENTRYPOINT [ "/opt/entrypoint.sh" ] + diff --git a/spark-base/MISSING-GPG-KEYS.yml b/spark-base/MISSING-GPG-KEYS.yml new file mode 100644 index 0000000..241021b --- /dev/null +++ b/spark-base/MISSING-GPG-KEYS.yml @@ -0,0 +1,23 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Some gpg keys are missing in the spark project release key https://downloads.apache.org/spark/KEYS +## We add them manually thanks to apache/spark-docker official images repo: +#### https://github.com/apache/spark-docker/blob/master/tools/template.py +keys: + # issuer "yumwang@apache.org" + - "3.3.1": "86727D43E73A415F67A0B1A14E68B3E6CD473653" diff --git a/spark-base/entrypoint.sh b/spark-base/entrypoint.sh new file mode 100644 index 0000000..75611a9 --- /dev/null +++ b/spark-base/entrypoint.sh @@ -0,0 +1,142 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Prevent any errors from being silently ignored +set -eo pipefail + +attempt_setup_fake_passwd_entry() { + # Check whether there is a passwd entry for the container UID + local myuid; myuid="$(id -u)" + # If there is no passwd entry for the container UID, attempt to fake one + # You can also refer to the https://github.com/docker-library/official-images/pull/13089#issuecomment-1534706523 + # It's to resolve OpenShift random UID case. + # See also: https://github.com/docker-library/postgres/pull/448 + if ! getent passwd "$myuid" &> /dev/null; then + local wrapper + for wrapper in {/usr,}/lib{/*,}/libnss_wrapper.so; do + if [ -s "$wrapper" ]; then + NSS_WRAPPER_PASSWD="$(mktemp)" + NSS_WRAPPER_GROUP="$(mktemp)" + export LD_PRELOAD="$wrapper" NSS_WRAPPER_PASSWD NSS_WRAPPER_GROUP + local mygid; mygid="$(id -g)" + printf 'spark:x:%s:%s:${SPARK_USER_NAME:-anonymous uid}:%s:/bin/false\n' "$myuid" "$mygid" "$SPARK_HOME" > "$NSS_WRAPPER_PASSWD" + printf 'spark:x:%s:\n' "$mygid" > "$NSS_WRAPPER_GROUP" + break + fi + done + fi +} + +if [ -z "$JAVA_HOME" ]; then + JAVA_HOME=$(java -XshowSettings:properties -version 2>&1 > /dev/null | grep 'java.home' | awk '{print $3}') +fi + +SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*" +for v in "${!SPARK_JAVA_OPT_@}"; do + SPARK_EXECUTOR_JAVA_OPTS+=( "${!v}" ) +done + +if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then + SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH" +fi + +if ! [ -z "${PYSPARK_PYTHON+x}" ]; then + export PYSPARK_PYTHON +fi +if ! [ -z "${PYSPARK_DRIVER_PYTHON+x}" ]; then + export PYSPARK_DRIVER_PYTHON +fi + +# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor. +# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s. +if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then + export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)" +fi + +if ! [ -z "${HADOOP_CONF_DIR+x}" ]; then + SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH"; +fi + +if ! [ -z "${SPARK_CONF_DIR+x}" ]; then + SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH"; +elif ! [ -z "${SPARK_HOME+x}" ]; then + SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH"; +fi + +# SPARK-43540: add current working directory into executor classpath +SPARK_CLASSPATH="$SPARK_CLASSPATH:$PWD" + +# Switch to spark if no USER specified (root by default) otherwise use USER directly +switch_spark_if_root() { + if [ $(id -u) -eq 0 ]; then + echo gosu spark + fi +} + +spark_3_2_support(){ + if ! printf '%s\n%s' "$1" "$2" | sort -C -V + then + # + 3.3.0 + echo "org.apache.spark.scheduler.cluster.k8s.KubernetesExecutorBackend --podName $SPARK_EXECUTOR_POD_NAME" + else + # -3.0.0 + echo "org.apache.spark.executor.CoarseGrainedExecutorBackend" + fi +} + +KUBERNETES_EXECUTOR_BACKEND="$(spark_3_2_support $SPARK_VERSION '3.2.4')" + +case "$1" in + driver) + shift 1 + CMD=( + "$SPARK_HOME/bin/spark-submit" + --conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS" + --conf "spark.executorEnv.SPARK_DRIVER_POD_IP=$SPARK_DRIVER_BIND_ADDRESS" + --deploy-mode client + "$@" + ) + attempt_setup_fake_passwd_entry + # Execute the container CMD under tini for better hygiene + exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}" + ;; + executor) + shift 1 + CMD=( + ${JAVA_HOME}/bin/java + "${SPARK_EXECUTOR_JAVA_OPTS[@]}" + -Xms"$SPARK_EXECUTOR_MEMORY" + -Xmx"$SPARK_EXECUTOR_MEMORY" + -cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH" + $KUBERNETES_EXECUTOR_BACKEND + --driver-url "$SPARK_DRIVER_URL" + --executor-id "$SPARK_EXECUTOR_ID" + --cores "$SPARK_EXECUTOR_CORES" + --app-id "$SPARK_APPLICATION_ID" + --hostname "$SPARK_EXECUTOR_POD_IP" + --resourceProfileId "$SPARK_RESOURCE_PROFILE_ID" + ) + attempt_setup_fake_passwd_entry + # Execute the container CMD under tini for better hygiene + exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}" + ;; + + *) + # Non-spark-on-k8s command provided, proceeding in pass-through mode... + exec "$@" + ;; +esac diff --git a/spark-py/Dockerfile b/spark-py/Dockerfile new file mode 100644 index 0000000..ae9239f --- /dev/null +++ b/spark-py/Dockerfile @@ -0,0 +1,40 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +ARG SPARK_VERSION=3.2.1 +ARG HADOOP_VERSION=3.2 +ARG SCALA_VERSION=2.12 +ARG JAVA_VERSION=11 + +ARG REGISTRY=quay.io +ARG REPO=okdp +ARG BASE_IMAGE=${REGISTRY}/${REPO}/spark:spark-${SPARK_VERSION}-scala-${SCALA_VERSION}-java-${JAVA_VERSION} + +FROM $BASE_IMAGE + +ARG PYTHON_VERSION 3.11 + +USER root + +COPY requirements.txt . + +RUN set -ex; \ + apt-get update; \ + apt-get install -y --no-install-recommends python${PYTHON_VERSION} python3-pip; \ + pip install -r requirements.txt; \ + rm -rf /var/lib/apt/lists/* requirements.txt + +USER spark diff --git a/spark-py/requirements.txt b/spark-py/requirements.txt new file mode 100644 index 0000000..e69de29 diff --git a/spark-r/Dockerfile b/spark-r/Dockerfile new file mode 100644 index 0000000..a124df7 --- /dev/null +++ b/spark-r/Dockerfile @@ -0,0 +1,37 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +ARG SPARK_VERSION=3.2.1 +ARG HADOOP_VERSION=3.2 +ARG SCALA_VERSION=2.12 +ARG JAVA_VERSION=11 + +ARG REGISTRY=quay.io +ARG REPO=okdp +ARG BASE_IMAGE=${REGISTRY}/${REPO}/spark:spark-${SPARK_VERSION}-scala-${SCALA_VERSION}-java-${JAVA_VERSION} + +FROM $BASE_IMAGE + +USER root + +RUN set -ex; \ + apt update; \ + apt-get install -y --no-install-recommends r-base r-base-dev; \ + rm -rf /var/lib/apt/lists/* + +ENV R_HOME /usr/lib/R + +USER spark diff --git a/spark/Dockerfile b/spark/Dockerfile new file mode 100644 index 0000000..d50d6ad --- /dev/null +++ b/spark/Dockerfile @@ -0,0 +1,75 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +ARG SPARK_VERSION=3.2.1 +ARG HADOOP_VERSION=3.2 +ARG SCALA_VERSION=2.12 +ARG JAVA_VERSION=11 + +ARG REGISTRY=quay.io +ARG REPO=okdp +ARG BASE_IMAGE=${REGISTRY}/${REPO}/spark:base-spark-${SPARK_VERSION}-scala-${SCALA_VERSION}-java-${JAVA_VERSION} + +FROM eclipse-temurin:${JAVA_VERSION}-jre-jammy AS okdp_addons +ARG SPARK_VERSION=3.2.1 +ARG SCALA_VERSION=2.12 + +RUN set -ex; \ + apt-get update; \ + apt install -y --no-install-recommends maven dos2unix + +WORKDIR /workspace + +COPY okdp-addons.pom . +COPY minio.pom . + +# The setup consumes less space compare to inheriting from the parent pom +# Handles the transitive dependencies versions through the pom +# Manage Java AWS SDK v1 (hadoop <3.4)/V2 (hadoop >=3.4) +# Some pom.xml versions comes with Control M +# Minio and AWS profiles are mutually exclusive: aws includes minio +RUN mvn -ntp dependency:get -DgroupId=org.apache.spark -DartifactId=spark-parent_${SCALA_VERSION} -Dversion=${SPARK_VERSION} -Dpackaging=pom; \ + mvn -ntp dependency:copy -Dartifact=org.apache.spark:spark-parent_${SCALA_VERSION}:${SPARK_VERSION}:pom -Dproject.basedir=./ -DoutputDirectory=./; \ + dos2unix spark-parent_${SCALA_VERSION}-${SPARK_VERSION}.pom; \ + HADOOP_VERSION=$(grep "" spark-parent_${SCALA_VERSION}-${SPARK_VERSION}.pom | tr -d ' ' | sed -e 's/^ *\(.*\)<\/hadoop.version> *$/\1/' | sort -rn | head -n 1); \ + mvn -ntp dependency:get -DgroupId=org.apache.hadoop -DartifactId=hadoop-aws -Dversion=${HADOOP_VERSION} -Dpackaging=pom; \ + mvn -ntp dependency:copy -Dartifact=org.apache.hadoop:hadoop-aws:${HADOOP_VERSION}:pom -Dproject.basedir=./ -DoutputDirectory=./; \ + AWS_JAVA_SDK_VERSION=$(mvn -ntp dependency:tree -f hadoop-aws-${HADOOP_VERSION}.pom | grep -E "(com.amazonaws|software.amazon.awssdk):(aws-java-sdk-bundle|bundle):jar:.*:compile" | awk '{ print $NF }' | awk -F: '{ print $4 }'); \ + mvn -ntp clean install \ + -Daws-java-sdk.version=${AWS_JAVA_SDK_VERSION} \ + -Daws-sdk-profile.version=v$(echo ${AWS_JAVA_SDK_VERSION} | cut -d '.' -f 1) \ + -f minio.pom; \ + mvn -ntp clean dependency:copy-dependencies \ + -Dspark.version=${SPARK_VERSION} \ + -Dscala.version=${SCALA_VERSION} \ + -Dhadoop.version=${HADOOP_VERSION} \ + -Daws-java-sdk.version=${AWS_JAVA_SDK_VERSION} \ + -Pminio \ + -f okdp-addons.pom + +FROM $BASE_IMAGE + +ENV JMX_CONF_DIR /etc/metrics/conf/ + +# OKDP addons +COPY --from=okdp_addons --chown=spark:spark /workspace/target/dependency/* $SPARK_HOME/jars + +# Jmx prometheus metrics +COPY --chown=spark:spark metrics.properties ${JMX_CONF_DIR}/metrics.properties +COPY --chown=spark:spark prometheus.yaml ${JMX_CONF_DIR}/prometheus.yaml + +USER spark + diff --git a/spark/metrics.properties b/spark/metrics.properties new file mode 100644 index 0000000..9640deb --- /dev/null +++ b/spark/metrics.properties @@ -0,0 +1,19 @@ +# +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +*.sink.jmx.class=org.apache.spark.metrics.sink.JmxSink +driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource +executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource \ No newline at end of file diff --git a/spark/minio.pom b/spark/minio.pom new file mode 100644 index 0000000..e2c37ca --- /dev/null +++ b/spark/minio.pom @@ -0,0 +1,209 @@ + + + + + 4.0.0 + OKDP AWS SDK for Java - Minio Bundle + com.amazonaws + okdp-minio-aws-s3-bundle + ${aws-java-sdk.version} + jar + + OKDP AWS SDK for Java - Minio Bundle + The bundle contains S3 service only with around 6.5MB instead of +350MB (+540MB in v2 bundle) + + + UTF-8 + + + + + + minio-aws-java-sdk-s3-v1 + + + aws-sdk-profile.version + v1 + + + + + com.amazonaws + aws-java-sdk-s3 + ${aws-java-sdk.version} + + + + com.amazonaws + aws-java-sdk-dynamodb + ${aws-java-sdk.version} + + + + + + org.apache.maven.plugins + maven-shade-plugin + + + package + + shade + + + false + true + + + joda-time:joda-time + com.fasterxml.jackson.core:* + com.fasterxml.jackson.dataformat:jackson-dataformat-cbor + org.apache.httpcomponents:* + commons-codec:commons-codec + commons-logging:commons-logging + io.netty:* + com.amazonaws:* + + + + + org.joda + com.amazonaws.thirdparty.joda + + + com.fasterxml.jackson + com.amazonaws.thirdparty.jackson + + + org.apache.http + com.amazonaws.thirdparty.apache.http + + + org.apache.commons.codec + com.amazonaws.thirdparty.apache.codec + + + org.apache.commons.logging + com.amazonaws.thirdparty.apache.logging + + + io.netty + com.amazonaws.thirdparty.io.netty + + + + + + + + + + + + + minio-aws-java-sdk-s3-v2 + + + aws-sdk-profile.version + v2 + + + + + software.amazon.awssdk + s3 + ${aws-java-sdk.version} + + + + + + org.apache.maven.plugins + maven-shade-plugin + + + package + + shade + + + false + true + + + com.fasterxml.jackson.jr:* + io.netty:* + org.apache.httpcomponents:* + org.reactivestreams:* + org.slf4j:* + commons-codec:commons-codec + software.amazon.awssdk:* + software.amazon:* + software.amazon.s3.accessgrants:* + com.github.ben-manes.caffeine:* + commons-logging:* + + + + + org.apache + software.amazon.awssdk.thirdparty.org.apache + + org.apache.log4j.* + + + + io.netty + software.amazon.awssdk.thirdparty.io.netty + + + org.slf4j + software.amazon.awssdk.thirdparty.org.slf4j + + + + + + + + + + + \ No newline at end of file diff --git a/spark/okdp-addons.pom b/spark/okdp-addons.pom new file mode 100644 index 0000000..ef663c6 --- /dev/null +++ b/spark/okdp-addons.pom @@ -0,0 +1,149 @@ + + + + + 4.0.0 + OKDP Addons + io.okdp + okdp-spark-docker-addons + ${spark.version} + pom + + OKDP extensions for spark docker images + + + UTF-8 + + + + + + io.okdp + okdp-spark-auth-filter + 1.1.0 + + + * + * + + + + + + io.prometheus.jmx + jmx_prometheus_javaagent + 0.20.0 + + + * + * + + + + + + org.apache.hadoop + hadoop-common + ${hadoop.version} + + + * + * + + + + + + + + minio + + + org.apache.hadoop + hadoop-aws + ${hadoop.version} + + + * + * + + + + + + com.amazonaws + okdp-minio-aws-s3-bundle + ${aws-java-sdk.version} + + + + org.apache.spark + spark-hadoop-cloud_${scala.version} + ${spark.version} + + + * + * + + + + + + + + aws + + + org.apache.hadoop + hadoop-aws + ${hadoop.version} + + + + org.apache.spark + spark-hadoop-cloud_${scala.version} + ${spark.version} + + + * + * + + + + + + + \ No newline at end of file diff --git a/spark/prometheus.yaml b/spark/prometheus.yaml new file mode 100644 index 0000000..f054e28 --- /dev/null +++ b/spark/prometheus.yaml @@ -0,0 +1,123 @@ +# +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +--- +lowercaseOutputName: true +attrNameSnakeCase: true +rules: + # These come from the application driver if it's a streaming application + # Example: default/streaming.driver.com.example.ClassName.StreamingMetrics.streaming.lastCompletedBatch_schedulingDelay + - pattern: metrics<>Value + name: spark_streaming_driver_$4 + labels: + app_namespace: "$1" + app_id: "$2" + # These come from the application driver if it's a structured streaming application + # Example: default/streaming.driver.spark.streaming.QueryName.inputRate-total + - pattern: metrics<>Value + name: spark_structured_streaming_driver_$4 + labels: + app_namespace: "$1" + app_id: "$2" + query_name: "$3" + # These come from the application executors + # Example: default/spark-pi.0.executor.threadpool.activeTasks + - pattern: metrics<>Value + name: spark_executor_$4 + type: GAUGE + labels: + app_namespace: "$1" + app_id: "$2" + executor_id: "$3" + # These come from the application driver + # Example: default/spark-pi.driver.DAGScheduler.stage.failedStages + - pattern: metrics<>Value + name: spark_driver_$3_$4 + type: GAUGE + labels: + app_namespace: "$1" + app_id: "$2" + # These come from the application driver + # Emulate timers for DAGScheduler like messagePRocessingTime + - pattern: metrics<>Count + name: spark_driver_DAGScheduler_$3_count + type: COUNTER + labels: + app_namespace: "$1" + app_id: "$2" + # HiveExternalCatalog is of type counter + - pattern: metrics<>Count + name: spark_driver_HiveExternalCatalog_$3_count + type: COUNTER + labels: + app_namespace: "$1" + app_id: "$2" + # These come from the application driver + # Emulate histograms for CodeGenerator + - pattern: metrics<>Count + name: spark_driver_CodeGenerator_$3_count + type: COUNTER + labels: + app_namespace: "$1" + app_id: "$2" + # These come from the application driver + # Emulate timer (keep only count attribute) plus counters for LiveListenerBus + - pattern: metrics<>Count + name: spark_driver_LiveListenerBus_$3_count + type: COUNTER + labels: + app_namespace: "$1" + app_id: "$2" + # Get Gauge type metrics for LiveListenerBus + - pattern: metrics<>Value + name: spark_driver_LiveListenerBus_$3 + type: GAUGE + labels: + app_namespace: "$1" + app_id: "$2" + # Executors counters + - pattern: metrics<>Count + name: spark_executor_$4_count + type: COUNTER + labels: + app_namespace: "$1" + app_id: "$2" + executor_id: "$3" + # These come from the application executors + # Example: app-20160809000059-0000.0.jvm.threadpool.activeTasks + - pattern: metrics<>Value + name: spark_executor_$4_$5 + type: GAUGE + labels: + app_namespace: "$1" + app_id: "$2" + executor_id: "$3" + - pattern: metrics<>Count + name: spark_executor_HiveExternalCatalog_$4_count + type: COUNTER + labels: + app_namespace: "$1" + app_id: "$2" + executor_id: "$3" + # These come from the application driver + # Emulate histograms for CodeGenerator + - pattern: metrics<>Count + name: spark_executor_CodeGenerator_$4_count + type: COUNTER + labels: + app_namespace: "$1" + app_id: "$2" + executor_id: "$3" \ No newline at end of file