diff --git a/tensorflow/training/buildspec-2-16-ec2.yml b/tensorflow/training/buildspec-2-16-ec2.yml index ce1652ebc706..f522225c6b28 100644 --- a/tensorflow/training/buildspec-2-16-ec2.yml +++ b/tensorflow/training/buildspec-2-16-ec2.yml @@ -38,6 +38,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] + # build_tag_override: "beta:2.16.2-cpu-py310-ubuntu20.04-ec2" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 enable_test_promotion: true @@ -55,6 +56,7 @@ images: tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] + # build_tag_override: "beta:2.16.2-gpu-py310-cu123-ubuntu20.04-ec2" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 diff --git a/test/dlc_tests/container_tests/bin/efa/testEFA b/test/dlc_tests/container_tests/bin/efa/testEFA index 72d2ce159f91..f6af82c40a82 100755 --- a/test/dlc_tests/container_tests/bin/efa/testEFA +++ b/test/dlc_tests/container_tests/bin/efa/testEFA @@ -64,7 +64,6 @@ check_efa_nccl_all_reduce(){ # the container. Not using full-paths of mpirun and other executables because these paths can change across PyTorch # versions in DLC images. mpirun -x FI_PROVIDER="efa" -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_FILE \ - -x NCCL_TUNER_PLUGIN=/usr/local/lib/libnccl-ofi-tuner.so \ -x NCCL_DEBUG=INFO ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \ -x PATH -x LD_LIBRARY_PATH=${CUDA_HOME}/lib:${CUDA_HOME}/lib64:$LD_LIBRARY_PATH \ -x NCCL_SOCKET_IFNAME=^lo --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \ diff --git a/test/dlc_tests/container_tests/bin/pytorch_tests/setupPyTorchBackendTest b/test/dlc_tests/container_tests/bin/pytorch_tests/setupPyTorchBackendTest index f2d3e1541f9a..4feffb3bd068 100755 --- a/test/dlc_tests/container_tests/bin/pytorch_tests/setupPyTorchBackendTest +++ b/test/dlc_tests/container_tests/bin/pytorch_tests/setupPyTorchBackendTest @@ -31,4 +31,12 @@ git clone https://github.com/pytorch/benchmark.git cd benchmark git checkout 350bb04 # pin commit for Python 3.11 support pip install -r requirements.txt + +# pin pynvml to avoid issues with 12.0.0 +# https://github.com/pytorch/benchmark/issues/2552 +# PR: https://github.com/pytorch/benchmark/pull/2553 +# when this issue is fixed can revert back +pip uninstall pynvml -y +pip install "pynvml<12" + pip install numba