aws · Yadan-Wei · Dec 5, 2024 · Nov 27, 2024 · Nov 27, 2024 · Nov 27, 2024
@@ -38,6 +38,7 @@ images:
     tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
     latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION,
                                 "-ec2" ]
+    # build_tag_override: "beta:2.16.2-cpu-py310-ubuntu20.04-ec2"
     docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
     target: ec2
     enable_test_promotion: true
@@ -55,6 +56,7 @@ images:
     tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
     latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-",
                                 *OS_VERSION, "-ec2" ]
+    # build_tag_override: "beta:2.16.2-gpu-py310-cu123-ubuntu20.04-ec2"
     docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
                          *DEVICE_TYPE ]
     target: ec2

@@ -64,7 +64,6 @@ check_efa_nccl_all_reduce(){
     # the container. Not using full-paths of mpirun and other executables because these paths can change across PyTorch
     # versions in DLC images.
     mpirun -x FI_PROVIDER="efa" -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_FILE \
-        -x NCCL_TUNER_PLUGIN=/usr/local/lib/libnccl-ofi-tuner.so \
         -x NCCL_DEBUG=INFO ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \
         -x PATH -x LD_LIBRARY_PATH=${CUDA_HOME}/lib:${CUDA_HOME}/lib64:$LD_LIBRARY_PATH \
         -x NCCL_SOCKET_IFNAME=^lo --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \

@@ -31,4 +31,12 @@ git clone https://github.com/pytorch/benchmark.git
 cd benchmark
 git checkout 350bb04  # pin commit for Python 3.11 support
 pip install -r requirements.txt
+
+# pin pynvml to avoid issues with 12.0.0
+# https://github.com/pytorch/benchmark/issues/2552
+# PR: https://github.com/pytorch/benchmark/pull/2553
+# when this issue is fixed can revert back
+pip uninstall pynvml -y
+pip install "pynvml<12"
+
 pip install numba