!! fixup / testing

NixOS · Dec 24, 2024 · 22f00e1 · 22f00e1
1 parent 4400b4b
commit 22f00e1
Show file tree

Hide file tree

Showing 17 changed files with 114 additions and 78 deletions.
diff --git a/pkgs/by-name/uc/ucx/package.nix b/pkgs/by-name/uc/ucx/package.nix
@@ -33,8 +33,10 @@ let
     paths = rocmList;
   };
 
+  # rocm build fails with gcc stdenv due to unrecognised arg parallel-jobs
+  stdenv' = if enableRocm then rocmPackages.llvm.rocmClangStdenv else stdenv;
 in
-stdenv.mkDerivation rec {
+stdenv'.mkDerivation rec {
   pname = "ucx";
   version = "1.17.0";
 

diff --git a/pkgs/development/python-modules/torch/default.nix b/pkgs/development/python-modules/torch/default.nix
@@ -16,11 +16,10 @@
     if cudaSupport then
       magma-cuda-static
     else if rocmSupport then
-      magma-hip
+      null
     else
       magma,
   magma,
-  magma-hip,
   magma-cuda-static,
   # Use the system NCCL as long as we're targeting CUDA on a supported platform.
   useSystemNccl ? (cudaSupport && !cudaPackages.nccl.meta.unsupported || rocmSupport),
@@ -36,6 +35,7 @@
   symlinkJoin,
   which,
   pybind11,
+  pkg-config,
   removeReferencesTo,
 
   # Build inputs
@@ -54,6 +54,7 @@
   cffi,
   click,
   typing-extensions,
+  six,
   # ROCm build and `torch.compile` requires `triton`
   tritonSupport ? (!stdenv.hostPlatform.isDarwin),
   triton,
@@ -66,7 +67,13 @@
   #          (dependencies without cuda support).
   #          Instead we should rely on overlays and nixpkgsFun.
   # (@SomeoneSerge)
-  _tritonEffective ? if cudaSupport then triton-cuda else triton,
+  _tritonEffective ?
+    if cudaSupport then
+      triton-cuda
+    else if rocmSupport then
+      rocmPackages.triton
+    else
+      triton,
   triton-cuda,
 
   # Unit tests
@@ -86,14 +93,13 @@
 
   # dependencies for torch.utils.tensorboard
   pillow,
-  six,
   future,
   tensorboard,
   protobuf,
 
   # ROCm dependencies
   rocmSupport ? config.rocmSupport,
-  rocmPackages_5,
+  rocmPackages,
   gpuTargets ? [ ],
 
   vulkanSupport ? false,
@@ -113,8 +119,6 @@ let
 
   triton = throw "python3Packages.torch: use _tritonEffective instead of triton to avoid divergence";
 
-  rocmPackages = rocmPackages_5;
-
   setBool = v: if v then "1" else "0";
 
   # https://github.com/pytorch/pytorch/blob/v2.4.0/torch/utils/cpp_extension.py#L1953
@@ -180,7 +184,7 @@ let
       clr
       rccl
       miopen
-      miopengemm
+      aotriton
       rocrand
       rocblas
       rocsparse
@@ -192,8 +196,11 @@ let
       rocfft
       rocsolver
       hipfft
+      hiprand
       hipsolver
+      hipblas-common
       hipblas
+      hipblaslt
       rocminfo
       rocm-thunk
       rocm-comgr
@@ -212,6 +219,7 @@ let
   brokenConditions = attrsets.filterAttrs (_: cond: cond) {
     "CUDA and ROCm are mutually exclusive" = cudaSupport && rocmSupport;
     "CUDA is not targeting Linux" = cudaSupport && !stdenv.hostPlatform.isLinux;
+    "ROCm 6 is currently not compatible with magma" = rocmSupport && effectiveMagma != null;
     "Unsupported CUDA version" =
       cudaSupport
       && !(builtins.elem cudaPackages.cudaMajorVersion [
@@ -225,8 +233,6 @@ let
     # In particular, this triggered warnings from cuda's `aliases.nix`
     "Magma cudaPackages does not match cudaPackages" =
       cudaSupport && (effectiveMagma.cudaPackages.cudaVersion != cudaPackages.cudaVersion);
-    "Rocm support is currently broken because `rocmPackages.hipblaslt` is unpackaged. (2024-06-09)" =
-      rocmSupport;
   };
 
   git-unroll = fetchFromGitea {
@@ -388,6 +394,10 @@ buildPythonPackage rec {
   # We only do an imports check, so do not build tests either.
   BUILD_TEST = setBool false;
 
+  # ninja hook doesn't automatically turn on ninja
+  # because pytorch setup.py is responsible for this
+  CMAKE_GENERATOR = "Ninja";
+
   # Unlike MKL, oneDNN (née MKLDNN) is FOSS, so we enable support for
   # it by default. PyTorch currently uses its own vendored version
   # of oneDNN through Intel iDeep.
@@ -406,6 +416,7 @@ buildPythonPackage rec {
 
   cmakeFlags =
     [
+      (lib.cmakeFeature "PYTHON_SIX_SOURCE_DIR" "${six.src}")
       # (lib.cmakeBool "CMAKE_FIND_DEBUG_MODE" true)
       (lib.cmakeFeature "CUDAToolkit_VERSION" cudaPackages.cudaVersion)
     ]
@@ -454,6 +465,8 @@ buildPythonPackage rec {
 
   env =
     {
+      # Builds faster without this and we don't have enough inputs that cmd length is an issue
+      NIX_CC_USE_RESPONSE_FILE = 0;
       # Suppress a weird warning in mkl-dnn, part of ideep in pytorch
       # (upstream seems to have fixed this in the wrong place?)
       # https://github.com/intel/mkl-dnn/commit/8134d346cdb7fe1695a2aa55771071d455fae0bc
@@ -511,6 +524,9 @@ buildPythonPackage rec {
     }
     // lib.optionalAttrs vulkanSupport {
       VULKAN_SDK = shaderc.bin;
+    }
+    // lib.optionalAttrs rocmSupport {
+      AOTRITON_INSTALLED_PREFIX = "${rocmPackages.aotriton}";
     };
 
   nativeBuildInputs =
@@ -519,6 +535,7 @@ buildPythonPackage rec {
       which
       ninja
       pybind11
+      pkg-config
       removeReferencesTo
     ]
     ++ lib.optionals cudaSupport (
@@ -564,7 +581,7 @@ buildPythonPackage rec {
       ]
     )
     ++ lib.optionals rocmSupport [ rocmPackages.llvm.openmp ]
-    ++ lib.optionals (cudaSupport || rocmSupport) [ effectiveMagma ]
+    ++ lib.optionals (effectiveMagma != null && (cudaSupport || rocmSupport)) [ effectiveMagma ]
     ++ lib.optionals stdenv.hostPlatform.isLinux [ numactl ]
     ++ lib.optionals stdenv.hostPlatform.isDarwin [
       apple-sdk_13

diff --git a/pkgs/development/rocm-modules/6/aotriton/default.nix b/pkgs/development/rocm-modules/6/aotriton/default.nix
@@ -33,7 +33,13 @@
   buildTests ? false,
   buildBenchmarks ? false,
   buildSamples ? false,
-  gpuTargets ? [ "gfx908" ], # [  ]
+  gpuTargets ? [
+    "gfx908"
+    "gfx90a"
+    "gfx942"
+    "gfx1030"
+    "gfx1100"
+  ], # [  ]
 }:
 
 stdenv.mkDerivation (
@@ -60,7 +66,7 @@ stdenv.mkDerivation (
     #   cd $out
     #   tar xf  ${cudaPackages.cuda_cudart.src} --strip-components=1
     # '';
-    cudaRtIncludes = cudaPackages.cudatoolkit;
+    #cudaRtIncludes = cudaPackages.cuda_cudart;
     triton-llvm' = builtins.trace "aotriton: TODO: confirm using same triton-llvm pinned hash as triton 3.2.x is ok" triton-llvm;
   in
   # triton-llvm' = triton-llvm.overrideAttrs (_old: {
@@ -86,8 +92,6 @@ stdenv.mkDerivation (
     };
     env.CXX = compiler;
     env.ROCM_PATH = "${clr}";
-    env.NIX_CC_USE_RESPONSE_FILE = 0;
-    env.NIX_DISABLE_WRAPPER_INCLUDES = 1;
     requiredSystemFeatures = [ "big-parallel" ];
 
     outputs =
@@ -142,7 +146,7 @@ stdenv.mkDerivation (
         xz
         nlohmann_json
         rocmlir
-        cudaRtIncludes
+        #cudaRtIncludes
 
         # Tensile deps - not optional, building without tensile isn't actually supported
         msgpack # FIXME: not included in cmake!
@@ -164,15 +168,16 @@ stdenv.mkDerivation (
     env.JSON_SYSPATH = nlohmann_json;
     env.MLIR_DIR = "${triton-llvm'}/lib/cmake/mlir";
     # build time dep for header only, only needs source.
-    env.TRITON_CUDACRT_PATH = cudaRtIncludes;
-    env.TRITON_CUDART_PATH = cudaRtIncludes;
-    env.CXXFLAGS = "-I/build/source/third_party/triton/third_party/nvidia/backend/include -I${cudaRtIncludes}/include";
+    # env.TRITON_CUDACRT_PATH = cudaRtIncludes;
+    # env.TRITON_CUDART_PATH = cudaRtIncludes;
+    env.CXXFLAGS = "-I/build/source/third_party/triton/third_party/nvidia/backend/include";
     # env.NOIMAGE_MODE = 1;
 
     # Fix up header issues in triton: https://github.com/triton-lang/triton/pull/3985/files
     preConfigure = ''
       mkdir third_party/triton/third_party/nvidia/backend/include/
-      cp ${cudaRtIncludes}/include/*.h third_party/triton/third_party/nvidia/backend/include/
+      touch third_party/triton/third_party/nvidia/backend/include/cuda.h
+      #cp ''${cudaRtIncludes}/include/*.h third_party/triton/third_party/nvidia/backend/include/
       find third_party/triton -type f -exec sed -i 's|[<]cupti.h[>]|"cupti.h"|g' {} +
       find third_party/triton -type f -exec sed -i 's|[<]cuda.h[>]|"cuda.h"|g' {} +
       grep -ir cuda.h third_party/triton

diff --git a/pkgs/development/rocm-modules/6/composable_kernel/default.nix b/pkgs/development/rocm-modules/6/composable_kernel/default.nix
@@ -59,6 +59,7 @@ stdenv.mkDerivation (finalAttrs: {
     clr
     hipify
     ninja
+    zstd
   ];
 
   buildInputs = [

diff --git a/pkgs/development/rocm-modules/6/default.nix b/pkgs/development/rocm-modules/6/default.nix
@@ -14,13 +14,15 @@
   emptyDirectory,
   cudaPackages,
   triton-llvm,
+  openmpi,
 }:
 
 lib.makeScope newScope (
   self:
   let
     pyPackages = python3Packages;
     libffiorig = libffi;
+    openmpi-orig = openmpi;
   in
   with self;
   {
@@ -556,17 +558,29 @@ lib.makeScope newScope (
       useCPU = true;
     };
 
+    openmpi = openmpi-orig.override (prev: {
+      ucx = prev.ucx.override {
+        enableCuda = false;
+        enableRocm = true;
+      };
+    });
+    mpi = self.openmpi;
+
     triton-llvm =
-      builtins.trace "FIXME: triton-rocm needs ANOTHER different LLVM build" triton-llvm.overrideAttrs
+      builtins.trace "FIXME: triton-rocm needs ANOTHER different LLVM build"
+        (triton-llvm.override {
+          buildTests = false; # FIXME: why are tests failing?
+        }).overrideAttrs
         {
           src = fetchFromGitHub {
             owner = "llvm";
             repo = "llvm-project";
             # make sure this matches triton llvm rel branch hash for now
             # https://github.com/triton-lang/triton/blob/release/3.2.x/cmake/llvm-hash.txt
-            rev = "b5cc222d7429fe6f18c787f633d5262fac2e676f";
-            hash = "sha256-iH5OBwtmJLHao2PhxKT8w+vGlFE0D2R/ry8j9nZs+TQ=";
+            rev = "86b69c31642e98f8357df62c09d118ad1da4e16a";
+            hash = "sha256-W/mQwaLGx6/rIBjdzUTIbWrvGjdh7m4s15f70fQ1/hE=";
           };
+          pname = "triton-llvm-rocm";
           patches = [ ]; # FIXME: https://github.com/llvm/llvm-project//commit/84837e3cc1cf17ed71580e3ea38299ed2bfaa5f6.patch doesn't apply, may need to rebase
         };
 
@@ -579,30 +593,38 @@ lib.makeScope newScope (
         llvm = self.triton-llvm;
       })).overridePythonAttrs
         (old: {
-
+          doCheck = false;
           stdenv = self.llvm.rocmClangStdenv;
           version = "3.2.0";
           src = fetchFromGitHub {
             owner = "triton-lang";
             repo = "triton";
-            rev = "release/3.2.x";
-            hash = "sha256-cC2eARYcmZqLrzwlmMi92xkEqpGMn2d9IndZQBoGE7Q=";
+            rev = "64b80f0916b69e3c4d0682a2368fd126e57891ab"; # "release/3.2.x";
+            hash = "sha256-xQOgMLHruVrI/9FtY3TvZKALitMOfqZ69uOyrYhXhu8=";
           };
           buildInputs = old.buildInputs ++ [
             self.clr
           ];
           dontStrip = true;
           env = old.env // {
-            CXXFLAGS = "-gz -g1 -O3 -I${self.clr}/include -I/build/source/third_party/triton/third_party/nvidia/backend/include -I${cudaPackages.cudatoolkit}/include";
+            CXXFLAGS = "-gz -g1 -O3 -I${self.clr}/include -I/build/source/third_party/triton/third_party/nvidia/backend/include";
+            TRITON_OFFLINE_BUILD = 1;
           };
-          # TRITON_BUILD_PROTON = "OFF"; # disable profiler, instead of --replace-fail 'packages += ["triton/profiler"]' ""\
           patches = [ ];
           postPatch = ''
             # Need an empty cuda.h to happily compile for ROCm
+            mkdir -p third_party/nvidia/include/ third_party/nvidia/include/backend/include/
             echo "" > third_party/nvidia/include/cuda.h
-
-            mkdir third_party/nvidia/backend/include/
-            cp ${cudaPackages.cudatoolkit}/include/*.h third_party/nvidia/backend/include/
+            touch third_party/nvidia/include/backend/include/{cuda,driver_types}.h
+            rm -rf third_party/nvidia
+            substituteInPlace CMakeLists.txt \
+              --replace-fail "add_subdirectory(test)" ""
+            sed -i '/nvidia\|NVGPU\|registerConvertTritonGPUToLLVMPass\|mlir::test::/Id' bin/RegisterTritonDialects.h
+            sed -i '/TritonTestAnalysis/Id' bin/CMakeLists.txt
+            substituteInPlace python/setup.py \
+              --replace-fail 'backends = [*BackendInstaller.copy(["nvidia", "amd"]), *BackendInstaller.copy_externals()]' \
+              'backends = [*BackendInstaller.copy(["amd"]), *BackendInstaller.copy_externals()]'
+            #cp ''${cudaPackages.cuda_cudart}/include/*.h third_party/nvidia/backend/include/
             find . -type f -exec sed -i 's|[<]cupti.h[>]|"cupti.h"|g' {} +
             find . -type f -exec sed -i 's|[<]cuda.h[>]|"cuda.h"|g' {} +
 

diff --git a/pkgs/development/rocm-modules/6/hipblas/default.nix b/pkgs/development/rocm-modules/6/hipblas/default.nix
@@ -24,8 +24,6 @@
 stdenv.mkDerivation (finalAttrs: {
   pname = "hipblas";
   version = "6.3.1";
-  env.NIX_DEBUG = 1;
-  env.NIX_DISABLE_WRAPPER_INCLUDES = 1;
 
   outputs =
     [

diff --git a/pkgs/development/rocm-modules/6/hipblaslt/default.nix b/pkgs/development/rocm-modules/6/hipblaslt/default.nix
@@ -25,7 +25,16 @@
   buildTests ? false,
   buildBenchmarks ? false,
   buildSamples ? false,
-  gpuTargets ? [ "gfx908" ], # [  ]
+  # hipblaslt supports only devices with MFMA or WMMA
+  # WMMA on gfx1100 may be broken
+  # MFMA on MI100 may be broken
+  # MI200/MI300 known to work
+  gpuTargets ? [
+    "gfx908"
+    "gfx90a"
+    "gfx942"
+    "gfx1100"
+  ],
 }:
 
 stdenv.mkDerivation (
@@ -62,14 +71,11 @@ stdenv.mkDerivation (
     env.CXXFLAGS = cFlags;
     env.ROCM_PATH = "${clr}";
     env.TENSILE_ROCM_ASSEMBLER_PATH = "${clang-sysrooted}/bin/clang++";
-    env.NIX_CC_USE_RESPONSE_FILE = 0;
-    env.NIX_DISABLE_WRAPPER_INCLUDES = 1;
     env.TENSILE_GEN_ASSEMBLY_TOOLCHAIN = "${clang-sysrooted}/bin/clang++";
     requiredSystemFeatures = [ "big-parallel" ];
 
     patches = [
       ./ext-op-first.diff
-      # ./alpha_1_init_fix.patch # libcxx bug workaround -
     ];
 
     outputs =

diff --git a/pkgs/development/rocm-modules/6/miopen/default.nix b/pkgs/development/rocm-modules/6/miopen/default.nix
@@ -39,6 +39,8 @@
     "gfx908"
     "gfx90a"
     "gfx942"
+    "gfx1030"
+    "gfx1100"
   ], # clr.gpuTargets
   buildDocs ? false, # Needs internet because of rocm-docs-core
   buildTests ? false,
-Original file line number
+Diff line change
@@ Expand Up / @@ -59,6 +59,7 @@ stdenv.mkDerivation (finalAttrs: { @@
         clr
         hipify
         ninja
+        zstd
       ];
       buildInputs = [
@@ Expand Down @@