From 22f00e136e7b2c8165c312fa662ce03baaed0b11 Mon Sep 17 00:00:00 2001
From: Luna Nova <git@lunnova.dev>
Date: Tue, 24 Dec 2024 08:58:12 -0800
Subject: [PATCH] !! fixup / testing

---
 pkgs/by-name/uc/ucx/package.nix               |  4 +-
 .../python-modules/torch/default.nix          | 39 +++++++++++-----
 .../rocm-modules/6/aotriton/default.nix       | 23 ++++++----
 .../6/composable_kernel/default.nix           |  1 +
 pkgs/development/rocm-modules/6/default.nix   | 44 ++++++++++++++-----
 .../rocm-modules/6/hipblas/default.nix        |  2 -
 .../rocm-modules/6/hipblaslt/default.nix      | 14 ++++--
 .../rocm-modules/6/miopen/default.nix         |  2 +
 .../rocm-modules/6/mscclpp/default.nix        |  6 +--
 .../rocm-modules/6/rccl/default.nix           |  6 +--
 .../rocm-modules/6/rocblas/default.nix        | 27 +++---------
 .../rocm-modules/6/rocfft/default.nix         |  4 +-
 .../rocm-modules/6/rocm-core/default.nix      |  2 +-
 .../6/rocm-device-libs/default.nix            |  5 ---
 .../rocm-modules/6/rocsolver/default.nix      |  1 -
 .../rocm-modules/6/rocsparse/default.nix      |  3 +-
 pkgs/top-level/stage.nix                      |  9 ++++
 17 files changed, 114 insertions(+), 78 deletions(-)

diff --git a/pkgs/by-name/uc/ucx/package.nix b/pkgs/by-name/uc/ucx/package.nix
index 87fb2eaaf824eb..cc9c3f4461d577 100644
--- a/pkgs/by-name/uc/ucx/package.nix
+++ b/pkgs/by-name/uc/ucx/package.nix
@@ -33,8 +33,10 @@ let
     paths = rocmList;
   };
 
+  # rocm build fails with gcc stdenv due to unrecognised arg parallel-jobs
+  stdenv' = if enableRocm then rocmPackages.llvm.rocmClangStdenv else stdenv;
 in
-stdenv.mkDerivation rec {
+stdenv'.mkDerivation rec {
   pname = "ucx";
   version = "1.17.0";
 
diff --git a/pkgs/development/python-modules/torch/default.nix b/pkgs/development/python-modules/torch/default.nix
index f85ccc9e85f40a..2703974880bac7 100644
--- a/pkgs/development/python-modules/torch/default.nix
+++ b/pkgs/development/python-modules/torch/default.nix
@@ -16,11 +16,10 @@
     if cudaSupport then
       magma-cuda-static
     else if rocmSupport then
-      magma-hip
+      null
     else
       magma,
   magma,
-  magma-hip,
   magma-cuda-static,
   # Use the system NCCL as long as we're targeting CUDA on a supported platform.
   useSystemNccl ? (cudaSupport && !cudaPackages.nccl.meta.unsupported || rocmSupport),
@@ -36,6 +35,7 @@
   symlinkJoin,
   which,
   pybind11,
+  pkg-config,
   removeReferencesTo,
 
   # Build inputs
@@ -54,6 +54,7 @@
   cffi,
   click,
   typing-extensions,
+  six,
   # ROCm build and `torch.compile` requires `triton`
   tritonSupport ? (!stdenv.hostPlatform.isDarwin),
   triton,
@@ -66,7 +67,13 @@
   #          (dependencies without cuda support).
   #          Instead we should rely on overlays and nixpkgsFun.
   # (@SomeoneSerge)
-  _tritonEffective ? if cudaSupport then triton-cuda else triton,
+  _tritonEffective ?
+    if cudaSupport then
+      triton-cuda
+    else if rocmSupport then
+      rocmPackages.triton
+    else
+      triton,
   triton-cuda,
 
   # Unit tests
@@ -86,14 +93,13 @@
 
   # dependencies for torch.utils.tensorboard
   pillow,
-  six,
   future,
   tensorboard,
   protobuf,
 
   # ROCm dependencies
   rocmSupport ? config.rocmSupport,
-  rocmPackages_5,
+  rocmPackages,
   gpuTargets ? [ ],
 
   vulkanSupport ? false,
@@ -113,8 +119,6 @@ let
 
   triton = throw "python3Packages.torch: use _tritonEffective instead of triton to avoid divergence";
 
-  rocmPackages = rocmPackages_5;
-
   setBool = v: if v then "1" else "0";
 
   # https://github.com/pytorch/pytorch/blob/v2.4.0/torch/utils/cpp_extension.py#L1953
@@ -180,7 +184,7 @@ let
       clr
       rccl
       miopen
-      miopengemm
+      aotriton
       rocrand
       rocblas
       rocsparse
@@ -192,8 +196,11 @@ let
       rocfft
       rocsolver
       hipfft
+      hiprand
       hipsolver
+      hipblas-common
       hipblas
+      hipblaslt
       rocminfo
       rocm-thunk
       rocm-comgr
@@ -212,6 +219,7 @@ let
   brokenConditions = attrsets.filterAttrs (_: cond: cond) {
     "CUDA and ROCm are mutually exclusive" = cudaSupport && rocmSupport;
     "CUDA is not targeting Linux" = cudaSupport && !stdenv.hostPlatform.isLinux;
+    "ROCm 6 is currently not compatible with magma" = rocmSupport && effectiveMagma != null;
     "Unsupported CUDA version" =
       cudaSupport
       && !(builtins.elem cudaPackages.cudaMajorVersion [
@@ -225,8 +233,6 @@ let
     # In particular, this triggered warnings from cuda's `aliases.nix`
     "Magma cudaPackages does not match cudaPackages" =
       cudaSupport && (effectiveMagma.cudaPackages.cudaVersion != cudaPackages.cudaVersion);
-    "Rocm support is currently broken because `rocmPackages.hipblaslt` is unpackaged. (2024-06-09)" =
-      rocmSupport;
   };
 
   git-unroll = fetchFromGitea {
@@ -388,6 +394,10 @@ buildPythonPackage rec {
   # We only do an imports check, so do not build tests either.
   BUILD_TEST = setBool false;
 
+  # ninja hook doesn't automatically turn on ninja
+  # because pytorch setup.py is responsible for this
+  CMAKE_GENERATOR = "Ninja";
+
   # Unlike MKL, oneDNN (née MKLDNN) is FOSS, so we enable support for
   # it by default. PyTorch currently uses its own vendored version
   # of oneDNN through Intel iDeep.
@@ -406,6 +416,7 @@ buildPythonPackage rec {
 
   cmakeFlags =
     [
+      (lib.cmakeFeature "PYTHON_SIX_SOURCE_DIR" "${six.src}")
       # (lib.cmakeBool "CMAKE_FIND_DEBUG_MODE" true)
       (lib.cmakeFeature "CUDAToolkit_VERSION" cudaPackages.cudaVersion)
     ]
@@ -454,6 +465,8 @@ buildPythonPackage rec {
 
   env =
     {
+      # Builds faster without this and we don't have enough inputs that cmd length is an issue
+      NIX_CC_USE_RESPONSE_FILE = 0;
       # Suppress a weird warning in mkl-dnn, part of ideep in pytorch
       # (upstream seems to have fixed this in the wrong place?)
       # https://github.com/intel/mkl-dnn/commit/8134d346cdb7fe1695a2aa55771071d455fae0bc
@@ -511,6 +524,9 @@ buildPythonPackage rec {
     }
     // lib.optionalAttrs vulkanSupport {
       VULKAN_SDK = shaderc.bin;
+    }
+    // lib.optionalAttrs rocmSupport {
+      AOTRITON_INSTALLED_PREFIX = "${rocmPackages.aotriton}";
     };
 
   nativeBuildInputs =
@@ -519,6 +535,7 @@ buildPythonPackage rec {
       which
       ninja
       pybind11
+      pkg-config
       removeReferencesTo
     ]
     ++ lib.optionals cudaSupport (
@@ -564,7 +581,7 @@ buildPythonPackage rec {
       ]
     )
     ++ lib.optionals rocmSupport [ rocmPackages.llvm.openmp ]
-    ++ lib.optionals (cudaSupport || rocmSupport) [ effectiveMagma ]
+    ++ lib.optionals (effectiveMagma != null && (cudaSupport || rocmSupport)) [ effectiveMagma ]
     ++ lib.optionals stdenv.hostPlatform.isLinux [ numactl ]
     ++ lib.optionals stdenv.hostPlatform.isDarwin [
       apple-sdk_13
diff --git a/pkgs/development/rocm-modules/6/aotriton/default.nix b/pkgs/development/rocm-modules/6/aotriton/default.nix
index 45d51afaf01e08..67935c3396b737 100644
--- a/pkgs/development/rocm-modules/6/aotriton/default.nix
+++ b/pkgs/development/rocm-modules/6/aotriton/default.nix
@@ -33,7 +33,13 @@
   buildTests ? false,
   buildBenchmarks ? false,
   buildSamples ? false,
-  gpuTargets ? [ "gfx908" ], # [  ]
+  gpuTargets ? [
+    "gfx908"
+    "gfx90a"
+    "gfx942"
+    "gfx1030"
+    "gfx1100"
+  ], # [  ]
 }:
 
 stdenv.mkDerivation (
@@ -60,7 +66,7 @@ stdenv.mkDerivation (
     #   cd $out
     #   tar xf  ${cudaPackages.cuda_cudart.src} --strip-components=1
     # '';
-    cudaRtIncludes = cudaPackages.cudatoolkit;
+    #cudaRtIncludes = cudaPackages.cuda_cudart;
     triton-llvm' = builtins.trace "aotriton: TODO: confirm using same triton-llvm pinned hash as triton 3.2.x is ok" triton-llvm;
   in
   # triton-llvm' = triton-llvm.overrideAttrs (_old: {
@@ -86,8 +92,6 @@ stdenv.mkDerivation (
     };
     env.CXX = compiler;
     env.ROCM_PATH = "${clr}";
-    env.NIX_CC_USE_RESPONSE_FILE = 0;
-    env.NIX_DISABLE_WRAPPER_INCLUDES = 1;
     requiredSystemFeatures = [ "big-parallel" ];
 
     outputs =
@@ -142,7 +146,7 @@ stdenv.mkDerivation (
         xz
         nlohmann_json
         rocmlir
-        cudaRtIncludes
+        #cudaRtIncludes
 
         # Tensile deps - not optional, building without tensile isn't actually supported
         msgpack # FIXME: not included in cmake!
@@ -164,15 +168,16 @@ stdenv.mkDerivation (
     env.JSON_SYSPATH = nlohmann_json;
     env.MLIR_DIR = "${triton-llvm'}/lib/cmake/mlir";
     # build time dep for header only, only needs source.
-    env.TRITON_CUDACRT_PATH = cudaRtIncludes;
-    env.TRITON_CUDART_PATH = cudaRtIncludes;
-    env.CXXFLAGS = "-I/build/source/third_party/triton/third_party/nvidia/backend/include -I${cudaRtIncludes}/include";
+    # env.TRITON_CUDACRT_PATH = cudaRtIncludes;
+    # env.TRITON_CUDART_PATH = cudaRtIncludes;
+    env.CXXFLAGS = "-I/build/source/third_party/triton/third_party/nvidia/backend/include";
     # env.NOIMAGE_MODE = 1;
 
     # Fix up header issues in triton: https://github.com/triton-lang/triton/pull/3985/files
     preConfigure = ''
       mkdir third_party/triton/third_party/nvidia/backend/include/
-      cp ${cudaRtIncludes}/include/*.h third_party/triton/third_party/nvidia/backend/include/
+      touch third_party/triton/third_party/nvidia/backend/include/cuda.h
+      #cp ''${cudaRtIncludes}/include/*.h third_party/triton/third_party/nvidia/backend/include/
       find third_party/triton -type f -exec sed -i 's|[<]cupti.h[>]|"cupti.h"|g' {} +
       find third_party/triton -type f -exec sed -i 's|[<]cuda.h[>]|"cuda.h"|g' {} +
       grep -ir cuda.h third_party/triton
diff --git a/pkgs/development/rocm-modules/6/composable_kernel/default.nix b/pkgs/development/rocm-modules/6/composable_kernel/default.nix
index 06940d03ab2776..faf4450ff3a4ba 100644
--- a/pkgs/development/rocm-modules/6/composable_kernel/default.nix
+++ b/pkgs/development/rocm-modules/6/composable_kernel/default.nix
@@ -59,6 +59,7 @@ stdenv.mkDerivation (finalAttrs: {
     clr
     hipify
     ninja
+    zstd
   ];
 
   buildInputs = [
diff --git a/pkgs/development/rocm-modules/6/default.nix b/pkgs/development/rocm-modules/6/default.nix
index 300b3de636d328..35db8428e4692b 100644
--- a/pkgs/development/rocm-modules/6/default.nix
+++ b/pkgs/development/rocm-modules/6/default.nix
@@ -14,6 +14,7 @@
   emptyDirectory,
   cudaPackages,
   triton-llvm,
+  openmpi,
 }:
 
 lib.makeScope newScope (
@@ -21,6 +22,7 @@ lib.makeScope newScope (
   let
     pyPackages = python3Packages;
     libffiorig = libffi;
+    openmpi-orig = openmpi;
   in
   with self;
   {
@@ -556,17 +558,29 @@ lib.makeScope newScope (
       useCPU = true;
     };
 
+    openmpi = openmpi-orig.override (prev: {
+      ucx = prev.ucx.override {
+        enableCuda = false;
+        enableRocm = true;
+      };
+    });
+    mpi = self.openmpi;
+
     triton-llvm =
-      builtins.trace "FIXME: triton-rocm needs ANOTHER different LLVM build" triton-llvm.overrideAttrs
+      builtins.trace "FIXME: triton-rocm needs ANOTHER different LLVM build"
+        (triton-llvm.override {
+          buildTests = false; # FIXME: why are tests failing?
+        }).overrideAttrs
         {
           src = fetchFromGitHub {
             owner = "llvm";
             repo = "llvm-project";
             # make sure this matches triton llvm rel branch hash for now
             # https://github.com/triton-lang/triton/blob/release/3.2.x/cmake/llvm-hash.txt
-            rev = "b5cc222d7429fe6f18c787f633d5262fac2e676f";
-            hash = "sha256-iH5OBwtmJLHao2PhxKT8w+vGlFE0D2R/ry8j9nZs+TQ=";
+            rev = "86b69c31642e98f8357df62c09d118ad1da4e16a";
+            hash = "sha256-W/mQwaLGx6/rIBjdzUTIbWrvGjdh7m4s15f70fQ1/hE=";
           };
+          pname = "triton-llvm-rocm";
           patches = [ ]; # FIXME: https://github.com/llvm/llvm-project//commit/84837e3cc1cf17ed71580e3ea38299ed2bfaa5f6.patch doesn't apply, may need to rebase
         };
 
@@ -579,30 +593,38 @@ lib.makeScope newScope (
         llvm = self.triton-llvm;
       })).overridePythonAttrs
         (old: {
-
+          doCheck = false;
           stdenv = self.llvm.rocmClangStdenv;
           version = "3.2.0";
           src = fetchFromGitHub {
             owner = "triton-lang";
             repo = "triton";
-            rev = "release/3.2.x";
-            hash = "sha256-cC2eARYcmZqLrzwlmMi92xkEqpGMn2d9IndZQBoGE7Q=";
+            rev = "64b80f0916b69e3c4d0682a2368fd126e57891ab"; # "release/3.2.x";
+            hash = "sha256-xQOgMLHruVrI/9FtY3TvZKALitMOfqZ69uOyrYhXhu8=";
           };
           buildInputs = old.buildInputs ++ [
             self.clr
           ];
           dontStrip = true;
           env = old.env // {
-            CXXFLAGS = "-gz -g1 -O3 -I${self.clr}/include -I/build/source/third_party/triton/third_party/nvidia/backend/include -I${cudaPackages.cudatoolkit}/include";
+            CXXFLAGS = "-gz -g1 -O3 -I${self.clr}/include -I/build/source/third_party/triton/third_party/nvidia/backend/include";
+            TRITON_OFFLINE_BUILD = 1;
           };
-          # TRITON_BUILD_PROTON = "OFF"; # disable profiler, instead of --replace-fail 'packages += ["triton/profiler"]' ""\
           patches = [ ];
           postPatch = ''
             # Need an empty cuda.h to happily compile for ROCm
+            mkdir -p third_party/nvidia/include/ third_party/nvidia/include/backend/include/
             echo "" > third_party/nvidia/include/cuda.h
-
-            mkdir third_party/nvidia/backend/include/
-            cp ${cudaPackages.cudatoolkit}/include/*.h third_party/nvidia/backend/include/
+            touch third_party/nvidia/include/backend/include/{cuda,driver_types}.h
+            rm -rf third_party/nvidia
+            substituteInPlace CMakeLists.txt \
+              --replace-fail "add_subdirectory(test)" ""
+            sed -i '/nvidia\|NVGPU\|registerConvertTritonGPUToLLVMPass\|mlir::test::/Id' bin/RegisterTritonDialects.h
+            sed -i '/TritonTestAnalysis/Id' bin/CMakeLists.txt
+            substituteInPlace python/setup.py \
+              --replace-fail 'backends = [*BackendInstaller.copy(["nvidia", "amd"]), *BackendInstaller.copy_externals()]' \
+              'backends = [*BackendInstaller.copy(["amd"]), *BackendInstaller.copy_externals()]'
+            #cp ''${cudaPackages.cuda_cudart}/include/*.h third_party/nvidia/backend/include/
             find . -type f -exec sed -i 's|[<]cupti.h[>]|"cupti.h"|g' {} +
             find . -type f -exec sed -i 's|[<]cuda.h[>]|"cuda.h"|g' {} +
 
diff --git a/pkgs/development/rocm-modules/6/hipblas/default.nix b/pkgs/development/rocm-modules/6/hipblas/default.nix
index 3f986b6d04b91e..b244d4934fdc60 100644
--- a/pkgs/development/rocm-modules/6/hipblas/default.nix
+++ b/pkgs/development/rocm-modules/6/hipblas/default.nix
@@ -24,8 +24,6 @@
 stdenv.mkDerivation (finalAttrs: {
   pname = "hipblas";
   version = "6.3.1";
-  env.NIX_DEBUG = 1;
-  env.NIX_DISABLE_WRAPPER_INCLUDES = 1;
 
   outputs =
     [
diff --git a/pkgs/development/rocm-modules/6/hipblaslt/default.nix b/pkgs/development/rocm-modules/6/hipblaslt/default.nix
index a00ad3bb269ce7..978a8592a8b4dc 100644
--- a/pkgs/development/rocm-modules/6/hipblaslt/default.nix
+++ b/pkgs/development/rocm-modules/6/hipblaslt/default.nix
@@ -25,7 +25,16 @@
   buildTests ? false,
   buildBenchmarks ? false,
   buildSamples ? false,
-  gpuTargets ? [ "gfx908" ], # [  ]
+  # hipblaslt supports only devices with MFMA or WMMA
+  # WMMA on gfx1100 may be broken
+  # MFMA on MI100 may be broken
+  # MI200/MI300 known to work
+  gpuTargets ? [
+    "gfx908"
+    "gfx90a"
+    "gfx942"
+    "gfx1100"
+  ],
 }:
 
 stdenv.mkDerivation (
@@ -62,14 +71,11 @@ stdenv.mkDerivation (
     env.CXXFLAGS = cFlags;
     env.ROCM_PATH = "${clr}";
     env.TENSILE_ROCM_ASSEMBLER_PATH = "${clang-sysrooted}/bin/clang++";
-    env.NIX_CC_USE_RESPONSE_FILE = 0;
-    env.NIX_DISABLE_WRAPPER_INCLUDES = 1;
     env.TENSILE_GEN_ASSEMBLY_TOOLCHAIN = "${clang-sysrooted}/bin/clang++";
     requiredSystemFeatures = [ "big-parallel" ];
 
     patches = [
       ./ext-op-first.diff
-      # ./alpha_1_init_fix.patch # libcxx bug workaround -
     ];
 
     outputs =
diff --git a/pkgs/development/rocm-modules/6/miopen/default.nix b/pkgs/development/rocm-modules/6/miopen/default.nix
index 7cf9455a423b7a..fe0ee8adadefdd 100644
--- a/pkgs/development/rocm-modules/6/miopen/default.nix
+++ b/pkgs/development/rocm-modules/6/miopen/default.nix
@@ -39,6 +39,8 @@
     "gfx908"
     "gfx90a"
     "gfx942"
+    "gfx1030"
+    "gfx1100"
   ], # clr.gpuTargets
   buildDocs ? false, # Needs internet because of rocm-docs-core
   buildTests ? false,
diff --git a/pkgs/development/rocm-modules/6/mscclpp/default.nix b/pkgs/development/rocm-modules/6/mscclpp/default.nix
index f734fda8c4f426..e1ac56b7683923 100644
--- a/pkgs/development/rocm-modules/6/mscclpp/default.nix
+++ b/pkgs/development/rocm-modules/6/mscclpp/default.nix
@@ -15,26 +15,22 @@ stdenv.mkDerivation {
   buildInputs = [
     clr
     numactl
-    #nlohmann_json
-    #python3Packages.nanobind
   ];
   postPatch = ''
     substituteInPlace CMakeLists.txt \
       --replace-fail "gfx90a gfx941 gfx942" "gfx908 gfx90a gfx942 gfx1030 gfx1100"
   '';
   cmakeFlags = [
-    #"--trace"
     "-DMSCCLPP_BYPASS_GPU_CHECK=ON"
     "-DMSCCLPP_USE_ROCM=ON"
     "-DMSCCLPP_BUILD_TESTS=OFF"
+    "-DGPU_TARGETS=gfx908;gfx90a;gfx942;gfx1030;gfx1100"
     "-DAMDGPU_TARGETS=gfx908;gfx90a;gfx942;gfx1030;gfx1100"
     "-DMSCCLPP_BUILD_APPS_NCCL=ON"
     "-DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF"
     "-DFETCHCONTENT_QUIET=OFF"
     "-DFETCHCONTENT_TRY_FIND_PACKAGE_MODE=ALWAYS"
-    #"-DFETCHCONTENT_SOURCE_DIR_NANOBIND=${nanobind_src}"
     "-DFETCHCONTENT_SOURCE_DIR_JSON=${nlohmann_json.src}"
-    #"-DFETCHCONTENT_FULLY_DISCONNECTED=ON"
   ];
   env.ROCM_PATH = clr;
   src = fetchFromGitHub {
diff --git a/pkgs/development/rocm-modules/6/rccl/default.nix b/pkgs/development/rocm-modules/6/rccl/default.nix
index f9b3e542aa81a4..f3087377d90249 100644
--- a/pkgs/development/rocm-modules/6/rccl/default.nix
+++ b/pkgs/development/rocm-modules/6/rccl/default.nix
@@ -84,7 +84,7 @@ stdenv.mkDerivation (finalAttrs: {
       "-DROCM_PATH=${clr}"
       "-DHIP_COMPILER=${clr}/bin/amdclang++"
       "-DCMAKE_CXX_COMPILER=${clr}/bin/amdclang++"
-      "-DROCM_PATCH_VERSION=${rocm-core.ROCM_LIBPATCH_VERSION}" # FIXME: get from versin
+      "-DROCM_PATCH_VERSION=${rocm-core.ROCM_LIBPATCH_VERSION}"
       "-DROCM_VERSION=${rocm-core.ROCM_LIBPATCH_VERSION}"
       "-DBUILD_BFD=OFF" # Can't get it to detect bfd.h
       "-DENABLE_MSCCL_KERNEL=ON"
@@ -107,8 +107,8 @@ stdenv.mkDerivation (finalAttrs: {
   makeFlags = [ "-l32" ];
 
   env.CCC_OVERRIDE_OPTIONS = "+-parallel-jobs=6";
-  env.CFLAGS = "-I${clr}/include -O2 -fno-strict-aliasing -gz -g1 ${san}-fno-omit-frame-pointer -momit-leaf-frame-pointer -DROCM_VERSION=60300";
-  env.CXXFLAGS = "-I${clr}/include -O2 -fno-strict-aliasing -gz -g1 ${san}-fno-omit-frame-pointer -momit-leaf-frame-pointer -DROCM_VERSION=60300";
+  env.CFLAGS = "-I${clr}/include -O2 -fno-strict-aliasing -gz -g1 ${san}-fno-omit-frame-pointer -momit-leaf-frame-pointer";
+  env.CXXFLAGS = "-I${clr}/include -O2 -fno-strict-aliasing -gz -g1 ${san}-fno-omit-frame-pointer -momit-leaf-frame-pointer";
   env.LDFLAGS = "${san}";
   postPatch = ''
     patchShebangs src tools
diff --git a/pkgs/development/rocm-modules/6/rocblas/default.nix b/pkgs/development/rocm-modules/6/rocblas/default.nix
index ee5fbc79b686cb..da2ba1c77d2ced 100644
--- a/pkgs/development/rocm-modules/6/rocblas/default.nix
+++ b/pkgs/development/rocm-modules/6/rocblas/default.nix
@@ -25,8 +25,6 @@
   buildTensile ? true,
   buildTests ? true,
   buildBenchmarks ? true,
-  #, tensileLogic ? "asm_full"
-  tensileCOVersion ? "default",
   # https://github.com/ROCm/Tensile/issues/1757
   # Allows gfx101* users to use rocBLAS normally.
   # Turn the below two values to `true` after the fix has been cherry-picked
@@ -42,16 +40,18 @@
   # would force all `gfx101*` GPUs to run as `gfx1010`, so `gfx101*` GPUs will
   # always try to use `gfx1010` code objects, hence building for `gfx1012` is
   # useless: https://github.com/NixOS/nixpkgs/pull/298388#issuecomment-2076327152
-  # , gpuTargets ? [ "gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx942;gfx1010;gfx1030;gfx1100;gfx1101;gfx1102" ]
-  #, gpuTargets ? [ "gfx908;gfx90a;gfx942;gfx1010;gfx1030;gfx1100;gfx1101;gfx1102" ]
-  #, gpuTargets ? [ "gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1030" ]
   gpuTargets ? [
+    "gfx900"
+    "gfx906"
     "gfx908"
     "gfx90a"
     "gfx942"
+    "gfx1010"
     "gfx1030"
     "gfx1100"
-  ], # "gfx1030" "gfx1100" ]
+    "gfx1101"
+    "gfx1102"
+  ],
 }:
 
 # FIXME: this derivation is ludicrously large, split into arch-specific derivations and symlink together?
@@ -125,7 +125,6 @@ stdenv.mkDerivation (finalAttrs: {
   env.LDFLAGS = lib.optionalString (
     buildTests || buildBenchmarks
   ) "-Wl,--as-needed -L${amd-blis}/lib -lblis-mt -lcblas";
-  env.NIX_DISABLE_WRAPPER_INCLUDES = 1;
   env.TENSILE_ROCM_ASSEMBLER_PATH = "${clang-sysrooted}/bin/clang++";
 
   cmakeFlags =
@@ -156,30 +155,18 @@ stdenv.mkDerivation (finalAttrs: {
       "-DCMAKE_INSTALL_LIBDIR=lib"
     ]
     ++ lib.optionals buildTensile [
-      #"        -DCMAKE_PREFIX_PATH="${DEPS_DIR};${ROCM_PATH}" \
       "-DCPACK_SET_DESTDIR=OFF"
       "-DLINK_BLIS=ON"
       "-DTensile_CODE_OBJECT_VERSION=default"
       "-DTensile_LOGIC=asm_full"
-      # "-DTensile_LOGIC=hip_lite"
-      #"-DTensile_SEPARATE_ARCHITECTURES=ON"
-      #"-DTensile_LAZY_LIBRARY_LOADING=ON"
       "-DTensile_LIBRARY_FORMAT=msgpack"
       (lib.cmakeBool "BUILD_WITH_PIP" false)
-      # "-DTensile_COMPILER=hipcc"
-      # "-DTensile_CODE_OBJECT_VERSION=V4"
-      # "-DTensile_LOGIC=hip_lite"
-      #(lib.cmakeFeature "Tensile_LOGIC" tensileLogic)
-      #(lib.cmakeFeature "Tensile_CODE_OBJECT_VERSION" tensileCOVersion)
       (lib.cmakeBool "Tensile_SEPARATE_ARCHITECTURES" tensileSepArch)
       (lib.cmakeBool "Tensile_LAZY_LIBRARY_LOADING" tensileLazyLib)
-      #(lib.cmakeBool "Tensile_PRINT_DEBUG" true)
-      #"-DTENSILE_GPU_ARCHS=gfx908"
-      #"-DTensile_VERBOSE=2"
     ];
 
   preConfigure = ''
-    makeFlagsArray+=("-l$((NIX_BUILD_CORES / 2))")
+    makeFlagsArray+=("-l$(nproc)")
   '';
 
   passthru.amdgpu_targets = gpuTargets';
diff --git a/pkgs/development/rocm-modules/6/rocfft/default.nix b/pkgs/development/rocm-modules/6/rocfft/default.nix
index bd5eddfd6c4739..9378430176e09d 100644
--- a/pkgs/development/rocm-modules/6/rocfft/default.nix
+++ b/pkgs/development/rocm-modules/6/rocfft/default.nix
@@ -14,9 +14,7 @@
   gtest,
   openmp,
   rocrand,
-  gpuTargets ? [
-    "gfx908;gfx1030;gfx1100"
-  ],
+  gpuTargets ? clr.gpuTargets,
 }:
 
 stdenv.mkDerivation (finalAttrs: {
diff --git a/pkgs/development/rocm-modules/6/rocm-core/default.nix b/pkgs/development/rocm-modules/6/rocm-core/default.nix
index 13edeaa29fc1d9..e71205db3abba3 100644
--- a/pkgs/development/rocm-modules/6/rocm-core/default.nix
+++ b/pkgs/development/rocm-modules/6/rocm-core/default.nix
@@ -20,7 +20,7 @@ stdenv.mkDerivation (finalAttrs: {
 
   nativeBuildInputs = [ cmake ];
   # FIXME: What's the correct way to set this?
-  env.ROCM_LIBPATCH_VERSION = "60300";
+  env.ROCM_LIBPATCH_VERSION = "${lib.versions.major finalAttrs.version}0${lib.versions.minor finalAttrs.version}0${lib.versions.patch finalAttrs.version}";
   env.BUILD_ID = "nixos-${finalAttrs.env.ROCM_LIBPATCH_VERSION}";
   env.ROCM_BUILD_ID = "release-${finalAttrs.env.BUILD_ID}";
   cmakeFlags = [
diff --git a/pkgs/development/rocm-modules/6/rocm-device-libs/default.nix b/pkgs/development/rocm-modules/6/rocm-device-libs/default.nix
index d0a7c40ea9f1f3..58e3d1c75a82f0 100644
--- a/pkgs/development/rocm-modules/6/rocm-device-libs/default.nix
+++ b/pkgs/development/rocm-modules/6/rocm-device-libs/default.nix
@@ -49,11 +49,6 @@ stdenv.mkDerivation (finalAttrs: {
   dontStrip = true;
   env.CFLAGS = "-g1 -gz";
   env.CXXFLAGS = "-g1 -gz";
-  # env.NIX_DEBUG = 1;
-  # env.CFLAGS = "-g1 -fsanitize=undefined";
-  # env.CXXFLAGS = "-g1 -fsanitize=undefined";
-  # env.NIX_CFLAGS_COMPILE = "-g1";
-  # env.NIX_CXXFLAGS_COMPILE = "-g1";
 
   cmakeFlags = [
     "-DCMAKE_RELEASE_TYPE=Release"
diff --git a/pkgs/development/rocm-modules/6/rocsolver/default.nix b/pkgs/development/rocm-modules/6/rocsolver/default.nix
index 0133d4bb40bdde..53a5d7b71c1101 100644
--- a/pkgs/development/rocm-modules/6/rocsolver/default.nix
+++ b/pkgs/development/rocm-modules/6/rocsolver/default.nix
@@ -15,7 +15,6 @@
   lapack-reference,
   buildTests ? false,
   buildBenchmarks ? false,
-  #, gpuTargets ? ["gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx942;gfx1030;gfx1100;gfx1101"]
   gpuTargets ? [ ],
 }:
 
diff --git a/pkgs/development/rocm-modules/6/rocsparse/default.nix b/pkgs/development/rocm-modules/6/rocsparse/default.nix
index cc62bcbac924b4..b27380a305fca0 100644
--- a/pkgs/development/rocm-modules/6/rocsparse/default.nix
+++ b/pkgs/development/rocm-modules/6/rocsparse/default.nix
@@ -15,8 +15,7 @@
   python3Packages,
   buildTests ? false,
   buildBenchmarks ? false, # Seems to depend on tests
-  #, gpuTargets ? ["gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx942;gfx1030;gfx1100;gfx1101"]
-  gpuTargets ? [ "gfx908;gfx1030;gfx1100" ],
+  gpuTargets ? clr.gpuTargets,
 }:
 
 stdenv.mkDerivation (finalAttrs: {
diff --git a/pkgs/top-level/stage.nix b/pkgs/top-level/stage.nix
index 1cedd8dd184587..6d8482e4f14058 100644
--- a/pkgs/top-level/stage.nix
+++ b/pkgs/top-level/stage.nix
@@ -321,6 +321,15 @@ let
       };
     });
 
+    # Full package set with rocm on cuda off
+    # Mostly useful for asserting pkgs.pkgsRocm.torchWithRocm == pkgs.torchWithRocm and similar
+    pkgsRocm = nixpkgsFun ({
+      config = super.config // {
+        cudaSupport = false;
+        rocmSupport = true;
+      };
+    });
+
     pkgsExtraHardening = nixpkgsFun {
       overlays = [
         (self': super': {