AMDGPU vector shuffle lowering does not make use of v_pk_mov_b32 #122243

arsenm · 2025-01-09T10:27:20Z

The lowering of vector shuffles (and all other vector operations) are still decomposed into copies of 32-bit pieces on targets with support for v_pk_mov_b32:

; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s

; Not using v_pk_mov_b32
define <4 x float> @shuffle_v4f32_2367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <4 x float>, ptr addrspace(1) %arg0
  %val1 = load <4 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <4 x float> %val0, <4 x float> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
  ret <4 x float> %shuffle
}

; Not using v_pk_mov_b32
define <4 x double> @shuffle_v4f64_2367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <4 x double>, ptr addrspace(1) %arg0
  %val1 = load <4 x double>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <4 x double> %val0, <4 x double> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
  ret <4 x double> %shuffle
}

define <8 x half> @shuffle_v4f16_2345_891011(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <8 x half>, ptr addrspace(1) %arg0
  %val1 = load <8 x half>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11>
  ret <8 x half> %shuffle
}

All of these cases codegen using pairs of v_mov_b32, but can compress these into one v_pk_mov_b32 on gfx90a+.

The 2-element 32-bit shuffles can be directly legal:

define <2 x float> @shuffle_v2f32_00(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 0, i32 0>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_01(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 0, i32 1>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_02(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 0, i32 2>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_03(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 0, i32 3>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_10(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 1, i32 0>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_11(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 1, i32 1>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_12(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 1, i32 2>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 1, i32 3>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_20(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 2, i32 0>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_21(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 2, i32 1>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_22(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 2, i32 2>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_23(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 2, i32 3>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_30(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 3, i32 0>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_31(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 3, i32 1>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_32(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 3, i32 2>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_33(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 3, i32 3>
  ret <2 x float> %shuffle
}

The text was updated successfully, but these errors were encountered:

llvmbot · 2025-01-09T10:27:34Z

@llvm/issue-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

The lowering of vector shuffles (and all other vector operations) are still decomposed into copies of 32-bit pieces on targets with support for v_pk_mov_b32:

; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 &lt; %s

; Not using v_pk_mov_b32
define &lt;4 x float&gt; @<!-- -->shuffle_v4f32_2367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load &lt;4 x float&gt;, ptr addrspace(1) %arg0
  %val1 = load &lt;4 x float&gt;, ptr addrspace(1) %arg1
  %shuffle = shufflevector &lt;4 x float&gt; %val0, &lt;4 x float&gt; %val1, &lt;4 x i32&gt; &lt;i32 2, i32 3, i32 6, i32 7&gt;
  ret &lt;4 x float&gt; %shuffle
}

; Not using v_pk_mov_b32
define &lt;4 x double&gt; @<!-- -->shuffle_v4f64_2367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load &lt;4 x double&gt;, ptr addrspace(1) %arg0
  %val1 = load &lt;4 x double&gt;, ptr addrspace(1) %arg1
  %shuffle = shufflevector &lt;4 x double&gt; %val0, &lt;4 x double&gt; %val1, &lt;4 x i32&gt; &lt;i32 2, i32 3, i32 6, i32 7&gt;
  ret &lt;4 x double&gt; %shuffle
}

define &lt;8 x half&gt; @<!-- -->shuffle_v4f16_2345_891011(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load &lt;8 x half&gt;, ptr addrspace(1) %arg0
  %val1 = load &lt;8 x half&gt;, ptr addrspace(1) %arg1
  %shuffle = shufflevector &lt;8 x half&gt; %val0, &lt;8 x half&gt; %val1, &lt;8 x i32&gt; &lt;i32 2, i32 3, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11&gt;
  ret &lt;8 x half&gt; %shuffle
}

All of these cases codegen using pairs of v_mov_b32, but can compress these into one v_pk_mov_b32 on gfx90a+.

arsenm added backend:AMDGPU missed-optimization labels Jan 9, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

AMDGPU vector shuffle lowering does not make use of v_pk_mov_b32 #122243

AMDGPU vector shuffle lowering does not make use of v_pk_mov_b32 #122243

arsenm commented Jan 9, 2025 •

edited

Loading

llvmbot commented Jan 9, 2025

AMDGPU vector shuffle lowering does not make use of v_pk_mov_b32 #122243

AMDGPU vector shuffle lowering does not make use of v_pk_mov_b32 #122243

Comments

arsenm commented Jan 9, 2025 • edited Loading

llvmbot commented Jan 9, 2025

arsenm commented Jan 9, 2025 •

edited

Loading