Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AMDGPU vector shuffle lowering does not make use of v_pk_mov_b32 #122243

Open
arsenm opened this issue Jan 9, 2025 · 1 comment
Open

AMDGPU vector shuffle lowering does not make use of v_pk_mov_b32 #122243

arsenm opened this issue Jan 9, 2025 · 1 comment

Comments

@arsenm
Copy link
Contributor

arsenm commented Jan 9, 2025

The lowering of vector shuffles (and all other vector operations) are still decomposed into copies of 32-bit pieces on targets with support for v_pk_mov_b32:

; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s

; Not using v_pk_mov_b32
define <4 x float> @shuffle_v4f32_2367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <4 x float>, ptr addrspace(1) %arg0
  %val1 = load <4 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <4 x float> %val0, <4 x float> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
  ret <4 x float> %shuffle
}

; Not using v_pk_mov_b32
define <4 x double> @shuffle_v4f64_2367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <4 x double>, ptr addrspace(1) %arg0
  %val1 = load <4 x double>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <4 x double> %val0, <4 x double> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
  ret <4 x double> %shuffle
}

define <8 x half> @shuffle_v4f16_2345_891011(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <8 x half>, ptr addrspace(1) %arg0
  %val1 = load <8 x half>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11>
  ret <8 x half> %shuffle
}

All of these cases codegen using pairs of v_mov_b32, but can compress these into one v_pk_mov_b32 on gfx90a+.

The 2-element 32-bit shuffles can be directly legal:

define <2 x float> @shuffle_v2f32_00(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 0, i32 0>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_01(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 0, i32 1>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_02(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 0, i32 2>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_03(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 0, i32 3>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_10(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 1, i32 0>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_11(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 1, i32 1>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_12(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 1, i32 2>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 1, i32 3>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_20(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 2, i32 0>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_21(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 2, i32 1>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_22(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 2, i32 2>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_23(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 2, i32 3>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_30(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 3, i32 0>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_31(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 3, i32 1>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_32(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 3, i32 2>
  ret <2 x float> %shuffle
}

define <2 x float> @shuffle_v2f32_33(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load <2 x float>, ptr addrspace(1) %arg0
  %val1 = load <2 x float>, ptr addrspace(1) %arg1
  %shuffle = shufflevector <2 x float> %val0, <2 x float> %val1, <2 x i32> <i32 3, i32 3>
  ret <2 x float> %shuffle
}

@llvmbot
Copy link
Member

llvmbot commented Jan 9, 2025

@llvm/issue-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

The lowering of vector shuffles (and all other vector operations) are still decomposed into copies of 32-bit pieces on targets with support for v_pk_mov_b32:
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 &lt; %s

; Not using v_pk_mov_b32
define &lt;4 x float&gt; @<!-- -->shuffle_v4f32_2367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load &lt;4 x float&gt;, ptr addrspace(1) %arg0
  %val1 = load &lt;4 x float&gt;, ptr addrspace(1) %arg1
  %shuffle = shufflevector &lt;4 x float&gt; %val0, &lt;4 x float&gt; %val1, &lt;4 x i32&gt; &lt;i32 2, i32 3, i32 6, i32 7&gt;
  ret &lt;4 x float&gt; %shuffle
}

; Not using v_pk_mov_b32
define &lt;4 x double&gt; @<!-- -->shuffle_v4f64_2367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load &lt;4 x double&gt;, ptr addrspace(1) %arg0
  %val1 = load &lt;4 x double&gt;, ptr addrspace(1) %arg1
  %shuffle = shufflevector &lt;4 x double&gt; %val0, &lt;4 x double&gt; %val1, &lt;4 x i32&gt; &lt;i32 2, i32 3, i32 6, i32 7&gt;
  ret &lt;4 x double&gt; %shuffle
}

define &lt;8 x half&gt; @<!-- -->shuffle_v4f16_2345_891011(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
  %val0 = load &lt;8 x half&gt;, ptr addrspace(1) %arg0
  %val1 = load &lt;8 x half&gt;, ptr addrspace(1) %arg1
  %shuffle = shufflevector &lt;8 x half&gt; %val0, &lt;8 x half&gt; %val1, &lt;8 x i32&gt; &lt;i32 2, i32 3, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11&gt;
  ret &lt;8 x half&gt; %shuffle
}

All of these cases codegen using pairs of v_mov_b32, but can compress these into one v_pk_mov_b32 on gfx90a+.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

2 participants