xref: /llvm-project/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll (revision 3277c7cd28154e33637a168acb26cea7ac1f7fff)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
3
4define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
5; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
6; GFX12:       ; %bb.0: ; %bb
7; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0
8; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
9; GFX12-NEXT:    s_endpgm
10bb:
11  %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
12  store <4 x float> %res, ptr addrspace(1) %out
13  ret void
14}
15
16define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
17; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
18; GFX12:       ; %bb.0: ; %bb
19; GFX12-NEXT:    v_mov_b32_e32 v6, 0x40400000
20; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
21; GFX12-NEXT:    v_mov_b32_e32 v7, v6
22; GFX12-NEXT:    v_mov_b32_e32 v8, v6
23; GFX12-NEXT:    v_mov_b32_e32 v9, v6
24; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9]
25; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
26; GFX12-NEXT:    s_endpgm
27bb:
28  %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
29  store <4 x float> %res, ptr addrspace(1) %out
30  ret void
31}
32
33define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
34; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
35; GFX12:       ; %bb.0: ; %bb
36; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0
37; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
38; GFX12-NEXT:    s_endpgm
39bb:
40  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
41  store <4 x float> %res, ptr addrspace(1) %out
42  ret void
43}
44
45define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
46; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
47; GFX12:       ; %bb.0: ; %bb
48; GFX12-NEXT:    v_mov_b32_e32 v6, 0x40400000
49; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
50; GFX12-NEXT:    v_mov_b32_e32 v7, v6
51; GFX12-NEXT:    v_mov_b32_e32 v8, v6
52; GFX12-NEXT:    v_mov_b32_e32 v9, v6
53; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9]
54; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
55; GFX12-NEXT:    s_endpgm
56bb:
57  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
58  store <4 x float> %res, ptr addrspace(1) %out
59  ret void
60}
61
62define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
63; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
64; GFX12:       ; %bb.0: ; %bb
65; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0
66; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
67; GFX12-NEXT:    s_endpgm
68bb:
69  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
70  store <4 x half> %res, ptr addrspace(1) %out
71  ret void
72}
73
74define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
75; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
76; GFX12:       ; %bb.0: ; %bb
77; GFX12-NEXT:    v_mov_b32_e32 v6, 0x42004200
78; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
79; GFX12-NEXT:    v_mov_b32_e32 v7, v6
80; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7]
81; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
82; GFX12-NEXT:    s_endpgm
83bb:
84  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
85  store <4 x half> %res, ptr addrspace(1) %out
86  ret void
87}
88
89define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
90; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
91; GFX12:       ; %bb.0: ; %bb
92; GFX12-NEXT:    v_mov_b32_e32 v6, 0x3f803f80
93; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
94; GFX12-NEXT:    v_mov_b32_e32 v7, v6
95; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
96; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
97; GFX12-NEXT:    s_endpgm
98bb:
99  %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
100  store <4 x i16> %res, ptr addrspace(1) %out
101  ret void
102}
103
104define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
105; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
106; GFX12:       ; %bb.0: ; %bb
107; GFX12-NEXT:    v_mov_b32_e32 v6, 0x3fc03fc0
108; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
109; GFX12-NEXT:    v_mov_b32_e32 v7, v6
110; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
111; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
112; GFX12-NEXT:    s_endpgm
113bb:
114  %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
115  store <4 x i16> %res, ptr addrspace(1) %out
116  ret void
117}
118
119define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
120; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
121; GFX12:       ; %bb.0: ; %bb
122; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1
123; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
124; GFX12-NEXT:    s_endpgm
125bb:
126  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
127  store <4 x i32> %res, ptr addrspace(1) %out
128  ret void
129}
130
131define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
132; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
133; GFX12:       ; %bb.0: ; %bb
134; GFX12-NEXT:    v_mov_b32_e32 v4, 0x80
135; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
136; GFX12-NEXT:    v_mov_b32_e32 v5, v4
137; GFX12-NEXT:    v_mov_b32_e32 v6, v4
138; GFX12-NEXT:    v_mov_b32_e32 v7, v4
139; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7]
140; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
141; GFX12-NEXT:    s_endpgm
142bb:
143  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
144  store <4 x i32> %res, ptr addrspace(1) %out
145  ret void
146}
147
148define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
149; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
150; GFX12:       ; %bb.0: ; %bb
151; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1
152; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
153; GFX12-NEXT:    s_endpgm
154bb:
155  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
156  store <4 x i32> %res, ptr addrspace(1) %out
157  ret void
158}
159
160define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
161; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
162; GFX12:       ; %bb.0: ; %bb
163; GFX12-NEXT:    v_mov_b32_e32 v4, 0x80
164; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
165; GFX12-NEXT:    v_mov_b32_e32 v5, v4
166; GFX12-NEXT:    v_mov_b32_e32 v6, v4
167; GFX12-NEXT:    v_mov_b32_e32 v7, v4
168; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7]
169; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
170; GFX12-NEXT:    s_endpgm
171bb:
172  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
173  store <4 x i32> %res, ptr addrspace(1) %out
174  ret void
175}
176
177define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
178; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
179; GFX12:       ; %bb.0: ; %bb
180; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0
181; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
182; GFX12-NEXT:    s_endpgm
183bb:
184  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
185  store <4 x float> %res, ptr addrspace(1) %out
186  ret void
187}
188
189define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
190; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
191; GFX12:       ; %bb.0: ; %bb
192; GFX12-NEXT:    v_mov_b32_e32 v4, 0x40400000
193; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
194; GFX12-NEXT:    v_mov_b32_e32 v5, v4
195; GFX12-NEXT:    v_mov_b32_e32 v6, v4
196; GFX12-NEXT:    v_mov_b32_e32 v7, v4
197; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7]
198; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
199; GFX12-NEXT:    s_endpgm
200bb:
201  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
202  store <4 x float> %res, ptr addrspace(1) %out
203  ret void
204}
205
206define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
207; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
208; GFX12:       ; %bb.0: ; %bb
209; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0
210; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
211; GFX12-NEXT:    s_endpgm
212bb:
213  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
214  store <4 x float> %res, ptr addrspace(1) %out
215  ret void
216}
217
218define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
219; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
220; GFX12:       ; %bb.0: ; %bb
221; GFX12-NEXT:    v_mov_b32_e32 v4, 0x40400000
222; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
223; GFX12-NEXT:    v_mov_b32_e32 v5, v4
224; GFX12-NEXT:    v_mov_b32_e32 v6, v4
225; GFX12-NEXT:    v_mov_b32_e32 v7, v4
226; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7]
227; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
228; GFX12-NEXT:    s_endpgm
229bb:
230  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
231  store <4 x float> %res, ptr addrspace(1) %out
232  ret void
233}
234
235define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
236; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
237; GFX12:       ; %bb.0: ; %bb
238; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0
239; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
240; GFX12-NEXT:    s_endpgm
241bb:
242  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
243  store <4 x float> %res, ptr addrspace(1) %out
244  ret void
245}
246
247define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
248; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
249; GFX12:       ; %bb.0: ; %bb
250; GFX12-NEXT:    v_mov_b32_e32 v4, 0x40400000
251; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
252; GFX12-NEXT:    v_mov_b32_e32 v5, v4
253; GFX12-NEXT:    v_mov_b32_e32 v6, v4
254; GFX12-NEXT:    v_mov_b32_e32 v7, v4
255; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7]
256; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
257; GFX12-NEXT:    s_endpgm
258bb:
259  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
260  store <4 x float> %res, ptr addrspace(1) %out
261  ret void
262}
263
264define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
265; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
266; GFX12:       ; %bb.0: ; %bb
267; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0
268; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
269; GFX12-NEXT:    s_endpgm
270bb:
271  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
272  store <4 x float> %res, ptr addrspace(1) %out
273  ret void
274}
275
276define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
277; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
278; GFX12:       ; %bb.0: ; %bb
279; GFX12-NEXT:    v_mov_b32_e32 v4, 0x40400000
280; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
281; GFX12-NEXT:    v_mov_b32_e32 v5, v4
282; GFX12-NEXT:    v_mov_b32_e32 v6, v4
283; GFX12-NEXT:    v_mov_b32_e32 v7, v4
284; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7]
285; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
286; GFX12-NEXT:    s_endpgm
287bb:
288  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
289  store <4 x float> %res, ptr addrspace(1) %out
290  ret void
291}
292
293define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
294; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
295; GFX12:       ; %bb.0: ; %bb
296; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1
297; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
298; GFX12-NEXT:    s_endpgm
299bb:
300  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
301  store <4 x i32> %res, ptr addrspace(1) %out
302  ret void
303}
304
305define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
306; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
307; GFX12:       ; %bb.0: ; %bb
308; GFX12-NEXT:    v_mov_b32_e32 v4, 0x80
309; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
310; GFX12-NEXT:    v_mov_b32_e32 v5, v4
311; GFX12-NEXT:    v_mov_b32_e32 v6, v4
312; GFX12-NEXT:    v_mov_b32_e32 v7, v4
313; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7]
314; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
315; GFX12-NEXT:    s_endpgm
316bb:
317  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
318  store <4 x i32> %res, ptr addrspace(1) %out
319  ret void
320}
321
322declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half>, <4 x half>, <4 x float>)
323declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16>, <4 x i16>, <4 x float>)
324declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
325declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg)
326declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
327declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
328declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32, i32, <4 x float>)
329declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32, i32, <4 x float>)
330declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32, i32, <4 x float>)
331declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32, i32, <4 x float>)
332declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
333