xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll (revision 3277c7cd28154e33637a168acb26cea7ac1f7fff)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
3
4define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
5; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
6; GFX12:       ; %bb.0: ; %bb
7; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0
8; GFX12-NEXT:    s_clause 0x1
9; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
10; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
11; GFX12-NEXT:    s_endpgm
12bb:
13  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
14  store <8 x float> %res, ptr addrspace(1) %out
15  ret void
16}
17
18define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
19; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
20; GFX12:       ; %bb.0: ; %bb
21; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
22; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
23; GFX12-NEXT:    s_mov_b32 s7, s0
24; GFX12-NEXT:    s_mov_b32 s1, s0
25; GFX12-NEXT:    s_mov_b32 s2, s0
26; GFX12-NEXT:    s_mov_b32 s3, s0
27; GFX12-NEXT:    s_mov_b32 s4, s0
28; GFX12-NEXT:    s_mov_b32 s5, s0
29; GFX12-NEXT:    s_mov_b32 s6, s0
30; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
31; GFX12-NEXT:    v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
32; GFX12-NEXT:    v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
33; GFX12-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
34; GFX12-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
35; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17]
36; GFX12-NEXT:    s_clause 0x1
37; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
38; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
39; GFX12-NEXT:    s_endpgm
40bb:
41  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
42  store <8 x float> %res, ptr addrspace(1) %out
43  ret void
44}
45
46define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
47; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
48; GFX12:       ; %bb.0: ; %bb
49; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0
50; GFX12-NEXT:    s_clause 0x1
51; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
52; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
53; GFX12-NEXT:    s_endpgm
54bb:
55  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
56  store <8 x float> %res, ptr addrspace(1) %out
57  ret void
58}
59
60define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
61; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
62; GFX12:       ; %bb.0: ; %bb
63; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
64; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
65; GFX12-NEXT:    s_mov_b32 s7, s0
66; GFX12-NEXT:    s_mov_b32 s1, s0
67; GFX12-NEXT:    s_mov_b32 s2, s0
68; GFX12-NEXT:    s_mov_b32 s3, s0
69; GFX12-NEXT:    s_mov_b32 s4, s0
70; GFX12-NEXT:    s_mov_b32 s5, s0
71; GFX12-NEXT:    s_mov_b32 s6, s0
72; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
73; GFX12-NEXT:    v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
74; GFX12-NEXT:    v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
75; GFX12-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
76; GFX12-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
77; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17]
78; GFX12-NEXT:    s_clause 0x1
79; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
80; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
81; GFX12-NEXT:    s_endpgm
82bb:
83  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
84  store <8 x float> %res, ptr addrspace(1) %out
85  ret void
86}
87
88define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
89; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
90; GFX12:       ; %bb.0: ; %bb
91; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0
92; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
93; GFX12-NEXT:    s_endpgm
94bb:
95  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
96  store <8 x half> %res, ptr addrspace(1) %out
97  ret void
98}
99
100define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
101; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
102; GFX12:       ; %bb.0: ; %bb
103; GFX12-NEXT:    s_mov_b32 s0, 0x42004200
104; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
105; GFX12-NEXT:    s_mov_b32 s3, s0
106; GFX12-NEXT:    s_mov_b32 s1, s0
107; GFX12-NEXT:    s_mov_b32 s2, s0
108; GFX12-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
109; GFX12-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
110; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
111; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
112; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
113; GFX12-NEXT:    s_endpgm
114bb:
115  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
116  store <8 x half> %res, ptr addrspace(1) %out
117  ret void
118}
119
120define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
121; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
122; GFX12:       ; %bb.0: ; %bb
123; GFX12-NEXT:    s_mov_b32 s0, 0x3f803f80
124; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
125; GFX12-NEXT:    s_mov_b32 s3, s0
126; GFX12-NEXT:    s_mov_b32 s1, s0
127; GFX12-NEXT:    s_mov_b32 s2, s0
128; GFX12-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
129; GFX12-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
130; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
131; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
132; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
133; GFX12-NEXT:    s_endpgm
134bb:
135  %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
136  store <8 x i16> %res, ptr addrspace(1) %out
137  ret void
138}
139
140define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
141; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
142; GFX12:       ; %bb.0: ; %bb
143; GFX12-NEXT:    s_mov_b32 s0, 0x3fc03fc0
144; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
145; GFX12-NEXT:    s_mov_b32 s3, s0
146; GFX12-NEXT:    s_mov_b32 s1, s0
147; GFX12-NEXT:    s_mov_b32 s2, s0
148; GFX12-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
149; GFX12-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
150; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
151; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
152; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
153; GFX12-NEXT:    s_endpgm
154bb:
155  %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
156  store <8 x i16> %res, ptr addrspace(1) %out
157  ret void
158}
159
160define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
161; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
162; GFX12:       ; %bb.0: ; %bb
163; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1
164; GFX12-NEXT:    s_clause 0x1
165; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
166; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
167; GFX12-NEXT:    s_endpgm
168bb:
169  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
170  store <8 x i32> %res, ptr addrspace(1) %out
171  ret void
172}
173
174define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
175; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
176; GFX12:       ; %bb.0: ; %bb
177; GFX12-NEXT:    s_movk_i32 s0, 0x80
178; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
179; GFX12-NEXT:    s_mov_b32 s7, s0
180; GFX12-NEXT:    s_mov_b32 s1, s0
181; GFX12-NEXT:    s_mov_b32 s2, s0
182; GFX12-NEXT:    s_mov_b32 s3, s0
183; GFX12-NEXT:    s_mov_b32 s4, s0
184; GFX12-NEXT:    s_mov_b32 s5, s0
185; GFX12-NEXT:    s_mov_b32 s6, s0
186; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
187; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
188; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
189; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
190; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
191; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13]
192; GFX12-NEXT:    s_clause 0x1
193; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
194; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
195; GFX12-NEXT:    s_endpgm
196bb:
197  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
198  store <8 x i32> %res, ptr addrspace(1) %out
199  ret void
200}
201
202define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
203; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
204; GFX12:       ; %bb.0: ; %bb
205; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1
206; GFX12-NEXT:    s_clause 0x1
207; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
208; GFX12-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
209; GFX12-NEXT:    s_endpgm
210bb:
211  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
212  store <8 x i32> %res, ptr addrspace(1) %out
213  ret void
214}
215
216define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
217; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
218; GFX12:       ; %bb.0: ; %bb
219; GFX12-NEXT:    s_movk_i32 s0, 0x80
220; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
221; GFX12-NEXT:    s_mov_b32 s7, s0
222; GFX12-NEXT:    s_mov_b32 s1, s0
223; GFX12-NEXT:    s_mov_b32 s2, s0
224; GFX12-NEXT:    s_mov_b32 s3, s0
225; GFX12-NEXT:    s_mov_b32 s4, s0
226; GFX12-NEXT:    s_mov_b32 s5, s0
227; GFX12-NEXT:    s_mov_b32 s6, s0
228; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
229; GFX12-NEXT:    v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v10, s6
230; GFX12-NEXT:    v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4
231; GFX12-NEXT:    v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v6, s2
232; GFX12-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
233; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11]
234; GFX12-NEXT:    s_clause 0x1
235; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
236; GFX12-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
237; GFX12-NEXT:    s_endpgm
238bb:
239  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
240  store <8 x i32> %res, ptr addrspace(1) %out
241  ret void
242}
243
244define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
245; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
246; GFX12:       ; %bb.0: ; %bb
247; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0
248; GFX12-NEXT:    s_clause 0x1
249; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
250; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
251; GFX12-NEXT:    s_endpgm
252bb:
253  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
254  store <8 x float> %res, ptr addrspace(1) %out
255  ret void
256}
257
258define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
259; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
260; GFX12:       ; %bb.0: ; %bb
261; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
262; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
263; GFX12-NEXT:    s_mov_b32 s7, s0
264; GFX12-NEXT:    s_mov_b32 s1, s0
265; GFX12-NEXT:    s_mov_b32 s2, s0
266; GFX12-NEXT:    s_mov_b32 s3, s0
267; GFX12-NEXT:    s_mov_b32 s4, s0
268; GFX12-NEXT:    s_mov_b32 s5, s0
269; GFX12-NEXT:    s_mov_b32 s6, s0
270; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
271; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
272; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
273; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
274; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
275; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
276; GFX12-NEXT:    s_clause 0x1
277; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
278; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
279; GFX12-NEXT:    s_endpgm
280bb:
281  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
282  store <8 x float> %res, ptr addrspace(1) %out
283  ret void
284}
285
286define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
287; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
288; GFX12:       ; %bb.0: ; %bb
289; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0
290; GFX12-NEXT:    s_clause 0x1
291; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
292; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
293; GFX12-NEXT:    s_endpgm
294bb:
295  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
296  store <8 x float> %res, ptr addrspace(1) %out
297  ret void
298}
299
300define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
301; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
302; GFX12:       ; %bb.0: ; %bb
303; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
304; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
305; GFX12-NEXT:    s_mov_b32 s7, s0
306; GFX12-NEXT:    s_mov_b32 s1, s0
307; GFX12-NEXT:    s_mov_b32 s2, s0
308; GFX12-NEXT:    s_mov_b32 s3, s0
309; GFX12-NEXT:    s_mov_b32 s4, s0
310; GFX12-NEXT:    s_mov_b32 s5, s0
311; GFX12-NEXT:    s_mov_b32 s6, s0
312; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
313; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
314; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
315; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
316; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
317; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
318; GFX12-NEXT:    s_clause 0x1
319; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
320; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
321; GFX12-NEXT:    s_endpgm
322bb:
323  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
324  store <8 x float> %res, ptr addrspace(1) %out
325  ret void
326}
327
328define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
329; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
330; GFX12:       ; %bb.0: ; %bb
331; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0
332; GFX12-NEXT:    s_clause 0x1
333; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
334; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
335; GFX12-NEXT:    s_endpgm
336bb:
337  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
338  store <8 x float> %res, ptr addrspace(1) %out
339  ret void
340}
341
342define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
343; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
344; GFX12:       ; %bb.0: ; %bb
345; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
346; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
347; GFX12-NEXT:    s_mov_b32 s7, s0
348; GFX12-NEXT:    s_mov_b32 s1, s0
349; GFX12-NEXT:    s_mov_b32 s2, s0
350; GFX12-NEXT:    s_mov_b32 s3, s0
351; GFX12-NEXT:    s_mov_b32 s4, s0
352; GFX12-NEXT:    s_mov_b32 s5, s0
353; GFX12-NEXT:    s_mov_b32 s6, s0
354; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
355; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
356; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
357; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
358; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
359; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
360; GFX12-NEXT:    s_clause 0x1
361; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
362; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
363; GFX12-NEXT:    s_endpgm
364bb:
365  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
366  store <8 x float> %res, ptr addrspace(1) %out
367  ret void
368}
369
370define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
371; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
372; GFX12:       ; %bb.0: ; %bb
373; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0
374; GFX12-NEXT:    s_clause 0x1
375; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
376; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
377; GFX12-NEXT:    s_endpgm
378bb:
379  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
380  store <8 x float> %res, ptr addrspace(1) %out
381  ret void
382}
383
384define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
385; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
386; GFX12:       ; %bb.0: ; %bb
387; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
388; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
389; GFX12-NEXT:    s_mov_b32 s7, s0
390; GFX12-NEXT:    s_mov_b32 s1, s0
391; GFX12-NEXT:    s_mov_b32 s2, s0
392; GFX12-NEXT:    s_mov_b32 s3, s0
393; GFX12-NEXT:    s_mov_b32 s4, s0
394; GFX12-NEXT:    s_mov_b32 s5, s0
395; GFX12-NEXT:    s_mov_b32 s6, s0
396; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
397; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
398; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
399; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
400; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
401; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
402; GFX12-NEXT:    s_clause 0x1
403; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
404; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
405; GFX12-NEXT:    s_endpgm
406bb:
407  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
408  store <8 x float> %res, ptr addrspace(1) %out
409  ret void
410}
411
412define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
413; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
414; GFX12:       ; %bb.0: ; %bb
415; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1
416; GFX12-NEXT:    s_clause 0x1
417; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
418; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
419; GFX12-NEXT:    s_endpgm
420bb:
421  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
422  store <8 x i32> %res, ptr addrspace(1) %out
423  ret void
424}
425
426define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
427; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
428; GFX12:       ; %bb.0: ; %bb
429; GFX12-NEXT:    s_movk_i32 s0, 0x80
430; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
431; GFX12-NEXT:    s_mov_b32 s7, s0
432; GFX12-NEXT:    s_mov_b32 s1, s0
433; GFX12-NEXT:    s_mov_b32 s2, s0
434; GFX12-NEXT:    s_mov_b32 s3, s0
435; GFX12-NEXT:    s_mov_b32 s4, s0
436; GFX12-NEXT:    s_mov_b32 s5, s0
437; GFX12-NEXT:    s_mov_b32 s6, s0
438; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
439; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
440; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
441; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
442; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
443; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13]
444; GFX12-NEXT:    s_clause 0x1
445; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
446; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
447; GFX12-NEXT:    s_endpgm
448bb:
449  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
450  store <8 x i32> %res, ptr addrspace(1) %out
451  ret void
452}
453
454declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
455declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
456declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
457declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg)
458declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
459declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
460declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
461declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
462declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
463declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
464declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
465declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
466declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
467declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
468declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
469declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
470declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
471declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
472declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
473declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
474declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
475declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
476