xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll (revision 3277c7cd28154e33637a168acb26cea7ac1f7fff)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
3
4define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
5; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
6; GFX12:       ; %bb.0: ; %bb
7; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
8; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
9; GFX12-NEXT:    s_endpgm
10bb:
11  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
12  store <4 x i32> %res, ptr addrspace(1) %out
13  ret void
14}
15
16define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
17; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
18; GFX12:       ; %bb.0: ; %bb
19; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
20; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
21; GFX12-NEXT:    s_endpgm
22bb:
23  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
24  store <4 x i32> %res, ptr addrspace(1) %out
25  ret void
26}
27
28define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
29; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
30; GFX12:       ; %bb.0: ; %bb
31; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
32; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
33; GFX12-NEXT:    s_endpgm
34bb:
35  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
36  store <4 x i32> %res, ptr addrspace(1) %out
37  ret void
38}
39
40
41
42define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
43; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
44; GFX12:       ; %bb.0: ; %bb
45; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
46; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
47; GFX12-NEXT:    s_endpgm
48bb:
49  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
50  store <4 x i32> %res, ptr addrspace(1) %out
51  ret void
52}
53
54define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
55; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
56; GFX12:       ; %bb.0: ; %bb
57; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
58; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
59; GFX12-NEXT:    s_endpgm
60bb:
61  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
62  store <4 x i32> %res, ptr addrspace(1) %out
63  ret void
64}
65
66define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
67; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
68; GFX12:       ; %bb.0: ; %bb
69; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
70; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
71; GFX12-NEXT:    s_endpgm
72bb:
73  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
74  store <4 x i32> %res, ptr addrspace(1) %out
75  ret void
76}
77
78
79
80define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
81; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
82; GFX12:       ; %bb.0: ; %bb
83; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
84; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
85; GFX12-NEXT:    s_endpgm
86bb:
87  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
88  store <4 x i32> %res, ptr addrspace(1) %out
89  ret void
90}
91
92define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
93; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
94; GFX12:       ; %bb.0: ; %bb
95; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
96; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
97; GFX12-NEXT:    s_endpgm
98bb:
99  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
100  store <4 x i32> %res, ptr addrspace(1) %out
101  ret void
102}
103
104define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
105; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
106; GFX12:       ; %bb.0: ; %bb
107; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
108; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
109; GFX12-NEXT:    s_endpgm
110bb:
111  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
112  store <4 x i32> %res, ptr addrspace(1) %out
113  ret void
114}
115
116
117
118
119
120
121define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
122; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
123; GFX12:       ; %bb.0: ; %bb
124; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
125; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
126; GFX12-NEXT:    s_endpgm
127bb:
128  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
129  store <4 x i32> %res, ptr addrspace(1) %out
130  ret void
131}
132
133define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
134; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
135; GFX12:       ; %bb.0: ; %bb
136; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
137; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
138; GFX12-NEXT:    s_endpgm
139bb:
140  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
141  store <4 x i32> %res, ptr addrspace(1) %out
142  ret void
143}
144
145define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
146; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
147; GFX12:       ; %bb.0: ; %bb
148; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
149; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
150; GFX12-NEXT:    s_endpgm
151bb:
152  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1)
153  store <4 x i32> %res, ptr addrspace(1) %out
154  ret void
155}
156
157
158
159define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
160; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
161; GFX12:       ; %bb.0: ; %bb
162; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
163; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
164; GFX12-NEXT:    s_endpgm
165bb:
166  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
167  store <4 x i32> %res, ptr addrspace(1) %out
168  ret void
169}
170
171define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
172; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
173; GFX12:       ; %bb.0: ; %bb
174; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
175; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
176; GFX12-NEXT:    s_endpgm
177bb:
178  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
179  store <4 x i32> %res, ptr addrspace(1) %out
180  ret void
181}
182
183define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
184; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
185; GFX12:       ; %bb.0: ; %bb
186; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
187; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
188; GFX12-NEXT:    s_endpgm
189bb:
190  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1)
191  store <4 x i32> %res, ptr addrspace(1) %out
192  ret void
193}
194
195
196
197define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
198; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
199; GFX12:       ; %bb.0: ; %bb
200; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
201; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
202; GFX12-NEXT:    s_endpgm
203bb:
204  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
205  store <4 x i32> %res, ptr addrspace(1) %out
206  ret void
207}
208
209define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
210; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
211; GFX12:       ; %bb.0: ; %bb
212; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
213; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
214; GFX12-NEXT:    s_endpgm
215bb:
216  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
217  store <4 x i32> %res, ptr addrspace(1) %out
218  ret void
219}
220
221define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
222; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
223; GFX12:       ; %bb.0: ; %bb
224; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
225; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
226; GFX12-NEXT:    s_endpgm
227bb:
228  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1)
229  store <4 x i32> %res, ptr addrspace(1) %out
230  ret void
231}
232
233declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
234declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
235declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
236declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
237declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
238declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
239