xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll (revision 3277c7cd28154e33637a168acb26cea7ac1f7fff)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
3
4declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half>, <16 x half>, <4 x float>)
5declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16>, <16 x i16>, <4 x float>)
6declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.b8f16.v16f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
7declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16.v16f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
8declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
9declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16.v16i16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
10declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg)
11declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg)
12
13; @llvm.amdgcn.wmma.f32.16x16x16.f16
14
15define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
16; W64-LABEL: test_wmma_f32_16x16x16_f16:
17; W64:       ; %bb.0: ; %bb
18; W64-NEXT:    v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
19; W64-NEXT:    global_store_b128 v[20:21], v[16:19], off
20; W64-NEXT:    s_endpgm
21bb:
22  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> %A, <16 x half> %B, <4 x float> %C)
23  store <4 x float> %res, ptr addrspace(1) %out, align 16
24  ret void
25}
26
27; @llvm.amdgcn.wmma.f32.16x16x16.bf16
28
29define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
30; W64-LABEL: test_wmma_f32_16x16x16_bf16:
31; W64:       ; %bb.0: ; %bb
32; W64-NEXT:    v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
33; W64-NEXT:    global_store_b128 v[20:21], v[16:19], off
34; W64-NEXT:    s_endpgm
35bb:
36  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C)
37  store <4 x float> %res, ptr addrspace(1) %out, align 16
38  ret void
39}
40
41; @llvm.amdgcn.wmma.f16.16x16x16.f16
42
43define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
44; W64-LABEL: test_wmma_f16_16x16x16_f16_lo:
45; W64:       ; %bb.0: ; %bb
46; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
47; W64-NEXT:    global_store_b128 v[20:21], v[16:19], off
48; W64-NEXT:    s_endpgm
49bb:
50  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 0)
51  store <8 x half> %res, ptr addrspace(1) %out, align 16
52  ret void
53}
54
55define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
56; W64-LABEL: test_wmma_f16_16x16x16_f16_hi:
57; W64:       ; %bb.0: ; %bb
58; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
59; W64-NEXT:    global_store_b128 v[20:21], v[16:19], off
60; W64-NEXT:    s_endpgm
61bb:
62  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 1)
63  store <8 x half> %res, ptr addrspace(1) %out, align 16
64  ret void
65}
66
67define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
68; W64-LABEL: test_wmma_f16_16x16x16_f16_untied:
69; W64:       ; %bb.0: ; %bb
70; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[32:35]
71; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35]
72; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
73; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
74; W64-NEXT:    s_endpgm
75bb:
76  %res.0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0)
77  %res.1 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, i1 0)
78  store <8 x half> %res.0, ptr addrspace(1) %out.0, align 32
79  store <8 x half> %res.1, ptr addrspace(1) %out.1, align 32
80  ret void
81}
82
83define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
84; W64-LABEL: test_wmma_f16_16x16x16_f16_tied:
85; W64:       ; %bb.0: ; %bb
86; W64-NEXT:    v_mov_b32_e32 v43, v35
87; W64-NEXT:    v_mov_b32_e32 v42, v34
88; W64-NEXT:    v_mov_b32_e32 v41, v33
89; W64-NEXT:    v_mov_b32_e32 v40, v32
90; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35]
91; W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
92; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[40:43]
93; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
94; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
95; W64-NEXT:    s_endpgm
96bb:
97  %res.0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16.v16f16(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0)
98  %res.1 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16.v16f16(<16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, i1 0)
99  store <8 x half> %res.0, ptr addrspace(1) %out.0, align 32
100  store <8 x half> %res.1, ptr addrspace(1) %out.1, align 32
101  ret void
102}
103
104; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
105
106define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) {
107; W64-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
108; W64:       ; %bb.0: ; %bb
109; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
110; W64-NEXT:    global_store_b128 v[20:21], v[16:19], off
111; W64-NEXT:    s_endpgm
112bb:
113  %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 0)
114  store <8 x i16> %res, ptr addrspace(1) %out, align 16
115  ret void
116}
117
118define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) {
119; W64-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
120; W64:       ; %bb.0: ; %bb
121; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
122; W64-NEXT:    global_store_b128 v[20:21], v[16:19], off
123; W64-NEXT:    s_endpgm
124bb:
125  %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 1)
126  store <8 x i16> %res, ptr addrspace(1) %out, align 16
127  ret void
128}
129
130define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
131; W64-LABEL: test_wmma_bf16_16x16x16_bf16_untied:
132; W64:       ; %bb.0: ; %bb
133; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[32:35]
134; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35]
135; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
136; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
137; W64-NEXT:    s_endpgm
138bb:
139  %res.0 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0)
140  %res.1 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, i1 0)
141  store <8 x i16> %res.0, ptr addrspace(1) %out.0, align 32
142  store <8 x i16> %res.1, ptr addrspace(1) %out.1, align 32
143  ret void
144}
145
146define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
147; W64-LABEL: test_wmma_bf16_16x16x16_bf16_tied:
148; W64:       ; %bb.0: ; %bb
149; W64-NEXT:    v_mov_b32_e32 v43, v35
150; W64-NEXT:    v_mov_b32_e32 v42, v34
151; W64-NEXT:    v_mov_b32_e32 v41, v33
152; W64-NEXT:    v_mov_b32_e32 v40, v32
153; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35]
154; W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
155; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[40:43]
156; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
157; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
158; W64-NEXT:    s_endpgm
159bb:
160  %res.0 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16.v16i16(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0)
161  %res.1 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16.v16i16(<16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, i1 0)
162  store <8 x i16> %res.0, ptr addrspace(1) %out.0, align 32
163  store <8 x i16> %res.1, ptr addrspace(1) %out.1, align 32
164  ret void
165}
166
167; @llvm.amdgcn.wmma.i32.16x16x16.iu8
168
169define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
170; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:
171; W64:       ; %bb.0: ; %bb
172; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11]
173; W64-NEXT:    global_store_b128 v[12:13], v[8:11], off
174; W64-NEXT:    s_endpgm
175bb:
176  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
177  store <4 x i32> %res, ptr addrspace(1) %out, align 16
178  ret void
179}
180
181
182define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
183; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed:
184; W64:       ; %bb.0: ; %bb
185; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0]
186; W64-NEXT:    global_store_b128 v[12:13], v[8:11], off
187; W64-NEXT:    s_endpgm
188bb:
189  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
190  store <4 x i32> %res, ptr addrspace(1) %out, align 16
191  ret void
192}
193
194define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
195; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned:
196; W64:       ; %bb.0: ; %bb
197; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0]
198; W64-NEXT:    global_store_b128 v[12:13], v[8:11], off
199; W64-NEXT:    s_endpgm
200bb:
201  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
202  store <4 x i32> %res, ptr addrspace(1) %out, align 16
203  ret void
204}
205
206define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
207; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed:
208; W64:       ; %bb.0: ; %bb
209; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0]
210; W64-NEXT:    global_store_b128 v[12:13], v[8:11], off
211; W64-NEXT:    s_endpgm
212bb:
213  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
214  store <4 x i32> %res, ptr addrspace(1) %out, align 16
215  ret void
216}
217
218define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
219; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp:
220; W64:       ; %bb.0: ; %bb
221; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] clamp
222; W64-NEXT:    global_store_b128 v[12:13], v[8:11], off
223; W64-NEXT:    s_endpgm
224bb:
225  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
226  store <4 x i32> %res, ptr addrspace(1) %out, align 16
227  ret void
228}
229
230define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
231; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp:
232; W64:       ; %bb.0: ; %bb
233; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] clamp
234; W64-NEXT:    global_store_b128 v[12:13], v[8:11], off
235; W64-NEXT:    s_endpgm
236bb:
237  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
238  store <4 x i32> %res, ptr addrspace(1) %out, align 16
239  ret void
240}
241
242define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
243; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp:
244; W64:       ; %bb.0: ; %bb
245; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] clamp
246; W64-NEXT:    global_store_b128 v[12:13], v[8:11], off
247; W64-NEXT:    s_endpgm
248bb:
249  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
250  store <4 x i32> %res, ptr addrspace(1) %out, align 16
251  ret void
252}
253
254define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
255; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp:
256; W64:       ; %bb.0: ; %bb
257; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] clamp
258; W64-NEXT:    global_store_b128 v[12:13], v[8:11], off
259; W64-NEXT:    s_endpgm
260bb:
261  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
262  store <4 x i32> %res, ptr addrspace(1) %out, align 16
263  ret void
264}
265
266; @llvm.amdgcn.wmma.i32.16x16x16.iu4
267
268define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
269; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned:
270; W64:       ; %bb.0: ; %bb
271; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7]
272; W64-NEXT:    global_store_b128 v[8:9], v[4:7], off
273; W64-NEXT:    s_endpgm
274bb:
275  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
276  store <4 x i32> %res, ptr addrspace(1) %out, align 16
277  ret void
278}
279
280define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
281; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed:
282; W64:       ; %bb.0: ; %bb
283; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0]
284; W64-NEXT:    global_store_b128 v[8:9], v[4:7], off
285; W64-NEXT:    s_endpgm
286bb:
287  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
288  store <4 x i32> %res, ptr addrspace(1) %out, align 16
289  ret void
290}
291
292define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
293; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned:
294; W64:       ; %bb.0: ; %bb
295; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0]
296; W64-NEXT:    global_store_b128 v[8:9], v[4:7], off
297; W64-NEXT:    s_endpgm
298bb:
299  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
300  store <4 x i32> %res, ptr addrspace(1) %out, align 16
301  ret void
302}
303
304define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
305; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed:
306; W64:       ; %bb.0: ; %bb
307; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0]
308; W64-NEXT:    global_store_b128 v[8:9], v[4:7], off
309; W64-NEXT:    s_endpgm
310bb:
311  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
312  store <4 x i32> %res, ptr addrspace(1) %out, align 16
313  ret void
314}
315
316define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
317; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp:
318; W64:       ; %bb.0: ; %bb
319; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] clamp
320; W64-NEXT:    global_store_b128 v[8:9], v[4:7], off
321; W64-NEXT:    s_endpgm
322bb:
323  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
324  store <4 x i32> %res, ptr addrspace(1) %out, align 16
325  ret void
326}
327
328define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
329; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp:
330; W64:       ; %bb.0: ; %bb
331; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] clamp
332; W64-NEXT:    global_store_b128 v[8:9], v[4:7], off
333; W64-NEXT:    s_endpgm
334bb:
335  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
336  store <4 x i32> %res, ptr addrspace(1) %out, align 16
337  ret void
338}
339
340define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
341; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp:
342; W64:       ; %bb.0: ; %bb
343; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] clamp
344; W64-NEXT:    global_store_b128 v[8:9], v[4:7], off
345; W64-NEXT:    s_endpgm
346bb:
347  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
348  store <4 x i32> %res, ptr addrspace(1) %out, align 16
349  ret void
350}
351
352define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
353; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp:
354; W64:       ; %bb.0: ; %bb
355; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] clamp
356; W64-NEXT:    global_store_b128 v[8:9], v[4:7], off
357; W64-NEXT:    s_endpgm
358bb:
359  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
360  store <4 x i32> %res, ptr addrspace(1) %out, align 16
361  ret void
362}
363
364