xref: /llvm-project/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll (revision 3277c7cd28154e33637a168acb26cea7ac1f7fff)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
3
4declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half>, <16 x half>, <4 x float>)
5declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16>, <16 x i16>, <4 x float>)
6declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
7declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
8declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg)
9declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg)
10
11; The tests demonstrate that the following WMMA register constraints are satisfied.
12;
13; v_wmma D, A, B, C
14; A and B cannot overlap with D. C cannot partially overlap with D, but it is OK for them to be the same (which is a typical case).
15;
16; In each test,
17;   - first wmma instruction: the dest register D is different than all the sources
18;   - second wmma instruction: the dest register D and src2 (C) are the same
19
20
21; @llvm.amdgcn.wmma.f32.16x16x16.f16
22
23define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <4 x float> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
24; W64-LABEL: test_wmma_f32_16x16x16_f16:
25; W64:       ; %bb.0: ; %bb
26; W64-NEXT:    v_wmma_f32_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19]
27; W64-NEXT:    v_wmma_f32_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19]
28; W64-NEXT:    global_store_b128 v[20:21], v[24:27], off
29; W64-NEXT:    global_store_b128 v[22:23], v[16:19], off
30; W64-NEXT:    s_endpgm
31bb:
32  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> %A, <16 x half> %B, <4 x float> %C)
33  %res2 = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> %B, <16 x half> %B, <4 x float> %C)
34  store <4 x float> %res, ptr addrspace(1) %out, align 16
35  store <4 x float> %res2, ptr addrspace(1) %out2, align 16
36  ret void
37}
38
39; @llvm.amdgcn.wmma.f32.16x16x16.bf16
40
41define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
42; W64-LABEL: test_wmma_f32_16x16x16_bf16:
43; W64:       ; %bb.0: ; %bb
44; W64-NEXT:    v_wmma_f32_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19]
45; W64-NEXT:    v_wmma_f32_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19]
46; W64-NEXT:    global_store_b128 v[20:21], v[24:27], off
47; W64-NEXT:    global_store_b128 v[22:23], v[16:19], off
48; W64-NEXT:    s_endpgm
49bb:
50  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C)
51  %res2 = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> %B, <16 x i16> %B, <4 x float> %C)
52  store <4 x float> %res, ptr addrspace(1) %out, align 16
53  store <4 x float> %res2, ptr addrspace(1) %out2, align 16
54  ret void
55}
56
57; @llvm.amdgcn.wmma.f16.16x16x16.f16
58
59define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
60; W64-LABEL: test_wmma_f16_16x16x16_f16_lo:
61; W64:       ; %bb.0: ; %bb
62; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19]
63; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19]
64; W64-NEXT:    global_store_b128 v[20:21], v[24:27], off
65; W64-NEXT:    global_store_b128 v[22:23], v[16:19], off
66; W64-NEXT:    s_endpgm
67bb:
68  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 0)
69  %res2 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %B, <16 x half> %B, <8 x half> %C, i1 0)
70  store <8 x half> %res, ptr addrspace(1) %out, align 16
71  store <8 x half> %res2, ptr addrspace(1) %out2, align 16
72  ret void
73}
74
75define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
76; W64-LABEL: test_wmma_f16_16x16x16_f16_hi:
77; W64:       ; %bb.0: ; %bb
78; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
79; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1]
80; W64-NEXT:    global_store_b128 v[20:21], v[24:27], off
81; W64-NEXT:    global_store_b128 v[22:23], v[16:19], off
82; W64-NEXT:    s_endpgm
83bb:
84  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 1)
85  %res2 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %B, <16 x half> %B, <8 x half> %C, i1 1)
86  store <8 x half> %res, ptr addrspace(1) %out, align 16
87  store <8 x half> %res2, ptr addrspace(1) %out2, align 16
88  ret void
89}
90
91; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
92
93define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
94; W64-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
95; W64:       ; %bb.0: ; %bb
96; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19]
97; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19]
98; W64-NEXT:    global_store_b128 v[20:21], v[24:27], off
99; W64-NEXT:    global_store_b128 v[22:23], v[16:19], off
100; W64-NEXT:    s_endpgm
101bb:
102  %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 0)
103  %res2 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %B, <16 x i16> %B, <8 x i16> %C, i1 0)
104  store <8 x i16> %res, ptr addrspace(1) %out, align 16
105  store <8 x i16> %res2, ptr addrspace(1) %out2, align 16
106  ret void
107}
108
109define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
110; W64-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
111; W64:       ; %bb.0: ; %bb
112; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
113; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1]
114; W64-NEXT:    global_store_b128 v[20:21], v[24:27], off
115; W64-NEXT:    global_store_b128 v[22:23], v[16:19], off
116; W64-NEXT:    s_endpgm
117bb:
118  %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 1)
119  %res2 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %B, <16 x i16> %B, <8 x i16> %C, i1 1)
120  store <8 x i16> %res, ptr addrspace(1) %out, align 16
121  store <8 x i16> %res2, ptr addrspace(1) %out2, align 16
122  ret void
123}
124
125; @llvm.amdgcn.wmma.i32.16x16x16.iu8
126
127define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
128; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:
129; W64:       ; %bb.0: ; %bb
130; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11]
131; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11]
132; W64-NEXT:    global_store_b128 v[12:13], v[16:19], off
133; W64-NEXT:    global_store_b128 v[14:15], v[8:11], off
134; W64-NEXT:    s_endpgm
135bb:
136  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
137  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
138  store <4 x i32> %res, ptr addrspace(1) %out, align 16
139  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
140  ret void
141}
142
143
144define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
145; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed:
146; W64:       ; %bb.0: ; %bb
147; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0]
148; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[0,1,0]
149; W64-NEXT:    global_store_b128 v[12:13], v[16:19], off
150; W64-NEXT:    global_store_b128 v[14:15], v[8:11], off
151; W64-NEXT:    s_endpgm
152bb:
153  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
154  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
155  store <4 x i32> %res, ptr addrspace(1) %out, align 16
156  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
157  ret void
158}
159
160define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
161; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned:
162; W64:       ; %bb.0: ; %bb
163; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0]
164; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,0,0]
165; W64-NEXT:    global_store_b128 v[12:13], v[16:19], off
166; W64-NEXT:    global_store_b128 v[14:15], v[8:11], off
167; W64-NEXT:    s_endpgm
168bb:
169  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
170  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
171  store <4 x i32> %res, ptr addrspace(1) %out, align 16
172  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
173  ret void
174}
175
176define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
177; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed:
178; W64:       ; %bb.0: ; %bb
179; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0]
180; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,1,0]
181; W64-NEXT:    global_store_b128 v[12:13], v[16:19], off
182; W64-NEXT:    global_store_b128 v[14:15], v[8:11], off
183; W64-NEXT:    s_endpgm
184bb:
185  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
186  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
187  store <4 x i32> %res, ptr addrspace(1) %out, align 16
188  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
189  ret void
190}
191
192define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
193; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp:
194; W64:       ; %bb.0: ; %bb
195; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] clamp
196; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] clamp
197; W64-NEXT:    global_store_b128 v[12:13], v[16:19], off
198; W64-NEXT:    global_store_b128 v[14:15], v[8:11], off
199; W64-NEXT:    s_endpgm
200bb:
201  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
202  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
203  store <4 x i32> %res, ptr addrspace(1) %out, align 16
204  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
205  ret void
206}
207
208define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
209; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp:
210; W64:       ; %bb.0: ; %bb
211; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] clamp
212; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[0,1,0] clamp
213; W64-NEXT:    global_store_b128 v[12:13], v[16:19], off
214; W64-NEXT:    global_store_b128 v[14:15], v[8:11], off
215; W64-NEXT:    s_endpgm
216bb:
217  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
218  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
219  store <4 x i32> %res, ptr addrspace(1) %out, align 16
220  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
221  ret void
222}
223
224define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
225; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp:
226; W64:       ; %bb.0: ; %bb
227; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] clamp
228; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,0,0] clamp
229; W64-NEXT:    global_store_b128 v[12:13], v[16:19], off
230; W64-NEXT:    global_store_b128 v[14:15], v[8:11], off
231; W64-NEXT:    s_endpgm
232bb:
233  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
234  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
235  store <4 x i32> %res, ptr addrspace(1) %out, align 16
236  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
237  ret void
238}
239
240define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
241; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp:
242; W64:       ; %bb.0: ; %bb
243; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] clamp
244; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,1,0] clamp
245; W64-NEXT:    global_store_b128 v[12:13], v[16:19], off
246; W64-NEXT:    global_store_b128 v[14:15], v[8:11], off
247; W64-NEXT:    s_endpgm
248bb:
249  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
250  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
251  store <4 x i32> %res, ptr addrspace(1) %out, align 16
252  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
253  ret void
254}
255
256; @llvm.amdgcn.wmma.i32.16x16x16.iu4
257
258define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
259; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned:
260; W64:       ; %bb.0: ; %bb
261; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7]
262; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7]
263; W64-NEXT:    global_store_b128 v[8:9], v[12:15], off
264; W64-NEXT:    global_store_b128 v[10:11], v[4:7], off
265; W64-NEXT:    s_endpgm
266bb:
267  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
268  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
269  store <4 x i32> %res, ptr addrspace(1) %out, align 16
270  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
271  ret void
272}
273
274define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
275; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed:
276; W64:       ; %bb.0: ; %bb
277; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0]
278; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[0,1,0]
279; W64-NEXT:    global_store_b128 v[8:9], v[12:15], off
280; W64-NEXT:    global_store_b128 v[10:11], v[4:7], off
281; W64-NEXT:    s_endpgm
282bb:
283  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
284  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
285  store <4 x i32> %res, ptr addrspace(1) %out, align 16
286  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
287  ret void
288}
289
290define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
291; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned:
292; W64:       ; %bb.0: ; %bb
293; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0]
294; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,0,0]
295; W64-NEXT:    global_store_b128 v[8:9], v[12:15], off
296; W64-NEXT:    global_store_b128 v[10:11], v[4:7], off
297; W64-NEXT:    s_endpgm
298bb:
299  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
300  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
301  store <4 x i32> %res, ptr addrspace(1) %out, align 16
302  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
303  ret void
304}
305
306define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
307; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed:
308; W64:       ; %bb.0: ; %bb
309; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0]
310; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,1,0]
311; W64-NEXT:    global_store_b128 v[8:9], v[12:15], off
312; W64-NEXT:    global_store_b128 v[10:11], v[4:7], off
313; W64-NEXT:    s_endpgm
314bb:
315  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
316  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
317  store <4 x i32> %res, ptr addrspace(1) %out, align 16
318  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
319  ret void
320}
321
322define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
323; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp:
324; W64:       ; %bb.0: ; %bb
325; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] clamp
326; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] clamp
327; W64-NEXT:    global_store_b128 v[8:9], v[12:15], off
328; W64-NEXT:    global_store_b128 v[10:11], v[4:7], off
329; W64-NEXT:    s_endpgm
330bb:
331  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
332  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
333  store <4 x i32> %res, ptr addrspace(1) %out, align 16
334  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
335  ret void
336}
337
338define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
339; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp:
340; W64:       ; %bb.0: ; %bb
341; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] clamp
342; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[0,1,0] clamp
343; W64-NEXT:    global_store_b128 v[8:9], v[12:15], off
344; W64-NEXT:    global_store_b128 v[10:11], v[4:7], off
345; W64-NEXT:    s_endpgm
346bb:
347  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
348  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
349  store <4 x i32> %res, ptr addrspace(1) %out, align 16
350  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
351  ret void
352}
353
354define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
355; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp:
356; W64:       ; %bb.0: ; %bb
357; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] clamp
358; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,0,0] clamp
359; W64-NEXT:    global_store_b128 v[8:9], v[12:15], off
360; W64-NEXT:    global_store_b128 v[10:11], v[4:7], off
361; W64-NEXT:    s_endpgm
362bb:
363  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
364  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
365  store <4 x i32> %res, ptr addrspace(1) %out, align 16
366  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
367  ret void
368}
369
370define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
371; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp:
372; W64:       ; %bb.0: ; %bb
373; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] clamp
374; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,1,0] clamp
375; W64-NEXT:    global_store_b128 v[8:9], v[12:15], off
376; W64-NEXT:    global_store_b128 v[10:11], v[4:7], off
377; W64-NEXT:    s_endpgm
378bb:
379  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
380  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
381  store <4 x i32> %res, ptr addrspace(1) %out, align 16
382  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
383  ret void
384}
385
386