xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll (revision 9e9907f1cfa424366fba58d9520f9305b537cec9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX906 %s
3; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX908 %s
4; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
5; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
6
7define i32 @v_udot2(<2 x i16> %a, <2 x i16> %b, i32 %c) {
8; GFX906-LABEL: v_udot2:
9; GFX906:       ; %bb.0:
10; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
12; GFX906-NEXT:    s_setpc_b64 s[30:31]
13;
14; GFX908-LABEL: v_udot2:
15; GFX908:       ; %bb.0:
16; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
18; GFX908-NEXT:    s_setpc_b64 s[30:31]
19;
20; GFX10-LABEL: v_udot2:
21; GFX10:       ; %bb.0:
22; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
23; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
24; GFX10-NEXT:    s_setpc_b64 s[30:31]
25  %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 false)
26  ret i32 %r
27}
28
29define i32 @v_udot2_clamp(<2 x i16> %a, <2 x i16> %b, i32 %c) {
30; GFX906-LABEL: v_udot2_clamp:
31; GFX906:       ; %bb.0:
32; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 clamp
34; GFX906-NEXT:    s_setpc_b64 s[30:31]
35;
36; GFX908-LABEL: v_udot2_clamp:
37; GFX908:       ; %bb.0:
38; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
39; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 clamp
40; GFX908-NEXT:    s_setpc_b64 s[30:31]
41;
42; GFX10-LABEL: v_udot2_clamp:
43; GFX10:       ; %bb.0:
44; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
45; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 clamp
46; GFX10-NEXT:    s_setpc_b64 s[30:31]
47  %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 true)
48  ret i32 %r
49}
50
51define amdgpu_ps float @v_udot2_sgpr_sgpr_sgpr(<2 x i16> inreg %a, <2 x i16> inreg %b, i32 inreg %c) {
52; GFX906-LABEL: v_udot2_sgpr_sgpr_sgpr:
53; GFX906:       ; %bb.0:
54; GFX906-NEXT:    v_mov_b32_e32 v0, s1
55; GFX906-NEXT:    v_mov_b32_e32 v1, s2
56; GFX906-NEXT:    v_dot2_u32_u16 v0, s0, v0, v1
57; GFX906-NEXT:    ; return to shader part epilog
58;
59; GFX908-LABEL: v_udot2_sgpr_sgpr_sgpr:
60; GFX908:       ; %bb.0:
61; GFX908-NEXT:    v_mov_b32_e32 v0, s1
62; GFX908-NEXT:    v_mov_b32_e32 v1, s2
63; GFX908-NEXT:    v_dot2_u32_u16 v0, s0, v0, v1
64; GFX908-NEXT:    ; return to shader part epilog
65;
66; GFX10-LABEL: v_udot2_sgpr_sgpr_sgpr:
67; GFX10:       ; %bb.0:
68; GFX10-NEXT:    v_mov_b32_e32 v0, s2
69; GFX10-NEXT:    v_dot2_u32_u16 v0, s0, s1, v0
70; GFX10-NEXT:    ; return to shader part epilog
71  %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 false)
72  %cast = bitcast i32 %r to float
73  ret float %cast
74}
75
76define i32 @v_udot2_inline_literal_a(<2 x i16> %b, i32 %c) {
77; GFX906-LABEL: v_udot2_inline_literal_a:
78; GFX906:       ; %bb.0:
79; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
80; GFX906-NEXT:    v_dot2_u32_u16 v0, 4, v0, v1 op_sel_hi:[0,1,1]
81; GFX906-NEXT:    s_setpc_b64 s[30:31]
82;
83; GFX908-LABEL: v_udot2_inline_literal_a:
84; GFX908:       ; %bb.0:
85; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
86; GFX908-NEXT:    v_dot2_u32_u16 v0, 4, v0, v1 op_sel_hi:[0,1,1]
87; GFX908-NEXT:    s_setpc_b64 s[30:31]
88;
89; GFX10-LABEL: v_udot2_inline_literal_a:
90; GFX10:       ; %bb.0:
91; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
92; GFX10-NEXT:    v_dot2_u32_u16 v0, 4, v0, v1 op_sel_hi:[0,1,1]
93; GFX10-NEXT:    s_setpc_b64 s[30:31]
94  %r = call i32 @llvm.amdgcn.udot2(<2 x i16> <i16 4, i16 4>, <2 x i16> %b, i32 %c, i1 false)
95  ret i32 %r
96}
97
98define i32 @v_udot2_inline_literal_b(<2 x i16> %a, i32 %c) {
99; GFX906-LABEL: v_udot2_inline_literal_b:
100; GFX906:       ; %bb.0:
101; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
102; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, 4, v1 op_sel_hi:[1,0,1]
103; GFX906-NEXT:    s_setpc_b64 s[30:31]
104;
105; GFX908-LABEL: v_udot2_inline_literal_b:
106; GFX908:       ; %bb.0:
107; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
108; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, 4, v1 op_sel_hi:[1,0,1]
109; GFX908-NEXT:    s_setpc_b64 s[30:31]
110;
111; GFX10-LABEL: v_udot2_inline_literal_b:
112; GFX10:       ; %bb.0:
113; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
114; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, 4, v1 op_sel_hi:[1,0,1]
115; GFX10-NEXT:    s_setpc_b64 s[30:31]
116  %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false)
117  ret i32 %r
118}
119
120define i32 @v_udot2_inline_literal_a_b(<2 x i16> %a, i32 %c) {
121; GFX906-LABEL: v_udot2_inline_literal_a_b:
122; GFX906:       ; %bb.0:
123; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
124; GFX906-NEXT:    v_dot2_u32_u16 v0, 8, 4, v1 op_sel_hi:[0,0,1]
125; GFX906-NEXT:    s_setpc_b64 s[30:31]
126;
127; GFX908-LABEL: v_udot2_inline_literal_a_b:
128; GFX908:       ; %bb.0:
129; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
130; GFX908-NEXT:    v_dot2_u32_u16 v0, 8, 4, v1 op_sel_hi:[0,0,1]
131; GFX908-NEXT:    s_setpc_b64 s[30:31]
132;
133; GFX10-LABEL: v_udot2_inline_literal_a_b:
134; GFX10:       ; %bb.0:
135; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
136; GFX10-NEXT:    v_dot2_u32_u16 v0, 8, 4, v1 op_sel_hi:[0,0,1]
137; GFX10-NEXT:    s_setpc_b64 s[30:31]
138  %r = call i32 @llvm.amdgcn.udot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false)
139  ret i32 %r
140}
141
142define i32 @v_udot2_inline_literal_a_b_c() {
143; GFX906-LABEL: v_udot2_inline_literal_a_b_c:
144; GFX906:       ; %bb.0:
145; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
146; GFX906-NEXT:    v_dot2_u32_u16 v0, 8, 4, 8 op_sel_hi:[0,0,1]
147; GFX906-NEXT:    s_setpc_b64 s[30:31]
148;
149; GFX908-LABEL: v_udot2_inline_literal_a_b_c:
150; GFX908:       ; %bb.0:
151; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
152; GFX908-NEXT:    v_dot2_u32_u16 v0, 8, 4, 8 op_sel_hi:[0,0,1]
153; GFX908-NEXT:    s_setpc_b64 s[30:31]
154;
155; GFX10-LABEL: v_udot2_inline_literal_a_b_c:
156; GFX10:       ; %bb.0:
157; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
158; GFX10-NEXT:    v_dot2_u32_u16 v0, 8, 4, 8 op_sel_hi:[0,0,1]
159; GFX10-NEXT:    s_setpc_b64 s[30:31]
160  %r = call i32 @llvm.amdgcn.udot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 8, i1 false)
161  ret i32 %r
162}
163
164define i32 @v_udot2_inline_literal_c(<2 x i16> %a, <2 x i16> %b) {
165; GFX906-LABEL: v_udot2_inline_literal_c:
166; GFX906:       ; %bb.0:
167; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
168; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, 7
169; GFX906-NEXT:    s_setpc_b64 s[30:31]
170;
171; GFX908-LABEL: v_udot2_inline_literal_c:
172; GFX908:       ; %bb.0:
173; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
174; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, 7
175; GFX908-NEXT:    s_setpc_b64 s[30:31]
176;
177; GFX10-LABEL: v_udot2_inline_literal_c:
178; GFX10:       ; %bb.0:
179; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
180; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, 7
181; GFX10-NEXT:    s_setpc_b64 s[30:31]
182  %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 7, i1 false)
183  ret i32 %r
184}
185
186define i32 @v_udot2_fneg_a(<2 x half> %a, <2 x i16> %b, i32 %c) {
187; GFX906-LABEL: v_udot2_fneg_a:
188; GFX906:       ; %bb.0:
189; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
190; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
191; GFX906-NEXT:    s_setpc_b64 s[30:31]
192;
193; GFX908-LABEL: v_udot2_fneg_a:
194; GFX908:       ; %bb.0:
195; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
196; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
197; GFX908-NEXT:    s_setpc_b64 s[30:31]
198;
199; GFX10-LABEL: v_udot2_fneg_a:
200; GFX10:       ; %bb.0:
201; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
202; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
203; GFX10-NEXT:    s_setpc_b64 s[30:31]
204  %neg.a = fneg <2 x half> %a
205  %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16>
206  %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %cast.neg.a, <2 x i16> %b, i32 %c, i1 false)
207  ret i32 %r
208}
209
210define i32 @v_udot2_fneg_b(<2 x i16> %a, <2 x half> %b, i32 %c) {
211; GFX906-LABEL: v_udot2_fneg_b:
212; GFX906:       ; %bb.0:
213; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
214; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
215; GFX906-NEXT:    s_setpc_b64 s[30:31]
216;
217; GFX908-LABEL: v_udot2_fneg_b:
218; GFX908:       ; %bb.0:
219; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
220; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
221; GFX908-NEXT:    s_setpc_b64 s[30:31]
222;
223; GFX10-LABEL: v_udot2_fneg_b:
224; GFX10:       ; %bb.0:
225; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
226; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
227; GFX10-NEXT:    s_setpc_b64 s[30:31]
228  %neg.b = fneg <2 x half> %b
229  %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16>
230  %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %cast.neg.b, i32 %c, i1 false)
231  ret i32 %r
232}
233
234define i32 @v_udot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) {
235; GFX906-LABEL: v_udot2_fnegf32_c:
236; GFX906:       ; %bb.0:
237; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
238; GFX906-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
239; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
240; GFX906-NEXT:    s_setpc_b64 s[30:31]
241;
242; GFX908-LABEL: v_udot2_fnegf32_c:
243; GFX908:       ; %bb.0:
244; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
245; GFX908-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
246; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
247; GFX908-NEXT:    s_setpc_b64 s[30:31]
248;
249; GFX10-LABEL: v_udot2_fnegf32_c:
250; GFX10:       ; %bb.0:
251; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
252; GFX10-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
253; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
254; GFX10-NEXT:    s_setpc_b64 s[30:31]
255  %neg.c = fneg float %c
256  %cast.neg.c = bitcast float %neg.c to i32
257  %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %cast.neg.c, i1 false)
258  ret i32 %r
259}
260
261define i32 @v_udot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) {
262; GFX906-LABEL: v_udot2_fnegv2f16_c:
263; GFX906:       ; %bb.0:
264; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
265; GFX906-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
266; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
267; GFX906-NEXT:    s_setpc_b64 s[30:31]
268;
269; GFX908-LABEL: v_udot2_fnegv2f16_c:
270; GFX908:       ; %bb.0:
271; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
272; GFX908-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
273; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
274; GFX908-NEXT:    s_setpc_b64 s[30:31]
275;
276; GFX10-LABEL: v_udot2_fnegv2f16_c:
277; GFX10:       ; %bb.0:
278; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
279; GFX10-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
280; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
281; GFX10-NEXT:    s_setpc_b64 s[30:31]
282  %neg.c = fneg <2 x half> %c
283  %cast.neg.c = bitcast <2 x half> %neg.c to i32
284  %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %cast.neg.c, i1 false)
285  ret i32 %r
286}
287
288define i32 @v_udot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) {
289; GFX906-LABEL: v_udot2_shuffle10_a:
290; GFX906:       ; %bb.0:
291; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
292; GFX906-NEXT:    v_alignbit_b32 v0, v0, v0, 16
293; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
294; GFX906-NEXT:    s_setpc_b64 s[30:31]
295;
296; GFX908-LABEL: v_udot2_shuffle10_a:
297; GFX908:       ; %bb.0:
298; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
299; GFX908-NEXT:    v_alignbit_b32 v0, v0, v0, 16
300; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
301; GFX908-NEXT:    s_setpc_b64 s[30:31]
302;
303; GFX10-LABEL: v_udot2_shuffle10_a:
304; GFX10:       ; %bb.0:
305; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
306; GFX10-NEXT:    v_alignbit_b32 v0, v0, v0, 16
307; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
308; GFX10-NEXT:    s_setpc_b64 s[30:31]
309  %shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
310  %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false)
311  ret i32 %r
312}
313
314define i32 @v_udot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) {
315; GFX906-LABEL: v_udot2_shuffle10_b:
316; GFX906:       ; %bb.0:
317; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
318; GFX906-NEXT:    v_alignbit_b32 v1, v1, v1, 16
319; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
320; GFX906-NEXT:    s_setpc_b64 s[30:31]
321;
322; GFX908-LABEL: v_udot2_shuffle10_b:
323; GFX908:       ; %bb.0:
324; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
325; GFX908-NEXT:    v_alignbit_b32 v1, v1, v1, 16
326; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
327; GFX908-NEXT:    s_setpc_b64 s[30:31]
328;
329; GFX10-LABEL: v_udot2_shuffle10_b:
330; GFX10:       ; %bb.0:
331; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
332; GFX10-NEXT:    v_alignbit_b32 v1, v1, v1, 16
333; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
334; GFX10-NEXT:    s_setpc_b64 s[30:31]
335  %shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
336  %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false)
337  ret i32 %r
338}
339
340declare i32 @llvm.amdgcn.udot2(<2 x i16>, <2 x i16>, i32, i1 immarg) #0
341
342attributes #0 = { nounwind readnone speculatable }
343