xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll (revision 8f6a1a07cb85980013c70d5af6d28f5fcf75e732)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
3; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
4; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
5; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
6
7define amdgpu_ps i32 @s_andn2_i32(i32 inreg %src0, i32 inreg %src1) {
8; GCN-LABEL: s_andn2_i32:
9; GCN:       ; %bb.0:
10; GCN-NEXT:    s_andn2_b32 s0, s2, s3
11; GCN-NEXT:    ; return to shader part epilog
12;
13; GFX10-LABEL: s_andn2_i32:
14; GFX10:       ; %bb.0:
15; GFX10-NEXT:    s_andn2_b32 s0, s2, s3
16; GFX10-NEXT:    ; return to shader part epilog
17;
18; GFX11-LABEL: s_andn2_i32:
19; GFX11:       ; %bb.0:
20; GFX11-NEXT:    s_and_not1_b32 s0, s2, s3
21; GFX11-NEXT:    ; return to shader part epilog
22  %not.src1 = xor i32 %src1, -1
23  %and = and i32 %src0, %not.src1
24  ret i32 %and
25}
26
27define amdgpu_ps i32 @s_andn2_i32_commute(i32 inreg %src0, i32 inreg %src1) {
28; GCN-LABEL: s_andn2_i32_commute:
29; GCN:       ; %bb.0:
30; GCN-NEXT:    s_andn2_b32 s0, s2, s3
31; GCN-NEXT:    ; return to shader part epilog
32;
33; GFX10-LABEL: s_andn2_i32_commute:
34; GFX10:       ; %bb.0:
35; GFX10-NEXT:    s_andn2_b32 s0, s2, s3
36; GFX10-NEXT:    ; return to shader part epilog
37;
38; GFX11-LABEL: s_andn2_i32_commute:
39; GFX11:       ; %bb.0:
40; GFX11-NEXT:    s_and_not1_b32 s0, s2, s3
41; GFX11-NEXT:    ; return to shader part epilog
42  %not.src1 = xor i32 %src1, -1
43  %and = and i32 %not.src1, %src0
44  ret i32 %and
45}
46
47define amdgpu_ps { i32, i32 } @s_andn2_i32_multi_use(i32 inreg %src0, i32 inreg %src1) {
48; GCN-LABEL: s_andn2_i32_multi_use:
49; GCN:       ; %bb.0:
50; GCN-NEXT:    s_not_b32 s1, s3
51; GCN-NEXT:    s_andn2_b32 s0, s2, s3
52; GCN-NEXT:    ; return to shader part epilog
53;
54; GFX10-LABEL: s_andn2_i32_multi_use:
55; GFX10:       ; %bb.0:
56; GFX10-NEXT:    s_andn2_b32 s0, s2, s3
57; GFX10-NEXT:    s_not_b32 s1, s3
58; GFX10-NEXT:    ; return to shader part epilog
59;
60; GFX11-LABEL: s_andn2_i32_multi_use:
61; GFX11:       ; %bb.0:
62; GFX11-NEXT:    s_and_not1_b32 s0, s2, s3
63; GFX11-NEXT:    s_not_b32 s1, s3
64; GFX11-NEXT:    ; return to shader part epilog
65  %not.src1 = xor i32 %src1, -1
66  %and = and i32 %src0, %not.src1
67  %insert.0 = insertvalue { i32, i32 } undef, i32 %and, 0
68  %insert.1 = insertvalue { i32, i32 } %insert.0, i32 %not.src1, 1
69  ret { i32, i32 } %insert.1
70}
71
72define amdgpu_ps { i32, i32 } @s_andn2_i32_multi_foldable_use(i32 inreg %src0, i32 inreg %src1, i32 inreg %src2) {
73; GCN-LABEL: s_andn2_i32_multi_foldable_use:
74; GCN:       ; %bb.0:
75; GCN-NEXT:    s_andn2_b32 s0, s2, s4
76; GCN-NEXT:    s_andn2_b32 s1, s3, s4
77; GCN-NEXT:    ; return to shader part epilog
78;
79; GFX10-LABEL: s_andn2_i32_multi_foldable_use:
80; GFX10:       ; %bb.0:
81; GFX10-NEXT:    s_andn2_b32 s0, s2, s4
82; GFX10-NEXT:    s_andn2_b32 s1, s3, s4
83; GFX10-NEXT:    ; return to shader part epilog
84;
85; GFX11-LABEL: s_andn2_i32_multi_foldable_use:
86; GFX11:       ; %bb.0:
87; GFX11-NEXT:    s_and_not1_b32 s0, s2, s4
88; GFX11-NEXT:    s_and_not1_b32 s1, s3, s4
89; GFX11-NEXT:    ; return to shader part epilog
90  %not.src2 = xor i32 %src2, -1
91  %and0 = and i32 %src0, %not.src2
92  %and1 = and i32 %src1, %not.src2
93  %insert.0 = insertvalue { i32, i32 } undef, i32 %and0, 0
94  %insert.1 = insertvalue { i32, i32 } %insert.0, i32 %and1, 1
95  ret { i32, i32 } %insert.1
96}
97
98define i32 @v_andn2_i32(i32 %src0, i32 %src1) {
99; GCN-LABEL: v_andn2_i32:
100; GCN:       ; %bb.0:
101; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
102; GCN-NEXT:    v_not_b32_e32 v1, v1
103; GCN-NEXT:    v_and_b32_e32 v0, v0, v1
104; GCN-NEXT:    s_setpc_b64 s[30:31]
105;
106; GFX10PLUS-LABEL: v_andn2_i32:
107; GFX10PLUS:       ; %bb.0:
108; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
109; GFX10PLUS-NEXT:    v_not_b32_e32 v1, v1
110; GFX10PLUS-NEXT:    v_and_b32_e32 v0, v0, v1
111; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
112  %not.src1 = xor i32 %src1, -1
113  %and = and i32 %src0, %not.src1
114  ret i32 %and
115}
116
117define amdgpu_ps float @v_andn2_i32_sv(i32 inreg %src0, i32 %src1) {
118; GCN-LABEL: v_andn2_i32_sv:
119; GCN:       ; %bb.0:
120; GCN-NEXT:    v_not_b32_e32 v0, v0
121; GCN-NEXT:    v_and_b32_e32 v0, s2, v0
122; GCN-NEXT:    ; return to shader part epilog
123;
124; GFX10PLUS-LABEL: v_andn2_i32_sv:
125; GFX10PLUS:       ; %bb.0:
126; GFX10PLUS-NEXT:    v_not_b32_e32 v0, v0
127; GFX10PLUS-NEXT:    v_and_b32_e32 v0, s2, v0
128; GFX10PLUS-NEXT:    ; return to shader part epilog
129  %not.src1 = xor i32 %src1, -1
130  %and = and i32 %src0, %not.src1
131  %cast = bitcast i32 %and to float
132  ret float %cast
133}
134
135define amdgpu_ps float @v_andn2_i32_vs(i32 %src0, i32 inreg %src1) {
136; GCN-LABEL: v_andn2_i32_vs:
137; GCN:       ; %bb.0:
138; GCN-NEXT:    s_not_b32 s0, s2
139; GCN-NEXT:    v_and_b32_e32 v0, s0, v0
140; GCN-NEXT:    ; return to shader part epilog
141;
142; GFX10PLUS-LABEL: v_andn2_i32_vs:
143; GFX10PLUS:       ; %bb.0:
144; GFX10PLUS-NEXT:    s_not_b32 s0, s2
145; GFX10PLUS-NEXT:    v_and_b32_e32 v0, s0, v0
146; GFX10PLUS-NEXT:    ; return to shader part epilog
147  %not.src1 = xor i32 %src1, -1
148  %and = and i32 %src0, %not.src1
149  %cast = bitcast i32 %and to float
150  ret float %cast
151}
152
153define amdgpu_ps i64 @s_andn2_i64(i64 inreg %src0, i64 inreg %src1) {
154; GCN-LABEL: s_andn2_i64:
155; GCN:       ; %bb.0:
156; GCN-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[4:5]
157; GCN-NEXT:    ; return to shader part epilog
158;
159; GFX10-LABEL: s_andn2_i64:
160; GFX10:       ; %bb.0:
161; GFX10-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[4:5]
162; GFX10-NEXT:    ; return to shader part epilog
163;
164; GFX11-LABEL: s_andn2_i64:
165; GFX11:       ; %bb.0:
166; GFX11-NEXT:    s_and_not1_b64 s[0:1], s[2:3], s[4:5]
167; GFX11-NEXT:    ; return to shader part epilog
168  %not.src1 = xor i64 %src1, -1
169  %and = and i64 %src0, %not.src1
170  ret i64 %and
171}
172
173define amdgpu_ps i64 @s_andn2_i64_commute(i64 inreg %src0, i64 inreg %src1) {
174; GCN-LABEL: s_andn2_i64_commute:
175; GCN:       ; %bb.0:
176; GCN-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[4:5]
177; GCN-NEXT:    ; return to shader part epilog
178;
179; GFX10-LABEL: s_andn2_i64_commute:
180; GFX10:       ; %bb.0:
181; GFX10-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[4:5]
182; GFX10-NEXT:    ; return to shader part epilog
183;
184; GFX11-LABEL: s_andn2_i64_commute:
185; GFX11:       ; %bb.0:
186; GFX11-NEXT:    s_and_not1_b64 s[0:1], s[2:3], s[4:5]
187; GFX11-NEXT:    ; return to shader part epilog
188  %not.src1 = xor i64 %src1, -1
189  %and = and i64 %not.src1, %src0
190  ret i64 %and
191}
192
193define amdgpu_ps { i64, i64 } @s_andn2_i64_multi_foldable_use(i64 inreg %src0, i64 inreg %src1, i64 inreg %src2) {
194; GCN-LABEL: s_andn2_i64_multi_foldable_use:
195; GCN:       ; %bb.0:
196; GCN-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[6:7]
197; GCN-NEXT:    s_andn2_b64 s[2:3], s[4:5], s[6:7]
198; GCN-NEXT:    ; return to shader part epilog
199;
200; GFX10-LABEL: s_andn2_i64_multi_foldable_use:
201; GFX10:       ; %bb.0:
202; GFX10-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[6:7]
203; GFX10-NEXT:    s_andn2_b64 s[2:3], s[4:5], s[6:7]
204; GFX10-NEXT:    ; return to shader part epilog
205;
206; GFX11-LABEL: s_andn2_i64_multi_foldable_use:
207; GFX11:       ; %bb.0:
208; GFX11-NEXT:    s_and_not1_b64 s[0:1], s[2:3], s[6:7]
209; GFX11-NEXT:    s_and_not1_b64 s[2:3], s[4:5], s[6:7]
210; GFX11-NEXT:    ; return to shader part epilog
211  %not.src2 = xor i64 %src2, -1
212  %and0 = and i64 %src0, %not.src2
213  %and1 = and i64 %src1, %not.src2
214  %insert.0 = insertvalue { i64, i64 } undef, i64 %and0, 0
215  %insert.1 = insertvalue { i64, i64 } %insert.0, i64 %and1, 1
216  ret { i64, i64 } %insert.1
217}
218
219define amdgpu_ps { i64, i64 } @s_andn2_i64_multi_use(i64 inreg %src0, i64 inreg %src1) {
220; GCN-LABEL: s_andn2_i64_multi_use:
221; GCN:       ; %bb.0:
222; GCN-NEXT:    s_not_b64 s[6:7], s[4:5]
223; GCN-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[4:5]
224; GCN-NEXT:    s_mov_b32 s2, s6
225; GCN-NEXT:    s_mov_b32 s3, s7
226; GCN-NEXT:    ; return to shader part epilog
227;
228; GFX10-LABEL: s_andn2_i64_multi_use:
229; GFX10:       ; %bb.0:
230; GFX10-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[4:5]
231; GFX10-NEXT:    s_not_b64 s[2:3], s[4:5]
232; GFX10-NEXT:    ; return to shader part epilog
233;
234; GFX11-LABEL: s_andn2_i64_multi_use:
235; GFX11:       ; %bb.0:
236; GFX11-NEXT:    s_and_not1_b64 s[0:1], s[2:3], s[4:5]
237; GFX11-NEXT:    s_not_b64 s[2:3], s[4:5]
238; GFX11-NEXT:    ; return to shader part epilog
239  %not.src1 = xor i64 %src1, -1
240  %and = and i64 %src0, %not.src1
241  %insert.0 = insertvalue { i64, i64 } undef, i64 %and, 0
242  %insert.1 = insertvalue { i64, i64 } %insert.0, i64 %not.src1, 1
243  ret { i64, i64 } %insert.1
244}
245
246define i64 @v_andn2_i64(i64 %src0, i64 %src1) {
247; GCN-LABEL: v_andn2_i64:
248; GCN:       ; %bb.0:
249; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
250; GCN-NEXT:    v_not_b32_e32 v2, v2
251; GCN-NEXT:    v_not_b32_e32 v3, v3
252; GCN-NEXT:    v_and_b32_e32 v0, v0, v2
253; GCN-NEXT:    v_and_b32_e32 v1, v1, v3
254; GCN-NEXT:    s_setpc_b64 s[30:31]
255;
256; GFX10PLUS-LABEL: v_andn2_i64:
257; GFX10PLUS:       ; %bb.0:
258; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
259; GFX10PLUS-NEXT:    v_not_b32_e32 v2, v2
260; GFX10PLUS-NEXT:    v_not_b32_e32 v3, v3
261; GFX10PLUS-NEXT:    v_and_b32_e32 v0, v0, v2
262; GFX10PLUS-NEXT:    v_and_b32_e32 v1, v1, v3
263; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
264  %not.src1 = xor i64 %src1, -1
265  %and = and i64 %src0, %not.src1
266  ret i64 %and
267}
268
269define amdgpu_ps <2 x float> @v_andn2_i64_sv(i64 inreg %src0, i64 %src1) {
270; GCN-LABEL: v_andn2_i64_sv:
271; GCN:       ; %bb.0:
272; GCN-NEXT:    v_not_b32_e32 v0, v0
273; GCN-NEXT:    v_not_b32_e32 v1, v1
274; GCN-NEXT:    v_and_b32_e32 v0, s2, v0
275; GCN-NEXT:    v_and_b32_e32 v1, s3, v1
276; GCN-NEXT:    ; return to shader part epilog
277;
278; GFX10PLUS-LABEL: v_andn2_i64_sv:
279; GFX10PLUS:       ; %bb.0:
280; GFX10PLUS-NEXT:    v_not_b32_e32 v0, v0
281; GFX10PLUS-NEXT:    v_not_b32_e32 v1, v1
282; GFX10PLUS-NEXT:    v_and_b32_e32 v0, s2, v0
283; GFX10PLUS-NEXT:    v_and_b32_e32 v1, s3, v1
284; GFX10PLUS-NEXT:    ; return to shader part epilog
285  %not.src1 = xor i64 %src1, -1
286  %and = and i64 %src0, %not.src1
287  %cast = bitcast i64 %and to <2 x float>
288  ret <2 x float> %cast
289}
290
291define amdgpu_ps <2 x float> @v_andn2_i64_vs(i64 %src0, i64 inreg %src1) {
292; GCN-LABEL: v_andn2_i64_vs:
293; GCN:       ; %bb.0:
294; GCN-NEXT:    s_not_b64 s[0:1], s[2:3]
295; GCN-NEXT:    v_and_b32_e32 v0, s0, v0
296; GCN-NEXT:    v_and_b32_e32 v1, s1, v1
297; GCN-NEXT:    ; return to shader part epilog
298;
299; GFX10PLUS-LABEL: v_andn2_i64_vs:
300; GFX10PLUS:       ; %bb.0:
301; GFX10PLUS-NEXT:    s_not_b64 s[0:1], s[2:3]
302; GFX10PLUS-NEXT:    v_and_b32_e32 v0, s0, v0
303; GFX10PLUS-NEXT:    v_and_b32_e32 v1, s1, v1
304; GFX10PLUS-NEXT:    ; return to shader part epilog
305  %not.src1 = xor i64 %src1, -1
306  %and = and i64 %src0, %not.src1
307  %cast = bitcast i64 %and to <2 x float>
308  ret <2 x float> %cast
309}
310
311define amdgpu_ps <2 x i32> @s_andn2_v2i32(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
312; GCN-LABEL: s_andn2_v2i32:
313; GCN:       ; %bb.0:
314; GCN-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[4:5]
315; GCN-NEXT:    ; return to shader part epilog
316;
317; GFX10-LABEL: s_andn2_v2i32:
318; GFX10:       ; %bb.0:
319; GFX10-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[4:5]
320; GFX10-NEXT:    ; return to shader part epilog
321;
322; GFX11-LABEL: s_andn2_v2i32:
323; GFX11:       ; %bb.0:
324; GFX11-NEXT:    s_and_not1_b64 s[0:1], s[2:3], s[4:5]
325; GFX11-NEXT:    ; return to shader part epilog
326  %not.src1 = xor <2 x i32> %src1, <i32 -1, i32 -1>
327  %and = and <2 x i32> %src0, %not.src1
328  ret <2 x i32> %and
329}
330
331define amdgpu_ps <2 x i32> @s_andn2_v2i32_commute(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
332; GCN-LABEL: s_andn2_v2i32_commute:
333; GCN:       ; %bb.0:
334; GCN-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[4:5]
335; GCN-NEXT:    ; return to shader part epilog
336;
337; GFX10-LABEL: s_andn2_v2i32_commute:
338; GFX10:       ; %bb.0:
339; GFX10-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[4:5]
340; GFX10-NEXT:    ; return to shader part epilog
341;
342; GFX11-LABEL: s_andn2_v2i32_commute:
343; GFX11:       ; %bb.0:
344; GFX11-NEXT:    s_and_not1_b64 s[0:1], s[2:3], s[4:5]
345; GFX11-NEXT:    ; return to shader part epilog
346  %not.src1 = xor <2 x i32> %src1, <i32 -1, i32 -1>
347  %and = and <2 x i32> %not.src1, %src0
348  ret <2 x i32> %and
349}
350
351define amdgpu_ps i16 @s_andn2_i16(i16 inreg %src0, i16 inreg %src1) {
352; GCN-LABEL: s_andn2_i16:
353; GCN:       ; %bb.0:
354; GCN-NEXT:    s_andn2_b32 s0, s2, s3
355; GCN-NEXT:    ; return to shader part epilog
356;
357; GFX10-LABEL: s_andn2_i16:
358; GFX10:       ; %bb.0:
359; GFX10-NEXT:    s_andn2_b32 s0, s2, s3
360; GFX10-NEXT:    ; return to shader part epilog
361;
362; GFX11-LABEL: s_andn2_i16:
363; GFX11:       ; %bb.0:
364; GFX11-NEXT:    s_and_not1_b32 s0, s2, s3
365; GFX11-NEXT:    ; return to shader part epilog
366  %not.src1 = xor i16 %src1, -1
367  %and = and i16 %src0, %not.src1
368  ret i16 %and
369}
370
371define amdgpu_ps i16 @s_andn2_i16_commute(i16 inreg %src0, i16 inreg %src1) {
372; GCN-LABEL: s_andn2_i16_commute:
373; GCN:       ; %bb.0:
374; GCN-NEXT:    s_andn2_b32 s0, s2, s3
375; GCN-NEXT:    ; return to shader part epilog
376;
377; GFX10-LABEL: s_andn2_i16_commute:
378; GFX10:       ; %bb.0:
379; GFX10-NEXT:    s_andn2_b32 s0, s2, s3
380; GFX10-NEXT:    ; return to shader part epilog
381;
382; GFX11-LABEL: s_andn2_i16_commute:
383; GFX11:       ; %bb.0:
384; GFX11-NEXT:    s_and_not1_b32 s0, s2, s3
385; GFX11-NEXT:    ; return to shader part epilog
386  %not.src1 = xor i16 %src1, -1
387  %and = and i16 %not.src1, %src0
388  ret i16 %and
389}
390
391define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_use(i16 inreg %src0, i16 inreg %src1) {
392; GCN-LABEL: s_andn2_i16_multi_use:
393; GCN:       ; %bb.0:
394; GCN-NEXT:    s_xor_b32 s1, s3, -1
395; GCN-NEXT:    s_andn2_b32 s0, s2, s3
396; GCN-NEXT:    ; return to shader part epilog
397;
398; GFX10-LABEL: s_andn2_i16_multi_use:
399; GFX10:       ; %bb.0:
400; GFX10-NEXT:    s_andn2_b32 s0, s2, s3
401; GFX10-NEXT:    s_xor_b32 s1, s3, -1
402; GFX10-NEXT:    ; return to shader part epilog
403;
404; GFX11-LABEL: s_andn2_i16_multi_use:
405; GFX11:       ; %bb.0:
406; GFX11-NEXT:    s_and_not1_b32 s0, s2, s3
407; GFX11-NEXT:    s_xor_b32 s1, s3, -1
408; GFX11-NEXT:    ; return to shader part epilog
409  %not.src1 = xor i16 %src1, -1
410  %and = and i16 %src0, %not.src1
411  %insert.0 = insertvalue { i16, i16 } undef, i16 %and, 0
412  %insert.1 = insertvalue { i16, i16 } %insert.0, i16 %not.src1, 1
413  ret { i16, i16 } %insert.1
414}
415
416define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_foldable_use(i16 inreg %src0, i16 inreg %src1, i16 inreg %src2) {
417; GCN-LABEL: s_andn2_i16_multi_foldable_use:
418; GCN:       ; %bb.0:
419; GCN-NEXT:    s_andn2_b32 s0, s2, s4
420; GCN-NEXT:    s_andn2_b32 s1, s3, s4
421; GCN-NEXT:    ; return to shader part epilog
422;
423; GFX10-LABEL: s_andn2_i16_multi_foldable_use:
424; GFX10:       ; %bb.0:
425; GFX10-NEXT:    s_andn2_b32 s0, s2, s4
426; GFX10-NEXT:    s_andn2_b32 s1, s3, s4
427; GFX10-NEXT:    ; return to shader part epilog
428;
429; GFX11-LABEL: s_andn2_i16_multi_foldable_use:
430; GFX11:       ; %bb.0:
431; GFX11-NEXT:    s_and_not1_b32 s0, s2, s4
432; GFX11-NEXT:    s_and_not1_b32 s1, s3, s4
433; GFX11-NEXT:    ; return to shader part epilog
434  %not.src2 = xor i16 %src2, -1
435  %and0 = and i16 %src0, %not.src2
436  %and1 = and i16 %src1, %not.src2
437  %insert.0 = insertvalue { i16, i16 } undef, i16 %and0, 0
438  %insert.1 = insertvalue { i16, i16 } %insert.0, i16 %and1, 1
439  ret { i16, i16 } %insert.1
440}
441
442define i16 @v_andn2_i16(i16 %src0, i16 %src1) {
443; GCN-LABEL: v_andn2_i16:
444; GCN:       ; %bb.0:
445; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
446; GCN-NEXT:    v_xor_b32_e32 v1, -1, v1
447; GCN-NEXT:    v_and_b32_e32 v0, v0, v1
448; GCN-NEXT:    s_setpc_b64 s[30:31]
449;
450; GFX10PLUS-LABEL: v_andn2_i16:
451; GFX10PLUS:       ; %bb.0:
452; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
453; GFX10PLUS-NEXT:    v_xor_b32_e32 v1, -1, v1
454; GFX10PLUS-NEXT:    v_and_b32_e32 v0, v0, v1
455; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
456  %not.src1 = xor i16 %src1, -1
457  %and = and i16 %src0, %not.src1
458  ret i16 %and
459}
460
461define amdgpu_ps float @v_andn2_i16_sv(i16 inreg %src0, i16 %src1) {
462; GCN-LABEL: v_andn2_i16_sv:
463; GCN:       ; %bb.0:
464; GCN-NEXT:    v_xor_b32_e32 v0, -1, v0
465; GCN-NEXT:    v_and_b32_e32 v0, s2, v0
466; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
467; GCN-NEXT:    ; return to shader part epilog
468;
469; GFX10PLUS-LABEL: v_andn2_i16_sv:
470; GFX10PLUS:       ; %bb.0:
471; GFX10PLUS-NEXT:    v_xor_b32_e32 v0, -1, v0
472; GFX10PLUS-NEXT:    v_and_b32_e32 v0, s2, v0
473; GFX10PLUS-NEXT:    v_and_b32_e32 v0, 0xffff, v0
474; GFX10PLUS-NEXT:    ; return to shader part epilog
475  %not.src1 = xor i16 %src1, -1
476  %and = and i16 %src0, %not.src1
477  %zext = zext i16 %and to i32
478  %cast.zext = bitcast i32 %zext to float
479  ret float %cast.zext
480}
481
482define amdgpu_ps float @v_andn2_i16_vs(i16 %src0, i16 inreg %src1) {
483; GCN-LABEL: v_andn2_i16_vs:
484; GCN:       ; %bb.0:
485; GCN-NEXT:    s_xor_b32 s0, s2, -1
486; GCN-NEXT:    v_and_b32_e32 v0, s0, v0
487; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
488; GCN-NEXT:    ; return to shader part epilog
489;
490; GFX10PLUS-LABEL: v_andn2_i16_vs:
491; GFX10PLUS:       ; %bb.0:
492; GFX10PLUS-NEXT:    s_xor_b32 s0, s2, -1
493; GFX10PLUS-NEXT:    v_and_b32_e32 v0, s0, v0
494; GFX10PLUS-NEXT:    v_and_b32_e32 v0, 0xffff, v0
495; GFX10PLUS-NEXT:    ; return to shader part epilog
496  %not.src1 = xor i16 %src1, -1
497  %and = and i16 %src0, %not.src1
498  %zext = zext i16 %and to i32
499  %cast.zext = bitcast i32 %zext to float
500  ret float %cast.zext
501}
502
503define amdgpu_ps i32 @s_andn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
504; GFX6-LABEL: s_andn2_v2i16:
505; GFX6:       ; %bb.0:
506; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
507; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
508; GFX6-NEXT:    s_or_b32 s0, s0, s1
509; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
510; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
511; GFX6-NEXT:    s_or_b32 s1, s1, s2
512; GFX6-NEXT:    s_xor_b32 s1, s1, -1
513; GFX6-NEXT:    s_and_b32 s0, s0, s1
514; GFX6-NEXT:    ; return to shader part epilog
515;
516; GFX9-LABEL: s_andn2_v2i16:
517; GFX9:       ; %bb.0:
518; GFX9-NEXT:    s_andn2_b32 s0, s2, s3
519; GFX9-NEXT:    ; return to shader part epilog
520;
521; GFX10-LABEL: s_andn2_v2i16:
522; GFX10:       ; %bb.0:
523; GFX10-NEXT:    s_andn2_b32 s0, s2, s3
524; GFX10-NEXT:    ; return to shader part epilog
525;
526; GFX11-LABEL: s_andn2_v2i16:
527; GFX11:       ; %bb.0:
528; GFX11-NEXT:    s_and_not1_b32 s0, s2, s3
529; GFX11-NEXT:    ; return to shader part epilog
530  %not.src1 = xor <2 x i16> %src1, <i16 -1, i16 -1>
531  %and = and <2 x i16> %src0, %not.src1
532  %cast = bitcast <2 x i16> %and to i32
533  ret i32 %cast
534}
535
536define amdgpu_ps i32 @s_andn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
537; GFX6-LABEL: s_andn2_v2i16_commute:
538; GFX6:       ; %bb.0:
539; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
540; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
541; GFX6-NEXT:    s_or_b32 s0, s0, s1
542; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
543; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
544; GFX6-NEXT:    s_or_b32 s1, s1, s2
545; GFX6-NEXT:    s_xor_b32 s1, s1, -1
546; GFX6-NEXT:    s_and_b32 s0, s1, s0
547; GFX6-NEXT:    ; return to shader part epilog
548;
549; GFX9-LABEL: s_andn2_v2i16_commute:
550; GFX9:       ; %bb.0:
551; GFX9-NEXT:    s_andn2_b32 s0, s2, s3
552; GFX9-NEXT:    ; return to shader part epilog
553;
554; GFX10-LABEL: s_andn2_v2i16_commute:
555; GFX10:       ; %bb.0:
556; GFX10-NEXT:    s_andn2_b32 s0, s2, s3
557; GFX10-NEXT:    ; return to shader part epilog
558;
559; GFX11-LABEL: s_andn2_v2i16_commute:
560; GFX11:       ; %bb.0:
561; GFX11-NEXT:    s_and_not1_b32 s0, s2, s3
562; GFX11-NEXT:    ; return to shader part epilog
563  %not.src1 = xor <2 x i16> %src1, <i16 -1, i16 -1>
564  %and = and <2 x i16> %not.src1, %src0
565  %cast = bitcast <2 x i16> %and to i32
566  ret i32 %cast
567}
568
569define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
570; GFX6-LABEL: s_andn2_v2i16_multi_use:
571; GFX6:       ; %bb.0:
572; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
573; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
574; GFX6-NEXT:    s_or_b32 s0, s0, s1
575; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
576; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
577; GFX6-NEXT:    s_or_b32 s1, s1, s2
578; GFX6-NEXT:    s_xor_b32 s1, s1, -1
579; GFX6-NEXT:    s_and_b32 s0, s0, s1
580; GFX6-NEXT:    ; return to shader part epilog
581;
582; GFX9-LABEL: s_andn2_v2i16_multi_use:
583; GFX9:       ; %bb.0:
584; GFX9-NEXT:    s_xor_b32 s1, s3, -1
585; GFX9-NEXT:    s_andn2_b32 s0, s2, s3
586; GFX9-NEXT:    ; return to shader part epilog
587;
588; GFX10-LABEL: s_andn2_v2i16_multi_use:
589; GFX10:       ; %bb.0:
590; GFX10-NEXT:    s_andn2_b32 s0, s2, s3
591; GFX10-NEXT:    s_xor_b32 s1, s3, -1
592; GFX10-NEXT:    ; return to shader part epilog
593;
594; GFX11-LABEL: s_andn2_v2i16_multi_use:
595; GFX11:       ; %bb.0:
596; GFX11-NEXT:    s_and_not1_b32 s0, s2, s3
597; GFX11-NEXT:    s_xor_b32 s1, s3, -1
598; GFX11-NEXT:    ; return to shader part epilog
599  %not.src1 = xor <2 x i16> %src1, <i16 -1, i16 -1>
600  %and = and <2 x i16> %src0, %not.src1
601
602  %cast.0 = bitcast <2 x i16> %and to i32
603  %cast.1 = bitcast <2 x i16> %not.src1 to i32
604  %insert.0 = insertvalue { i32, i32 } undef, i32 %cast.0, 0
605  %insert.1 = insertvalue { i32, i32 } %insert.0, i32 %cast.1, 1
606  ret { i32, i32 } %insert.1
607}
608
609define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_foldable_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1, <2 x i16> inreg %src2) {
610; GFX6-LABEL: s_andn2_v2i16_multi_foldable_use:
611; GFX6:       ; %bb.0:
612; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
613; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
614; GFX6-NEXT:    s_or_b32 s0, s0, s1
615; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
616; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
617; GFX6-NEXT:    s_or_b32 s1, s1, s2
618; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
619; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
620; GFX6-NEXT:    s_or_b32 s2, s2, s3
621; GFX6-NEXT:    s_xor_b32 s2, s2, -1
622; GFX6-NEXT:    s_and_b32 s0, s0, s2
623; GFX6-NEXT:    s_and_b32 s1, s1, s2
624; GFX6-NEXT:    ; return to shader part epilog
625;
626; GFX9-LABEL: s_andn2_v2i16_multi_foldable_use:
627; GFX9:       ; %bb.0:
628; GFX9-NEXT:    s_andn2_b32 s0, s2, s4
629; GFX9-NEXT:    s_andn2_b32 s1, s3, s4
630; GFX9-NEXT:    ; return to shader part epilog
631;
632; GFX10-LABEL: s_andn2_v2i16_multi_foldable_use:
633; GFX10:       ; %bb.0:
634; GFX10-NEXT:    s_andn2_b32 s0, s2, s4
635; GFX10-NEXT:    s_andn2_b32 s1, s3, s4
636; GFX10-NEXT:    ; return to shader part epilog
637;
638; GFX11-LABEL: s_andn2_v2i16_multi_foldable_use:
639; GFX11:       ; %bb.0:
640; GFX11-NEXT:    s_and_not1_b32 s0, s2, s4
641; GFX11-NEXT:    s_and_not1_b32 s1, s3, s4
642; GFX11-NEXT:    ; return to shader part epilog
643  %not.src2 = xor <2 x i16> %src2, <i16 -1, i16 -1>
644  %and0 = and <2 x i16> %src0, %not.src2
645  %and1 = and <2 x i16> %src1, %not.src2
646
647  %cast.0 = bitcast <2 x i16> %and0 to i32
648  %cast.1 = bitcast <2 x i16> %and1 to i32
649  %insert.0 = insertvalue { i32, i32 } undef, i32 %cast.0, 0
650  %insert.1 = insertvalue { i32, i32 } %insert.0, i32 %cast.1, 1
651  ret { i32, i32 } %insert.1
652}
653
654define <2 x i16> @v_andn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) {
655; GFX6-LABEL: v_andn2_v2i16:
656; GFX6:       ; %bb.0:
657; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
658; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
659; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
660; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
661; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
662; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
663; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
664; GFX6-NEXT:    v_xor_b32_e32 v1, -1, v1
665; GFX6-NEXT:    v_and_b32_e32 v0, v0, v1
666; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
667; GFX6-NEXT:    s_setpc_b64 s[30:31]
668;
669; GFX9-LABEL: v_andn2_v2i16:
670; GFX9:       ; %bb.0:
671; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
672; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
673; GFX9-NEXT:    v_and_b32_e32 v0, v0, v1
674; GFX9-NEXT:    s_setpc_b64 s[30:31]
675;
676; GFX10PLUS-LABEL: v_andn2_v2i16:
677; GFX10PLUS:       ; %bb.0:
678; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
679; GFX10PLUS-NEXT:    v_xor_b32_e32 v1, -1, v1
680; GFX10PLUS-NEXT:    v_and_b32_e32 v0, v0, v1
681; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
682  %not.src1 = xor <2 x i16> %src1, <i16 -1, i16 -1>
683  %and = and <2 x i16> %src0, %not.src1
684  ret <2 x i16> %and
685}
686
687
688define amdgpu_ps i48 @s_andn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
689; GFX6-LABEL: s_andn2_v3i16:
690; GFX6:       ; %bb.0:
691; GFX6-NEXT:    s_and_b32 s6, s6, 0xffff
692; GFX6-NEXT:    s_mov_b32 s0, -1
693; GFX6-NEXT:    s_and_b32 s5, s5, 0xffff
694; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
695; GFX6-NEXT:    s_and_b32 s3, s3, 0xffff
696; GFX6-NEXT:    s_mov_b32 s1, 0xffff
697; GFX6-NEXT:    s_or_b32 s6, s5, s6
698; GFX6-NEXT:    s_and_b32 s7, s7, 0xffff
699; GFX6-NEXT:    s_and_b32 s2, s2, 0xffff
700; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
701; GFX6-NEXT:    s_xor_b64 s[0:1], s[6:7], s[0:1]
702; GFX6-NEXT:    s_or_b32 s2, s2, s3
703; GFX6-NEXT:    s_and_b32 s3, s4, 0xffff
704; GFX6-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
705; GFX6-NEXT:    s_lshr_b32 s2, s0, 16
706; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
707; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
708; GFX6-NEXT:    s_or_b32 s0, s0, s2
709; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
710; GFX6-NEXT:    ; return to shader part epilog
711;
712; GFX9-LABEL: s_andn2_v3i16:
713; GFX9:       ; %bb.0:
714; GFX9-NEXT:    s_mov_b64 s[0:1], -1
715; GFX9-NEXT:    s_xor_b64 s[0:1], s[4:5], s[0:1]
716; GFX9-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
717; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
718; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
719; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
720; GFX9-NEXT:    s_or_b32 s0, s0, s2
721; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
722; GFX9-NEXT:    ; return to shader part epilog
723;
724; GFX10PLUS-LABEL: s_andn2_v3i16:
725; GFX10PLUS:       ; %bb.0:
726; GFX10PLUS-NEXT:    s_mov_b64 s[0:1], -1
727; GFX10PLUS-NEXT:    s_xor_b64 s[0:1], s[4:5], s[0:1]
728; GFX10PLUS-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
729; GFX10PLUS-NEXT:    s_lshr_b32 s2, s0, 16
730; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
731; GFX10PLUS-NEXT:    s_lshl_b32 s2, s2, 16
732; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xffff
733; GFX10PLUS-NEXT:    s_or_b32 s0, s0, s2
734; GFX10PLUS-NEXT:    ; return to shader part epilog
735  %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -1>
736  %and = and <3 x i16> %src0, %not.src1
737  %cast = bitcast <3 x i16> %and to i48
738  ret i48 %cast
739}
740
741define amdgpu_ps i48 @s_andn2_v3i16_commute(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
742; GFX6-LABEL: s_andn2_v3i16_commute:
743; GFX6:       ; %bb.0:
744; GFX6-NEXT:    s_and_b32 s6, s6, 0xffff
745; GFX6-NEXT:    s_mov_b32 s0, -1
746; GFX6-NEXT:    s_and_b32 s5, s5, 0xffff
747; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
748; GFX6-NEXT:    s_and_b32 s3, s3, 0xffff
749; GFX6-NEXT:    s_mov_b32 s1, 0xffff
750; GFX6-NEXT:    s_or_b32 s6, s5, s6
751; GFX6-NEXT:    s_and_b32 s7, s7, 0xffff
752; GFX6-NEXT:    s_and_b32 s2, s2, 0xffff
753; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
754; GFX6-NEXT:    s_xor_b64 s[0:1], s[6:7], s[0:1]
755; GFX6-NEXT:    s_or_b32 s2, s2, s3
756; GFX6-NEXT:    s_and_b32 s3, s4, 0xffff
757; GFX6-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
758; GFX6-NEXT:    s_lshr_b32 s2, s0, 16
759; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
760; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
761; GFX6-NEXT:    s_or_b32 s0, s0, s2
762; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
763; GFX6-NEXT:    ; return to shader part epilog
764;
765; GFX9-LABEL: s_andn2_v3i16_commute:
766; GFX9:       ; %bb.0:
767; GFX9-NEXT:    s_mov_b64 s[0:1], -1
768; GFX9-NEXT:    s_xor_b64 s[0:1], s[4:5], s[0:1]
769; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
770; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
771; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
772; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
773; GFX9-NEXT:    s_or_b32 s0, s0, s2
774; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
775; GFX9-NEXT:    ; return to shader part epilog
776;
777; GFX10PLUS-LABEL: s_andn2_v3i16_commute:
778; GFX10PLUS:       ; %bb.0:
779; GFX10PLUS-NEXT:    s_mov_b64 s[0:1], -1
780; GFX10PLUS-NEXT:    s_xor_b64 s[0:1], s[4:5], s[0:1]
781; GFX10PLUS-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
782; GFX10PLUS-NEXT:    s_lshr_b32 s2, s0, 16
783; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
784; GFX10PLUS-NEXT:    s_lshl_b32 s2, s2, 16
785; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xffff
786; GFX10PLUS-NEXT:    s_or_b32 s0, s0, s2
787; GFX10PLUS-NEXT:    ; return to shader part epilog
788  %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -1>
789  %and = and <3 x i16> %not.src1, %src0
790  %cast = bitcast <3 x i16> %and to i48
791  ret i48 %cast
792}
793
794define amdgpu_ps { i48, i48 } @s_andn2_v3i16_multi_use(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
795; GFX6-LABEL: s_andn2_v3i16_multi_use:
796; GFX6:       ; %bb.0:
797; GFX6-NEXT:    s_and_b32 s6, s6, 0xffff
798; GFX6-NEXT:    s_mov_b32 s0, -1
799; GFX6-NEXT:    s_and_b32 s5, s5, 0xffff
800; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
801; GFX6-NEXT:    s_mov_b32 s1, 0xffff
802; GFX6-NEXT:    s_or_b32 s6, s5, s6
803; GFX6-NEXT:    s_and_b32 s7, s7, 0xffff
804; GFX6-NEXT:    s_xor_b64 s[6:7], s[6:7], s[0:1]
805; GFX6-NEXT:    s_and_b32 s1, s3, 0xffff
806; GFX6-NEXT:    s_and_b32 s0, s2, 0xffff
807; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
808; GFX6-NEXT:    s_or_b32 s0, s0, s1
809; GFX6-NEXT:    s_and_b32 s1, s4, 0xffff
810; GFX6-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
811; GFX6-NEXT:    s_lshr_b32 s2, s0, 16
812; GFX6-NEXT:    s_lshr_b32 s5, s6, 16
813; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
814; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
815; GFX6-NEXT:    s_or_b32 s0, s0, s2
816; GFX6-NEXT:    s_and_b32 s2, s6, 0xffff
817; GFX6-NEXT:    s_lshl_b32 s3, s5, 16
818; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
819; GFX6-NEXT:    s_or_b32 s2, s2, s3
820; GFX6-NEXT:    s_and_b32 s3, s7, 0xffff
821; GFX6-NEXT:    ; return to shader part epilog
822;
823; GFX9-LABEL: s_andn2_v3i16_multi_use:
824; GFX9:       ; %bb.0:
825; GFX9-NEXT:    s_mov_b64 s[0:1], -1
826; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], s[0:1]
827; GFX9-NEXT:    s_and_b64 s[0:1], s[2:3], s[4:5]
828; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
829; GFX9-NEXT:    s_lshr_b32 s6, s4, 16
830; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
831; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
832; GFX9-NEXT:    s_or_b32 s0, s0, s2
833; GFX9-NEXT:    s_and_b32 s2, s4, 0xffff
834; GFX9-NEXT:    s_lshl_b32 s3, s6, 16
835; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
836; GFX9-NEXT:    s_or_b32 s2, s2, s3
837; GFX9-NEXT:    s_and_b32 s3, s5, 0xffff
838; GFX9-NEXT:    ; return to shader part epilog
839;
840; GFX10PLUS-LABEL: s_andn2_v3i16_multi_use:
841; GFX10PLUS:       ; %bb.0:
842; GFX10PLUS-NEXT:    s_mov_b64 s[0:1], -1
843; GFX10PLUS-NEXT:    s_xor_b64 s[4:5], s[4:5], s[0:1]
844; GFX10PLUS-NEXT:    s_and_b64 s[0:1], s[2:3], s[4:5]
845; GFX10PLUS-NEXT:    s_lshr_b32 s3, s4, 16
846; GFX10PLUS-NEXT:    s_lshr_b32 s2, s0, 16
847; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
848; GFX10PLUS-NEXT:    s_lshl_b32 s2, s2, 16
849; GFX10PLUS-NEXT:    s_lshl_b32 s3, s3, 16
850; GFX10PLUS-NEXT:    s_or_b32 s0, s0, s2
851; GFX10PLUS-NEXT:    s_and_b32 s2, s4, 0xffff
852; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xffff
853; GFX10PLUS-NEXT:    s_or_b32 s2, s2, s3
854; GFX10PLUS-NEXT:    s_and_b32 s3, s5, 0xffff
855; GFX10PLUS-NEXT:    ; return to shader part epilog
856  %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -1>
857  %and = and <3 x i16> %src0, %not.src1
858  %cast.0 = bitcast <3 x i16> %and to i48
859  %cast.1 = bitcast <3 x i16> %not.src1 to i48
860  %insert.0 = insertvalue { i48, i48 } undef, i48 %cast.0, 0
861  %insert.1 = insertvalue { i48, i48 } %insert.0, i48 %cast.1, 1
862  ret { i48, i48 } %insert.1
863}
864
865define <3 x i16> @v_andn2_v3i16(<3 x i16> %src0, <3 x i16> %src1) {
866; GFX6-LABEL: v_andn2_v3i16:
867; GFX6:       ; %bb.0:
868; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
869; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
870; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
871; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
872; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
873; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
874; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
875; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
876; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v5
877; GFX6-NEXT:    v_xor_b32_e32 v3, -1, v3
878; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
879; GFX6-NEXT:    v_xor_b32_e32 v4, 0xfff5, v4
880; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
881; GFX6-NEXT:    v_and_b32_e32 v0, v0, v3
882; GFX6-NEXT:    v_and_b32_e32 v2, v1, v4
883; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
884; GFX6-NEXT:    s_setpc_b64 s[30:31]
885;
886; GFX9-LABEL: v_andn2_v3i16:
887; GFX9:       ; %bb.0:
888; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
889; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
890; GFX9-NEXT:    v_xor_b32_e32 v3, -11, v3
891; GFX9-NEXT:    v_and_b32_e32 v0, v0, v2
892; GFX9-NEXT:    v_and_b32_e32 v1, v1, v3
893; GFX9-NEXT:    s_setpc_b64 s[30:31]
894;
895; GFX10PLUS-LABEL: v_andn2_v3i16:
896; GFX10PLUS:       ; %bb.0:
897; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
898; GFX10PLUS-NEXT:    v_xor_b32_e32 v2, -1, v2
899; GFX10PLUS-NEXT:    v_xor_b32_e32 v3, -11, v3
900; GFX10PLUS-NEXT:    v_and_b32_e32 v0, v0, v2
901; GFX10PLUS-NEXT:    v_and_b32_e32 v1, v1, v3
902; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
903  %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -11>
904  %and = and <3 x i16> %src0, %not.src1
905  ret <3 x i16> %and
906}
907
908define amdgpu_ps i64 @s_andn2_v4i16(<4 x i16> inreg %src0, <4 x i16> inreg %src1) {
909; GFX6-LABEL: s_andn2_v4i16:
910; GFX6:       ; %bb.0:
911; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
912; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
913; GFX6-NEXT:    s_or_b32 s0, s0, s1
914; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
915; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
916; GFX6-NEXT:    s_or_b32 s1, s1, s2
917; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
918; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
919; GFX6-NEXT:    s_or_b32 s2, s2, s3
920; GFX6-NEXT:    s_lshl_b32 s3, s9, 16
921; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
922; GFX6-NEXT:    s_or_b32 s3, s3, s4
923; GFX6-NEXT:    s_mov_b32 s4, -1
924; GFX6-NEXT:    s_mov_b32 s5, s4
925; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
926; GFX6-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
927; GFX6-NEXT:    ; return to shader part epilog
928;
929; GFX9-LABEL: s_andn2_v4i16:
930; GFX9:       ; %bb.0:
931; GFX9-NEXT:    s_mov_b32 s0, -1
932; GFX9-NEXT:    s_mov_b32 s1, s0
933; GFX9-NEXT:    s_xor_b64 s[0:1], s[4:5], s[0:1]
934; GFX9-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
935; GFX9-NEXT:    ; return to shader part epilog
936;
937; GFX10PLUS-LABEL: s_andn2_v4i16:
938; GFX10PLUS:       ; %bb.0:
939; GFX10PLUS-NEXT:    s_mov_b32 s0, -1
940; GFX10PLUS-NEXT:    s_mov_b32 s1, s0
941; GFX10PLUS-NEXT:    s_xor_b64 s[0:1], s[4:5], s[0:1]
942; GFX10PLUS-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
943; GFX10PLUS-NEXT:    ; return to shader part epilog
944  %not.src1 = xor <4 x i16> %src1, <i16 -1, i16 -1, i16 -1, i16 -1>
945  %and = and <4 x i16> %src0, %not.src1
946  %cast = bitcast <4 x i16> %and to i64
947  ret i64 %cast
948}
949
950define amdgpu_ps i64 @s_andn2_v4i16_commute(<4 x i16> inreg %src0, <4 x i16> inreg %src1) {
951; GFX6-LABEL: s_andn2_v4i16_commute:
952; GFX6:       ; %bb.0:
953; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
954; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
955; GFX6-NEXT:    s_or_b32 s0, s0, s1
956; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
957; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
958; GFX6-NEXT:    s_or_b32 s1, s1, s2
959; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
960; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
961; GFX6-NEXT:    s_or_b32 s2, s2, s3
962; GFX6-NEXT:    s_lshl_b32 s3, s9, 16
963; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
964; GFX6-NEXT:    s_or_b32 s3, s3, s4
965; GFX6-NEXT:    s_mov_b32 s4, -1
966; GFX6-NEXT:    s_mov_b32 s5, s4
967; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
968; GFX6-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
969; GFX6-NEXT:    ; return to shader part epilog
970;
971; GFX9-LABEL: s_andn2_v4i16_commute:
972; GFX9:       ; %bb.0:
973; GFX9-NEXT:    s_mov_b32 s0, -1
974; GFX9-NEXT:    s_mov_b32 s1, s0
975; GFX9-NEXT:    s_xor_b64 s[0:1], s[4:5], s[0:1]
976; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
977; GFX9-NEXT:    ; return to shader part epilog
978;
979; GFX10PLUS-LABEL: s_andn2_v4i16_commute:
980; GFX10PLUS:       ; %bb.0:
981; GFX10PLUS-NEXT:    s_mov_b32 s0, -1
982; GFX10PLUS-NEXT:    s_mov_b32 s1, s0
983; GFX10PLUS-NEXT:    s_xor_b64 s[0:1], s[4:5], s[0:1]
984; GFX10PLUS-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
985; GFX10PLUS-NEXT:    ; return to shader part epilog
986  %not.src1 = xor <4 x i16> %src1, <i16 -1, i16 -1, i16 -1, i16 -1>
987  %and = and <4 x i16> %not.src1, %src0
988  %cast = bitcast <4 x i16> %and to i64
989  ret i64 %cast
990}
991
992define amdgpu_ps { i64, i64 } @s_andn2_v4i16_multi_use(<4 x i16> inreg %src0, <4 x i16> inreg %src1) {
993; GFX6-LABEL: s_andn2_v4i16_multi_use:
994; GFX6:       ; %bb.0:
995; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
996; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
997; GFX6-NEXT:    s_or_b32 s0, s0, s1
998; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
999; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
1000; GFX6-NEXT:    s_or_b32 s1, s1, s2
1001; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
1002; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
1003; GFX6-NEXT:    s_or_b32 s2, s2, s3
1004; GFX6-NEXT:    s_lshl_b32 s3, s9, 16
1005; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
1006; GFX6-NEXT:    s_or_b32 s3, s3, s4
1007; GFX6-NEXT:    s_mov_b32 s4, -1
1008; GFX6-NEXT:    s_mov_b32 s5, s4
1009; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
1010; GFX6-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
1011; GFX6-NEXT:    ; return to shader part epilog
1012;
1013; GFX9-LABEL: s_andn2_v4i16_multi_use:
1014; GFX9:       ; %bb.0:
1015; GFX9-NEXT:    s_mov_b32 s0, -1
1016; GFX9-NEXT:    s_mov_b32 s1, s0
1017; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], s[0:1]
1018; GFX9-NEXT:    s_and_b64 s[0:1], s[2:3], s[4:5]
1019; GFX9-NEXT:    s_mov_b32 s2, s4
1020; GFX9-NEXT:    s_mov_b32 s3, s5
1021; GFX9-NEXT:    ; return to shader part epilog
1022;
1023; GFX10PLUS-LABEL: s_andn2_v4i16_multi_use:
1024; GFX10PLUS:       ; %bb.0:
1025; GFX10PLUS-NEXT:    s_mov_b32 s0, -1
1026; GFX10PLUS-NEXT:    s_mov_b32 s1, s0
1027; GFX10PLUS-NEXT:    s_xor_b64 s[4:5], s[4:5], s[0:1]
1028; GFX10PLUS-NEXT:    s_and_b64 s[0:1], s[2:3], s[4:5]
1029; GFX10PLUS-NEXT:    s_mov_b32 s2, s4
1030; GFX10PLUS-NEXT:    s_mov_b32 s3, s5
1031; GFX10PLUS-NEXT:    ; return to shader part epilog
1032  %not.src1 = xor <4 x i16> %src1, <i16 -1, i16 -1, i16 -1, i16 -1>
1033  %and = and <4 x i16> %src0, %not.src1
1034
1035  %cast.0 = bitcast <4 x i16> %and to i64
1036  %cast.1 = bitcast <4 x i16> %not.src1 to i64
1037  %insert.0 = insertvalue { i64, i64 } undef, i64 %cast.0, 0
1038  %insert.1 = insertvalue { i64, i64 } %insert.0, i64 %cast.1, 1
1039  ret { i64, i64 } %insert.1
1040}
1041
1042define amdgpu_ps { i64, i64 } @s_andn2_v4i16_multi_foldable_use(<4 x i16> inreg %src0, <4 x i16> inreg %src1, <4 x i16> inreg %src2) {
1043; GFX6-LABEL: s_andn2_v4i16_multi_foldable_use:
1044; GFX6:       ; %bb.0:
1045; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
1046; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
1047; GFX6-NEXT:    s_or_b32 s0, s0, s1
1048; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
1049; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
1050; GFX6-NEXT:    s_or_b32 s1, s1, s2
1051; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
1052; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
1053; GFX6-NEXT:    s_or_b32 s2, s2, s3
1054; GFX6-NEXT:    s_lshl_b32 s3, s9, 16
1055; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
1056; GFX6-NEXT:    s_or_b32 s3, s3, s4
1057; GFX6-NEXT:    s_lshl_b32 s4, s11, 16
1058; GFX6-NEXT:    s_and_b32 s5, s10, 0xffff
1059; GFX6-NEXT:    s_or_b32 s4, s4, s5
1060; GFX6-NEXT:    s_lshl_b32 s5, s13, 16
1061; GFX6-NEXT:    s_and_b32 s6, s12, 0xffff
1062; GFX6-NEXT:    s_or_b32 s5, s5, s6
1063; GFX6-NEXT:    s_mov_b32 s6, -1
1064; GFX6-NEXT:    s_mov_b32 s7, s6
1065; GFX6-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
1066; GFX6-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
1067; GFX6-NEXT:    s_and_b64 s[2:3], s[2:3], s[4:5]
1068; GFX6-NEXT:    ; return to shader part epilog
1069;
1070; GFX9-LABEL: s_andn2_v4i16_multi_foldable_use:
1071; GFX9:       ; %bb.0:
1072; GFX9-NEXT:    s_mov_b32 s0, -1
1073; GFX9-NEXT:    s_mov_b32 s1, s0
1074; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[0:1]
1075; GFX9-NEXT:    s_and_b64 s[0:1], s[2:3], s[6:7]
1076; GFX9-NEXT:    s_and_b64 s[2:3], s[4:5], s[6:7]
1077; GFX9-NEXT:    ; return to shader part epilog
1078;
1079; GFX10PLUS-LABEL: s_andn2_v4i16_multi_foldable_use:
1080; GFX10PLUS:       ; %bb.0:
1081; GFX10PLUS-NEXT:    s_mov_b32 s0, -1
1082; GFX10PLUS-NEXT:    s_mov_b32 s1, s0
1083; GFX10PLUS-NEXT:    s_xor_b64 s[6:7], s[6:7], s[0:1]
1084; GFX10PLUS-NEXT:    s_and_b64 s[0:1], s[2:3], s[6:7]
1085; GFX10PLUS-NEXT:    s_and_b64 s[2:3], s[4:5], s[6:7]
1086; GFX10PLUS-NEXT:    ; return to shader part epilog
1087  %not.src2 = xor <4 x i16> %src2, <i16 -1, i16 -1, i16 -1, i16 -1>
1088  %and0 = and <4 x i16> %src0, %not.src2
1089  %and1 = and <4 x i16> %src1, %not.src2
1090
1091  %cast.0 = bitcast <4 x i16> %and0 to i64
1092  %cast.1 = bitcast <4 x i16> %and1 to i64
1093  %insert.0 = insertvalue { i64, i64 } undef, i64 %cast.0, 0
1094  %insert.1 = insertvalue { i64, i64 } %insert.0, i64 %cast.1, 1
1095  ret { i64, i64 } %insert.1
1096}
1097
1098define <4 x i16> @v_andn2_v4i16(<4 x i16> %src0, <4 x i16> %src1) {
1099; GFX6-LABEL: v_andn2_v4i16:
1100; GFX6:       ; %bb.0:
1101; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1102; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1103; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1104; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
1105; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
1106; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1107; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
1108; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
1109; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v4
1110; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
1111; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
1112; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v6
1113; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
1114; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v2
1115; GFX6-NEXT:    v_xor_b32_e32 v3, -1, v3
1116; GFX6-NEXT:    v_and_b32_e32 v0, v0, v2
1117; GFX6-NEXT:    v_and_b32_e32 v2, v1, v3
1118; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1119; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
1120; GFX6-NEXT:    s_setpc_b64 s[30:31]
1121;
1122; GFX9-LABEL: v_andn2_v4i16:
1123; GFX9:       ; %bb.0:
1124; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1125; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
1126; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v3
1127; GFX9-NEXT:    v_and_b32_e32 v0, v0, v2
1128; GFX9-NEXT:    v_and_b32_e32 v1, v1, v3
1129; GFX9-NEXT:    s_setpc_b64 s[30:31]
1130;
1131; GFX10PLUS-LABEL: v_andn2_v4i16:
1132; GFX10PLUS:       ; %bb.0:
1133; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1134; GFX10PLUS-NEXT:    v_xor_b32_e32 v2, -1, v2
1135; GFX10PLUS-NEXT:    v_xor_b32_e32 v3, -1, v3
1136; GFX10PLUS-NEXT:    v_and_b32_e32 v0, v0, v2
1137; GFX10PLUS-NEXT:    v_and_b32_e32 v1, v1, v3
1138; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
1139  %not.src1 = xor <4 x i16> %src1, <i16 -1, i16 -1, i16 -1, i16 -1>
1140  %and = and <4 x i16> %src0, %not.src1
1141  ret <4 x i16> %and
1142}
1143