xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll (revision 8f6a1a07cb85980013c70d5af6d28f5fcf75e732)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
3; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
4; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
5; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
6
7define amdgpu_ps i32 @s_orn2_i32(i32 inreg %src0, i32 inreg %src1) {
8; GCN-LABEL: s_orn2_i32:
9; GCN:       ; %bb.0:
10; GCN-NEXT:    s_orn2_b32 s0, s2, s3
11; GCN-NEXT:    ; return to shader part epilog
12;
13; GFX10-LABEL: s_orn2_i32:
14; GFX10:       ; %bb.0:
15; GFX10-NEXT:    s_orn2_b32 s0, s2, s3
16; GFX10-NEXT:    ; return to shader part epilog
17;
18; GFX11-LABEL: s_orn2_i32:
19; GFX11:       ; %bb.0:
20; GFX11-NEXT:    s_or_not1_b32 s0, s2, s3
21; GFX11-NEXT:    ; return to shader part epilog
22  %not.src1 = xor i32 %src1, -1
23  %or = or i32 %src0, %not.src1
24  ret i32 %or
25}
26
27define amdgpu_ps i32 @s_orn2_i32_commute(i32 inreg %src0, i32 inreg %src1) {
28; GCN-LABEL: s_orn2_i32_commute:
29; GCN:       ; %bb.0:
30; GCN-NEXT:    s_orn2_b32 s0, s2, s3
31; GCN-NEXT:    ; return to shader part epilog
32;
33; GFX10-LABEL: s_orn2_i32_commute:
34; GFX10:       ; %bb.0:
35; GFX10-NEXT:    s_orn2_b32 s0, s2, s3
36; GFX10-NEXT:    ; return to shader part epilog
37;
38; GFX11-LABEL: s_orn2_i32_commute:
39; GFX11:       ; %bb.0:
40; GFX11-NEXT:    s_or_not1_b32 s0, s2, s3
41; GFX11-NEXT:    ; return to shader part epilog
42  %not.src1 = xor i32 %src1, -1
43  %or = or i32 %not.src1, %src0
44  ret i32 %or
45}
46
47define amdgpu_ps { i32, i32 } @s_orn2_i32_multi_use(i32 inreg %src0, i32 inreg %src1) {
48; GCN-LABEL: s_orn2_i32_multi_use:
49; GCN:       ; %bb.0:
50; GCN-NEXT:    s_not_b32 s1, s3
51; GCN-NEXT:    s_orn2_b32 s0, s2, s3
52; GCN-NEXT:    ; return to shader part epilog
53;
54; GFX10-LABEL: s_orn2_i32_multi_use:
55; GFX10:       ; %bb.0:
56; GFX10-NEXT:    s_orn2_b32 s0, s2, s3
57; GFX10-NEXT:    s_not_b32 s1, s3
58; GFX10-NEXT:    ; return to shader part epilog
59;
60; GFX11-LABEL: s_orn2_i32_multi_use:
61; GFX11:       ; %bb.0:
62; GFX11-NEXT:    s_or_not1_b32 s0, s2, s3
63; GFX11-NEXT:    s_not_b32 s1, s3
64; GFX11-NEXT:    ; return to shader part epilog
65  %not.src1 = xor i32 %src1, -1
66  %or = or i32 %src0, %not.src1
67  %insert.0 = insertvalue { i32, i32 } undef, i32 %or, 0
68  %insert.1 = insertvalue { i32, i32 } %insert.0, i32 %not.src1, 1
69  ret { i32, i32 } %insert.1
70}
71
72define amdgpu_ps { i32, i32 } @s_orn2_i32_multi_foldable_use(i32 inreg %src0, i32 inreg %src1, i32 inreg %src2) {
73; GCN-LABEL: s_orn2_i32_multi_foldable_use:
74; GCN:       ; %bb.0:
75; GCN-NEXT:    s_orn2_b32 s0, s2, s4
76; GCN-NEXT:    s_orn2_b32 s1, s3, s4
77; GCN-NEXT:    ; return to shader part epilog
78;
79; GFX10-LABEL: s_orn2_i32_multi_foldable_use:
80; GFX10:       ; %bb.0:
81; GFX10-NEXT:    s_orn2_b32 s0, s2, s4
82; GFX10-NEXT:    s_orn2_b32 s1, s3, s4
83; GFX10-NEXT:    ; return to shader part epilog
84;
85; GFX11-LABEL: s_orn2_i32_multi_foldable_use:
86; GFX11:       ; %bb.0:
87; GFX11-NEXT:    s_or_not1_b32 s0, s2, s4
88; GFX11-NEXT:    s_or_not1_b32 s1, s3, s4
89; GFX11-NEXT:    ; return to shader part epilog
90  %not.src2 = xor i32 %src2, -1
91  %or0 = or i32 %src0, %not.src2
92  %or1 = or i32 %src1, %not.src2
93  %insert.0 = insertvalue { i32, i32 } undef, i32 %or0, 0
94  %insert.1 = insertvalue { i32, i32 } %insert.0, i32 %or1, 1
95  ret { i32, i32 } %insert.1
96}
97
98define i32 @v_orn2_i32(i32 %src0, i32 %src1) {
99; GCN-LABEL: v_orn2_i32:
100; GCN:       ; %bb.0:
101; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
102; GCN-NEXT:    v_not_b32_e32 v1, v1
103; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
104; GCN-NEXT:    s_setpc_b64 s[30:31]
105;
106; GFX10PLUS-LABEL: v_orn2_i32:
107; GFX10PLUS:       ; %bb.0:
108; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
109; GFX10PLUS-NEXT:    v_not_b32_e32 v1, v1
110; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v0, v1
111; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
112  %not.src1 = xor i32 %src1, -1
113  %or = or i32 %src0, %not.src1
114  ret i32 %or
115}
116
117define amdgpu_ps float @v_orn2_i32_sv(i32 inreg %src0, i32 %src1) {
118; GCN-LABEL: v_orn2_i32_sv:
119; GCN:       ; %bb.0:
120; GCN-NEXT:    v_not_b32_e32 v0, v0
121; GCN-NEXT:    v_or_b32_e32 v0, s2, v0
122; GCN-NEXT:    ; return to shader part epilog
123;
124; GFX10PLUS-LABEL: v_orn2_i32_sv:
125; GFX10PLUS:       ; %bb.0:
126; GFX10PLUS-NEXT:    v_not_b32_e32 v0, v0
127; GFX10PLUS-NEXT:    v_or_b32_e32 v0, s2, v0
128; GFX10PLUS-NEXT:    ; return to shader part epilog
129  %not.src1 = xor i32 %src1, -1
130  %or = or i32 %src0, %not.src1
131  %cast = bitcast i32 %or to float
132  ret float %cast
133}
134
135define amdgpu_ps float @v_orn2_i32_vs(i32 %src0, i32 inreg %src1) {
136; GCN-LABEL: v_orn2_i32_vs:
137; GCN:       ; %bb.0:
138; GCN-NEXT:    s_not_b32 s0, s2
139; GCN-NEXT:    v_or_b32_e32 v0, s0, v0
140; GCN-NEXT:    ; return to shader part epilog
141;
142; GFX10PLUS-LABEL: v_orn2_i32_vs:
143; GFX10PLUS:       ; %bb.0:
144; GFX10PLUS-NEXT:    s_not_b32 s0, s2
145; GFX10PLUS-NEXT:    v_or_b32_e32 v0, s0, v0
146; GFX10PLUS-NEXT:    ; return to shader part epilog
147  %not.src1 = xor i32 %src1, -1
148  %or = or i32 %src0, %not.src1
149  %cast = bitcast i32 %or to float
150  ret float %cast
151}
152
153define amdgpu_ps i64 @s_orn2_i64(i64 inreg %src0, i64 inreg %src1) {
154; GCN-LABEL: s_orn2_i64:
155; GCN:       ; %bb.0:
156; GCN-NEXT:    s_orn2_b64 s[0:1], s[2:3], s[4:5]
157; GCN-NEXT:    ; return to shader part epilog
158;
159; GFX10-LABEL: s_orn2_i64:
160; GFX10:       ; %bb.0:
161; GFX10-NEXT:    s_orn2_b64 s[0:1], s[2:3], s[4:5]
162; GFX10-NEXT:    ; return to shader part epilog
163;
164; GFX11-LABEL: s_orn2_i64:
165; GFX11:       ; %bb.0:
166; GFX11-NEXT:    s_or_not1_b64 s[0:1], s[2:3], s[4:5]
167; GFX11-NEXT:    ; return to shader part epilog
168  %not.src1 = xor i64 %src1, -1
169  %or = or i64 %src0, %not.src1
170  ret i64 %or
171}
172
173define amdgpu_ps i64 @s_orn2_i64_commute(i64 inreg %src0, i64 inreg %src1) {
174; GCN-LABEL: s_orn2_i64_commute:
175; GCN:       ; %bb.0:
176; GCN-NEXT:    s_orn2_b64 s[0:1], s[2:3], s[4:5]
177; GCN-NEXT:    ; return to shader part epilog
178;
179; GFX10-LABEL: s_orn2_i64_commute:
180; GFX10:       ; %bb.0:
181; GFX10-NEXT:    s_orn2_b64 s[0:1], s[2:3], s[4:5]
182; GFX10-NEXT:    ; return to shader part epilog
183;
184; GFX11-LABEL: s_orn2_i64_commute:
185; GFX11:       ; %bb.0:
186; GFX11-NEXT:    s_or_not1_b64 s[0:1], s[2:3], s[4:5]
187; GFX11-NEXT:    ; return to shader part epilog
188  %not.src1 = xor i64 %src1, -1
189  %or = or i64 %not.src1, %src0
190  ret i64 %or
191}
192
193define amdgpu_ps { i64, i64 } @s_orn2_i64_multi_foldable_use(i64 inreg %src0, i64 inreg %src1, i64 inreg %src2) {
194; GCN-LABEL: s_orn2_i64_multi_foldable_use:
195; GCN:       ; %bb.0:
196; GCN-NEXT:    s_orn2_b64 s[0:1], s[2:3], s[6:7]
197; GCN-NEXT:    s_orn2_b64 s[2:3], s[4:5], s[6:7]
198; GCN-NEXT:    ; return to shader part epilog
199;
200; GFX10-LABEL: s_orn2_i64_multi_foldable_use:
201; GFX10:       ; %bb.0:
202; GFX10-NEXT:    s_orn2_b64 s[0:1], s[2:3], s[6:7]
203; GFX10-NEXT:    s_orn2_b64 s[2:3], s[4:5], s[6:7]
204; GFX10-NEXT:    ; return to shader part epilog
205;
206; GFX11-LABEL: s_orn2_i64_multi_foldable_use:
207; GFX11:       ; %bb.0:
208; GFX11-NEXT:    s_or_not1_b64 s[0:1], s[2:3], s[6:7]
209; GFX11-NEXT:    s_or_not1_b64 s[2:3], s[4:5], s[6:7]
210; GFX11-NEXT:    ; return to shader part epilog
211  %not.src2 = xor i64 %src2, -1
212  %or0 = or i64 %src0, %not.src2
213  %or1 = or i64 %src1, %not.src2
214  %insert.0 = insertvalue { i64, i64 } undef, i64 %or0, 0
215  %insert.1 = insertvalue { i64, i64 } %insert.0, i64 %or1, 1
216  ret { i64, i64 } %insert.1
217}
218
219define amdgpu_ps { i64, i64 } @s_orn2_i64_multi_use(i64 inreg %src0, i64 inreg %src1) {
220; GCN-LABEL: s_orn2_i64_multi_use:
221; GCN:       ; %bb.0:
222; GCN-NEXT:    s_not_b64 s[6:7], s[4:5]
223; GCN-NEXT:    s_orn2_b64 s[0:1], s[2:3], s[4:5]
224; GCN-NEXT:    s_mov_b32 s2, s6
225; GCN-NEXT:    s_mov_b32 s3, s7
226; GCN-NEXT:    ; return to shader part epilog
227;
228; GFX10-LABEL: s_orn2_i64_multi_use:
229; GFX10:       ; %bb.0:
230; GFX10-NEXT:    s_orn2_b64 s[0:1], s[2:3], s[4:5]
231; GFX10-NEXT:    s_not_b64 s[2:3], s[4:5]
232; GFX10-NEXT:    ; return to shader part epilog
233;
234; GFX11-LABEL: s_orn2_i64_multi_use:
235; GFX11:       ; %bb.0:
236; GFX11-NEXT:    s_or_not1_b64 s[0:1], s[2:3], s[4:5]
237; GFX11-NEXT:    s_not_b64 s[2:3], s[4:5]
238; GFX11-NEXT:    ; return to shader part epilog
239  %not.src1 = xor i64 %src1, -1
240  %or = or i64 %src0, %not.src1
241  %insert.0 = insertvalue { i64, i64 } undef, i64 %or, 0
242  %insert.1 = insertvalue { i64, i64 } %insert.0, i64 %not.src1, 1
243  ret { i64, i64 } %insert.1
244}
245
246define i64 @v_orn2_i64(i64 %src0, i64 %src1) {
247; GCN-LABEL: v_orn2_i64:
248; GCN:       ; %bb.0:
249; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
250; GCN-NEXT:    v_not_b32_e32 v2, v2
251; GCN-NEXT:    v_not_b32_e32 v3, v3
252; GCN-NEXT:    v_or_b32_e32 v0, v0, v2
253; GCN-NEXT:    v_or_b32_e32 v1, v1, v3
254; GCN-NEXT:    s_setpc_b64 s[30:31]
255;
256; GFX10PLUS-LABEL: v_orn2_i64:
257; GFX10PLUS:       ; %bb.0:
258; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
259; GFX10PLUS-NEXT:    v_not_b32_e32 v2, v2
260; GFX10PLUS-NEXT:    v_not_b32_e32 v3, v3
261; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v0, v2
262; GFX10PLUS-NEXT:    v_or_b32_e32 v1, v1, v3
263; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
264  %not.src1 = xor i64 %src1, -1
265  %or = or i64 %src0, %not.src1
266  ret i64 %or
267}
268
269define amdgpu_ps <2 x float> @v_orn2_i64_sv(i64 inreg %src0, i64 %src1) {
270; GCN-LABEL: v_orn2_i64_sv:
271; GCN:       ; %bb.0:
272; GCN-NEXT:    v_not_b32_e32 v0, v0
273; GCN-NEXT:    v_not_b32_e32 v1, v1
274; GCN-NEXT:    v_or_b32_e32 v0, s2, v0
275; GCN-NEXT:    v_or_b32_e32 v1, s3, v1
276; GCN-NEXT:    ; return to shader part epilog
277;
278; GFX10PLUS-LABEL: v_orn2_i64_sv:
279; GFX10PLUS:       ; %bb.0:
280; GFX10PLUS-NEXT:    v_not_b32_e32 v0, v0
281; GFX10PLUS-NEXT:    v_not_b32_e32 v1, v1
282; GFX10PLUS-NEXT:    v_or_b32_e32 v0, s2, v0
283; GFX10PLUS-NEXT:    v_or_b32_e32 v1, s3, v1
284; GFX10PLUS-NEXT:    ; return to shader part epilog
285  %not.src1 = xor i64 %src1, -1
286  %or = or i64 %src0, %not.src1
287  %cast = bitcast i64 %or to <2 x float>
288  ret <2 x float> %cast
289}
290
291define amdgpu_ps <2 x float> @v_orn2_i64_vs(i64 %src0, i64 inreg %src1) {
292; GCN-LABEL: v_orn2_i64_vs:
293; GCN:       ; %bb.0:
294; GCN-NEXT:    s_not_b64 s[0:1], s[2:3]
295; GCN-NEXT:    v_or_b32_e32 v0, s0, v0
296; GCN-NEXT:    v_or_b32_e32 v1, s1, v1
297; GCN-NEXT:    ; return to shader part epilog
298;
299; GFX10PLUS-LABEL: v_orn2_i64_vs:
300; GFX10PLUS:       ; %bb.0:
301; GFX10PLUS-NEXT:    s_not_b64 s[0:1], s[2:3]
302; GFX10PLUS-NEXT:    v_or_b32_e32 v0, s0, v0
303; GFX10PLUS-NEXT:    v_or_b32_e32 v1, s1, v1
304; GFX10PLUS-NEXT:    ; return to shader part epilog
305  %not.src1 = xor i64 %src1, -1
306  %or = or i64 %src0, %not.src1
307  %cast = bitcast i64 %or to <2 x float>
308  ret <2 x float> %cast
309}
310
311define amdgpu_ps <2 x i32> @s_orn2_v2i32(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
312; GCN-LABEL: s_orn2_v2i32:
313; GCN:       ; %bb.0:
314; GCN-NEXT:    s_orn2_b64 s[0:1], s[2:3], s[4:5]
315; GCN-NEXT:    ; return to shader part epilog
316;
317; GFX10-LABEL: s_orn2_v2i32:
318; GFX10:       ; %bb.0:
319; GFX10-NEXT:    s_orn2_b64 s[0:1], s[2:3], s[4:5]
320; GFX10-NEXT:    ; return to shader part epilog
321;
322; GFX11-LABEL: s_orn2_v2i32:
323; GFX11:       ; %bb.0:
324; GFX11-NEXT:    s_or_not1_b64 s[0:1], s[2:3], s[4:5]
325; GFX11-NEXT:    ; return to shader part epilog
326  %not.src1 = xor <2 x i32> %src1, <i32 -1, i32 -1>
327  %or = or <2 x i32> %src0, %not.src1
328  ret <2 x i32> %or
329}
330
331define amdgpu_ps <2 x i32> @s_orn2_v2i32_commute(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
332; GCN-LABEL: s_orn2_v2i32_commute:
333; GCN:       ; %bb.0:
334; GCN-NEXT:    s_orn2_b64 s[0:1], s[2:3], s[4:5]
335; GCN-NEXT:    ; return to shader part epilog
336;
337; GFX10-LABEL: s_orn2_v2i32_commute:
338; GFX10:       ; %bb.0:
339; GFX10-NEXT:    s_orn2_b64 s[0:1], s[2:3], s[4:5]
340; GFX10-NEXT:    ; return to shader part epilog
341;
342; GFX11-LABEL: s_orn2_v2i32_commute:
343; GFX11:       ; %bb.0:
344; GFX11-NEXT:    s_or_not1_b64 s[0:1], s[2:3], s[4:5]
345; GFX11-NEXT:    ; return to shader part epilog
346  %not.src1 = xor <2 x i32> %src1, <i32 -1, i32 -1>
347  %or = or <2 x i32> %not.src1, %src0
348  ret <2 x i32> %or
349}
350
351define amdgpu_ps i16 @s_orn2_i16(i16 inreg %src0, i16 inreg %src1) {
352; GCN-LABEL: s_orn2_i16:
353; GCN:       ; %bb.0:
354; GCN-NEXT:    s_orn2_b32 s0, s2, s3
355; GCN-NEXT:    ; return to shader part epilog
356;
357; GFX10-LABEL: s_orn2_i16:
358; GFX10:       ; %bb.0:
359; GFX10-NEXT:    s_orn2_b32 s0, s2, s3
360; GFX10-NEXT:    ; return to shader part epilog
361;
362; GFX11-LABEL: s_orn2_i16:
363; GFX11:       ; %bb.0:
364; GFX11-NEXT:    s_or_not1_b32 s0, s2, s3
365; GFX11-NEXT:    ; return to shader part epilog
366  %not.src1 = xor i16 %src1, -1
367  %or = or i16 %src0, %not.src1
368  ret i16 %or
369}
370
371define amdgpu_ps i16 @s_orn2_i16_commute(i16 inreg %src0, i16 inreg %src1) {
372; GCN-LABEL: s_orn2_i16_commute:
373; GCN:       ; %bb.0:
374; GCN-NEXT:    s_orn2_b32 s0, s2, s3
375; GCN-NEXT:    ; return to shader part epilog
376;
377; GFX10-LABEL: s_orn2_i16_commute:
378; GFX10:       ; %bb.0:
379; GFX10-NEXT:    s_orn2_b32 s0, s2, s3
380; GFX10-NEXT:    ; return to shader part epilog
381;
382; GFX11-LABEL: s_orn2_i16_commute:
383; GFX11:       ; %bb.0:
384; GFX11-NEXT:    s_or_not1_b32 s0, s2, s3
385; GFX11-NEXT:    ; return to shader part epilog
386  %not.src1 = xor i16 %src1, -1
387  %or = or i16 %not.src1, %src0
388  ret i16 %or
389}
390
391define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_use(i16 inreg %src0, i16 inreg %src1) {
392; GCN-LABEL: s_orn2_i16_multi_use:
393; GCN:       ; %bb.0:
394; GCN-NEXT:    s_xor_b32 s1, s3, -1
395; GCN-NEXT:    s_orn2_b32 s0, s2, s3
396; GCN-NEXT:    ; return to shader part epilog
397;
398; GFX10-LABEL: s_orn2_i16_multi_use:
399; GFX10:       ; %bb.0:
400; GFX10-NEXT:    s_orn2_b32 s0, s2, s3
401; GFX10-NEXT:    s_xor_b32 s1, s3, -1
402; GFX10-NEXT:    ; return to shader part epilog
403;
404; GFX11-LABEL: s_orn2_i16_multi_use:
405; GFX11:       ; %bb.0:
406; GFX11-NEXT:    s_or_not1_b32 s0, s2, s3
407; GFX11-NEXT:    s_xor_b32 s1, s3, -1
408; GFX11-NEXT:    ; return to shader part epilog
409  %not.src1 = xor i16 %src1, -1
410  %or = or i16 %src0, %not.src1
411  %insert.0 = insertvalue { i16, i16 } undef, i16 %or, 0
412  %insert.1 = insertvalue { i16, i16 } %insert.0, i16 %not.src1, 1
413  ret { i16, i16 } %insert.1
414}
415
416define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_foldable_use(i16 inreg %src0, i16 inreg %src1, i16 inreg %src2) {
417; GCN-LABEL: s_orn2_i16_multi_foldable_use:
418; GCN:       ; %bb.0:
419; GCN-NEXT:    s_orn2_b32 s0, s2, s4
420; GCN-NEXT:    s_orn2_b32 s1, s3, s4
421; GCN-NEXT:    ; return to shader part epilog
422;
423; GFX10-LABEL: s_orn2_i16_multi_foldable_use:
424; GFX10:       ; %bb.0:
425; GFX10-NEXT:    s_orn2_b32 s0, s2, s4
426; GFX10-NEXT:    s_orn2_b32 s1, s3, s4
427; GFX10-NEXT:    ; return to shader part epilog
428;
429; GFX11-LABEL: s_orn2_i16_multi_foldable_use:
430; GFX11:       ; %bb.0:
431; GFX11-NEXT:    s_or_not1_b32 s0, s2, s4
432; GFX11-NEXT:    s_or_not1_b32 s1, s3, s4
433; GFX11-NEXT:    ; return to shader part epilog
434  %not.src2 = xor i16 %src2, -1
435  %or0 = or i16 %src0, %not.src2
436  %or1 = or i16 %src1, %not.src2
437  %insert.0 = insertvalue { i16, i16 } undef, i16 %or0, 0
438  %insert.1 = insertvalue { i16, i16 } %insert.0, i16 %or1, 1
439  ret { i16, i16 } %insert.1
440}
441
442define i16 @v_orn2_i16(i16 %src0, i16 %src1) {
443; GCN-LABEL: v_orn2_i16:
444; GCN:       ; %bb.0:
445; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
446; GCN-NEXT:    v_xor_b32_e32 v1, -1, v1
447; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
448; GCN-NEXT:    s_setpc_b64 s[30:31]
449;
450; GFX10PLUS-LABEL: v_orn2_i16:
451; GFX10PLUS:       ; %bb.0:
452; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
453; GFX10PLUS-NEXT:    v_xor_b32_e32 v1, -1, v1
454; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v0, v1
455; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
456  %not.src1 = xor i16 %src1, -1
457  %or = or i16 %src0, %not.src1
458  ret i16 %or
459}
460
461define amdgpu_ps float @v_orn2_i16_sv(i16 inreg %src0, i16 %src1) {
462; GCN-LABEL: v_orn2_i16_sv:
463; GCN:       ; %bb.0:
464; GCN-NEXT:    v_xor_b32_e32 v0, -1, v0
465; GCN-NEXT:    v_or_b32_e32 v0, s2, v0
466; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
467; GCN-NEXT:    ; return to shader part epilog
468;
469; GFX10PLUS-LABEL: v_orn2_i16_sv:
470; GFX10PLUS:       ; %bb.0:
471; GFX10PLUS-NEXT:    v_xor_b32_e32 v0, -1, v0
472; GFX10PLUS-NEXT:    v_or_b32_e32 v0, s2, v0
473; GFX10PLUS-NEXT:    v_and_b32_e32 v0, 0xffff, v0
474; GFX10PLUS-NEXT:    ; return to shader part epilog
475  %not.src1 = xor i16 %src1, -1
476  %or = or i16 %src0, %not.src1
477  %zext = zext i16 %or to i32
478  %cast.zext = bitcast i32 %zext to float
479  ret float %cast.zext
480}
481
482define amdgpu_ps float @v_orn2_i16_vs(i16 %src0, i16 inreg %src1) {
483; GCN-LABEL: v_orn2_i16_vs:
484; GCN:       ; %bb.0:
485; GCN-NEXT:    s_xor_b32 s0, s2, -1
486; GCN-NEXT:    v_or_b32_e32 v0, s0, v0
487; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
488; GCN-NEXT:    ; return to shader part epilog
489;
490; GFX10PLUS-LABEL: v_orn2_i16_vs:
491; GFX10PLUS:       ; %bb.0:
492; GFX10PLUS-NEXT:    s_xor_b32 s0, s2, -1
493; GFX10PLUS-NEXT:    v_or_b32_e32 v0, s0, v0
494; GFX10PLUS-NEXT:    v_and_b32_e32 v0, 0xffff, v0
495; GFX10PLUS-NEXT:    ; return to shader part epilog
496  %not.src1 = xor i16 %src1, -1
497  %or = or i16 %src0, %not.src1
498  %zext = zext i16 %or to i32
499  %cast.zext = bitcast i32 %zext to float
500  ret float %cast.zext
501}
502
503define amdgpu_ps i32 @s_orn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
504; GFX6-LABEL: s_orn2_v2i16:
505; GFX6:       ; %bb.0:
506; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
507; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
508; GFX6-NEXT:    s_or_b32 s0, s0, s1
509; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
510; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
511; GFX6-NEXT:    s_or_b32 s1, s1, s2
512; GFX6-NEXT:    s_xor_b32 s1, s1, -1
513; GFX6-NEXT:    s_or_b32 s0, s0, s1
514; GFX6-NEXT:    ; return to shader part epilog
515;
516; GFX9-LABEL: s_orn2_v2i16:
517; GFX9:       ; %bb.0:
518; GFX9-NEXT:    s_orn2_b32 s0, s2, s3
519; GFX9-NEXT:    ; return to shader part epilog
520;
521; GFX10-LABEL: s_orn2_v2i16:
522; GFX10:       ; %bb.0:
523; GFX10-NEXT:    s_orn2_b32 s0, s2, s3
524; GFX10-NEXT:    ; return to shader part epilog
525;
526; GFX11-LABEL: s_orn2_v2i16:
527; GFX11:       ; %bb.0:
528; GFX11-NEXT:    s_or_not1_b32 s0, s2, s3
529; GFX11-NEXT:    ; return to shader part epilog
530  %not.src1 = xor <2 x i16> %src1, <i16 -1, i16 -1>
531  %or = or <2 x i16> %src0, %not.src1
532  %cast = bitcast <2 x i16> %or to i32
533  ret i32 %cast
534}
535
536define amdgpu_ps i32 @s_orn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
537; GFX6-LABEL: s_orn2_v2i16_commute:
538; GFX6:       ; %bb.0:
539; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
540; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
541; GFX6-NEXT:    s_or_b32 s0, s0, s1
542; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
543; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
544; GFX6-NEXT:    s_or_b32 s1, s1, s2
545; GFX6-NEXT:    s_xor_b32 s1, s1, -1
546; GFX6-NEXT:    s_or_b32 s0, s1, s0
547; GFX6-NEXT:    ; return to shader part epilog
548;
549; GFX9-LABEL: s_orn2_v2i16_commute:
550; GFX9:       ; %bb.0:
551; GFX9-NEXT:    s_orn2_b32 s0, s2, s3
552; GFX9-NEXT:    ; return to shader part epilog
553;
554; GFX10-LABEL: s_orn2_v2i16_commute:
555; GFX10:       ; %bb.0:
556; GFX10-NEXT:    s_orn2_b32 s0, s2, s3
557; GFX10-NEXT:    ; return to shader part epilog
558;
559; GFX11-LABEL: s_orn2_v2i16_commute:
560; GFX11:       ; %bb.0:
561; GFX11-NEXT:    s_or_not1_b32 s0, s2, s3
562; GFX11-NEXT:    ; return to shader part epilog
563  %not.src1 = xor <2 x i16> %src1, <i16 -1, i16 -1>
564  %or = or <2 x i16> %not.src1, %src0
565  %cast = bitcast <2 x i16> %or to i32
566  ret i32 %cast
567}
568
569define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
570; GFX6-LABEL: s_orn2_v2i16_multi_use:
571; GFX6:       ; %bb.0:
572; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
573; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
574; GFX6-NEXT:    s_or_b32 s0, s0, s1
575; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
576; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
577; GFX6-NEXT:    s_or_b32 s1, s1, s2
578; GFX6-NEXT:    s_xor_b32 s1, s1, -1
579; GFX6-NEXT:    s_or_b32 s0, s0, s1
580; GFX6-NEXT:    ; return to shader part epilog
581;
582; GFX9-LABEL: s_orn2_v2i16_multi_use:
583; GFX9:       ; %bb.0:
584; GFX9-NEXT:    s_xor_b32 s1, s3, -1
585; GFX9-NEXT:    s_orn2_b32 s0, s2, s3
586; GFX9-NEXT:    ; return to shader part epilog
587;
588; GFX10-LABEL: s_orn2_v2i16_multi_use:
589; GFX10:       ; %bb.0:
590; GFX10-NEXT:    s_orn2_b32 s0, s2, s3
591; GFX10-NEXT:    s_xor_b32 s1, s3, -1
592; GFX10-NEXT:    ; return to shader part epilog
593;
594; GFX11-LABEL: s_orn2_v2i16_multi_use:
595; GFX11:       ; %bb.0:
596; GFX11-NEXT:    s_or_not1_b32 s0, s2, s3
597; GFX11-NEXT:    s_xor_b32 s1, s3, -1
598; GFX11-NEXT:    ; return to shader part epilog
599  %not.src1 = xor <2 x i16> %src1, <i16 -1, i16 -1>
600  %or = or <2 x i16> %src0, %not.src1
601
602  %cast.0 = bitcast <2 x i16> %or to i32
603  %cast.1 = bitcast <2 x i16> %not.src1 to i32
604  %insert.0 = insertvalue { i32, i32 } undef, i32 %cast.0, 0
605  %insert.1 = insertvalue { i32, i32 } %insert.0, i32 %cast.1, 1
606  ret { i32, i32 } %insert.1
607}
608
609define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_foldable_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1, <2 x i16> inreg %src2) {
610; GFX6-LABEL: s_orn2_v2i16_multi_foldable_use:
611; GFX6:       ; %bb.0:
612; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
613; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
614; GFX6-NEXT:    s_or_b32 s0, s0, s1
615; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
616; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
617; GFX6-NEXT:    s_or_b32 s1, s1, s2
618; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
619; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
620; GFX6-NEXT:    s_or_b32 s2, s2, s3
621; GFX6-NEXT:    s_xor_b32 s2, s2, -1
622; GFX6-NEXT:    s_or_b32 s0, s0, s2
623; GFX6-NEXT:    s_or_b32 s1, s1, s2
624; GFX6-NEXT:    ; return to shader part epilog
625;
626; GFX9-LABEL: s_orn2_v2i16_multi_foldable_use:
627; GFX9:       ; %bb.0:
628; GFX9-NEXT:    s_orn2_b32 s0, s2, s4
629; GFX9-NEXT:    s_orn2_b32 s1, s3, s4
630; GFX9-NEXT:    ; return to shader part epilog
631;
632; GFX10-LABEL: s_orn2_v2i16_multi_foldable_use:
633; GFX10:       ; %bb.0:
634; GFX10-NEXT:    s_orn2_b32 s0, s2, s4
635; GFX10-NEXT:    s_orn2_b32 s1, s3, s4
636; GFX10-NEXT:    ; return to shader part epilog
637;
638; GFX11-LABEL: s_orn2_v2i16_multi_foldable_use:
639; GFX11:       ; %bb.0:
640; GFX11-NEXT:    s_or_not1_b32 s0, s2, s4
641; GFX11-NEXT:    s_or_not1_b32 s1, s3, s4
642; GFX11-NEXT:    ; return to shader part epilog
643  %not.src2 = xor <2 x i16> %src2, <i16 -1, i16 -1>
644  %or0 = or <2 x i16> %src0, %not.src2
645  %or1 = or <2 x i16> %src1, %not.src2
646
647  %cast.0 = bitcast <2 x i16> %or0 to i32
648  %cast.1 = bitcast <2 x i16> %or1 to i32
649  %insert.0 = insertvalue { i32, i32 } undef, i32 %cast.0, 0
650  %insert.1 = insertvalue { i32, i32 } %insert.0, i32 %cast.1, 1
651  ret { i32, i32 } %insert.1
652}
653
654define <2 x i16> @v_orn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) {
655; GFX6-LABEL: v_orn2_v2i16:
656; GFX6:       ; %bb.0:
657; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
658; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
659; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
660; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
661; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
662; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
663; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
664; GFX6-NEXT:    v_xor_b32_e32 v1, -1, v1
665; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
666; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
667; GFX6-NEXT:    s_setpc_b64 s[30:31]
668;
669; GFX9-LABEL: v_orn2_v2i16:
670; GFX9:       ; %bb.0:
671; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
672; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
673; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
674; GFX9-NEXT:    s_setpc_b64 s[30:31]
675;
676; GFX10PLUS-LABEL: v_orn2_v2i16:
677; GFX10PLUS:       ; %bb.0:
678; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
679; GFX10PLUS-NEXT:    v_xor_b32_e32 v1, -1, v1
680; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v0, v1
681; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
682  %not.src1 = xor <2 x i16> %src1, <i16 -1, i16 -1>
683  %or = or <2 x i16> %src0, %not.src1
684  ret <2 x i16> %or
685}
686
687define amdgpu_ps i48 @s_orn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
688; GFX6-LABEL: s_orn2_v3i16:
689; GFX6:       ; %bb.0:
690; GFX6-NEXT:    s_and_b32 s6, s6, 0xffff
691; GFX6-NEXT:    s_mov_b32 s0, -1
692; GFX6-NEXT:    s_and_b32 s5, s5, 0xffff
693; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
694; GFX6-NEXT:    s_and_b32 s3, s3, 0xffff
695; GFX6-NEXT:    s_mov_b32 s1, 0xffff
696; GFX6-NEXT:    s_or_b32 s6, s5, s6
697; GFX6-NEXT:    s_and_b32 s7, s7, 0xffff
698; GFX6-NEXT:    s_and_b32 s2, s2, 0xffff
699; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
700; GFX6-NEXT:    s_xor_b64 s[0:1], s[6:7], s[0:1]
701; GFX6-NEXT:    s_or_b32 s2, s2, s3
702; GFX6-NEXT:    s_and_b32 s3, s4, 0xffff
703; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
704; GFX6-NEXT:    s_lshr_b32 s2, s0, 16
705; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
706; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
707; GFX6-NEXT:    s_or_b32 s0, s0, s2
708; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
709; GFX6-NEXT:    ; return to shader part epilog
710;
711; GFX9-LABEL: s_orn2_v3i16:
712; GFX9:       ; %bb.0:
713; GFX9-NEXT:    s_mov_b64 s[0:1], -1
714; GFX9-NEXT:    s_xor_b64 s[0:1], s[4:5], s[0:1]
715; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
716; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
717; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
718; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
719; GFX9-NEXT:    s_or_b32 s0, s0, s2
720; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
721; GFX9-NEXT:    ; return to shader part epilog
722;
723; GFX10PLUS-LABEL: s_orn2_v3i16:
724; GFX10PLUS:       ; %bb.0:
725; GFX10PLUS-NEXT:    s_mov_b64 s[0:1], -1
726; GFX10PLUS-NEXT:    s_xor_b64 s[0:1], s[4:5], s[0:1]
727; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
728; GFX10PLUS-NEXT:    s_lshr_b32 s2, s0, 16
729; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
730; GFX10PLUS-NEXT:    s_lshl_b32 s2, s2, 16
731; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xffff
732; GFX10PLUS-NEXT:    s_or_b32 s0, s0, s2
733; GFX10PLUS-NEXT:    ; return to shader part epilog
734  %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -1>
735  %or = or <3 x i16> %src0, %not.src1
736  %cast = bitcast <3 x i16> %or to i48
737  ret i48 %cast
738}
739
740define amdgpu_ps i48 @s_orn2_v3i16_commute(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
741; GFX6-LABEL: s_orn2_v3i16_commute:
742; GFX6:       ; %bb.0:
743; GFX6-NEXT:    s_and_b32 s6, s6, 0xffff
744; GFX6-NEXT:    s_mov_b32 s0, -1
745; GFX6-NEXT:    s_and_b32 s5, s5, 0xffff
746; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
747; GFX6-NEXT:    s_and_b32 s3, s3, 0xffff
748; GFX6-NEXT:    s_mov_b32 s1, 0xffff
749; GFX6-NEXT:    s_or_b32 s6, s5, s6
750; GFX6-NEXT:    s_and_b32 s7, s7, 0xffff
751; GFX6-NEXT:    s_and_b32 s2, s2, 0xffff
752; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
753; GFX6-NEXT:    s_xor_b64 s[0:1], s[6:7], s[0:1]
754; GFX6-NEXT:    s_or_b32 s2, s2, s3
755; GFX6-NEXT:    s_and_b32 s3, s4, 0xffff
756; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
757; GFX6-NEXT:    s_lshr_b32 s2, s0, 16
758; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
759; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
760; GFX6-NEXT:    s_or_b32 s0, s0, s2
761; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
762; GFX6-NEXT:    ; return to shader part epilog
763;
764; GFX9-LABEL: s_orn2_v3i16_commute:
765; GFX9:       ; %bb.0:
766; GFX9-NEXT:    s_mov_b64 s[0:1], -1
767; GFX9-NEXT:    s_xor_b64 s[0:1], s[4:5], s[0:1]
768; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
769; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
770; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
771; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
772; GFX9-NEXT:    s_or_b32 s0, s0, s2
773; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
774; GFX9-NEXT:    ; return to shader part epilog
775;
776; GFX10PLUS-LABEL: s_orn2_v3i16_commute:
777; GFX10PLUS:       ; %bb.0:
778; GFX10PLUS-NEXT:    s_mov_b64 s[0:1], -1
779; GFX10PLUS-NEXT:    s_xor_b64 s[0:1], s[4:5], s[0:1]
780; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
781; GFX10PLUS-NEXT:    s_lshr_b32 s2, s0, 16
782; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
783; GFX10PLUS-NEXT:    s_lshl_b32 s2, s2, 16
784; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xffff
785; GFX10PLUS-NEXT:    s_or_b32 s0, s0, s2
786; GFX10PLUS-NEXT:    ; return to shader part epilog
787  %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -1>
788  %or = or <3 x i16> %not.src1, %src0
789  %cast = bitcast <3 x i16> %or to i48
790  ret i48 %cast
791}
792
793define amdgpu_ps { i48, i48 } @s_orn2_v3i16_multi_use(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
794; GFX6-LABEL: s_orn2_v3i16_multi_use:
795; GFX6:       ; %bb.0:
796; GFX6-NEXT:    s_and_b32 s6, s6, 0xffff
797; GFX6-NEXT:    s_mov_b32 s0, -1
798; GFX6-NEXT:    s_and_b32 s5, s5, 0xffff
799; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
800; GFX6-NEXT:    s_mov_b32 s1, 0xffff
801; GFX6-NEXT:    s_or_b32 s6, s5, s6
802; GFX6-NEXT:    s_and_b32 s7, s7, 0xffff
803; GFX6-NEXT:    s_xor_b64 s[6:7], s[6:7], s[0:1]
804; GFX6-NEXT:    s_and_b32 s1, s3, 0xffff
805; GFX6-NEXT:    s_and_b32 s0, s2, 0xffff
806; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
807; GFX6-NEXT:    s_or_b32 s0, s0, s1
808; GFX6-NEXT:    s_and_b32 s1, s4, 0xffff
809; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
810; GFX6-NEXT:    s_lshr_b32 s2, s0, 16
811; GFX6-NEXT:    s_lshr_b32 s5, s6, 16
812; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
813; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
814; GFX6-NEXT:    s_or_b32 s0, s0, s2
815; GFX6-NEXT:    s_and_b32 s2, s6, 0xffff
816; GFX6-NEXT:    s_lshl_b32 s3, s5, 16
817; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
818; GFX6-NEXT:    s_or_b32 s2, s2, s3
819; GFX6-NEXT:    s_and_b32 s3, s7, 0xffff
820; GFX6-NEXT:    ; return to shader part epilog
821;
822; GFX9-LABEL: s_orn2_v3i16_multi_use:
823; GFX9:       ; %bb.0:
824; GFX9-NEXT:    s_mov_b64 s[0:1], -1
825; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], s[0:1]
826; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
827; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
828; GFX9-NEXT:    s_lshr_b32 s6, s4, 16
829; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
830; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
831; GFX9-NEXT:    s_or_b32 s0, s0, s2
832; GFX9-NEXT:    s_and_b32 s2, s4, 0xffff
833; GFX9-NEXT:    s_lshl_b32 s3, s6, 16
834; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
835; GFX9-NEXT:    s_or_b32 s2, s2, s3
836; GFX9-NEXT:    s_and_b32 s3, s5, 0xffff
837; GFX9-NEXT:    ; return to shader part epilog
838;
839; GFX10PLUS-LABEL: s_orn2_v3i16_multi_use:
840; GFX10PLUS:       ; %bb.0:
841; GFX10PLUS-NEXT:    s_mov_b64 s[0:1], -1
842; GFX10PLUS-NEXT:    s_xor_b64 s[4:5], s[4:5], s[0:1]
843; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
844; GFX10PLUS-NEXT:    s_lshr_b32 s3, s4, 16
845; GFX10PLUS-NEXT:    s_lshr_b32 s2, s0, 16
846; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
847; GFX10PLUS-NEXT:    s_lshl_b32 s2, s2, 16
848; GFX10PLUS-NEXT:    s_lshl_b32 s3, s3, 16
849; GFX10PLUS-NEXT:    s_or_b32 s0, s0, s2
850; GFX10PLUS-NEXT:    s_and_b32 s2, s4, 0xffff
851; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xffff
852; GFX10PLUS-NEXT:    s_or_b32 s2, s2, s3
853; GFX10PLUS-NEXT:    s_and_b32 s3, s5, 0xffff
854; GFX10PLUS-NEXT:    ; return to shader part epilog
855  %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -1>
856  %or = or <3 x i16> %src0, %not.src1
857  %cast.0 = bitcast <3 x i16> %or to i48
858  %cast.1 = bitcast <3 x i16> %not.src1 to i48
859  %insert.0 = insertvalue { i48, i48 } undef, i48 %cast.0, 0
860  %insert.1 = insertvalue { i48, i48 } %insert.0, i48 %cast.1, 1
861  ret { i48, i48 } %insert.1
862}
863
864define <3 x i16> @v_orn2_v3i16(<3 x i16> %src0, <3 x i16> %src1) {
865; GFX6-LABEL: v_orn2_v3i16:
866; GFX6:       ; %bb.0:
867; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
868; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
869; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
870; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
871; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
872; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
873; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
874; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
875; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v5
876; GFX6-NEXT:    v_xor_b32_e32 v3, -1, v3
877; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
878; GFX6-NEXT:    v_xor_b32_e32 v4, 0xfff5, v4
879; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
880; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
881; GFX6-NEXT:    v_or_b32_e32 v2, v1, v4
882; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
883; GFX6-NEXT:    s_setpc_b64 s[30:31]
884;
885; GFX9-LABEL: v_orn2_v3i16:
886; GFX9:       ; %bb.0:
887; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
888; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
889; GFX9-NEXT:    v_xor_b32_e32 v3, -11, v3
890; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
891; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
892; GFX9-NEXT:    s_setpc_b64 s[30:31]
893;
894; GFX10PLUS-LABEL: v_orn2_v3i16:
895; GFX10PLUS:       ; %bb.0:
896; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
897; GFX10PLUS-NEXT:    v_xor_b32_e32 v2, -1, v2
898; GFX10PLUS-NEXT:    v_xor_b32_e32 v3, -11, v3
899; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v0, v2
900; GFX10PLUS-NEXT:    v_or_b32_e32 v1, v1, v3
901; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
902  %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -11>
903  %or = or <3 x i16> %src0, %not.src1
904  ret <3 x i16> %or
905}
906
907define amdgpu_ps i64 @s_orn2_v4i16(<4 x i16> inreg %src0, <4 x i16> inreg %src1) {
908; GFX6-LABEL: s_orn2_v4i16:
909; GFX6:       ; %bb.0:
910; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
911; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
912; GFX6-NEXT:    s_or_b32 s0, s0, s1
913; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
914; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
915; GFX6-NEXT:    s_or_b32 s1, s1, s2
916; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
917; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
918; GFX6-NEXT:    s_or_b32 s2, s2, s3
919; GFX6-NEXT:    s_lshl_b32 s3, s9, 16
920; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
921; GFX6-NEXT:    s_or_b32 s3, s3, s4
922; GFX6-NEXT:    s_mov_b32 s4, -1
923; GFX6-NEXT:    s_mov_b32 s5, s4
924; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
925; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
926; GFX6-NEXT:    ; return to shader part epilog
927;
928; GFX9-LABEL: s_orn2_v4i16:
929; GFX9:       ; %bb.0:
930; GFX9-NEXT:    s_mov_b32 s0, -1
931; GFX9-NEXT:    s_mov_b32 s1, s0
932; GFX9-NEXT:    s_xor_b64 s[0:1], s[4:5], s[0:1]
933; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
934; GFX9-NEXT:    ; return to shader part epilog
935;
936; GFX10PLUS-LABEL: s_orn2_v4i16:
937; GFX10PLUS:       ; %bb.0:
938; GFX10PLUS-NEXT:    s_mov_b32 s0, -1
939; GFX10PLUS-NEXT:    s_mov_b32 s1, s0
940; GFX10PLUS-NEXT:    s_xor_b64 s[0:1], s[4:5], s[0:1]
941; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
942; GFX10PLUS-NEXT:    ; return to shader part epilog
943  %not.src1 = xor <4 x i16> %src1, <i16 -1, i16 -1, i16 -1, i16 -1>
944  %or = or <4 x i16> %src0, %not.src1
945  %cast = bitcast <4 x i16> %or to i64
946  ret i64 %cast
947}
948
949define amdgpu_ps i64 @s_orn2_v4i16_commute(<4 x i16> inreg %src0, <4 x i16> inreg %src1) {
950; GFX6-LABEL: s_orn2_v4i16_commute:
951; GFX6:       ; %bb.0:
952; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
953; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
954; GFX6-NEXT:    s_or_b32 s0, s0, s1
955; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
956; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
957; GFX6-NEXT:    s_or_b32 s1, s1, s2
958; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
959; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
960; GFX6-NEXT:    s_or_b32 s2, s2, s3
961; GFX6-NEXT:    s_lshl_b32 s3, s9, 16
962; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
963; GFX6-NEXT:    s_or_b32 s3, s3, s4
964; GFX6-NEXT:    s_mov_b32 s4, -1
965; GFX6-NEXT:    s_mov_b32 s5, s4
966; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
967; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
968; GFX6-NEXT:    ; return to shader part epilog
969;
970; GFX9-LABEL: s_orn2_v4i16_commute:
971; GFX9:       ; %bb.0:
972; GFX9-NEXT:    s_mov_b32 s0, -1
973; GFX9-NEXT:    s_mov_b32 s1, s0
974; GFX9-NEXT:    s_xor_b64 s[0:1], s[4:5], s[0:1]
975; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
976; GFX9-NEXT:    ; return to shader part epilog
977;
978; GFX10PLUS-LABEL: s_orn2_v4i16_commute:
979; GFX10PLUS:       ; %bb.0:
980; GFX10PLUS-NEXT:    s_mov_b32 s0, -1
981; GFX10PLUS-NEXT:    s_mov_b32 s1, s0
982; GFX10PLUS-NEXT:    s_xor_b64 s[0:1], s[4:5], s[0:1]
983; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
984; GFX10PLUS-NEXT:    ; return to shader part epilog
985  %not.src1 = xor <4 x i16> %src1, <i16 -1, i16 -1, i16 -1, i16 -1>
986  %or = or <4 x i16> %not.src1, %src0
987  %cast = bitcast <4 x i16> %or to i64
988  ret i64 %cast
989}
990
991define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_use(<4 x i16> inreg %src0, <4 x i16> inreg %src1) {
992; GFX6-LABEL: s_orn2_v4i16_multi_use:
993; GFX6:       ; %bb.0:
994; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
995; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
996; GFX6-NEXT:    s_or_b32 s0, s0, s1
997; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
998; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
999; GFX6-NEXT:    s_or_b32 s1, s1, s2
1000; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
1001; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
1002; GFX6-NEXT:    s_or_b32 s2, s2, s3
1003; GFX6-NEXT:    s_lshl_b32 s3, s9, 16
1004; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
1005; GFX6-NEXT:    s_or_b32 s3, s3, s4
1006; GFX6-NEXT:    s_mov_b32 s4, -1
1007; GFX6-NEXT:    s_mov_b32 s5, s4
1008; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
1009; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
1010; GFX6-NEXT:    ; return to shader part epilog
1011;
1012; GFX9-LABEL: s_orn2_v4i16_multi_use:
1013; GFX9:       ; %bb.0:
1014; GFX9-NEXT:    s_mov_b32 s0, -1
1015; GFX9-NEXT:    s_mov_b32 s1, s0
1016; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], s[0:1]
1017; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
1018; GFX9-NEXT:    s_mov_b32 s2, s4
1019; GFX9-NEXT:    s_mov_b32 s3, s5
1020; GFX9-NEXT:    ; return to shader part epilog
1021;
1022; GFX10PLUS-LABEL: s_orn2_v4i16_multi_use:
1023; GFX10PLUS:       ; %bb.0:
1024; GFX10PLUS-NEXT:    s_mov_b32 s0, -1
1025; GFX10PLUS-NEXT:    s_mov_b32 s1, s0
1026; GFX10PLUS-NEXT:    s_xor_b64 s[4:5], s[4:5], s[0:1]
1027; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
1028; GFX10PLUS-NEXT:    s_mov_b32 s2, s4
1029; GFX10PLUS-NEXT:    s_mov_b32 s3, s5
1030; GFX10PLUS-NEXT:    ; return to shader part epilog
1031  %not.src1 = xor <4 x i16> %src1, <i16 -1, i16 -1, i16 -1, i16 -1>
1032  %or = or <4 x i16> %src0, %not.src1
1033
1034  %cast.0 = bitcast <4 x i16> %or to i64
1035  %cast.1 = bitcast <4 x i16> %not.src1 to i64
1036  %insert.0 = insertvalue { i64, i64 } undef, i64 %cast.0, 0
1037  %insert.1 = insertvalue { i64, i64 } %insert.0, i64 %cast.1, 1
1038  ret { i64, i64 } %insert.1
1039}
1040
1041define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_foldable_use(<4 x i16> inreg %src0, <4 x i16> inreg %src1, <4 x i16> inreg %src2) {
1042; GFX6-LABEL: s_orn2_v4i16_multi_foldable_use:
1043; GFX6:       ; %bb.0:
1044; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
1045; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
1046; GFX6-NEXT:    s_or_b32 s0, s0, s1
1047; GFX6-NEXT:    s_lshl_b32 s1, s5, 16
1048; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
1049; GFX6-NEXT:    s_or_b32 s1, s1, s2
1050; GFX6-NEXT:    s_lshl_b32 s2, s7, 16
1051; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
1052; GFX6-NEXT:    s_or_b32 s2, s2, s3
1053; GFX6-NEXT:    s_lshl_b32 s3, s9, 16
1054; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
1055; GFX6-NEXT:    s_or_b32 s3, s3, s4
1056; GFX6-NEXT:    s_lshl_b32 s4, s11, 16
1057; GFX6-NEXT:    s_and_b32 s5, s10, 0xffff
1058; GFX6-NEXT:    s_or_b32 s4, s4, s5
1059; GFX6-NEXT:    s_lshl_b32 s5, s13, 16
1060; GFX6-NEXT:    s_and_b32 s6, s12, 0xffff
1061; GFX6-NEXT:    s_or_b32 s5, s5, s6
1062; GFX6-NEXT:    s_mov_b32 s6, -1
1063; GFX6-NEXT:    s_mov_b32 s7, s6
1064; GFX6-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
1065; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
1066; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
1067; GFX6-NEXT:    ; return to shader part epilog
1068;
1069; GFX9-LABEL: s_orn2_v4i16_multi_foldable_use:
1070; GFX9:       ; %bb.0:
1071; GFX9-NEXT:    s_mov_b32 s0, -1
1072; GFX9-NEXT:    s_mov_b32 s1, s0
1073; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[0:1]
1074; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[6:7]
1075; GFX9-NEXT:    s_or_b64 s[2:3], s[4:5], s[6:7]
1076; GFX9-NEXT:    ; return to shader part epilog
1077;
1078; GFX10PLUS-LABEL: s_orn2_v4i16_multi_foldable_use:
1079; GFX10PLUS:       ; %bb.0:
1080; GFX10PLUS-NEXT:    s_mov_b32 s0, -1
1081; GFX10PLUS-NEXT:    s_mov_b32 s1, s0
1082; GFX10PLUS-NEXT:    s_xor_b64 s[6:7], s[6:7], s[0:1]
1083; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[2:3], s[6:7]
1084; GFX10PLUS-NEXT:    s_or_b64 s[2:3], s[4:5], s[6:7]
1085; GFX10PLUS-NEXT:    ; return to shader part epilog
1086  %not.src2 = xor <4 x i16> %src2, <i16 -1, i16 -1, i16 -1, i16 -1>
1087  %or0 = or <4 x i16> %src0, %not.src2
1088  %or1 = or <4 x i16> %src1, %not.src2
1089
1090  %cast.0 = bitcast <4 x i16> %or0 to i64
1091  %cast.1 = bitcast <4 x i16> %or1 to i64
1092  %insert.0 = insertvalue { i64, i64 } undef, i64 %cast.0, 0
1093  %insert.1 = insertvalue { i64, i64 } %insert.0, i64 %cast.1, 1
1094  ret { i64, i64 } %insert.1
1095}
1096
1097define <4 x i16> @v_orn2_v4i16(<4 x i16> %src0, <4 x i16> %src1) {
1098; GFX6-LABEL: v_orn2_v4i16:
1099; GFX6:       ; %bb.0:
1100; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1101; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1102; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1103; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
1104; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
1105; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1106; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
1107; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
1108; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v4
1109; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
1110; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
1111; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v6
1112; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
1113; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v2
1114; GFX6-NEXT:    v_xor_b32_e32 v3, -1, v3
1115; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
1116; GFX6-NEXT:    v_or_b32_e32 v2, v1, v3
1117; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1118; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
1119; GFX6-NEXT:    s_setpc_b64 s[30:31]
1120;
1121; GFX9-LABEL: v_orn2_v4i16:
1122; GFX9:       ; %bb.0:
1123; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1124; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
1125; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v3
1126; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
1127; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
1128; GFX9-NEXT:    s_setpc_b64 s[30:31]
1129;
1130; GFX10PLUS-LABEL: v_orn2_v4i16:
1131; GFX10PLUS:       ; %bb.0:
1132; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1133; GFX10PLUS-NEXT:    v_xor_b32_e32 v2, -1, v2
1134; GFX10PLUS-NEXT:    v_xor_b32_e32 v3, -1, v3
1135; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v0, v2
1136; GFX10PLUS-NEXT:    v_or_b32_e32 v1, v1, v3
1137; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
1138  %not.src1 = xor <4 x i16> %src1, <i16 -1, i16 -1, i16 -1, i16 -1>
1139  %or = or <4 x i16> %src0, %not.src1
1140  ret <4 x i16> %or
1141}
1142