xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll (revision 17f3e00911b860d535f41185e605c47babcc2039)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
5; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX11 %s
6
7define <2 x i16> @v_sub_v2i16(<2 x i16> %a, <2 x i16> %b) {
8; GFX9-LABEL: v_sub_v2i16:
9; GFX9:       ; %bb.0:
10; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1
12; GFX9-NEXT:    s_setpc_b64 s[30:31]
13;
14; GFX8-LABEL: v_sub_v2i16:
15; GFX8:       ; %bb.0:
16; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v1
18; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
19; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
20; GFX8-NEXT:    s_setpc_b64 s[30:31]
21;
22; GFX10-LABEL: v_sub_v2i16:
23; GFX10:       ; %bb.0:
24; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v1
26; GFX10-NEXT:    s_setpc_b64 s[30:31]
27;
28; GFX11-LABEL: v_sub_v2i16:
29; GFX11:       ; %bb.0:
30; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31; GFX11-NEXT:    v_pk_sub_i16 v0, v0, v1
32; GFX11-NEXT:    s_setpc_b64 s[30:31]
33  %sub = sub <2 x i16> %a, %b
34  ret <2 x i16> %sub
35}
36
37define <2 x i16> @v_sub_v2i16_fneg_lhs(<2 x half> %a, <2 x i16> %b) {
38; GFX9-LABEL: v_sub_v2i16_fneg_lhs:
39; GFX9:       ; %bb.0:
40; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
42; GFX9-NEXT:    s_setpc_b64 s[30:31]
43;
44; GFX8-LABEL: v_sub_v2i16_fneg_lhs:
45; GFX8:       ; %bb.0:
46; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
47; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
48; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v1
49; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
50; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
51; GFX8-NEXT:    s_setpc_b64 s[30:31]
52;
53; GFX10-LABEL: v_sub_v2i16_fneg_lhs:
54; GFX10:       ; %bb.0:
55; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
57; GFX10-NEXT:    s_setpc_b64 s[30:31]
58;
59; GFX11-LABEL: v_sub_v2i16_fneg_lhs:
60; GFX11:       ; %bb.0:
61; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
62; GFX11-NEXT:    v_pk_sub_i16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
63; GFX11-NEXT:    s_setpc_b64 s[30:31]
64  %neg.a = fneg <2 x half> %a
65  %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16>
66  %sub = sub <2 x i16> %cast.neg.a, %b
67  ret <2 x i16> %sub
68}
69
70define <2 x i16> @v_sub_v2i16_fneg_rhs(<2 x i16> %a, <2 x half> %b) {
71; GFX9-LABEL: v_sub_v2i16_fneg_rhs:
72; GFX9:       ; %bb.0:
73; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
74; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
75; GFX9-NEXT:    s_setpc_b64 s[30:31]
76;
77; GFX8-LABEL: v_sub_v2i16_fneg_rhs:
78; GFX8:       ; %bb.0:
79; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
80; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
81; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v1
82; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
83; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
84; GFX8-NEXT:    s_setpc_b64 s[30:31]
85;
86; GFX10-LABEL: v_sub_v2i16_fneg_rhs:
87; GFX10:       ; %bb.0:
88; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
89; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
90; GFX10-NEXT:    s_setpc_b64 s[30:31]
91;
92; GFX11-LABEL: v_sub_v2i16_fneg_rhs:
93; GFX11:       ; %bb.0:
94; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
95; GFX11-NEXT:    v_pk_sub_i16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
96; GFX11-NEXT:    s_setpc_b64 s[30:31]
97  %neg.b = fneg <2 x half> %b
98  %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16>
99  %sub = sub <2 x i16> %a, %cast.neg.b
100  ret <2 x i16> %sub
101}
102
103define <2 x i16> @v_sub_v2i16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) {
104; GFX9-LABEL: v_sub_v2i16_fneg_lhs_fneg_rhs:
105; GFX9:       ; %bb.0:
106; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
107; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
108; GFX9-NEXT:    s_setpc_b64 s[30:31]
109;
110; GFX8-LABEL: v_sub_v2i16_fneg_lhs_fneg_rhs:
111; GFX8:       ; %bb.0:
112; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
113; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
114; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
115; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v1
116; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
117; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
118; GFX8-NEXT:    s_setpc_b64 s[30:31]
119;
120; GFX10-LABEL: v_sub_v2i16_fneg_lhs_fneg_rhs:
121; GFX10:       ; %bb.0:
122; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
123; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
124; GFX10-NEXT:    s_setpc_b64 s[30:31]
125;
126; GFX11-LABEL: v_sub_v2i16_fneg_lhs_fneg_rhs:
127; GFX11:       ; %bb.0:
128; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
129; GFX11-NEXT:    v_pk_sub_i16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
130; GFX11-NEXT:    s_setpc_b64 s[30:31]
131  %neg.a = fneg <2 x half> %a
132  %neg.b = fneg <2 x half> %b
133  %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16>
134  %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16>
135  %sub = sub <2 x i16> %cast.neg.a, %cast.neg.b
136  ret <2 x i16> %sub
137}
138
139define <2 x i16> @v_sub_v2i16_neg_inline_imm_splat(<2 x i16> %a) {
140; GFX9-LABEL: v_sub_v2i16_neg_inline_imm_splat:
141; GFX9:       ; %bb.0:
142; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
143; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffc0ffc0
144; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1
145; GFX9-NEXT:    s_setpc_b64 s[30:31]
146;
147; GFX8-LABEL: v_sub_v2i16_neg_inline_imm_splat:
148; GFX8:       ; %bb.0:
149; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
150; GFX8-NEXT:    v_mov_b32_e32 v2, 64
151; GFX8-NEXT:    v_add_u16_e32 v1, 64, v0
152; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
153; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
154; GFX8-NEXT:    s_setpc_b64 s[30:31]
155;
156; GFX10-LABEL: v_sub_v2i16_neg_inline_imm_splat:
157; GFX10:       ; %bb.0:
158; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
159; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 0xffc0ffc0
160; GFX10-NEXT:    s_setpc_b64 s[30:31]
161;
162; GFX11-LABEL: v_sub_v2i16_neg_inline_imm_splat:
163; GFX11:       ; %bb.0:
164; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
165; GFX11-NEXT:    v_pk_sub_i16 v0, v0, 0xffc0ffc0
166; GFX11-NEXT:    s_setpc_b64 s[30:31]
167  %sub = sub <2 x i16> %a, <i16 -64, i16 -64>
168  ret <2 x i16> %sub
169}
170
171define <2 x i16> @v_sub_v2i16_neg_inline_imm_lo(<2 x i16> %a) {
172; GFX9-LABEL: v_sub_v2i16_neg_inline_imm_lo:
173; GFX9:       ; %bb.0:
174; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
175; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4ffc0
176; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1
177; GFX9-NEXT:    s_setpc_b64 s[30:31]
178;
179; GFX8-LABEL: v_sub_v2i16_neg_inline_imm_lo:
180; GFX8:       ; %bb.0:
181; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
182; GFX8-NEXT:    v_mov_b32_e32 v2, -4
183; GFX8-NEXT:    v_add_u16_e32 v1, 64, v0
184; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
185; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
186; GFX8-NEXT:    s_setpc_b64 s[30:31]
187;
188; GFX10-LABEL: v_sub_v2i16_neg_inline_imm_lo:
189; GFX10:       ; %bb.0:
190; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
191; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 0x4ffc0
192; GFX10-NEXT:    s_setpc_b64 s[30:31]
193;
194; GFX11-LABEL: v_sub_v2i16_neg_inline_imm_lo:
195; GFX11:       ; %bb.0:
196; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
197; GFX11-NEXT:    v_pk_sub_i16 v0, v0, 0x4ffc0
198; GFX11-NEXT:    s_setpc_b64 s[30:31]
199  %sub = sub <2 x i16> %a, <i16 -64, i16 4>
200  ret <2 x i16> %sub
201}
202
203define <2 x i16> @v_sub_v2i16_neg_inline_imm_hi(<2 x i16> %a) {
204; GFX9-LABEL: v_sub_v2i16_neg_inline_imm_hi:
205; GFX9:       ; %bb.0:
206; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
207; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffc00004
208; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1
209; GFX9-NEXT:    s_setpc_b64 s[30:31]
210;
211; GFX8-LABEL: v_sub_v2i16_neg_inline_imm_hi:
212; GFX8:       ; %bb.0:
213; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
214; GFX8-NEXT:    v_mov_b32_e32 v2, 64
215; GFX8-NEXT:    v_add_u16_e32 v1, -4, v0
216; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
217; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
218; GFX8-NEXT:    s_setpc_b64 s[30:31]
219;
220; GFX10-LABEL: v_sub_v2i16_neg_inline_imm_hi:
221; GFX10:       ; %bb.0:
222; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
223; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 0xffc00004
224; GFX10-NEXT:    s_setpc_b64 s[30:31]
225;
226; GFX11-LABEL: v_sub_v2i16_neg_inline_imm_hi:
227; GFX11:       ; %bb.0:
228; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
229; GFX11-NEXT:    v_pk_sub_i16 v0, v0, 0xffc00004
230; GFX11-NEXT:    s_setpc_b64 s[30:31]
231  %sub = sub <2 x i16> %a, <i16 4, i16 -64>
232  ret <2 x i16> %sub
233}
234
235define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) {
236; GFX9-LABEL: s_sub_v2i16_neg_inline_imm_splat:
237; GFX9:       ; %bb.0:
238; GFX9-NEXT:    s_lshr_b32 s1, s0, 16
239; GFX9-NEXT:    s_sub_i32 s0, s0, 0xffc0ffc0
240; GFX9-NEXT:    s_sub_i32 s1, s1, 0xffc0
241; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
242; GFX9-NEXT:    ; return to shader part epilog
243;
244; GFX8-LABEL: s_sub_v2i16_neg_inline_imm_splat:
245; GFX8:       ; %bb.0:
246; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
247; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
248; GFX8-NEXT:    s_add_i32 s0, s0, 0xffff0040
249; GFX8-NEXT:    s_add_i32 s1, s1, 0xffff0040
250; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
251; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
252; GFX8-NEXT:    s_or_b32 s0, s1, s0
253; GFX8-NEXT:    ; return to shader part epilog
254;
255; GFX10-LABEL: s_sub_v2i16_neg_inline_imm_splat:
256; GFX10:       ; %bb.0:
257; GFX10-NEXT:    s_lshr_b32 s1, s0, 16
258; GFX10-NEXT:    s_sub_i32 s0, s0, 0xffc0ffc0
259; GFX10-NEXT:    s_sub_i32 s1, s1, 0xffc0
260; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
261; GFX10-NEXT:    ; return to shader part epilog
262;
263; GFX11-LABEL: s_sub_v2i16_neg_inline_imm_splat:
264; GFX11:       ; %bb.0:
265; GFX11-NEXT:    s_lshr_b32 s1, s0, 16
266; GFX11-NEXT:    s_sub_i32 s0, s0, 0xffc0ffc0
267; GFX11-NEXT:    s_sub_i32 s1, s1, 0xffc0
268; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
269; GFX11-NEXT:    ; return to shader part epilog
270  %sub = sub <2 x i16> %a, <i16 -64, i16 -64>
271  %cast = bitcast <2 x i16> %sub to i32
272  ret i32 %cast
273}
274
275define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) {
276; GFX9-LABEL: s_sub_v2i16_neg_inline_imm_lo:
277; GFX9:       ; %bb.0:
278; GFX9-NEXT:    s_lshr_b32 s1, s0, 16
279; GFX9-NEXT:    s_sub_i32 s0, s0, 0x4ffc0
280; GFX9-NEXT:    s_sub_i32 s1, s1, 4
281; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
282; GFX9-NEXT:    ; return to shader part epilog
283;
284; GFX8-LABEL: s_sub_v2i16_neg_inline_imm_lo:
285; GFX8:       ; %bb.0:
286; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
287; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
288; GFX8-NEXT:    s_add_i32 s0, s0, 0xffff0040
289; GFX8-NEXT:    s_add_i32 s1, s1, -4
290; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
291; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
292; GFX8-NEXT:    s_or_b32 s0, s1, s0
293; GFX8-NEXT:    ; return to shader part epilog
294;
295; GFX10-LABEL: s_sub_v2i16_neg_inline_imm_lo:
296; GFX10:       ; %bb.0:
297; GFX10-NEXT:    s_lshr_b32 s1, s0, 16
298; GFX10-NEXT:    s_sub_i32 s0, s0, 0x4ffc0
299; GFX10-NEXT:    s_sub_i32 s1, s1, 4
300; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
301; GFX10-NEXT:    ; return to shader part epilog
302;
303; GFX11-LABEL: s_sub_v2i16_neg_inline_imm_lo:
304; GFX11:       ; %bb.0:
305; GFX11-NEXT:    s_lshr_b32 s1, s0, 16
306; GFX11-NEXT:    s_sub_i32 s0, s0, 0x4ffc0
307; GFX11-NEXT:    s_sub_i32 s1, s1, 4
308; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
309; GFX11-NEXT:    ; return to shader part epilog
310  %sub = sub <2 x i16> %a, <i16 -64, i16 4>
311  %cast = bitcast <2 x i16> %sub to i32
312  ret i32 %cast
313}
314
315define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) {
316; GFX9-LABEL: s_sub_v2i16_neg_inline_imm_hi:
317; GFX9:       ; %bb.0:
318; GFX9-NEXT:    s_lshr_b32 s1, s0, 16
319; GFX9-NEXT:    s_sub_i32 s0, s0, 0xffc00004
320; GFX9-NEXT:    s_sub_i32 s1, s1, 0xffc0
321; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
322; GFX9-NEXT:    ; return to shader part epilog
323;
324; GFX8-LABEL: s_sub_v2i16_neg_inline_imm_hi:
325; GFX8:       ; %bb.0:
326; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
327; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
328; GFX8-NEXT:    s_add_i32 s0, s0, -4
329; GFX8-NEXT:    s_add_i32 s1, s1, 0xffff0040
330; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
331; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
332; GFX8-NEXT:    s_or_b32 s0, s1, s0
333; GFX8-NEXT:    ; return to shader part epilog
334;
335; GFX10-LABEL: s_sub_v2i16_neg_inline_imm_hi:
336; GFX10:       ; %bb.0:
337; GFX10-NEXT:    s_lshr_b32 s1, s0, 16
338; GFX10-NEXT:    s_sub_i32 s0, s0, 0xffc00004
339; GFX10-NEXT:    s_sub_i32 s1, s1, 0xffc0
340; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
341; GFX10-NEXT:    ; return to shader part epilog
342;
343; GFX11-LABEL: s_sub_v2i16_neg_inline_imm_hi:
344; GFX11:       ; %bb.0:
345; GFX11-NEXT:    s_lshr_b32 s1, s0, 16
346; GFX11-NEXT:    s_sub_i32 s0, s0, 0xffc00004
347; GFX11-NEXT:    s_sub_i32 s1, s1, 0xffc0
348; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
349; GFX11-NEXT:    ; return to shader part epilog
350  %sub = sub <2 x i16> %a, <i16 4, i16 -64>
351  %cast = bitcast <2 x i16> %sub to i32
352  ret i32 %cast
353}
354
355define amdgpu_ps i32 @s_sub_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) {
356; GFX9-LABEL: s_sub_v2i16:
357; GFX9:       ; %bb.0:
358; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
359; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
360; GFX9-NEXT:    s_sub_i32 s0, s0, s1
361; GFX9-NEXT:    s_sub_i32 s1, s2, s3
362; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
363; GFX9-NEXT:    ; return to shader part epilog
364;
365; GFX8-LABEL: s_sub_v2i16:
366; GFX8:       ; %bb.0:
367; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
368; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
369; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
370; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
371; GFX8-NEXT:    s_sub_i32 s0, s0, s1
372; GFX8-NEXT:    s_sub_i32 s1, s2, s3
373; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
374; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
375; GFX8-NEXT:    s_or_b32 s0, s1, s0
376; GFX8-NEXT:    ; return to shader part epilog
377;
378; GFX10-LABEL: s_sub_v2i16:
379; GFX10:       ; %bb.0:
380; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
381; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
382; GFX10-NEXT:    s_sub_i32 s0, s0, s1
383; GFX10-NEXT:    s_sub_i32 s1, s2, s3
384; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
385; GFX10-NEXT:    ; return to shader part epilog
386;
387; GFX11-LABEL: s_sub_v2i16:
388; GFX11:       ; %bb.0:
389; GFX11-NEXT:    s_lshr_b32 s2, s0, 16
390; GFX11-NEXT:    s_lshr_b32 s3, s1, 16
391; GFX11-NEXT:    s_sub_i32 s0, s0, s1
392; GFX11-NEXT:    s_sub_i32 s1, s2, s3
393; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
394; GFX11-NEXT:    ; return to shader part epilog
395  %sub = sub <2 x i16> %a, %b
396  %cast = bitcast <2 x i16> %sub to i32
397  ret i32 %cast
398}
399
400define amdgpu_ps i32 @s_sub_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg %b) {
401; GFX9-LABEL: s_sub_v2i16_fneg_lhs:
402; GFX9:       ; %bb.0:
403; GFX9-NEXT:    s_xor_b32 s0, s0, 0x80008000
404; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
405; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
406; GFX9-NEXT:    s_sub_i32 s0, s0, s1
407; GFX9-NEXT:    s_sub_i32 s1, s2, s3
408; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
409; GFX9-NEXT:    ; return to shader part epilog
410;
411; GFX8-LABEL: s_sub_v2i16_fneg_lhs:
412; GFX8:       ; %bb.0:
413; GFX8-NEXT:    s_xor_b32 s0, s0, 0x80008000
414; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
415; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
416; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
417; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
418; GFX8-NEXT:    s_sub_i32 s0, s0, s1
419; GFX8-NEXT:    s_sub_i32 s1, s2, s3
420; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
421; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
422; GFX8-NEXT:    s_or_b32 s0, s1, s0
423; GFX8-NEXT:    ; return to shader part epilog
424;
425; GFX10-LABEL: s_sub_v2i16_fneg_lhs:
426; GFX10:       ; %bb.0:
427; GFX10-NEXT:    s_xor_b32 s0, s0, 0x80008000
428; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
429; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
430; GFX10-NEXT:    s_sub_i32 s0, s0, s1
431; GFX10-NEXT:    s_sub_i32 s1, s2, s3
432; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
433; GFX10-NEXT:    ; return to shader part epilog
434;
435; GFX11-LABEL: s_sub_v2i16_fneg_lhs:
436; GFX11:       ; %bb.0:
437; GFX11-NEXT:    s_xor_b32 s0, s0, 0x80008000
438; GFX11-NEXT:    s_lshr_b32 s3, s1, 16
439; GFX11-NEXT:    s_lshr_b32 s2, s0, 16
440; GFX11-NEXT:    s_sub_i32 s0, s0, s1
441; GFX11-NEXT:    s_sub_i32 s1, s2, s3
442; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
443; GFX11-NEXT:    ; return to shader part epilog
444  %neg.a = fneg <2 x half> %a
445  %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16>
446  %sub = sub <2 x i16> %cast.neg.a, %b
447  %cast = bitcast <2 x i16> %sub to i32
448  ret i32 %cast
449}
450
451define amdgpu_ps i32 @s_sub_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg %b) {
452; GFX9-LABEL: s_sub_v2i16_fneg_rhs:
453; GFX9:       ; %bb.0:
454; GFX9-NEXT:    s_xor_b32 s1, s1, 0x80008000
455; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
456; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
457; GFX9-NEXT:    s_sub_i32 s0, s0, s1
458; GFX9-NEXT:    s_sub_i32 s1, s2, s3
459; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
460; GFX9-NEXT:    ; return to shader part epilog
461;
462; GFX8-LABEL: s_sub_v2i16_fneg_rhs:
463; GFX8:       ; %bb.0:
464; GFX8-NEXT:    s_xor_b32 s1, s1, 0x80008000
465; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
466; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
467; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
468; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
469; GFX8-NEXT:    s_sub_i32 s0, s0, s1
470; GFX8-NEXT:    s_sub_i32 s1, s2, s3
471; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
472; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
473; GFX8-NEXT:    s_or_b32 s0, s1, s0
474; GFX8-NEXT:    ; return to shader part epilog
475;
476; GFX10-LABEL: s_sub_v2i16_fneg_rhs:
477; GFX10:       ; %bb.0:
478; GFX10-NEXT:    s_xor_b32 s1, s1, 0x80008000
479; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
480; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
481; GFX10-NEXT:    s_sub_i32 s0, s0, s1
482; GFX10-NEXT:    s_sub_i32 s1, s2, s3
483; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
484; GFX10-NEXT:    ; return to shader part epilog
485;
486; GFX11-LABEL: s_sub_v2i16_fneg_rhs:
487; GFX11:       ; %bb.0:
488; GFX11-NEXT:    s_xor_b32 s1, s1, 0x80008000
489; GFX11-NEXT:    s_lshr_b32 s2, s0, 16
490; GFX11-NEXT:    s_lshr_b32 s3, s1, 16
491; GFX11-NEXT:    s_sub_i32 s0, s0, s1
492; GFX11-NEXT:    s_sub_i32 s1, s2, s3
493; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
494; GFX11-NEXT:    ; return to shader part epilog
495  %neg.b = fneg <2 x half> %b
496  %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16>
497  %sub = sub <2 x i16> %a, %cast.neg.b
498  %cast = bitcast <2 x i16> %sub to i32
499  ret i32 %cast
500}
501
502define amdgpu_ps i32 @s_sub_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x half> inreg %b) {
503; GFX9-LABEL: s_sub_v2i16_fneg_lhs_fneg_rhs:
504; GFX9:       ; %bb.0:
505; GFX9-NEXT:    s_xor_b32 s0, s0, 0x80008000
506; GFX9-NEXT:    s_xor_b32 s1, s1, 0x80008000
507; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
508; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
509; GFX9-NEXT:    s_sub_i32 s0, s0, s1
510; GFX9-NEXT:    s_sub_i32 s1, s2, s3
511; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
512; GFX9-NEXT:    ; return to shader part epilog
513;
514; GFX8-LABEL: s_sub_v2i16_fneg_lhs_fneg_rhs:
515; GFX8:       ; %bb.0:
516; GFX8-NEXT:    s_xor_b32 s0, s0, 0x80008000
517; GFX8-NEXT:    s_xor_b32 s1, s1, 0x80008000
518; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
519; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
520; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
521; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
522; GFX8-NEXT:    s_sub_i32 s0, s0, s1
523; GFX8-NEXT:    s_sub_i32 s1, s2, s3
524; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
525; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
526; GFX8-NEXT:    s_or_b32 s0, s1, s0
527; GFX8-NEXT:    ; return to shader part epilog
528;
529; GFX10-LABEL: s_sub_v2i16_fneg_lhs_fneg_rhs:
530; GFX10:       ; %bb.0:
531; GFX10-NEXT:    s_xor_b32 s0, s0, 0x80008000
532; GFX10-NEXT:    s_xor_b32 s1, s1, 0x80008000
533; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
534; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
535; GFX10-NEXT:    s_sub_i32 s0, s0, s1
536; GFX10-NEXT:    s_sub_i32 s1, s2, s3
537; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
538; GFX10-NEXT:    ; return to shader part epilog
539;
540; GFX11-LABEL: s_sub_v2i16_fneg_lhs_fneg_rhs:
541; GFX11:       ; %bb.0:
542; GFX11-NEXT:    s_xor_b32 s0, s0, 0x80008000
543; GFX11-NEXT:    s_xor_b32 s1, s1, 0x80008000
544; GFX11-NEXT:    s_lshr_b32 s2, s0, 16
545; GFX11-NEXT:    s_lshr_b32 s3, s1, 16
546; GFX11-NEXT:    s_sub_i32 s0, s0, s1
547; GFX11-NEXT:    s_sub_i32 s1, s2, s3
548; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
549; GFX11-NEXT:    ; return to shader part epilog
550  %neg.a = fneg <2 x half> %a
551  %neg.b = fneg <2 x half> %b
552  %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16>
553  %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16>
554  %sub = sub <2 x i16> %cast.neg.a, %cast.neg.b
555  %cast = bitcast <2 x i16> %sub to i32
556  ret i32 %cast
557}
558