xref: /llvm-project/llvm/test/CodeGen/AArch64/hadd-combine.ll (revision f78febf7a87832fb2078961a6d8881b527c917bb)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
3
4define <8 x i16> @haddu_base(<8 x i16> %src1, <8 x i16> %src2) {
5; CHECK-LABEL: haddu_base:
6; CHECK:       // %bb.0:
7; CHECK-NEXT:    uhadd v0.8h, v0.8h, v1.8h
8; CHECK-NEXT:    ret
9  %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
10  %zextsrc2 = zext <8 x i16> %src2 to <8 x i32>
11  %add = add <8 x i32> %zextsrc1, %zextsrc2
12  %resulti16 = lshr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
13  %result = trunc <8 x i32> %resulti16 to <8 x i16>
14  ret <8 x i16> %result
15}
16
17define <8 x i16> @haddu_const(<8 x i16> %src1) {
18; CHECK-LABEL: haddu_const:
19; CHECK:       // %bb.0:
20; CHECK-NEXT:    movi v1.8h, #1
21; CHECK-NEXT:    uhadd v0.8h, v0.8h, v1.8h
22; CHECK-NEXT:    ret
23  %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
24  %add = add <8 x i32> %zextsrc1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
25  %resulti16 = lshr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
26  %result = trunc <8 x i32> %resulti16 to <8 x i16>
27  ret <8 x i16> %result
28}
29
30define <8 x i16> @haddu_const_lhs(<8 x i16> %src1) {
31; CHECK-LABEL: haddu_const_lhs:
32; CHECK:       // %bb.0:
33; CHECK-NEXT:    movi v1.8h, #1
34; CHECK-NEXT:    uhadd v0.8h, v0.8h, v1.8h
35; CHECK-NEXT:    ret
36  %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
37  %add = add <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, %zextsrc1
38  %resulti16 = lshr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
39  %result = trunc <8 x i32> %resulti16 to <8 x i16>
40  ret <8 x i16> %result
41}
42
43define <8 x i16> @haddu_const_zero(<8 x i16> %src1) {
44; CHECK-LABEL: haddu_const_zero:
45; CHECK:       // %bb.0:
46; CHECK-NEXT:    ushll v1.4s, v0.4h, #0
47; CHECK-NEXT:    ushll2 v2.4s, v0.8h, #0
48; CHECK-NEXT:    shrn v0.4h, v1.4s, #1
49; CHECK-NEXT:    shrn2 v0.8h, v2.4s, #1
50; CHECK-NEXT:    ret
51  %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
52  %add = add <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, %zextsrc1
53  %resulti16 = lshr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
54  %result = trunc <8 x i32> %resulti16 to <8 x i16>
55  ret <8 x i16> %result
56}
57
58define <8 x i16> @haddu_const_both() {
59; CHECK-LABEL: haddu_const_both:
60; CHECK:       // %bb.0:
61; CHECK-NEXT:    movi v0.8h, #2
62; CHECK-NEXT:    ret
63  %add = add <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
64  %resulti16 = lshr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
65  %result = trunc <8 x i32> %resulti16 to <8 x i16>
66  ret <8 x i16> %result
67}
68
69define <8 x i16> @haddu_const_bothhigh() {
70; CHECK-LABEL: haddu_const_bothhigh:
71; CHECK:       // %bb.0:
72; CHECK-NEXT:    mvni v0.8h, #1
73; CHECK-NEXT:    ret
74  %ext1 = zext <8 x i16> <i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534> to <8 x i32>
75  %ext2 = zext <8 x i16> <i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535> to <8 x i32>
76  %add = add <8 x i32> %ext1, %ext2
77  %resulti16 = ashr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
78  %result = trunc <8 x i32> %resulti16 to <8 x i16>
79  ret <8 x i16> %result
80}
81
82define <8 x i16> @haddu_undef(<8 x i16> %src1) {
83; CHECK-LABEL: haddu_undef:
84; CHECK:       // %bb.0:
85; CHECK-NEXT:    ushll v1.4s, v0.4h, #0
86; CHECK-NEXT:    ushll2 v2.4s, v0.8h, #0
87; CHECK-NEXT:    shrn v0.4h, v1.4s, #1
88; CHECK-NEXT:    shrn2 v0.8h, v2.4s, #1
89; CHECK-NEXT:    ret
90  %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
91  %zextsrc2 = zext <8 x i16> undef to <8 x i32>
92  %add = add <8 x i32> %zextsrc2, %zextsrc1
93  %resulti16 = lshr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
94  %result = trunc <8 x i32> %resulti16 to <8 x i16>
95  ret <8 x i16> %result
96}
97
98
99
100define <8 x i16> @haddu_i_base(<8 x i16> %src1, <8 x i16> %src2) {
101; CHECK-LABEL: haddu_i_base:
102; CHECK:       // %bb.0:
103; CHECK-NEXT:    uhadd v0.8h, v0.8h, v1.8h
104; CHECK-NEXT:    ret
105  %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %src1, <8 x i16> %src2)
106  ret <8 x i16> %result
107}
108
109define <8 x i16> @haddu_i_const(<8 x i16> %src1) {
110; CHECK-LABEL: haddu_i_const:
111; CHECK:       // %bb.0:
112; CHECK-NEXT:    movi v1.8h, #1
113; CHECK-NEXT:    uhadd v0.8h, v0.8h, v1.8h
114; CHECK-NEXT:    ret
115  %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %src1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
116  ret <8 x i16> %result
117}
118
119define <8 x i16> @haddu_i_const_lhs(<8 x i16> %src1) {
120; CHECK-LABEL: haddu_i_const_lhs:
121; CHECK:       // %bb.0:
122; CHECK-NEXT:    movi v1.8h, #1
123; CHECK-NEXT:    uhadd v0.8h, v0.8h, v1.8h
124; CHECK-NEXT:    ret
125  %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, <8 x i16> %src1)
126  ret <8 x i16> %result
127}
128
129define <8 x i16> @haddu_i_const_zero(<8 x i16> %src1) {
130; CHECK-LABEL: haddu_i_const_zero:
131; CHECK:       // %bb.0:
132; CHECK-NEXT:    ushr v0.8h, v0.8h, #1
133; CHECK-NEXT:    ret
134  %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <8 x i16> %src1)
135  ret <8 x i16> %result
136}
137
138define <8 x i16> @haddu_i_const_both() {
139; CHECK-LABEL: haddu_i_const_both:
140; CHECK:       // %bb.0:
141; CHECK-NEXT:    movi v0.8h, #2
142; CHECK-NEXT:    ret
143  %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
144  ret <8 x i16> %result
145}
146
147define <8 x i16> @haddu_i_const_bothhigh() {
148; CHECK-LABEL: haddu_i_const_bothhigh:
149; CHECK:       // %bb.0:
150; CHECK-NEXT:    mvni v0.8h, #1
151; CHECK-NEXT:    ret
152  %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> <i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534>, <8 x i16> <i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535>)
153  ret <8 x i16> %result
154}
155
156define <8 x i16> @haddu_i_undef(<8 x i16> %t, <8 x i16> %src1) {
157; CHECK-LABEL: haddu_i_undef:
158; CHECK:       // %bb.0:
159; CHECK-NEXT:    mov v0.16b, v1.16b
160; CHECK-NEXT:    ret
161  %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> undef, <8 x i16> %src1)
162  ret <8 x i16> %result
163}
164
165
166
167
168
169define <8 x i16> @hadds_base(<8 x i16> %src1, <8 x i16> %src2) {
170; CHECK-LABEL: hadds_base:
171; CHECK:       // %bb.0:
172; CHECK-NEXT:    shadd v0.8h, v0.8h, v1.8h
173; CHECK-NEXT:    ret
174  %zextsrc1 = sext <8 x i16> %src1 to <8 x i32>
175  %zextsrc2 = sext <8 x i16> %src2 to <8 x i32>
176  %add = add <8 x i32> %zextsrc1, %zextsrc2
177  %resulti16 = ashr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
178  %result = trunc <8 x i32> %resulti16 to <8 x i16>
179  ret <8 x i16> %result
180}
181
182define <8 x i16> @hadds_const(<8 x i16> %src1) {
183; CHECK-LABEL: hadds_const:
184; CHECK:       // %bb.0:
185; CHECK-NEXT:    movi v1.8h, #1
186; CHECK-NEXT:    shadd v0.8h, v0.8h, v1.8h
187; CHECK-NEXT:    ret
188  %zextsrc1 = sext <8 x i16> %src1 to <8 x i32>
189  %add = add <8 x i32> %zextsrc1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
190  %resulti16 = ashr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
191  %result = trunc <8 x i32> %resulti16 to <8 x i16>
192  ret <8 x i16> %result
193}
194
195define <8 x i16> @hadds_const_lhs(<8 x i16> %src1) {
196; CHECK-LABEL: hadds_const_lhs:
197; CHECK:       // %bb.0:
198; CHECK-NEXT:    movi v1.8h, #1
199; CHECK-NEXT:    shadd v0.8h, v0.8h, v1.8h
200; CHECK-NEXT:    ret
201  %zextsrc1 = sext <8 x i16> %src1 to <8 x i32>
202  %add = add <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, %zextsrc1
203  %resulti16 = ashr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
204  %result = trunc <8 x i32> %resulti16 to <8 x i16>
205  ret <8 x i16> %result
206}
207
208define <8 x i16> @hadds_const_zero(<8 x i16> %src1) {
209; CHECK-LABEL: hadds_const_zero:
210; CHECK:       // %bb.0:
211; CHECK-NEXT:    sshll v1.4s, v0.4h, #0
212; CHECK-NEXT:    sshll2 v2.4s, v0.8h, #0
213; CHECK-NEXT:    shrn v0.4h, v1.4s, #1
214; CHECK-NEXT:    shrn2 v0.8h, v2.4s, #1
215; CHECK-NEXT:    ret
216  %zextsrc1 = sext <8 x i16> %src1 to <8 x i32>
217  %add = add <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, %zextsrc1
218  %resulti16 = ashr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
219  %result = trunc <8 x i32> %resulti16 to <8 x i16>
220  ret <8 x i16> %result
221}
222
223define <8 x i16> @hadds_const_both() {
224; CHECK-LABEL: hadds_const_both:
225; CHECK:       // %bb.0:
226; CHECK-NEXT:    movi v0.8h, #2
227; CHECK-NEXT:    ret
228  %add = add <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
229  %resulti16 = ashr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
230  %result = trunc <8 x i32> %resulti16 to <8 x i16>
231  ret <8 x i16> %result
232}
233
234define <8 x i16> @hadds_const_bothhigh() {
235; CHECK-LABEL: hadds_const_bothhigh:
236; CHECK:       // %bb.0:
237; CHECK-NEXT:    mov w8, #32766 // =0x7ffe
238; CHECK-NEXT:    dup v0.8h, w8
239; CHECK-NEXT:    ret
240  %ext1 = sext <8 x i16> <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766> to <8 x i32>
241  %ext2 = sext <8 x i16> <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767> to <8 x i32>
242  %add = add <8 x i32> %ext1, %ext2
243  %resulti16 = ashr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
244  %result = trunc <8 x i32> %resulti16 to <8 x i16>
245  ret <8 x i16> %result
246}
247
248define <8 x i16> @hadds_undef(<8 x i16> %src1) {
249; CHECK-LABEL: hadds_undef:
250; CHECK:       // %bb.0:
251; CHECK-NEXT:    sshll v1.4s, v0.4h, #0
252; CHECK-NEXT:    sshll2 v2.4s, v0.8h, #0
253; CHECK-NEXT:    shrn v0.4h, v1.4s, #1
254; CHECK-NEXT:    shrn2 v0.8h, v2.4s, #1
255; CHECK-NEXT:    ret
256  %zextsrc1 = sext <8 x i16> %src1 to <8 x i32>
257  %zextsrc2 = sext <8 x i16> undef to <8 x i32>
258  %add = add <8 x i32> %zextsrc2, %zextsrc1
259  %resulti16 = ashr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
260  %result = trunc <8 x i32> %resulti16 to <8 x i16>
261  ret <8 x i16> %result
262}
263
264
265
266define <8 x i16> @hadds_i_base(<8 x i16> %src1, <8 x i16> %src2) {
267; CHECK-LABEL: hadds_i_base:
268; CHECK:       // %bb.0:
269; CHECK-NEXT:    shadd v0.8h, v0.8h, v1.8h
270; CHECK-NEXT:    ret
271  %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %src1, <8 x i16> %src2)
272  ret <8 x i16> %result
273}
274
275define <8 x i16> @hadds_i_const(<8 x i16> %src1) {
276; CHECK-LABEL: hadds_i_const:
277; CHECK:       // %bb.0:
278; CHECK-NEXT:    movi v1.8h, #1
279; CHECK-NEXT:    shadd v0.8h, v0.8h, v1.8h
280; CHECK-NEXT:    ret
281  %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %src1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
282  ret <8 x i16> %result
283}
284
285define <8 x i16> @hadds_i_const_lhs(<8 x i16> %src1) {
286; CHECK-LABEL: hadds_i_const_lhs:
287; CHECK:       // %bb.0:
288; CHECK-NEXT:    movi v1.8h, #1
289; CHECK-NEXT:    shadd v0.8h, v0.8h, v1.8h
290; CHECK-NEXT:    ret
291  %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, <8 x i16> %src1)
292  ret <8 x i16> %result
293}
294
295define <8 x i16> @hadds_i_const_zero(<8 x i16> %src1) {
296; CHECK-LABEL: hadds_i_const_zero:
297; CHECK:       // %bb.0:
298; CHECK-NEXT:    sshr v0.8h, v0.8h, #1
299; CHECK-NEXT:    ret
300  %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <8 x i16> %src1)
301  ret <8 x i16> %result
302}
303
304define <8 x i16> @hadds_i_const_both() {
305; CHECK-LABEL: hadds_i_const_both:
306; CHECK:       // %bb.0:
307; CHECK-NEXT:    movi v0.8h, #2
308; CHECK-NEXT:    ret
309  %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
310  ret <8 x i16> %result
311}
312
313define <8 x i16> @hadds_i_const_bothhigh() {
314; CHECK-LABEL: hadds_i_const_bothhigh:
315; CHECK:       // %bb.0:
316; CHECK-NEXT:    mov w8, #32766 // =0x7ffe
317; CHECK-NEXT:    dup v0.8h, w8
318; CHECK-NEXT:    ret
319  %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>, <8 x i16> <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767>)
320  ret <8 x i16> %result
321}
322
323define <8 x i16> @hadds_i_undef(<8 x i16> %t, <8 x i16> %src1) {
324; CHECK-LABEL: hadds_i_undef:
325; CHECK:       // %bb.0:
326; CHECK-NEXT:    mov v0.16b, v1.16b
327; CHECK-NEXT:    ret
328  %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> undef, <8 x i16> %src1)
329  ret <8 x i16> %result
330}
331
332define <8 x i16> @sub_fixedwidth_v4i32(<8 x i16> %a0, <8 x i16> %a1)  {
333; CHECK-LABEL: sub_fixedwidth_v4i32:
334; CHECK:       // %bb.0:
335; CHECK-NEXT:    urhadd v0.8h, v0.8h, v1.8h
336; CHECK-NEXT:    ret
337  %or = or <8 x i16> %a0, %a1
338  %xor = xor <8 x i16> %a0, %a1
339  %srl = lshr <8 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
340  %res = sub <8 x i16> %or, %srl
341  ret <8 x i16> %res
342}
343
344define <8 x i16> @srhadd_fixedwidth_v8i16(<8 x i16> %a0, <8 x i16> %a1)  {
345; CHECK-LABEL: srhadd_fixedwidth_v8i16:
346; CHECK:       // %bb.0:
347; CHECK-NEXT:    srhadd v0.8h, v0.8h, v1.8h
348; CHECK-NEXT:    ret
349  %or = or <8 x i16> %a0, %a1
350  %xor = xor <8 x i16> %a0, %a1
351  %srl = ashr <8 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
352  %res = sub <8 x i16> %or, %srl
353  ret <8 x i16> %res
354}
355
356define <8 x i16> @rhaddu_base(<8 x i16> %src1, <8 x i16> %src2) {
357; CHECK-LABEL: rhaddu_base:
358; CHECK:       // %bb.0:
359; CHECK-NEXT:    urhadd v0.8h, v0.8h, v1.8h
360; CHECK-NEXT:    ret
361  %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
362  %zextsrc2 = zext <8 x i16> %src2 to <8 x i32>
363  %add1 = add <8 x i32> %zextsrc1, %zextsrc2
364  %add = add <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
365  %resulti16 = lshr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
366  %result = trunc <8 x i32> %resulti16 to <8 x i16>
367  ret <8 x i16> %result
368}
369
370define <8 x i16> @rhaddu_const(<8 x i16> %src1) {
371; CHECK-LABEL: rhaddu_const:
372; CHECK:       // %bb.0:
373; CHECK-NEXT:    movi v1.8h, #1
374; CHECK-NEXT:    urhadd v0.8h, v0.8h, v1.8h
375; CHECK-NEXT:    ret
376  %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
377  %add1 = add <8 x i32> %zextsrc1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
378  %add = add <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
379  %resulti16 = lshr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
380  %result = trunc <8 x i32> %resulti16 to <8 x i16>
381  ret <8 x i16> %result
382}
383
384define <8 x i16> @rhaddu_const_lhs(<8 x i16> %src1) {
385; CHECK-LABEL: rhaddu_const_lhs:
386; CHECK:       // %bb.0:
387; CHECK-NEXT:    movi v1.8h, #1
388; CHECK-NEXT:    urhadd v0.8h, v0.8h, v1.8h
389; CHECK-NEXT:    ret
390  %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
391  %add1 = add <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, %zextsrc1
392  %add = add <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
393  %resulti16 = lshr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
394  %result = trunc <8 x i32> %resulti16 to <8 x i16>
395  ret <8 x i16> %result
396}
397
398define <8 x i16> @rhaddu_const_zero(<8 x i16> %src1) {
399; CHECK-LABEL: rhaddu_const_zero:
400; CHECK:       // %bb.0:
401; CHECK-NEXT:    movi v1.8h, #1
402; CHECK-NEXT:    uhadd v0.8h, v0.8h, v1.8h
403; CHECK-NEXT:    ret
404  %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
405  %add1 = add <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, %zextsrc1
406  %add = add <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
407  %resulti16 = lshr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
408  %result = trunc <8 x i32> %resulti16 to <8 x i16>
409  ret <8 x i16> %result
410}
411
412define <8 x i16> @rhaddu_const_both() {
413; CHECK-LABEL: rhaddu_const_both:
414; CHECK:       // %bb.0:
415; CHECK-NEXT:    movi v0.8h, #2
416; CHECK-NEXT:    ret
417  %add1 = add <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
418  %add = add <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
419  %resulti16 = lshr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
420  %result = trunc <8 x i32> %resulti16 to <8 x i16>
421  ret <8 x i16> %result
422}
423
424define <8 x i16> @rhaddu_const_bothhigh() {
425; CHECK-LABEL: rhaddu_const_bothhigh:
426; CHECK:       // %bb.0:
427; CHECK-NEXT:    movi v0.2d, #0xffffffffffffffff
428; CHECK-NEXT:    ret
429  %ext1 = zext <8 x i16> <i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534> to <8 x i32>
430  %ext2 = zext <8 x i16> <i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535> to <8 x i32>
431  %add1 = add <8 x i32> %ext1, %ext2
432  %add = add <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
433  %resulti16 = ashr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
434  %result = trunc <8 x i32> %resulti16 to <8 x i16>
435  ret <8 x i16> %result
436}
437
438define <8 x i16> @rhaddu_undef(<8 x i16> %src1) {
439; CHECK-LABEL: rhaddu_undef:
440; CHECK:       // %bb.0:
441; CHECK-NEXT:    movi v1.8h, #1
442; CHECK-NEXT:    uhadd v0.8h, v0.8h, v1.8h
443; CHECK-NEXT:    ret
444  %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
445  %zextsrc2 = zext <8 x i16> undef to <8 x i32>
446  %add1 = add <8 x i32> %zextsrc2, %zextsrc1
447  %add = add <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
448  %resulti16 = lshr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
449  %result = trunc <8 x i32> %resulti16 to <8 x i16>
450  ret <8 x i16> %result
451}
452
453
454
455define <8 x i16> @rhaddu_i_base(<8 x i16> %src1, <8 x i16> %src2) {
456; CHECK-LABEL: rhaddu_i_base:
457; CHECK:       // %bb.0:
458; CHECK-NEXT:    urhadd v0.8h, v0.8h, v1.8h
459; CHECK-NEXT:    ret
460  %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %src1, <8 x i16> %src2)
461  ret <8 x i16> %result
462}
463
464define <8 x i16> @rhaddu_i_const(<8 x i16> %src1) {
465; CHECK-LABEL: rhaddu_i_const:
466; CHECK:       // %bb.0:
467; CHECK-NEXT:    movi v1.8h, #1
468; CHECK-NEXT:    urhadd v0.8h, v0.8h, v1.8h
469; CHECK-NEXT:    ret
470  %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %src1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
471  ret <8 x i16> %result
472}
473
474define <8 x i16> @rhaddu_i_const_lhs(<8 x i16> %src1) {
475; CHECK-LABEL: rhaddu_i_const_lhs:
476; CHECK:       // %bb.0:
477; CHECK-NEXT:    movi v1.8h, #1
478; CHECK-NEXT:    urhadd v0.8h, v0.8h, v1.8h
479; CHECK-NEXT:    ret
480  %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, <8 x i16> %src1)
481  ret <8 x i16> %result
482}
483
484define <8 x i16> @rhaddu_i_const_zero(<8 x i16> %src1) {
485; CHECK-LABEL: rhaddu_i_const_zero:
486; CHECK:       // %bb.0:
487; CHECK-NEXT:    movi v1.2d, #0000000000000000
488; CHECK-NEXT:    urhadd v0.8h, v0.8h, v1.8h
489; CHECK-NEXT:    ret
490  %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <8 x i16> %src1)
491  ret <8 x i16> %result
492}
493
494define <8 x i16> @rhaddu_i_const_both() {
495; CHECK-LABEL: rhaddu_i_const_both:
496; CHECK:       // %bb.0:
497; CHECK-NEXT:    movi v0.8h, #2
498; CHECK-NEXT:    ret
499  %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
500  ret <8 x i16> %result
501}
502
503define <8 x i16> @rhaddu_i_const_bothhigh() {
504; CHECK-LABEL: rhaddu_i_const_bothhigh:
505; CHECK:       // %bb.0:
506; CHECK-NEXT:    movi v0.2d, #0xffffffffffffffff
507; CHECK-NEXT:    ret
508  %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> <i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534>, <8 x i16> <i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535, i16 65535>)
509  ret <8 x i16> %result
510}
511
512define <8 x i16> @rhaddu_i_undef(<8 x i16> %t, <8 x i16> %src1) {
513; CHECK-LABEL: rhaddu_i_undef:
514; CHECK:       // %bb.0:
515; CHECK-NEXT:    mov v0.16b, v1.16b
516; CHECK-NEXT:    ret
517  %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> undef, <8 x i16> %src1)
518  ret <8 x i16> %result
519}
520
521
522
523
524
525define <8 x i16> @rhadds_base(<8 x i16> %src1, <8 x i16> %src2) {
526; CHECK-LABEL: rhadds_base:
527; CHECK:       // %bb.0:
528; CHECK-NEXT:    srhadd v0.8h, v0.8h, v1.8h
529; CHECK-NEXT:    ret
530  %zextsrc1 = sext <8 x i16> %src1 to <8 x i32>
531  %zextsrc2 = sext <8 x i16> %src2 to <8 x i32>
532  %add1 = add <8 x i32> %zextsrc1, %zextsrc2
533  %add = add <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
534  %resulti16 = ashr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
535  %result = trunc <8 x i32> %resulti16 to <8 x i16>
536  ret <8 x i16> %result
537}
538
539define <8 x i16> @rhadds_const(<8 x i16> %src1) {
540; CHECK-LABEL: rhadds_const:
541; CHECK:       // %bb.0:
542; CHECK-NEXT:    movi v1.8h, #1
543; CHECK-NEXT:    srhadd v0.8h, v0.8h, v1.8h
544; CHECK-NEXT:    ret
545  %zextsrc1 = sext <8 x i16> %src1 to <8 x i32>
546  %add1 = add <8 x i32> %zextsrc1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
547  %add = add <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
548  %resulti16 = ashr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
549  %result = trunc <8 x i32> %resulti16 to <8 x i16>
550  ret <8 x i16> %result
551}
552
553define <8 x i16> @rhadds_const_lhs(<8 x i16> %src1) {
554; CHECK-LABEL: rhadds_const_lhs:
555; CHECK:       // %bb.0:
556; CHECK-NEXT:    movi v1.8h, #1
557; CHECK-NEXT:    srhadd v0.8h, v0.8h, v1.8h
558; CHECK-NEXT:    ret
559  %zextsrc1 = sext <8 x i16> %src1 to <8 x i32>
560  %add1 = add <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, %zextsrc1
561  %add = add <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
562  %resulti16 = ashr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
563  %result = trunc <8 x i32> %resulti16 to <8 x i16>
564  ret <8 x i16> %result
565}
566
567define <8 x i16> @rhadds_const_zero(<8 x i16> %src1) {
568; CHECK-LABEL: rhadds_const_zero:
569; CHECK:       // %bb.0:
570; CHECK-NEXT:    movi v1.8h, #1
571; CHECK-NEXT:    shadd v0.8h, v0.8h, v1.8h
572; CHECK-NEXT:    ret
573  %zextsrc1 = sext <8 x i16> %src1 to <8 x i32>
574  %add1 = add <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, %zextsrc1
575  %add = add <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
576  %resulti16 = ashr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
577  %result = trunc <8 x i32> %resulti16 to <8 x i16>
578  ret <8 x i16> %result
579}
580
581define <8 x i16> @rhadds_const_both() {
582; CHECK-LABEL: rhadds_const_both:
583; CHECK:       // %bb.0:
584; CHECK-NEXT:    movi v0.8h, #2
585; CHECK-NEXT:    ret
586  %add1 = add <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
587  %add = add <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
588  %resulti16 = ashr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
589  %result = trunc <8 x i32> %resulti16 to <8 x i16>
590  ret <8 x i16> %result
591}
592
593define <8 x i16> @rhadds_const_bothhigh() {
594; CHECK-LABEL: rhadds_const_bothhigh:
595; CHECK:       // %bb.0:
596; CHECK-NEXT:    mvni v0.8h, #128, lsl #8
597; CHECK-NEXT:    ret
598  %ext1 = sext <8 x i16> <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766> to <8 x i32>
599  %ext2 = sext <8 x i16> <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767> to <8 x i32>
600  %add1 = add <8 x i32> %ext1, %ext2
601  %add = add <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
602  %resulti16 = ashr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
603  %result = trunc <8 x i32> %resulti16 to <8 x i16>
604  ret <8 x i16> %result
605}
606
607define <8 x i16> @rhadds_undef(<8 x i16> %src1) {
608; CHECK-LABEL: rhadds_undef:
609; CHECK:       // %bb.0:
610; CHECK-NEXT:    movi v1.8h, #1
611; CHECK-NEXT:    shadd v0.8h, v0.8h, v1.8h
612; CHECK-NEXT:    ret
613  %zextsrc1 = sext <8 x i16> %src1 to <8 x i32>
614  %zextsrc2 = sext <8 x i16> undef to <8 x i32>
615  %add1 = add <8 x i32> %zextsrc2, %zextsrc1
616  %add = add <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
617  %resulti16 = ashr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
618  %result = trunc <8 x i32> %resulti16 to <8 x i16>
619  ret <8 x i16> %result
620}
621
622
623
624define <8 x i16> @rhadds_i_base(<8 x i16> %src1, <8 x i16> %src2) {
625; CHECK-LABEL: rhadds_i_base:
626; CHECK:       // %bb.0:
627; CHECK-NEXT:    srhadd v0.8h, v0.8h, v1.8h
628; CHECK-NEXT:    ret
629  %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %src1, <8 x i16> %src2)
630  ret <8 x i16> %result
631}
632
633define <8 x i16> @rhadds_i_const(<8 x i16> %src1) {
634; CHECK-LABEL: rhadds_i_const:
635; CHECK:       // %bb.0:
636; CHECK-NEXT:    movi v1.8h, #1
637; CHECK-NEXT:    srhadd v0.8h, v0.8h, v1.8h
638; CHECK-NEXT:    ret
639  %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %src1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
640  ret <8 x i16> %result
641}
642
643define <8 x i16> @rhadds_i_const_lhs(<8 x i16> %src1) {
644; CHECK-LABEL: rhadds_i_const_lhs:
645; CHECK:       // %bb.0:
646; CHECK-NEXT:    movi v1.8h, #1
647; CHECK-NEXT:    srhadd v0.8h, v0.8h, v1.8h
648; CHECK-NEXT:    ret
649  %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, <8 x i16> %src1)
650  ret <8 x i16> %result
651}
652
653define <8 x i16> @rhadds_i_const_zero(<8 x i16> %src1) {
654; CHECK-LABEL: rhadds_i_const_zero:
655; CHECK:       // %bb.0:
656; CHECK-NEXT:    movi v1.2d, #0000000000000000
657; CHECK-NEXT:    srhadd v0.8h, v0.8h, v1.8h
658; CHECK-NEXT:    ret
659  %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <8 x i16> %src1)
660  ret <8 x i16> %result
661}
662
663define <8 x i16> @rhadds_i_const_both() {
664; CHECK-LABEL: rhadds_i_const_both:
665; CHECK:       // %bb.0:
666; CHECK-NEXT:    movi v0.8h, #2
667; CHECK-NEXT:    ret
668  %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
669  ret <8 x i16> %result
670}
671
672define <8 x i16> @rhadds_i_const_bothhigh() {
673; CHECK-LABEL: rhadds_i_const_bothhigh:
674; CHECK:       // %bb.0:
675; CHECK-NEXT:    mvni v0.8h, #128, lsl #8
676; CHECK-NEXT:    ret
677  %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>, <8 x i16> <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767>)
678  ret <8 x i16> %result
679}
680
681define <8 x i16> @rhadds_i_undef(<8 x i16> %t, <8 x i16> %src1) {
682; CHECK-LABEL: rhadds_i_undef:
683; CHECK:       // %bb.0:
684; CHECK-NEXT:    mov v0.16b, v1.16b
685; CHECK-NEXT:    ret
686  %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> undef, <8 x i16> %src1)
687  ret <8 x i16> %result
688}
689
690
691define <8 x i8> @shadd_v8i8(<8 x i8> %x) {
692; CHECK-LABEL: shadd_v8i8:
693; CHECK:       // %bb.0:
694; CHECK-NEXT:    ret
695  %r = tail call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %x, <8 x i8> %x)
696  ret <8 x i8> %r
697}
698
699define <4 x i16> @shadd_v4i16(<4 x i16> %x) {
700; CHECK-LABEL: shadd_v4i16:
701; CHECK:       // %bb.0:
702; CHECK-NEXT:    ret
703  %r = tail call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> %x, <4 x i16> %x)
704  ret <4 x i16> %r
705}
706
707define <2 x i32> @shadd_v2i32(<2 x i32> %x) {
708; CHECK-LABEL: shadd_v2i32:
709; CHECK:       // %bb.0:
710; CHECK-NEXT:    ret
711  %r = tail call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> %x, <2 x i32> %x)
712  ret <2 x i32> %r
713}
714
715define <16 x i8> @shadd_v16i8(<16 x i8> %x) {
716; CHECK-LABEL: shadd_v16i8:
717; CHECK:       // %bb.0:
718; CHECK-NEXT:    ret
719  %r = tail call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %x, <16 x i8> %x)
720  ret <16 x i8> %r
721}
722
723define <8 x i16> @shadd_v8i16(<8 x i16> %x) {
724; CHECK-LABEL: shadd_v8i16:
725; CHECK:       // %bb.0:
726; CHECK-NEXT:    ret
727  %r = tail call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %x, <8 x i16> %x)
728  ret <8 x i16> %r
729}
730
731define <4 x i32> @shadd_v4i32(<4 x i32> %x) {
732; CHECK-LABEL: shadd_v4i32:
733; CHECK:       // %bb.0:
734; CHECK-NEXT:    ret
735  %r = tail call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> %x, <4 x i32> %x)
736  ret <4 x i32> %r
737}
738
739define <8 x i8> @uhadd_v8i8(<8 x i8> %x) {
740; CHECK-LABEL: uhadd_v8i8:
741; CHECK:       // %bb.0:
742; CHECK-NEXT:    ret
743  %r = tail call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %x, <8 x i8> %x)
744  ret <8 x i8> %r
745}
746
747define <4 x i16> @uhadd_v4i16(<4 x i16> %x) {
748; CHECK-LABEL: uhadd_v4i16:
749; CHECK:       // %bb.0:
750; CHECK-NEXT:    ret
751  %r = tail call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> %x, <4 x i16> %x)
752  ret <4 x i16> %r
753}
754
755define <2 x i32> @uhadd_v2i32(<2 x i32> %x) {
756; CHECK-LABEL: uhadd_v2i32:
757; CHECK:       // %bb.0:
758; CHECK-NEXT:    ret
759  %r = tail call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> %x, <2 x i32> %x)
760  ret <2 x i32> %r
761}
762
763define <16 x i8> @uhadd_v16i8(<16 x i8> %x) {
764; CHECK-LABEL: uhadd_v16i8:
765; CHECK:       // %bb.0:
766; CHECK-NEXT:    ret
767  %r = tail call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %x, <16 x i8> %x)
768  ret <16 x i8> %r
769}
770
771define <8 x i16> @uhadd_v8i16(<8 x i16> %x) {
772; CHECK-LABEL: uhadd_v8i16:
773; CHECK:       // %bb.0:
774; CHECK-NEXT:    ret
775  %r = tail call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %x, <8 x i16> %x)
776  ret <8 x i16> %r
777}
778
779define <4 x i32> @uhadd_v4i32(<4 x i32> %x) {
780; CHECK-LABEL: uhadd_v4i32:
781; CHECK:       // %bb.0:
782; CHECK-NEXT:    ret
783  %r = tail call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> %x, <4 x i32> %x)
784  ret <4 x i32> %r
785}
786define <8 x i8> @srhadd_v8i8(<8 x i8> %x) {
787; CHECK-LABEL: srhadd_v8i8:
788; CHECK:       // %bb.0:
789; CHECK-NEXT:    ret
790  %r = tail call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %x, <8 x i8> %x)
791  ret <8 x i8> %r
792}
793
794define <4 x i16> @srhadd_v4i16(<4 x i16> %x) {
795; CHECK-LABEL: srhadd_v4i16:
796; CHECK:       // %bb.0:
797; CHECK-NEXT:    ret
798  %r = tail call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> %x, <4 x i16> %x)
799  ret <4 x i16> %r
800}
801
802define <2 x i32> @srhadd_v2i32(<2 x i32> %x) {
803; CHECK-LABEL: srhadd_v2i32:
804; CHECK:       // %bb.0:
805; CHECK-NEXT:    ret
806  %r = tail call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> %x, <2 x i32> %x)
807  ret <2 x i32> %r
808}
809
810define <16 x i8> @srhadd_v16i8(<16 x i8> %x) {
811; CHECK-LABEL: srhadd_v16i8:
812; CHECK:       // %bb.0:
813; CHECK-NEXT:    ret
814  %r = tail call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %x, <16 x i8> %x)
815  ret <16 x i8> %r
816}
817
818define <8 x i16> @srhadd_v8i16(<8 x i16> %x) {
819; CHECK-LABEL: srhadd_v8i16:
820; CHECK:       // %bb.0:
821; CHECK-NEXT:    ret
822  %r = tail call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %x, <8 x i16> %x)
823  ret <8 x i16> %r
824}
825
826define <4 x i32> @srhadd_v4i32(<4 x i32> %x) {
827; CHECK-LABEL: srhadd_v4i32:
828; CHECK:       // %bb.0:
829; CHECK-NEXT:    ret
830  %r = tail call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> %x, <4 x i32> %x)
831  ret <4 x i32> %r
832}
833
834define <8 x i8> @urhadd_v8i8(<8 x i8> %x) {
835; CHECK-LABEL: urhadd_v8i8:
836; CHECK:       // %bb.0:
837; CHECK-NEXT:    ret
838  %r = tail call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %x, <8 x i8> %x)
839  ret <8 x i8> %r
840}
841
842define <4 x i16> @urhadd_v4i16(<4 x i16> %x) {
843; CHECK-LABEL: urhadd_v4i16:
844; CHECK:       // %bb.0:
845; CHECK-NEXT:    ret
846  %r = tail call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> %x, <4 x i16> %x)
847  ret <4 x i16> %r
848}
849
850define <2 x i32> @urhadd_v2i32(<2 x i32> %x) {
851; CHECK-LABEL: urhadd_v2i32:
852; CHECK:       // %bb.0:
853; CHECK-NEXT:    ret
854  %r = tail call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> %x, <2 x i32> %x)
855  ret <2 x i32> %r
856}
857
858define <16 x i8> @urhadd_v16i8(<16 x i8> %x) {
859; CHECK-LABEL: urhadd_v16i8:
860; CHECK:       // %bb.0:
861; CHECK-NEXT:    ret
862  %r = tail call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %x, <16 x i8> %x)
863  ret <16 x i8> %r
864}
865
866define <8 x i16> @urhadd_v8i16(<8 x i16> %x) {
867; CHECK-LABEL: urhadd_v8i16:
868; CHECK:       // %bb.0:
869; CHECK-NEXT:    ret
870  %r = tail call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %x, <8 x i16> %x)
871  ret <8 x i16> %r
872}
873
874define <4 x i32> @urhadd_v4i32(<4 x i32> %x) {
875; CHECK-LABEL: urhadd_v4i32:
876; CHECK:       // %bb.0:
877; CHECK-NEXT:    ret
878  %r = tail call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> %x, <4 x i32> %x)
879  ret <4 x i32> %r
880}
881
882define <8 x i16> @uhadd_fixedwidth_v4i32(<8 x i16> %a0, <8 x i16> %a1)  {
883; CHECK-LABEL: uhadd_fixedwidth_v4i32:
884; CHECK:       // %bb.0:
885; CHECK-NEXT:    uhadd v0.8h, v0.8h, v1.8h
886; CHECK-NEXT:    ret
887  %and = and <8 x i16> %a0, %a1
888  %xor = xor <8 x i16> %a0, %a1
889  %srl = lshr <8 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
890  %res = add <8 x i16> %and, %srl
891  ret <8 x i16> %res
892}
893
894define <8 x i16> @shadd_fixedwidth_v8i16(<8 x i16> %a0, <8 x i16> %a1)  {
895; CHECK-LABEL: shadd_fixedwidth_v8i16:
896; CHECK:       // %bb.0:
897; CHECK-NEXT:    shadd v0.8h, v0.8h, v1.8h
898; CHECK-NEXT:    ret
899  %and = and <8 x i16> %a0, %a1
900  %xor = xor <8 x i16> %a0, %a1
901  %srl = ashr <8 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
902  %res = add <8 x i16> %and, %srl
903  ret <8 x i16> %res
904}
905
906define <8 x i16> @shadd_demandedelts(<8 x i16> %a0, <8 x i16> %a1) {
907; CHECK-LABEL: shadd_demandedelts:
908; CHECK:       // %bb.0:
909; CHECK-NEXT:    dup v0.8h, v0.h[0]
910; CHECK-NEXT:    shadd v0.8h, v0.8h, v1.8h
911; CHECK-NEXT:    dup v0.8h, v0.h[0]
912; CHECK-NEXT:    ret
913  %s0 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> zeroinitializer
914  %op = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %s0, <8 x i16> %a1)
915  %r0 = shufflevector <8 x i16> %op, <8 x i16> undef, <8 x i32> zeroinitializer
916  ret <8 x i16> %r0
917}
918
919define <8 x i16> @srhadd_demandedelts(<8 x i16> %a0, <8 x i16> %a1) {
920; CHECK-LABEL: srhadd_demandedelts:
921; CHECK:       // %bb.0:
922; CHECK-NEXT:    dup v0.8h, v0.h[0]
923; CHECK-NEXT:    srhadd v0.8h, v0.8h, v1.8h
924; CHECK-NEXT:    dup v0.8h, v0.h[0]
925; CHECK-NEXT:    ret
926  %s0 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> zeroinitializer
927  %op = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %s0, <8 x i16> %a1)
928  %r0 = shufflevector <8 x i16> %op, <8 x i16> undef, <8 x i32> zeroinitializer
929  ret <8 x i16> %r0
930}
931
932define <8 x i16> @uhadd_demandedelts(<8 x i16> %a0, <8 x i16> %a1) {
933; CHECK-LABEL: uhadd_demandedelts:
934; CHECK:       // %bb.0:
935; CHECK-NEXT:    dup v0.8h, v0.h[0]
936; CHECK-NEXT:    uhadd v0.8h, v0.8h, v1.8h
937; CHECK-NEXT:    dup v0.8h, v0.h[0]
938; CHECK-NEXT:    ret
939  %s0 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> zeroinitializer
940  %op = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %s0, <8 x i16> %a1)
941  %r0 = shufflevector <8 x i16> %op, <8 x i16> undef, <8 x i32> zeroinitializer
942  ret <8 x i16> %r0
943}
944
945define <8 x i16> @urhadd_demandedelts(<8 x i16> %a0, <8 x i16> %a1) {
946; CHECK-LABEL: urhadd_demandedelts:
947; CHECK:       // %bb.0:
948; CHECK-NEXT:    dup v0.8h, v0.h[0]
949; CHECK-NEXT:    urhadd v0.8h, v0.8h, v1.8h
950; CHECK-NEXT:    dup v0.8h, v0.h[0]
951; CHECK-NEXT:    ret
952  %s0 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> zeroinitializer
953  %op = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %s0, <8 x i16> %a1)
954  %r0 = shufflevector <8 x i16> %op, <8 x i16> undef, <8 x i32> zeroinitializer
955  ret <8 x i16> %r0
956}
957
958; Remove unnecessary sign_extend_inreg after shadd
959define <2 x i32> @shadd_signbits_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) {
960; CHECK-LABEL: shadd_signbits_v2i32:
961; CHECK:       // %bb.0:
962; CHECK-NEXT:    sshr v0.2s, v0.2s, #17
963; CHECK-NEXT:    sshr v1.2s, v1.2s, #17
964; CHECK-NEXT:    shadd v0.2s, v0.2s, v1.2s
965; CHECK-NEXT:    str d0, [x0]
966; CHECK-NEXT:    ret
967  %x0 = ashr <2 x i32> %a0, <i32 17, i32 17>
968  %x1 = ashr <2 x i32> %a1, <i32 17, i32 17>
969  %m = and <2 x i32> %x0, %x1
970  %s = xor <2 x i32> %x0, %x1
971  %x = ashr <2 x i32> %s, <i32 1, i32 1>
972  %avg = add <2 x i32> %m, %x
973  %avg1 = shl <2 x i32> %avg, <i32 17, i32 17>
974  %avg2 = ashr <2 x i32> %avg1, <i32 17, i32 17>
975  store <2 x i32> %avg, ptr %p2 ; extra use
976  ret <2 x i32> %avg2
977}
978
979; Remove unnecessary sign_extend_inreg after srhadd
980define <2 x i32> @srhadd_signbits_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) {
981; CHECK-LABEL: srhadd_signbits_v2i32:
982; CHECK:       // %bb.0:
983; CHECK-NEXT:    sshr v0.2s, v0.2s, #17
984; CHECK-NEXT:    sshr v1.2s, v1.2s, #17
985; CHECK-NEXT:    srhadd v0.2s, v0.2s, v1.2s
986; CHECK-NEXT:    str d0, [x0]
987; CHECK-NEXT:    ret
988  %x0 = ashr <2 x i32> %a0, <i32 17, i32 17>
989  %x1 = ashr <2 x i32> %a1, <i32 17, i32 17>
990  %m = or <2 x i32> %x0, %x1
991  %s = xor <2 x i32> %x0, %x1
992  %x = ashr <2 x i32> %s, <i32 1, i32 1>
993  %avg = sub <2 x i32> %m, %x
994  %avg1 = shl <2 x i32> %avg, <i32 17, i32 17>
995  %avg2 = ashr <2 x i32> %avg1, <i32 17, i32 17>
996  store <2 x i32> %avg, ptr %p2 ; extra use
997  ret <2 x i32> %avg2
998}
999
1000; negative test - not enough signbits to remove sign_extend_inreg after srhadd
1001define <2 x i32> @srhadd_signbits_v2i32_negative(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) {
1002; CHECK-LABEL: srhadd_signbits_v2i32_negative:
1003; CHECK:       // %bb.0:
1004; CHECK-NEXT:    sshr v0.2s, v0.2s, #17
1005; CHECK-NEXT:    sshr v1.2s, v1.2s, #17
1006; CHECK-NEXT:    srhadd v1.2s, v0.2s, v1.2s
1007; CHECK-NEXT:    shl v0.2s, v1.2s, #22
1008; CHECK-NEXT:    str d1, [x0]
1009; CHECK-NEXT:    sshr v0.2s, v0.2s, #22
1010; CHECK-NEXT:    ret
1011  %x0 = ashr <2 x i32> %a0, <i32 17, i32 17>
1012  %x1 = ashr <2 x i32> %a1, <i32 17, i32 17>
1013  %m = or <2 x i32> %x0, %x1
1014  %s = xor <2 x i32> %x0, %x1
1015  %x = ashr <2 x i32> %s, <i32 1, i32 1>
1016  %avg = sub <2 x i32> %m, %x
1017  %avg1 = shl <2 x i32> %avg, <i32 22, i32 22>
1018  %avg2 = ashr <2 x i32> %avg1, <i32 22, i32 22>
1019  store <2 x i32> %avg, ptr %p2 ; extra use
1020  ret <2 x i32> %avg2
1021}
1022
1023declare <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>)
1024declare <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16>, <4 x i16>)
1025declare <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32>, <2 x i32>)
1026declare <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8>, <8 x i8>)
1027declare <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16>, <4 x i16>)
1028declare <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32>, <2 x i32>)
1029declare <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8>, <16 x i8>)
1030declare <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16>, <8 x i16>)
1031declare <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32>, <4 x i32>)
1032declare <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8>, <16 x i8>)
1033declare <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>)
1034declare <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32>, <4 x i32>)
1035
1036declare <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>)
1037declare <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16>, <4 x i16>)
1038declare <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32>, <2 x i32>)
1039declare <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>)
1040declare <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16>, <4 x i16>)
1041declare <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32>, <2 x i32>)
1042declare <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8>, <16 x i8>)
1043declare <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16>, <8 x i16>)
1044declare <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32>, <4 x i32>)
1045declare <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8>, <16 x i8>)
1046declare <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16>, <8 x i16>)
1047declare <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32>, <4 x i32>)
1048