xref: /llvm-project/llvm/test/CodeGen/AArch64/arm64-vhadd.ll (revision 760ad23e4811b81f8727d0a12b2d13149a45c72f)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
3
4define <8 x i8> @shadd8b(ptr nocapture readonly %A, ptr nocapture readonly %B) {
5; CHECK-LABEL: shadd8b:
6; CHECK:       // %bb.0:
7; CHECK-NEXT:    ldr d0, [x0]
8; CHECK-NEXT:    ldr d1, [x1]
9; CHECK-NEXT:    shadd.8b v0, v0, v1
10; CHECK-NEXT:    ret
11  %tmp1 = load <8 x i8>, ptr %A, align 8
12  %tmp2 = load <8 x i8>, ptr %B, align 8
13  %tmp3 = tail call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
14  ret <8 x i8> %tmp3
15}
16
17define <16 x i8> @shadd16b(ptr nocapture readonly %A, ptr nocapture readonly %B) {
18; CHECK-LABEL: shadd16b:
19; CHECK:       // %bb.0:
20; CHECK-NEXT:    ldr q0, [x0]
21; CHECK-NEXT:    ldr q1, [x1]
22; CHECK-NEXT:    shadd.16b v0, v0, v1
23; CHECK-NEXT:    ret
24  %tmp1 = load <16 x i8>, ptr %A, align 16
25  %tmp2 = load <16 x i8>, ptr %B, align 16
26  %tmp3 = tail call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
27  ret <16 x i8> %tmp3
28}
29
30define <4 x i16> @shadd4h(ptr nocapture readonly %A, ptr nocapture readonly %B) {
31; CHECK-LABEL: shadd4h:
32; CHECK:       // %bb.0:
33; CHECK-NEXT:    ldr d0, [x0]
34; CHECK-NEXT:    ldr d1, [x1]
35; CHECK-NEXT:    shadd.4h v0, v0, v1
36; CHECK-NEXT:    ret
37  %tmp1 = load <4 x i16>, ptr %A, align 8
38  %tmp2 = load <4 x i16>, ptr %B, align 8
39  %tmp3 = tail call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
40  ret <4 x i16> %tmp3
41}
42
43define <8 x i16> @shadd8h(ptr nocapture readonly %A, ptr nocapture readonly %B) {
44; CHECK-LABEL: shadd8h:
45; CHECK:       // %bb.0:
46; CHECK-NEXT:    ldr q0, [x0]
47; CHECK-NEXT:    ldr q1, [x1]
48; CHECK-NEXT:    shadd.8h v0, v0, v1
49; CHECK-NEXT:    ret
50  %tmp1 = load <8 x i16>, ptr %A, align 16
51  %tmp2 = load <8 x i16>, ptr %B, align 16
52  %tmp3 = tail call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
53  ret <8 x i16> %tmp3
54}
55
56define <2 x i32> @shadd2s(ptr nocapture readonly %A, ptr nocapture readonly %B) {
57; CHECK-LABEL: shadd2s:
58; CHECK:       // %bb.0:
59; CHECK-NEXT:    ldr d0, [x0]
60; CHECK-NEXT:    ldr d1, [x1]
61; CHECK-NEXT:    shadd.2s v0, v0, v1
62; CHECK-NEXT:    ret
63  %tmp1 = load <2 x i32>, ptr %A, align 8
64  %tmp2 = load <2 x i32>, ptr %B, align 8
65  %tmp3 = tail call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
66  ret <2 x i32> %tmp3
67}
68
69define <4 x i32> @shadd4s(ptr nocapture readonly %A, ptr nocapture readonly %B) {
70; CHECK-LABEL: shadd4s:
71; CHECK:       // %bb.0:
72; CHECK-NEXT:    ldr q0, [x0]
73; CHECK-NEXT:    ldr q1, [x1]
74; CHECK-NEXT:    shadd.4s v0, v0, v1
75; CHECK-NEXT:    ret
76  %tmp1 = load <4 x i32>, ptr %A, align 16
77  %tmp2 = load <4 x i32>, ptr %B, align 16
78  %tmp3 = tail call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
79  ret <4 x i32> %tmp3
80}
81
82define <8 x i8> @uhadd8b(ptr nocapture readonly %A, ptr nocapture readonly %B) {
83; CHECK-LABEL: uhadd8b:
84; CHECK:       // %bb.0:
85; CHECK-NEXT:    ldr d0, [x0]
86; CHECK-NEXT:    ldr d1, [x1]
87; CHECK-NEXT:    uhadd.8b v0, v0, v1
88; CHECK-NEXT:    ret
89  %tmp1 = load <8 x i8>, ptr %A, align 8
90  %tmp2 = load <8 x i8>, ptr %B, align 8
91  %tmp3 = tail call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
92  ret <8 x i8> %tmp3
93}
94
95define <16 x i8> @uhadd16b(ptr nocapture readonly %A, ptr nocapture readonly %B) {
96; CHECK-LABEL: uhadd16b:
97; CHECK:       // %bb.0:
98; CHECK-NEXT:    ldr q0, [x0]
99; CHECK-NEXT:    ldr q1, [x1]
100; CHECK-NEXT:    uhadd.16b v0, v0, v1
101; CHECK-NEXT:    ret
102  %tmp1 = load <16 x i8>, ptr %A, align 16
103  %tmp2 = load <16 x i8>, ptr %B, align 16
104  %tmp3 = tail call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
105  ret <16 x i8> %tmp3
106}
107
108define <4 x i16> @uhadd4h(ptr nocapture readonly %A, ptr nocapture readonly %B) {
109; CHECK-LABEL: uhadd4h:
110; CHECK:       // %bb.0:
111; CHECK-NEXT:    ldr d0, [x0]
112; CHECK-NEXT:    ldr d1, [x1]
113; CHECK-NEXT:    uhadd.4h v0, v0, v1
114; CHECK-NEXT:    ret
115  %tmp1 = load <4 x i16>, ptr %A, align 8
116  %tmp2 = load <4 x i16>, ptr %B, align 8
117  %tmp3 = tail call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
118  ret <4 x i16> %tmp3
119}
120
121define <8 x i16> @uhadd8h(ptr nocapture readonly %A, ptr nocapture readonly %B) {
122; CHECK-LABEL: uhadd8h:
123; CHECK:       // %bb.0:
124; CHECK-NEXT:    ldr q0, [x0]
125; CHECK-NEXT:    ldr q1, [x1]
126; CHECK-NEXT:    uhadd.8h v0, v0, v1
127; CHECK-NEXT:    ret
128  %tmp1 = load <8 x i16>, ptr %A, align 16
129  %tmp2 = load <8 x i16>, ptr %B, align 16
130  %tmp3 = tail call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
131  ret <8 x i16> %tmp3
132}
133
134define <2 x i32> @uhadd2s(ptr nocapture readonly %A, ptr nocapture readonly %B) {
135; CHECK-LABEL: uhadd2s:
136; CHECK:       // %bb.0:
137; CHECK-NEXT:    ldr d0, [x0]
138; CHECK-NEXT:    ldr d1, [x1]
139; CHECK-NEXT:    uhadd.2s v0, v0, v1
140; CHECK-NEXT:    ret
141  %tmp1 = load <2 x i32>, ptr %A, align 8
142  %tmp2 = load <2 x i32>, ptr %B, align 8
143  %tmp3 = tail call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
144  ret <2 x i32> %tmp3
145}
146
147define <4 x i32> @uhadd4s(ptr nocapture readonly %A, ptr nocapture readonly %B) {
148; CHECK-LABEL: uhadd4s:
149; CHECK:       // %bb.0:
150; CHECK-NEXT:    ldr q0, [x0]
151; CHECK-NEXT:    ldr q1, [x1]
152; CHECK-NEXT:    uhadd.4s v0, v0, v1
153; CHECK-NEXT:    ret
154  %tmp1 = load <4 x i32>, ptr %A, align 16
155  %tmp2 = load <4 x i32>, ptr %B, align 16
156  %tmp3 = tail call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
157  ret <4 x i32> %tmp3
158}
159
160declare <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>)
161declare <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16>, <4 x i16>)
162declare <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32>, <2 x i32>)
163declare <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8>, <8 x i8>)
164declare <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16>, <4 x i16>)
165declare <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32>, <2 x i32>)
166declare <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8>, <16 x i8>)
167declare <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16>, <8 x i16>)
168declare <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32>, <4 x i32>)
169declare <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8>, <16 x i8>)
170declare <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>)
171declare <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32>, <4 x i32>)
172
173define <8 x i8> @srhadd8b(ptr nocapture readonly %A, ptr nocapture readonly %B) {
174; CHECK-LABEL: srhadd8b:
175; CHECK:       // %bb.0:
176; CHECK-NEXT:    ldr d0, [x0]
177; CHECK-NEXT:    ldr d1, [x1]
178; CHECK-NEXT:    srhadd.8b v0, v0, v1
179; CHECK-NEXT:    ret
180  %tmp1 = load <8 x i8>, ptr %A, align 8
181  %tmp2 = load <8 x i8>, ptr %B, align 8
182  %tmp3 = tail call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
183  ret <8 x i8> %tmp3
184}
185
186define <16 x i8> @srhadd16b(ptr nocapture readonly %A, ptr nocapture readonly %B) {
187; CHECK-LABEL: srhadd16b:
188; CHECK:       // %bb.0:
189; CHECK-NEXT:    ldr q0, [x0]
190; CHECK-NEXT:    ldr q1, [x1]
191; CHECK-NEXT:    srhadd.16b v0, v0, v1
192; CHECK-NEXT:    ret
193  %tmp1 = load <16 x i8>, ptr %A, align 16
194  %tmp2 = load <16 x i8>, ptr %B, align 16
195  %tmp3 = tail call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
196  ret <16 x i8> %tmp3
197}
198
199define <4 x i16> @srhadd4h(ptr nocapture readonly %A, ptr nocapture readonly %B) {
200; CHECK-LABEL: srhadd4h:
201; CHECK:       // %bb.0:
202; CHECK-NEXT:    ldr d0, [x0]
203; CHECK-NEXT:    ldr d1, [x1]
204; CHECK-NEXT:    srhadd.4h v0, v0, v1
205; CHECK-NEXT:    ret
206  %tmp1 = load <4 x i16>, ptr %A, align 8
207  %tmp2 = load <4 x i16>, ptr %B, align 8
208  %tmp3 = tail call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
209  ret <4 x i16> %tmp3
210}
211
212define <8 x i16> @srhadd8h(ptr nocapture readonly %A, ptr nocapture readonly %B) {
213; CHECK-LABEL: srhadd8h:
214; CHECK:       // %bb.0:
215; CHECK-NEXT:    ldr q0, [x0]
216; CHECK-NEXT:    ldr q1, [x1]
217; CHECK-NEXT:    srhadd.8h v0, v0, v1
218; CHECK-NEXT:    ret
219  %tmp1 = load <8 x i16>, ptr %A, align 16
220  %tmp2 = load <8 x i16>, ptr %B, align 16
221  %tmp3 = tail call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
222  ret <8 x i16> %tmp3
223}
224
225define <2 x i32> @srhadd2s(ptr nocapture readonly %A, ptr nocapture readonly %B) {
226; CHECK-LABEL: srhadd2s:
227; CHECK:       // %bb.0:
228; CHECK-NEXT:    ldr d0, [x0]
229; CHECK-NEXT:    ldr d1, [x1]
230; CHECK-NEXT:    srhadd.2s v0, v0, v1
231; CHECK-NEXT:    ret
232  %tmp1 = load <2 x i32>, ptr %A, align 8
233  %tmp2 = load <2 x i32>, ptr %B, align 8
234  %tmp3 = tail call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
235  ret <2 x i32> %tmp3
236}
237
238define <4 x i32> @srhadd4s(ptr nocapture readonly %A, ptr nocapture readonly %B) {
239; CHECK-LABEL: srhadd4s:
240; CHECK:       // %bb.0:
241; CHECK-NEXT:    ldr q0, [x0]
242; CHECK-NEXT:    ldr q1, [x1]
243; CHECK-NEXT:    srhadd.4s v0, v0, v1
244; CHECK-NEXT:    ret
245  %tmp1 = load <4 x i32>, ptr %A, align 16
246  %tmp2 = load <4 x i32>, ptr %B, align 16
247  %tmp3 = tail call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
248  ret <4 x i32> %tmp3
249}
250
251define <8 x i8> @urhadd8b(ptr nocapture readonly %A, ptr nocapture readonly %B) {
252; CHECK-LABEL: urhadd8b:
253; CHECK:       // %bb.0:
254; CHECK-NEXT:    ldr d0, [x0]
255; CHECK-NEXT:    ldr d1, [x1]
256; CHECK-NEXT:    urhadd.8b v0, v0, v1
257; CHECK-NEXT:    ret
258  %tmp1 = load <8 x i8>, ptr %A, align 8
259  %tmp2 = load <8 x i8>, ptr %B, align 8
260  %tmp3 = tail call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
261  ret <8 x i8> %tmp3
262}
263
264define <16 x i8> @urhadd16b(ptr nocapture readonly %A, ptr nocapture readonly %B) {
265; CHECK-LABEL: urhadd16b:
266; CHECK:       // %bb.0:
267; CHECK-NEXT:    ldr q0, [x0]
268; CHECK-NEXT:    ldr q1, [x1]
269; CHECK-NEXT:    urhadd.16b v0, v0, v1
270; CHECK-NEXT:    ret
271  %tmp1 = load <16 x i8>, ptr %A, align 16
272  %tmp2 = load <16 x i8>, ptr %B, align 16
273  %tmp3 = tail call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
274  ret <16 x i8> %tmp3
275}
276
277define <4 x i16> @urhadd4h(ptr nocapture readonly %A, ptr nocapture readonly %B) {
278; CHECK-LABEL: urhadd4h:
279; CHECK:       // %bb.0:
280; CHECK-NEXT:    ldr d0, [x0]
281; CHECK-NEXT:    ldr d1, [x1]
282; CHECK-NEXT:    urhadd.4h v0, v0, v1
283; CHECK-NEXT:    ret
284  %tmp1 = load <4 x i16>, ptr %A, align 8
285  %tmp2 = load <4 x i16>, ptr %B, align 8
286  %tmp3 = tail call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
287  ret <4 x i16> %tmp3
288}
289
290define <8 x i16> @urhadd8h(ptr nocapture readonly %A, ptr nocapture readonly %B) {
291; CHECK-LABEL: urhadd8h:
292; CHECK:       // %bb.0:
293; CHECK-NEXT:    ldr q0, [x0]
294; CHECK-NEXT:    ldr q1, [x1]
295; CHECK-NEXT:    urhadd.8h v0, v0, v1
296; CHECK-NEXT:    ret
297  %tmp1 = load <8 x i16>, ptr %A, align 16
298  %tmp2 = load <8 x i16>, ptr %B, align 16
299  %tmp3 = tail call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
300  ret <8 x i16> %tmp3
301}
302
303define <2 x i32> @urhadd2s(ptr nocapture readonly %A, ptr nocapture readonly %B) {
304; CHECK-LABEL: urhadd2s:
305; CHECK:       // %bb.0:
306; CHECK-NEXT:    ldr d0, [x0]
307; CHECK-NEXT:    ldr d1, [x1]
308; CHECK-NEXT:    urhadd.2s v0, v0, v1
309; CHECK-NEXT:    ret
310  %tmp1 = load <2 x i32>, ptr %A, align 8
311  %tmp2 = load <2 x i32>, ptr %B, align 8
312  %tmp3 = tail call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
313  ret <2 x i32> %tmp3
314}
315
316define <4 x i32> @urhadd4s(ptr nocapture readonly %A, ptr nocapture readonly %B) {
317; CHECK-LABEL: urhadd4s:
318; CHECK:       // %bb.0:
319; CHECK-NEXT:    ldr q0, [x0]
320; CHECK-NEXT:    ldr q1, [x1]
321; CHECK-NEXT:    urhadd.4s v0, v0, v1
322; CHECK-NEXT:    ret
323  %tmp1 = load <4 x i32>, ptr %A, align 16
324  %tmp2 = load <4 x i32>, ptr %B, align 16
325  %tmp3 = tail call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
326  ret <4 x i32> %tmp3
327}
328
329define void @testLowerToSRHADD8b(<8 x i8> %src1, <8 x i8> %src2, ptr nocapture writeonly %dest) {
330; CHECK-LABEL: testLowerToSRHADD8b:
331; CHECK:       // %bb.0:
332; CHECK-NEXT:    srhadd.8b v0, v0, v1
333; CHECK-NEXT:    str d0, [x0]
334; CHECK-NEXT:    ret
335  %sextsrc1 = sext <8 x i8> %src1 to <8 x i16>
336  %sextsrc2 = sext <8 x i8> %src2 to <8 x i16>
337  %add1 = add nsw <8 x i16> %sextsrc1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
338  %add2 = add nsw <8 x i16> %add1, %sextsrc2
339  %resulti16 = lshr <8 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
340  %result = trunc <8 x i16> %resulti16 to <8 x i8>
341  store <8 x i8> %result, ptr %dest, align 8
342  ret void
343}
344
345define void @testLowerToSRHADD4h(<4 x i16> %src1, <4 x i16> %src2, ptr nocapture writeonly %dest) {
346; CHECK-LABEL: testLowerToSRHADD4h:
347; CHECK:       // %bb.0:
348; CHECK-NEXT:    srhadd.4h v0, v0, v1
349; CHECK-NEXT:    str d0, [x0]
350; CHECK-NEXT:    ret
351  %sextsrc1 = sext <4 x i16> %src1 to <4 x i32>
352  %sextsrc2 = sext <4 x i16> %src2 to <4 x i32>
353  %add1 = add nsw <4 x i32> %sextsrc1, <i32 1, i32 1, i32 1, i32 1>
354  %add2 = add nsw <4 x i32> %add1, %sextsrc2
355  %resulti16 = lshr <4 x i32> %add2, <i32 1, i32 1, i32 1, i32 1>
356  %result = trunc <4 x i32> %resulti16 to <4 x i16>
357  store <4 x i16> %result, ptr %dest, align 8
358  ret void
359}
360
361define void @testLowerToSRHADD2s(<2 x i32> %src1, <2 x i32> %src2, ptr nocapture writeonly %dest) {
362; CHECK-LABEL: testLowerToSRHADD2s:
363; CHECK:       // %bb.0:
364; CHECK-NEXT:    srhadd.2s v0, v0, v1
365; CHECK-NEXT:    str d0, [x0]
366; CHECK-NEXT:    ret
367  %sextsrc1 = sext <2 x i32> %src1 to <2 x i64>
368  %sextsrc2 = sext <2 x i32> %src2 to <2 x i64>
369  %add1 = add nsw <2 x i64> %sextsrc1, <i64 1, i64 1>
370  %add2 = add nsw <2 x i64> %add1, %sextsrc2
371  %resulti16 = lshr <2 x i64> %add2, <i64 1, i64 1>
372  %result = trunc <2 x i64> %resulti16 to <2 x i32>
373  store <2 x i32> %result, ptr %dest, align 8
374  ret void
375}
376
377define void @testLowerToSRHADD16b(<16 x i8> %src1, <16 x i8> %src2, ptr nocapture writeonly %dest) {
378; CHECK-LABEL: testLowerToSRHADD16b:
379; CHECK:       // %bb.0:
380; CHECK-NEXT:    srhadd.16b v0, v0, v1
381; CHECK-NEXT:    str q0, [x0]
382; CHECK-NEXT:    ret
383  %sextsrc1 = sext <16 x i8> %src1 to <16 x i16>
384  %sextsrc2 = sext <16 x i8> %src2 to <16 x i16>
385  %add1 = add nsw <16 x i16> %sextsrc1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
386  %add2 = add nsw <16 x i16> %add1, %sextsrc2
387  %resulti16 = lshr <16 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
388  %result = trunc <16 x i16> %resulti16 to <16 x i8>
389  store <16 x i8> %result, ptr %dest, align 16
390  ret void
391}
392
393define void @testLowerToSRHADD8h(<8 x i16> %src1, <8 x i16> %src2, ptr nocapture writeonly %dest) {
394; CHECK-LABEL: testLowerToSRHADD8h:
395; CHECK:       // %bb.0:
396; CHECK-NEXT:    srhadd.8h v0, v0, v1
397; CHECK-NEXT:    str q0, [x0]
398; CHECK-NEXT:    ret
399  %sextsrc1 = sext <8 x i16> %src1 to <8 x i32>
400  %sextsrc2 = sext <8 x i16> %src2 to <8 x i32>
401  %add1 = add nsw <8 x i32> %sextsrc1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
402  %add2 = add nsw <8 x i32> %add1, %sextsrc2
403  %resulti16 = lshr <8 x i32> %add2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
404  %result = trunc <8 x i32> %resulti16 to <8 x i16>
405  store <8 x i16> %result, ptr %dest, align 16
406  ret void
407}
408
409define void @testLowerToSRHADD4s(<4 x i32> %src1, <4 x i32> %src2, ptr nocapture writeonly %dest) {
410; CHECK-LABEL: testLowerToSRHADD4s:
411; CHECK:       // %bb.0:
412; CHECK-NEXT:    srhadd.4s v0, v0, v1
413; CHECK-NEXT:    str q0, [x0]
414; CHECK-NEXT:    ret
415  %sextsrc1 = sext <4 x i32> %src1 to <4 x i64>
416  %sextsrc2 = sext <4 x i32> %src2 to <4 x i64>
417  %add1 = add nsw <4 x i64> %sextsrc1, <i64 1, i64 1, i64 1, i64 1>
418  %add2 = add nsw <4 x i64> %add1, %sextsrc2
419  %resulti16 = lshr <4 x i64> %add2, <i64 1, i64 1, i64 1, i64 1>
420  %result = trunc <4 x i64> %resulti16 to <4 x i32>
421  store <4 x i32> %result, ptr %dest, align 16
422  ret void
423}
424
425define void @testLowerToSHADD8b(<8 x i8> %src1, <8 x i8> %src2, ptr nocapture writeonly %dest) {
426; CHECK-LABEL: testLowerToSHADD8b:
427; CHECK:       // %bb.0:
428; CHECK-NEXT:    shadd.8b v0, v0, v1
429; CHECK-NEXT:    str d0, [x0]
430; CHECK-NEXT:    ret
431  %sextsrc1 = sext <8 x i8> %src1 to <8 x i16>
432  %sextsrc2 = sext <8 x i8> %src2 to <8 x i16>
433  %add = add nsw <8 x i16> %sextsrc1, %sextsrc2
434  %resulti16 = lshr <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
435  %result = trunc <8 x i16> %resulti16 to <8 x i8>
436  store <8 x i8> %result, ptr %dest, align 8
437  ret void
438}
439
440define void @testLowerToSHADD4h(<4 x i16> %src1, <4 x i16> %src2, ptr nocapture writeonly %dest) {
441; CHECK-LABEL: testLowerToSHADD4h:
442; CHECK:       // %bb.0:
443; CHECK-NEXT:    shadd.4h v0, v0, v1
444; CHECK-NEXT:    str d0, [x0]
445; CHECK-NEXT:    ret
446  %sextsrc1 = sext <4 x i16> %src1 to <4 x i32>
447  %sextsrc2 = sext <4 x i16> %src2 to <4 x i32>
448  %add = add nsw <4 x i32> %sextsrc1, %sextsrc2
449  %resulti16 = lshr <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
450  %result = trunc <4 x i32> %resulti16 to <4 x i16>
451  store <4 x i16> %result, ptr %dest, align 8
452  ret void
453}
454
455define void @testLowerToSHADD2s(<2 x i32> %src1, <2 x i32> %src2, ptr nocapture writeonly %dest) {
456; CHECK-LABEL: testLowerToSHADD2s:
457; CHECK:       // %bb.0:
458; CHECK-NEXT:    shadd.2s v0, v0, v1
459; CHECK-NEXT:    str d0, [x0]
460; CHECK-NEXT:    ret
461  %sextsrc1 = sext <2 x i32> %src1 to <2 x i64>
462  %sextsrc2 = sext <2 x i32> %src2 to <2 x i64>
463  %add = add nsw <2 x i64> %sextsrc1, %sextsrc2
464  %resulti16 = lshr <2 x i64> %add, <i64 1, i64 1>
465  %result = trunc <2 x i64> %resulti16 to <2 x i32>
466  store <2 x i32> %result, ptr %dest, align 8
467  ret void
468}
469
470define void @testLowerToSHADD16b(<16 x i8> %src1, <16 x i8> %src2, ptr nocapture writeonly %dest) {
471; CHECK-LABEL: testLowerToSHADD16b:
472; CHECK:       // %bb.0:
473; CHECK-NEXT:    shadd.16b v0, v0, v1
474; CHECK-NEXT:    str q0, [x0]
475; CHECK-NEXT:    ret
476  %sextsrc1 = sext <16 x i8> %src1 to <16 x i16>
477  %sextsrc2 = sext <16 x i8> %src2 to <16 x i16>
478  %add = add nsw <16 x i16> %sextsrc1, %sextsrc2
479  %resulti16 = lshr <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
480  %result = trunc <16 x i16> %resulti16 to <16 x i8>
481  store <16 x i8> %result, ptr %dest, align 16
482  ret void
483}
484
485define void @testLowerToSHADD8h(<8 x i16> %src1, <8 x i16> %src2, ptr nocapture writeonly %dest) {
486; CHECK-LABEL: testLowerToSHADD8h:
487; CHECK:       // %bb.0:
488; CHECK-NEXT:    shadd.8h v0, v0, v1
489; CHECK-NEXT:    str q0, [x0]
490; CHECK-NEXT:    ret
491  %sextsrc1 = sext <8 x i16> %src1 to <8 x i32>
492  %sextsrc2 = sext <8 x i16> %src2 to <8 x i32>
493  %add = add nsw <8 x i32> %sextsrc1, %sextsrc2
494  %resulti16 = lshr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
495  %result = trunc <8 x i32> %resulti16 to <8 x i16>
496  store <8 x i16> %result, ptr %dest, align 16
497  ret void
498}
499
500define void @testLowerToSHADD4s(<4 x i32> %src1, <4 x i32> %src2, ptr nocapture writeonly %dest) {
501; CHECK-LABEL: testLowerToSHADD4s:
502; CHECK:       // %bb.0:
503; CHECK-NEXT:    shadd.4s v0, v0, v1
504; CHECK-NEXT:    str q0, [x0]
505; CHECK-NEXT:    ret
506  %sextsrc1 = sext <4 x i32> %src1 to <4 x i64>
507  %sextsrc2 = sext <4 x i32> %src2 to <4 x i64>
508  %add = add nsw <4 x i64> %sextsrc1, %sextsrc2
509  %resulti16 = lshr <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
510  %result = trunc <4 x i64> %resulti16 to <4 x i32>
511  store <4 x i32> %result, ptr %dest, align 16
512  ret void
513}
514
515define void @testLowerToURHADD8b(<8 x i8> %src1, <8 x i8> %src2, ptr nocapture writeonly %dest) {
516; CHECK-LABEL: testLowerToURHADD8b:
517; CHECK:       // %bb.0:
518; CHECK-NEXT:    urhadd.8b v0, v0, v1
519; CHECK-NEXT:    str d0, [x0]
520; CHECK-NEXT:    ret
521  %zextsrc1 = zext <8 x i8> %src1 to <8 x i16>
522  %zextsrc2 = zext <8 x i8> %src2 to <8 x i16>
523  %add1 = add nuw nsw <8 x i16> %zextsrc1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
524  %add2 = add nuw nsw <8 x i16> %add1, %zextsrc2
525  %resulti16 = lshr <8 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
526  %result = trunc <8 x i16> %resulti16 to <8 x i8>
527  store <8 x i8> %result, ptr %dest, align 8
528  ret void
529}
530
531define void @testLowerToURHADD4h(<4 x i16> %src1, <4 x i16> %src2, ptr nocapture writeonly %dest) {
532; CHECK-LABEL: testLowerToURHADD4h:
533; CHECK:       // %bb.0:
534; CHECK-NEXT:    urhadd.4h v0, v0, v1
535; CHECK-NEXT:    str d0, [x0]
536; CHECK-NEXT:    ret
537  %zextsrc1 = zext <4 x i16> %src1 to <4 x i32>
538  %zextsrc2 = zext <4 x i16> %src2 to <4 x i32>
539  %add1 = add nuw nsw <4 x i32> %zextsrc1, <i32 1, i32 1, i32 1, i32 1>
540  %add2 = add nuw nsw <4 x i32> %add1, %zextsrc2
541  %resulti16 = lshr <4 x i32> %add2, <i32 1, i32 1, i32 1, i32 1>
542  %result = trunc <4 x i32> %resulti16 to <4 x i16>
543  store <4 x i16> %result, ptr %dest, align 8
544  ret void
545}
546
547define void @testLowerToURHADD2s(<2 x i32> %src1, <2 x i32> %src2, ptr nocapture writeonly %dest) {
548; CHECK-LABEL: testLowerToURHADD2s:
549; CHECK:       // %bb.0:
550; CHECK-NEXT:    urhadd.2s v0, v0, v1
551; CHECK-NEXT:    str d0, [x0]
552; CHECK-NEXT:    ret
553  %zextsrc1 = zext <2 x i32> %src1 to <2 x i64>
554  %zextsrc2 = zext <2 x i32> %src2 to <2 x i64>
555  %add1 = add nuw nsw <2 x i64> %zextsrc1, <i64 1, i64 1>
556  %add2 = add nuw nsw <2 x i64> %add1, %zextsrc2
557  %resulti16 = lshr <2 x i64> %add2, <i64 1, i64 1>
558  %result = trunc <2 x i64> %resulti16 to <2 x i32>
559  store <2 x i32> %result, ptr %dest, align 8
560  ret void
561}
562
563define void @testLowerToURHADD16b(<16 x i8> %src1, <16 x i8> %src2, ptr nocapture writeonly %dest) {
564; CHECK-LABEL: testLowerToURHADD16b:
565; CHECK:       // %bb.0:
566; CHECK-NEXT:    urhadd.16b v0, v0, v1
567; CHECK-NEXT:    str q0, [x0]
568; CHECK-NEXT:    ret
569  %zextsrc1 = zext <16 x i8> %src1 to <16 x i16>
570  %zextsrc2 = zext <16 x i8> %src2 to <16 x i16>
571  %add1 = add nuw nsw <16 x i16> %zextsrc1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
572  %add2 = add nuw nsw <16 x i16> %add1, %zextsrc2
573  %resulti16 = lshr <16 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
574  %result = trunc <16 x i16> %resulti16 to <16 x i8>
575  store <16 x i8> %result, ptr %dest, align 16
576  ret void
577}
578
579define void @testLowerToURHADD8h(<8 x i16> %src1, <8 x i16> %src2, ptr nocapture writeonly %dest) {
580; CHECK-LABEL: testLowerToURHADD8h:
581; CHECK:       // %bb.0:
582; CHECK-NEXT:    urhadd.8h v0, v0, v1
583; CHECK-NEXT:    str q0, [x0]
584; CHECK-NEXT:    ret
585  %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
586  %zextsrc2 = zext <8 x i16> %src2 to <8 x i32>
587  %add1 = add nuw nsw <8 x i32> %zextsrc1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
588  %add2 = add nuw nsw <8 x i32> %add1, %zextsrc2
589  %resulti16 = lshr <8 x i32> %add2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
590  %result = trunc <8 x i32> %resulti16 to <8 x i16>
591  store <8 x i16> %result, ptr %dest, align 16
592  ret void
593}
594
595define void @testLowerToURHADD4s(<4 x i32> %src1, <4 x i32> %src2, ptr nocapture writeonly %dest) {
596; CHECK-LABEL: testLowerToURHADD4s:
597; CHECK:       // %bb.0:
598; CHECK-NEXT:    urhadd.4s v0, v0, v1
599; CHECK-NEXT:    str q0, [x0]
600; CHECK-NEXT:    ret
601  %zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
602  %zextsrc2 = zext <4 x i32> %src2 to <4 x i64>
603  %add1 = add nuw nsw <4 x i64> %zextsrc1, <i64 1, i64 1, i64 1, i64 1>
604  %add2 = add nuw nsw <4 x i64> %add1, %zextsrc2
605  %resulti16 = lshr <4 x i64> %add2, <i64 1, i64 1, i64 1, i64 1>
606  %result = trunc <4 x i64> %resulti16 to <4 x i32>
607  store <4 x i32> %result, ptr %dest, align 16
608  ret void
609}
610
611define void @testLowerToUHADD8b(<8 x i8> %src1, <8 x i8> %src2, ptr nocapture writeonly %dest) {
612; CHECK-LABEL: testLowerToUHADD8b:
613; CHECK:       // %bb.0:
614; CHECK-NEXT:    uhadd.8b v0, v0, v1
615; CHECK-NEXT:    str d0, [x0]
616; CHECK-NEXT:    ret
617  %zextsrc1 = zext <8 x i8> %src1 to <8 x i16>
618  %zextsrc2 = zext <8 x i8> %src2 to <8 x i16>
619  %add = add nuw nsw <8 x i16> %zextsrc1, %zextsrc2
620  %resulti16 = lshr <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
621  %result = trunc <8 x i16> %resulti16 to <8 x i8>
622  store <8 x i8> %result, ptr %dest, align 8
623  ret void
624}
625
626define void @testLowerToUHADD4h(<4 x i16> %src1, <4 x i16> %src2, ptr nocapture writeonly %dest) {
627; CHECK-LABEL: testLowerToUHADD4h:
628; CHECK:       // %bb.0:
629; CHECK-NEXT:    uhadd.4h v0, v0, v1
630; CHECK-NEXT:    str d0, [x0]
631; CHECK-NEXT:    ret
632  %zextsrc1 = zext <4 x i16> %src1 to <4 x i32>
633  %zextsrc2 = zext <4 x i16> %src2 to <4 x i32>
634  %add = add nuw nsw <4 x i32> %zextsrc1, %zextsrc2
635  %resulti16 = lshr <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
636  %result = trunc <4 x i32> %resulti16 to <4 x i16>
637  store <4 x i16> %result, ptr %dest, align 8
638  ret void
639}
640
641define void @testLowerToUHADD2s(<2 x i32> %src1, <2 x i32> %src2, ptr nocapture writeonly %dest) {
642; CHECK-LABEL: testLowerToUHADD2s:
643; CHECK:       // %bb.0:
644; CHECK-NEXT:    uhadd.2s v0, v0, v1
645; CHECK-NEXT:    str d0, [x0]
646; CHECK-NEXT:    ret
647  %zextsrc1 = zext <2 x i32> %src1 to <2 x i64>
648  %zextsrc2 = zext <2 x i32> %src2 to <2 x i64>
649  %add = add nuw nsw <2 x i64> %zextsrc1, %zextsrc2
650  %resulti16 = lshr <2 x i64> %add, <i64 1, i64 1>
651  %result = trunc <2 x i64> %resulti16 to <2 x i32>
652  store <2 x i32> %result, ptr %dest, align 8
653  ret void
654}
655
656define void @testLowerToUHADD16b(<16 x i8> %src1, <16 x i8> %src2, ptr nocapture writeonly %dest) {
657; CHECK-LABEL: testLowerToUHADD16b:
658; CHECK:       // %bb.0:
659; CHECK-NEXT:    uhadd.16b v0, v0, v1
660; CHECK-NEXT:    str q0, [x0]
661; CHECK-NEXT:    ret
662  %zextsrc1 = zext <16 x i8> %src1 to <16 x i16>
663  %zextsrc2 = zext <16 x i8> %src2 to <16 x i16>
664  %add = add nuw nsw <16 x i16> %zextsrc1, %zextsrc2
665  %resulti16 = lshr <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
666  %result = trunc <16 x i16> %resulti16 to <16 x i8>
667  store <16 x i8> %result, ptr %dest, align 16
668  ret void
669}
670
671define void @testLowerToUHADD8h(<8 x i16> %src1, <8 x i16> %src2, ptr nocapture writeonly %dest) {
672; CHECK-LABEL: testLowerToUHADD8h:
673; CHECK:       // %bb.0:
674; CHECK-NEXT:    uhadd.8h v0, v0, v1
675; CHECK-NEXT:    str q0, [x0]
676; CHECK-NEXT:    ret
677  %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
678  %zextsrc2 = zext <8 x i16> %src2 to <8 x i32>
679  %add = add nuw nsw <8 x i32> %zextsrc1, %zextsrc2
680  %resulti16 = lshr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
681  %result = trunc <8 x i32> %resulti16 to <8 x i16>
682  store <8 x i16> %result, ptr %dest, align 16
683  ret void
684}
685
686define void @testLowerToUHADD4s(<4 x i32> %src1, <4 x i32> %src2, ptr nocapture writeonly %dest) {
687; CHECK-LABEL: testLowerToUHADD4s:
688; CHECK:       // %bb.0:
689; CHECK-NEXT:    uhadd.4s v0, v0, v1
690; CHECK-NEXT:    str q0, [x0]
691; CHECK-NEXT:    ret
692  %zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
693  %zextsrc2 = zext <4 x i32> %src2 to <4 x i64>
694  %add = add nuw nsw <4 x i64> %zextsrc1, %zextsrc2
695  %resulti16 = lshr <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
696  %result = trunc <4 x i64> %resulti16 to <4 x i32>
697  store <4 x i32> %result, ptr %dest, align 16
698  ret void
699}
700
701define <4 x i32> @hadd16_sext_asr(<4 x i16> %src1, <4 x i16> %src2) {
702; CHECK-LABEL: hadd16_sext_asr:
703; CHECK:       // %bb.0:
704; CHECK-NEXT:    shadd.4h v0, v0, v1
705; CHECK-NEXT:    sshll.4s v0, v0, #0
706; CHECK-NEXT:    ret
707  %zextsrc1 = sext <4 x i16> %src1 to <4 x i32>
708  %zextsrc2 = sext <4 x i16> %src2 to <4 x i32>
709  %add = add nsw <4 x i32> %zextsrc1, %zextsrc2
710  %resulti16 = ashr <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
711  ret <4 x i32> %resulti16
712}
713
714define <4 x i32> @hadd16_zext_asr(<4 x i16> %src1, <4 x i16> %src2) {
715; CHECK-LABEL: hadd16_zext_asr:
716; CHECK:       // %bb.0:
717; CHECK-NEXT:    uhadd.4h v0, v0, v1
718; CHECK-NEXT:    ushll.4s v0, v0, #0
719; CHECK-NEXT:    ret
720  %zextsrc1 = zext <4 x i16> %src1 to <4 x i32>
721  %zextsrc2 = zext <4 x i16> %src2 to <4 x i32>
722  %add = add nuw nsw <4 x i32> %zextsrc1, %zextsrc2
723  %resulti16 = lshr <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
724  ret <4 x i32> %resulti16
725}
726
727define <4 x i32> @hadd16_sext_lsr(<4 x i16> %src1, <4 x i16> %src2) {
728; CHECK-LABEL: hadd16_sext_lsr:
729; CHECK:       // %bb.0:
730; CHECK-NEXT:    saddl.4s v0, v0, v1
731; CHECK-NEXT:    ushr.4s v0, v0, #1
732; CHECK-NEXT:    ret
733  %zextsrc1 = sext <4 x i16> %src1 to <4 x i32>
734  %zextsrc2 = sext <4 x i16> %src2 to <4 x i32>
735  %add = add nsw <4 x i32> %zextsrc1, %zextsrc2
736  %resulti16 = lshr <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
737  ret <4 x i32> %resulti16
738}
739
740define <4 x i32> @hadd16_zext_lsr(<4 x i16> %src1, <4 x i16> %src2) {
741; CHECK-LABEL: hadd16_zext_lsr:
742; CHECK:       // %bb.0:
743; CHECK-NEXT:    uhadd.4h v0, v0, v1
744; CHECK-NEXT:    ushll.4s v0, v0, #0
745; CHECK-NEXT:    ret
746  %zextsrc1 = zext <4 x i16> %src1 to <4 x i32>
747  %zextsrc2 = zext <4 x i16> %src2 to <4 x i32>
748  %add = add nuw nsw <4 x i32> %zextsrc1, %zextsrc2
749  %resulti16 = lshr <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
750  ret <4 x i32> %resulti16
751}
752
753define <4 x i64> @hadd32_sext_asr(<4 x i32> %src1, <4 x i32> %src2) {
754; CHECK-LABEL: hadd32_sext_asr:
755; CHECK:       // %bb.0:
756; CHECK-NEXT:    shadd.4s v0, v0, v1
757; CHECK-NEXT:    sshll2.2d v1, v0, #0
758; CHECK-NEXT:    sshll.2d v0, v0, #0
759; CHECK-NEXT:    ret
760  %zextsrc1 = sext <4 x i32> %src1 to <4 x i64>
761  %zextsrc2 = sext <4 x i32> %src2 to <4 x i64>
762  %add = add nsw <4 x i64> %zextsrc1, %zextsrc2
763  %resulti32 = ashr <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
764  ret <4 x i64> %resulti32
765}
766
767define <4 x i64> @hadd32_zext_asr(<4 x i32> %src1, <4 x i32> %src2) {
768; CHECK-LABEL: hadd32_zext_asr:
769; CHECK:       // %bb.0:
770; CHECK-NEXT:    uhadd.4s v0, v0, v1
771; CHECK-NEXT:    ushll2.2d v1, v0, #0
772; CHECK-NEXT:    ushll.2d v0, v0, #0
773; CHECK-NEXT:    ret
774  %zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
775  %zextsrc2 = zext <4 x i32> %src2 to <4 x i64>
776  %add = add nuw nsw <4 x i64> %zextsrc1, %zextsrc2
777  %resulti32 = lshr <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
778  ret <4 x i64> %resulti32
779}
780
781define <4 x i64> @hadd32_sext_lsr(<4 x i32> %src1, <4 x i32> %src2) {
782; CHECK-LABEL: hadd32_sext_lsr:
783; CHECK:       // %bb.0:
784; CHECK-NEXT:    saddl.2d v2, v0, v1
785; CHECK-NEXT:    saddl2.2d v0, v0, v1
786; CHECK-NEXT:    ushr.2d v1, v0, #1
787; CHECK-NEXT:    ushr.2d v0, v2, #1
788; CHECK-NEXT:    ret
789  %zextsrc1 = sext <4 x i32> %src1 to <4 x i64>
790  %zextsrc2 = sext <4 x i32> %src2 to <4 x i64>
791  %add = add nsw <4 x i64> %zextsrc1, %zextsrc2
792  %resulti32 = lshr <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
793  ret <4 x i64> %resulti32
794}
795
796define <4 x i64> @hadd32_zext_lsr(<4 x i32> %src1, <4 x i32> %src2) {
797; CHECK-LABEL: hadd32_zext_lsr:
798; CHECK:       // %bb.0:
799; CHECK-NEXT:    uhadd.4s v0, v0, v1
800; CHECK-NEXT:    ushll2.2d v1, v0, #0
801; CHECK-NEXT:    ushll.2d v0, v0, #0
802; CHECK-NEXT:    ret
803  %zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
804  %zextsrc2 = zext <4 x i32> %src2 to <4 x i64>
805  %add = add nuw nsw <4 x i64> %zextsrc1, %zextsrc2
806  %resulti32 = lshr <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
807  ret <4 x i64> %resulti32
808}
809
810define <4 x i16> @hadd8_sext_asr(<4 x i8> %src1, <4 x i8> %src2) {
811; CHECK-LABEL: hadd8_sext_asr:
812; CHECK:       // %bb.0:
813; CHECK-NEXT:    shl.4h v1, v1, #8
814; CHECK-NEXT:    shl.4h v0, v0, #8
815; CHECK-NEXT:    sshr.4h v1, v1, #8
816; CHECK-NEXT:    sshr.4h v0, v0, #8
817; CHECK-NEXT:    shadd.4h v0, v0, v1
818; CHECK-NEXT:    ret
819  %zextsrc1 = sext <4 x i8> %src1 to <4 x i16>
820  %zextsrc2 = sext <4 x i8> %src2 to <4 x i16>
821  %add = add nsw <4 x i16> %zextsrc1, %zextsrc2
822  %resulti8 = ashr <4 x i16> %add, <i16 1, i16 1, i16 1, i16 1>
823  ret <4 x i16> %resulti8
824}
825
826define <4 x i16> @hadd8_zext_asr(<4 x i8> %src1, <4 x i8> %src2) {
827; CHECK-LABEL: hadd8_zext_asr:
828; CHECK:       // %bb.0:
829; CHECK-NEXT:    bic.4h v1, #255, lsl #8
830; CHECK-NEXT:    bic.4h v0, #255, lsl #8
831; CHECK-NEXT:    uhadd.4h v0, v0, v1
832; CHECK-NEXT:    ret
833  %zextsrc1 = zext <4 x i8> %src1 to <4 x i16>
834  %zextsrc2 = zext <4 x i8> %src2 to <4 x i16>
835  %add = add nuw nsw <4 x i16> %zextsrc1, %zextsrc2
836  %resulti8 = lshr <4 x i16> %add, <i16 1, i16 1, i16 1, i16 1>
837  ret <4 x i16> %resulti8
838}
839
840define <4 x i16> @hadd8_sext_lsr(<4 x i8> %src1, <4 x i8> %src2) {
841; CHECK-LABEL: hadd8_sext_lsr:
842; CHECK:       // %bb.0:
843; CHECK-NEXT:    shl.4h v0, v0, #8
844; CHECK-NEXT:    shl.4h v1, v1, #8
845; CHECK-NEXT:    sshr.4h v0, v0, #8
846; CHECK-NEXT:    ssra.4h v0, v1, #8
847; CHECK-NEXT:    ushr.4h v0, v0, #1
848; CHECK-NEXT:    ret
849  %zextsrc1 = sext <4 x i8> %src1 to <4 x i16>
850  %zextsrc2 = sext <4 x i8> %src2 to <4 x i16>
851  %add = add nsw <4 x i16> %zextsrc1, %zextsrc2
852  %resulti8 = lshr <4 x i16> %add, <i16 1, i16 1, i16 1, i16 1>
853  ret <4 x i16> %resulti8
854}
855
856define <4 x i16> @hadd8_zext_lsr(<4 x i8> %src1, <4 x i8> %src2) {
857; CHECK-LABEL: hadd8_zext_lsr:
858; CHECK:       // %bb.0:
859; CHECK-NEXT:    bic.4h v1, #255, lsl #8
860; CHECK-NEXT:    bic.4h v0, #255, lsl #8
861; CHECK-NEXT:    uhadd.4h v0, v0, v1
862; CHECK-NEXT:    ret
863  %zextsrc1 = zext <4 x i8> %src1 to <4 x i16>
864  %zextsrc2 = zext <4 x i8> %src2 to <4 x i16>
865  %add = add nuw nsw <4 x i16> %zextsrc1, %zextsrc2
866  %resulti8 = lshr <4 x i16> %add, <i16 1, i16 1, i16 1, i16 1>
867  ret <4 x i16> %resulti8
868}
869
870define <2 x i16> @hadd8x2_sext_asr(<2 x i8> %src1, <2 x i8> %src2) {
871; CHECK-LABEL: hadd8x2_sext_asr:
872; CHECK:       // %bb.0:
873; CHECK-NEXT:    shl.2s v1, v1, #24
874; CHECK-NEXT:    shl.2s v0, v0, #24
875; CHECK-NEXT:    sshr.2s v1, v1, #24
876; CHECK-NEXT:    sshr.2s v0, v0, #24
877; CHECK-NEXT:    shadd.2s v0, v0, v1
878; CHECK-NEXT:    ret
879  %zextsrc1 = sext <2 x i8> %src1 to <2 x i16>
880  %zextsrc2 = sext <2 x i8> %src2 to <2 x i16>
881  %add = add nsw <2 x i16> %zextsrc1, %zextsrc2
882  %resulti8 = ashr <2 x i16> %add, <i16 1, i16 1>
883  ret <2 x i16> %resulti8
884}
885
886define <2 x i16> @hadd8x2_zext_asr(<2 x i8> %src1, <2 x i8> %src2) {
887; CHECK-LABEL: hadd8x2_zext_asr:
888; CHECK:       // %bb.0:
889; CHECK-NEXT:    movi d2, #0x0000ff000000ff
890; CHECK-NEXT:    and.8b v1, v1, v2
891; CHECK-NEXT:    and.8b v0, v0, v2
892; CHECK-NEXT:    uhadd.2s v0, v0, v1
893; CHECK-NEXT:    ret
894  %zextsrc1 = zext <2 x i8> %src1 to <2 x i16>
895  %zextsrc2 = zext <2 x i8> %src2 to <2 x i16>
896  %add = add nuw nsw <2 x i16> %zextsrc1, %zextsrc2
897  %resulti8 = lshr <2 x i16> %add, <i16 1, i16 1>
898  ret <2 x i16> %resulti8
899}
900
901define <2 x i16> @hadd8x2_sext_lsr(<2 x i8> %src1, <2 x i8> %src2) {
902; CHECK-LABEL: hadd8x2_sext_lsr:
903; CHECK:       // %bb.0:
904; CHECK-NEXT:    shl.2s v0, v0, #24
905; CHECK-NEXT:    shl.2s v1, v1, #24
906; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
907; CHECK-NEXT:    sshr.2s v0, v0, #24
908; CHECK-NEXT:    ssra.2s v0, v1, #24
909; CHECK-NEXT:    and.8b v0, v0, v2
910; CHECK-NEXT:    ushr.2s v0, v0, #1
911; CHECK-NEXT:    ret
912  %zextsrc1 = sext <2 x i8> %src1 to <2 x i16>
913  %zextsrc2 = sext <2 x i8> %src2 to <2 x i16>
914  %add = add nsw <2 x i16> %zextsrc1, %zextsrc2
915  %resulti8 = lshr <2 x i16> %add, <i16 1, i16 1>
916  ret <2 x i16> %resulti8
917}
918
919define <2 x i16> @hadd8x2_zext_lsr(<2 x i8> %src1, <2 x i8> %src2) {
920; CHECK-LABEL: hadd8x2_zext_lsr:
921; CHECK:       // %bb.0:
922; CHECK-NEXT:    movi d2, #0x0000ff000000ff
923; CHECK-NEXT:    and.8b v1, v1, v2
924; CHECK-NEXT:    and.8b v0, v0, v2
925; CHECK-NEXT:    uhadd.2s v0, v0, v1
926; CHECK-NEXT:    ret
927  %zextsrc1 = zext <2 x i8> %src1 to <2 x i16>
928  %zextsrc2 = zext <2 x i8> %src2 to <2 x i16>
929  %add = add nuw nsw <2 x i16> %zextsrc1, %zextsrc2
930  %resulti8 = lshr <2 x i16> %add, <i16 1, i16 1>
931  ret <2 x i16> %resulti8
932}
933
934define <4 x i16> @rhadd8_sext_asr(<4 x i8> %src1, <4 x i8> %src2) {
935; CHECK-LABEL: rhadd8_sext_asr:
936; CHECK:       // %bb.0:
937; CHECK-NEXT:    shl.4h v1, v1, #8
938; CHECK-NEXT:    shl.4h v0, v0, #8
939; CHECK-NEXT:    sshr.4h v1, v1, #8
940; CHECK-NEXT:    sshr.4h v0, v0, #8
941; CHECK-NEXT:    srhadd.4h v0, v0, v1
942; CHECK-NEXT:    ret
943  %zextsrc1 = sext <4 x i8> %src1 to <4 x i16>
944  %zextsrc2 = sext <4 x i8> %src2 to <4 x i16>
945  %add = add nsw <4 x i16> %zextsrc1, %zextsrc2
946  %add2 = add nsw <4 x i16> %add, <i16 1, i16 1, i16 1, i16 1>
947  %resulti8 = ashr <4 x i16> %add2, <i16 1, i16 1, i16 1, i16 1>
948  ret <4 x i16> %resulti8
949}
950
951define <4 x i16> @rhadd8_zext_asr(<4 x i8> %src1, <4 x i8> %src2) {
952; CHECK-LABEL: rhadd8_zext_asr:
953; CHECK:       // %bb.0:
954; CHECK-NEXT:    bic.4h v1, #255, lsl #8
955; CHECK-NEXT:    bic.4h v0, #255, lsl #8
956; CHECK-NEXT:    urhadd.4h v0, v0, v1
957; CHECK-NEXT:    ret
958  %zextsrc1 = zext <4 x i8> %src1 to <4 x i16>
959  %zextsrc2 = zext <4 x i8> %src2 to <4 x i16>
960  %add = add nuw nsw <4 x i16> %zextsrc1, %zextsrc2
961  %add2 = add nuw nsw <4 x i16> %add, <i16 1, i16 1, i16 1, i16 1>
962  %resulti8 = lshr <4 x i16> %add2, <i16 1, i16 1, i16 1, i16 1>
963  ret <4 x i16> %resulti8
964}
965
966define <4 x i16> @rhadd8_sext_lsr(<4 x i8> %src1, <4 x i8> %src2) {
967; CHECK-LABEL: rhadd8_sext_lsr:
968; CHECK:       // %bb.0:
969; CHECK-NEXT:    shl.4h v0, v0, #8
970; CHECK-NEXT:    shl.4h v1, v1, #8
971; CHECK-NEXT:    movi.4h v2, #1
972; CHECK-NEXT:    sshr.4h v0, v0, #8
973; CHECK-NEXT:    ssra.4h v0, v1, #8
974; CHECK-NEXT:    add.4h v0, v0, v2
975; CHECK-NEXT:    ushr.4h v0, v0, #1
976; CHECK-NEXT:    ret
977  %zextsrc1 = sext <4 x i8> %src1 to <4 x i16>
978  %zextsrc2 = sext <4 x i8> %src2 to <4 x i16>
979  %add = add nsw <4 x i16> %zextsrc1, %zextsrc2
980  %add2 = add nsw <4 x i16> %add, <i16 1, i16 1, i16 1, i16 1>
981  %resulti8 = lshr <4 x i16> %add2, <i16 1, i16 1, i16 1, i16 1>
982  ret <4 x i16> %resulti8
983}
984
985define <4 x i16> @rhadd8_zext_lsr(<4 x i8> %src1, <4 x i8> %src2) {
986; CHECK-LABEL: rhadd8_zext_lsr:
987; CHECK:       // %bb.0:
988; CHECK-NEXT:    bic.4h v1, #255, lsl #8
989; CHECK-NEXT:    bic.4h v0, #255, lsl #8
990; CHECK-NEXT:    urhadd.4h v0, v0, v1
991; CHECK-NEXT:    ret
992  %zextsrc1 = zext <4 x i8> %src1 to <4 x i16>
993  %zextsrc2 = zext <4 x i8> %src2 to <4 x i16>
994  %add = add nuw nsw <4 x i16> %zextsrc1, %zextsrc2
995  %add2 = add nuw nsw <4 x i16> %add, <i16 1, i16 1, i16 1, i16 1>
996  %resulti8 = lshr <4 x i16> %add2, <i16 1, i16 1, i16 1, i16 1>
997  ret <4 x i16> %resulti8
998}
999
1000define <2 x i16> @rhadd8x2_sext_asr(<2 x i8> %src1, <2 x i8> %src2) {
1001; CHECK-LABEL: rhadd8x2_sext_asr:
1002; CHECK:       // %bb.0:
1003; CHECK-NEXT:    shl.2s v1, v1, #24
1004; CHECK-NEXT:    shl.2s v0, v0, #24
1005; CHECK-NEXT:    sshr.2s v1, v1, #24
1006; CHECK-NEXT:    sshr.2s v0, v0, #24
1007; CHECK-NEXT:    srhadd.2s v0, v0, v1
1008; CHECK-NEXT:    ret
1009  %zextsrc1 = sext <2 x i8> %src1 to <2 x i16>
1010  %zextsrc2 = sext <2 x i8> %src2 to <2 x i16>
1011  %add = add nsw <2 x i16> %zextsrc1, %zextsrc2
1012  %add2 = add nsw <2 x i16> %add, <i16 1, i16 1>
1013  %resulti8 = ashr <2 x i16> %add2, <i16 1, i16 1>
1014  ret <2 x i16> %resulti8
1015}
1016
1017define <2 x i16> @rhadd8x2_zext_asr(<2 x i8> %src1, <2 x i8> %src2) {
1018; CHECK-LABEL: rhadd8x2_zext_asr:
1019; CHECK:       // %bb.0:
1020; CHECK-NEXT:    movi d2, #0x0000ff000000ff
1021; CHECK-NEXT:    and.8b v1, v1, v2
1022; CHECK-NEXT:    and.8b v0, v0, v2
1023; CHECK-NEXT:    urhadd.2s v0, v0, v1
1024; CHECK-NEXT:    ret
1025  %zextsrc1 = zext <2 x i8> %src1 to <2 x i16>
1026  %zextsrc2 = zext <2 x i8> %src2 to <2 x i16>
1027  %add = add nuw nsw <2 x i16> %zextsrc1, %zextsrc2
1028  %add2 = add nuw nsw <2 x i16> %add, <i16 1, i16 1>
1029  %resulti8 = lshr <2 x i16> %add2, <i16 1, i16 1>
1030  ret <2 x i16> %resulti8
1031}
1032
1033define <2 x i16> @rhadd8x2_sext_lsr(<2 x i8> %src1, <2 x i8> %src2) {
1034; CHECK-LABEL: rhadd8x2_sext_lsr:
1035; CHECK:       // %bb.0:
1036; CHECK-NEXT:    shl.2s v0, v0, #24
1037; CHECK-NEXT:    shl.2s v1, v1, #24
1038; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
1039; CHECK-NEXT:    sshr.2s v0, v0, #24
1040; CHECK-NEXT:    sshr.2s v1, v1, #24
1041; CHECK-NEXT:    mvn.8b v0, v0
1042; CHECK-NEXT:    sub.2s v0, v1, v0
1043; CHECK-NEXT:    and.8b v0, v0, v2
1044; CHECK-NEXT:    ushr.2s v0, v0, #1
1045; CHECK-NEXT:    ret
1046  %zextsrc1 = sext <2 x i8> %src1 to <2 x i16>
1047  %zextsrc2 = sext <2 x i8> %src2 to <2 x i16>
1048  %add = add nsw <2 x i16> %zextsrc1, %zextsrc2
1049  %add2 = add nsw <2 x i16> %add, <i16 1, i16 1>
1050  %resulti8 = lshr <2 x i16> %add2, <i16 1, i16 1>
1051  ret <2 x i16> %resulti8
1052}
1053
1054define <2 x i16> @rhadd8x2_zext_lsr(<2 x i8> %src1, <2 x i8> %src2) {
1055; CHECK-LABEL: rhadd8x2_zext_lsr:
1056; CHECK:       // %bb.0:
1057; CHECK-NEXT:    movi d2, #0x0000ff000000ff
1058; CHECK-NEXT:    and.8b v1, v1, v2
1059; CHECK-NEXT:    and.8b v0, v0, v2
1060; CHECK-NEXT:    urhadd.2s v0, v0, v1
1061; CHECK-NEXT:    ret
1062  %zextsrc1 = zext <2 x i8> %src1 to <2 x i16>
1063  %zextsrc2 = zext <2 x i8> %src2 to <2 x i16>
1064  %add = add nuw nsw <2 x i16> %zextsrc1, %zextsrc2
1065  %add2 = add nuw nsw <2 x i16> %add, <i16 1, i16 1>
1066  %resulti8 = lshr <2 x i16> %add2, <i16 1, i16 1>
1067  ret <2 x i16> %resulti8
1068}
1069
1070
1071define void @testLowerToSHADD8b_c(<8 x i8> %src1, ptr nocapture writeonly %dest) {
1072; CHECK-LABEL: testLowerToSHADD8b_c:
1073; CHECK:       // %bb.0:
1074; CHECK-NEXT:    movi.8b v1, #10
1075; CHECK-NEXT:    shadd.8b v0, v0, v1
1076; CHECK-NEXT:    str d0, [x0]
1077; CHECK-NEXT:    ret
1078  %sextsrc1 = sext <8 x i8> %src1 to <8 x i16>
1079  %add = add nsw <8 x i16> %sextsrc1, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
1080  %resulti16 = lshr <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1081  %result = trunc <8 x i16> %resulti16 to <8 x i8>
1082  store <8 x i8> %result, ptr %dest, align 8
1083  ret void
1084}
1085
1086define void @testLowerToSHADD4h_c(<4 x i16> %src1, ptr nocapture writeonly %dest) {
1087; CHECK-LABEL: testLowerToSHADD4h_c:
1088; CHECK:       // %bb.0:
1089; CHECK-NEXT:    movi.4h v1, #10
1090; CHECK-NEXT:    shadd.4h v0, v0, v1
1091; CHECK-NEXT:    str d0, [x0]
1092; CHECK-NEXT:    ret
1093  %sextsrc1 = sext <4 x i16> %src1 to <4 x i32>
1094  %add = add nsw <4 x i32> %sextsrc1, <i32 10, i32 10, i32 10, i32 10>
1095  %resulti16 = lshr <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
1096  %result = trunc <4 x i32> %resulti16 to <4 x i16>
1097  store <4 x i16> %result, ptr %dest, align 8
1098  ret void
1099}
1100
1101define void @testLowerToSHADD2s_c(<2 x i32> %src1, ptr nocapture writeonly %dest) {
1102; CHECK-LABEL: testLowerToSHADD2s_c:
1103; CHECK:       // %bb.0:
1104; CHECK-NEXT:    movi.2s v1, #10
1105; CHECK-NEXT:    shadd.2s v0, v0, v1
1106; CHECK-NEXT:    str d0, [x0]
1107; CHECK-NEXT:    ret
1108  %sextsrc1 = sext <2 x i32> %src1 to <2 x i64>
1109  %add = add nsw <2 x i64> %sextsrc1, <i64 10, i64 10>
1110  %resulti16 = lshr <2 x i64> %add, <i64 1, i64 1>
1111  %result = trunc <2 x i64> %resulti16 to <2 x i32>
1112  store <2 x i32> %result, ptr %dest, align 8
1113  ret void
1114}
1115
1116define void @testLowerToSHADD16b_c(<16 x i8> %src1, ptr nocapture writeonly %dest) {
1117; CHECK-LABEL: testLowerToSHADD16b_c:
1118; CHECK:       // %bb.0:
1119; CHECK-NEXT:    movi.16b v1, #10
1120; CHECK-NEXT:    shadd.16b v0, v0, v1
1121; CHECK-NEXT:    str q0, [x0]
1122; CHECK-NEXT:    ret
1123  %sextsrc1 = sext <16 x i8> %src1 to <16 x i16>
1124  %add = add nsw <16 x i16> %sextsrc1, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
1125  %resulti16 = lshr <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1126  %result = trunc <16 x i16> %resulti16 to <16 x i8>
1127  store <16 x i8> %result, ptr %dest, align 16
1128  ret void
1129}
1130
1131define void @testLowerToSHADD8h_c(<8 x i16> %src1, ptr nocapture writeonly %dest) {
1132; CHECK-LABEL: testLowerToSHADD8h_c:
1133; CHECK:       // %bb.0:
1134; CHECK-NEXT:    movi.8h v1, #10
1135; CHECK-NEXT:    shadd.8h v0, v0, v1
1136; CHECK-NEXT:    str q0, [x0]
1137; CHECK-NEXT:    ret
1138  %sextsrc1 = sext <8 x i16> %src1 to <8 x i32>
1139  %add = add nsw <8 x i32> %sextsrc1, <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
1140  %resulti16 = lshr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1141  %result = trunc <8 x i32> %resulti16 to <8 x i16>
1142  store <8 x i16> %result, ptr %dest, align 16
1143  ret void
1144}
1145
1146define void @testLowerToSHADD4s_c(<4 x i32> %src1, ptr nocapture writeonly %dest) {
1147; CHECK-LABEL: testLowerToSHADD4s_c:
1148; CHECK:       // %bb.0:
1149; CHECK-NEXT:    movi.4s v1, #10
1150; CHECK-NEXT:    shadd.4s v0, v0, v1
1151; CHECK-NEXT:    str q0, [x0]
1152; CHECK-NEXT:    ret
1153  %sextsrc1 = sext <4 x i32> %src1 to <4 x i64>
1154  %add = add nsw <4 x i64> %sextsrc1, <i64 10, i64 10, i64 10, i64 10>
1155  %resulti16 = lshr <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
1156  %result = trunc <4 x i64> %resulti16 to <4 x i32>
1157  store <4 x i32> %result, ptr %dest, align 16
1158  ret void
1159}
1160
1161define void @testLowerToUHADD8b_c(<8 x i8> %src1, ptr nocapture writeonly %dest) {
1162; CHECK-LABEL: testLowerToUHADD8b_c:
1163; CHECK:       // %bb.0:
1164; CHECK-NEXT:    movi.8b v1, #10
1165; CHECK-NEXT:    uhadd.8b v0, v0, v1
1166; CHECK-NEXT:    str d0, [x0]
1167; CHECK-NEXT:    ret
1168  %zextsrc1 = zext <8 x i8> %src1 to <8 x i16>
1169  %add = add nuw nsw <8 x i16> %zextsrc1, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
1170  %resulti16 = lshr <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1171  %result = trunc <8 x i16> %resulti16 to <8 x i8>
1172  store <8 x i8> %result, ptr %dest, align 8
1173  ret void
1174}
1175
1176define void @testLowerToUHADD4h_c(<4 x i16> %src1, ptr nocapture writeonly %dest) {
1177; CHECK-LABEL: testLowerToUHADD4h_c:
1178; CHECK:       // %bb.0:
1179; CHECK-NEXT:    movi.4h v1, #10
1180; CHECK-NEXT:    uhadd.4h v0, v0, v1
1181; CHECK-NEXT:    str d0, [x0]
1182; CHECK-NEXT:    ret
1183  %zextsrc1 = zext <4 x i16> %src1 to <4 x i32>
1184  %add = add nuw nsw <4 x i32> %zextsrc1, <i32 10, i32 10, i32 10, i32 10>
1185  %resulti16 = lshr <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
1186  %result = trunc <4 x i32> %resulti16 to <4 x i16>
1187  store <4 x i16> %result, ptr %dest, align 8
1188  ret void
1189}
1190
1191define void @testLowerToUHADD2s_c(<2 x i32> %src1, ptr nocapture writeonly %dest) {
1192; CHECK-LABEL: testLowerToUHADD2s_c:
1193; CHECK:       // %bb.0:
1194; CHECK-NEXT:    movi.2s v1, #10
1195; CHECK-NEXT:    uhadd.2s v0, v0, v1
1196; CHECK-NEXT:    str d0, [x0]
1197; CHECK-NEXT:    ret
1198  %zextsrc1 = zext <2 x i32> %src1 to <2 x i64>
1199  %add = add nuw nsw <2 x i64> %zextsrc1, <i64 10, i64 10>
1200  %resulti16 = lshr <2 x i64> %add, <i64 1, i64 1>
1201  %result = trunc <2 x i64> %resulti16 to <2 x i32>
1202  store <2 x i32> %result, ptr %dest, align 8
1203  ret void
1204}
1205
1206define void @testLowerToUHADD16b_c(<16 x i8> %src1, ptr nocapture writeonly %dest) {
1207; CHECK-LABEL: testLowerToUHADD16b_c:
1208; CHECK:       // %bb.0:
1209; CHECK-NEXT:    movi.16b v1, #10
1210; CHECK-NEXT:    uhadd.16b v0, v0, v1
1211; CHECK-NEXT:    str q0, [x0]
1212; CHECK-NEXT:    ret
1213  %zextsrc1 = zext <16 x i8> %src1 to <16 x i16>
1214  %add = add nuw nsw <16 x i16> %zextsrc1, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
1215  %resulti16 = lshr <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1216  %result = trunc <16 x i16> %resulti16 to <16 x i8>
1217  store <16 x i8> %result, ptr %dest, align 16
1218  ret void
1219}
1220
1221define void @testLowerToUHADD8h_c(<8 x i16> %src1, ptr nocapture writeonly %dest) {
1222; CHECK-LABEL: testLowerToUHADD8h_c:
1223; CHECK:       // %bb.0:
1224; CHECK-NEXT:    movi.8h v1, #10
1225; CHECK-NEXT:    uhadd.8h v0, v0, v1
1226; CHECK-NEXT:    str q0, [x0]
1227; CHECK-NEXT:    ret
1228  %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
1229  %add = add nuw nsw <8 x i32> %zextsrc1, <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
1230  %resulti16 = lshr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1231  %result = trunc <8 x i32> %resulti16 to <8 x i16>
1232  store <8 x i16> %result, ptr %dest, align 16
1233  ret void
1234}
1235
1236define void @testLowerToUHADD4s_c(<4 x i32> %src1, ptr nocapture writeonly %dest) {
1237; CHECK-LABEL: testLowerToUHADD4s_c:
1238; CHECK:       // %bb.0:
1239; CHECK-NEXT:    movi.4s v1, #10
1240; CHECK-NEXT:    uhadd.4s v0, v0, v1
1241; CHECK-NEXT:    str q0, [x0]
1242; CHECK-NEXT:    ret
1243  %zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
1244  %add = add nuw nsw <4 x i64> %zextsrc1, <i64 10, i64 10, i64 10, i64 10>
1245  %resulti16 = lshr <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
1246  %result = trunc <4 x i64> %resulti16 to <4 x i32>
1247  store <4 x i32> %result, ptr %dest, align 16
1248  ret void
1249}
1250
1251define <8 x i8> @andmaskv8i8(<8 x i16> %src1, <8 x i8> %src2) {
1252; CHECK-LABEL: andmaskv8i8:
1253; CHECK:       // %bb.0:
1254; CHECK-NEXT:    movi.8b v2, #7
1255; CHECK-NEXT:    xtn.8b v0, v0
1256; CHECK-NEXT:    and.8b v0, v0, v2
1257; CHECK-NEXT:    uhadd.8b v0, v0, v1
1258; CHECK-NEXT:    ret
1259  %zextsrc1 = and <8 x i16> %src1, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
1260  %zextsrc2 = zext <8 x i8> %src2 to <8 x i16>
1261  %add = add nuw nsw <8 x i16> %zextsrc1, %zextsrc2
1262  %resulti16 = lshr <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1263  %result = trunc <8 x i16> %resulti16 to <8 x i8>
1264  ret <8 x i8> %result
1265}
1266
1267define <16 x i8> @andmaskv16i8(<16 x i16> %src1, <16 x i8> %src2) {
1268; CHECK-LABEL: andmaskv16i8:
1269; CHECK:       // %bb.0:
1270; CHECK-NEXT:    movi.16b v3, #7
1271; CHECK-NEXT:    uzp1.16b v0, v0, v1
1272; CHECK-NEXT:    and.16b v0, v0, v3
1273; CHECK-NEXT:    uhadd.16b v0, v0, v2
1274; CHECK-NEXT:    ret
1275  %zextsrc1 = and <16 x i16> %src1, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
1276  %zextsrc2 = zext <16 x i8> %src2 to <16 x i16>
1277  %add = add nuw nsw <16 x i16> %zextsrc1, %zextsrc2
1278  %resulti16 = lshr <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1279  %result = trunc <16 x i16> %resulti16 to <16 x i8>
1280  ret <16 x i8> %result
1281}
1282
1283define <16 x i8> @andmask2v16i8(<16 x i16> %src1, <16 x i16> %src2) {
1284; CHECK-LABEL: andmask2v16i8:
1285; CHECK:       // %bb.0:
1286; CHECK-NEXT:    uzp1.16b v2, v2, v3
1287; CHECK-NEXT:    movi.16b v3, #3
1288; CHECK-NEXT:    uzp1.16b v0, v0, v1
1289; CHECK-NEXT:    movi.16b v1, #7
1290; CHECK-NEXT:    and.16b v2, v2, v3
1291; CHECK-NEXT:    and.16b v0, v0, v1
1292; CHECK-NEXT:    uhadd.16b v0, v0, v2
1293; CHECK-NEXT:    ret
1294  %zextsrc1 = and <16 x i16> %src1, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
1295  %zextsrc2 = and <16 x i16> %src2, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
1296  %add = add nuw nsw <16 x i16> %zextsrc1, %zextsrc2
1297  %resulti16 = lshr <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1298  %result = trunc <16 x i16> %resulti16 to <16 x i8>
1299  ret <16 x i8> %result
1300}
1301
1302define <8 x i8> @andmask2v8i8(<8 x i16> %src1, <8 x i16> %src2) {
1303; CHECK-LABEL: andmask2v8i8:
1304; CHECK:       // %bb.0:
1305; CHECK-NEXT:    movi.8b v2, #7
1306; CHECK-NEXT:    xtn.8b v0, v0
1307; CHECK-NEXT:    xtn.8b v1, v1
1308; CHECK-NEXT:    and.8b v0, v0, v2
1309; CHECK-NEXT:    uhadd.8b v0, v0, v1
1310; CHECK-NEXT:    ret
1311  %zextsrc1 = and <8 x i16> %src1, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
1312  %zextsrc2 = and <8 x i16> %src2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
1313  %add = add nuw nsw <8 x i16> %zextsrc1, %zextsrc2
1314  %resulti16 = lshr <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1315  %result = trunc <8 x i16> %resulti16 to <8 x i8>
1316  ret <8 x i8> %result
1317}
1318
1319define <8 x i16> @andmask3v8i8(<8 x i16> %src1, <8 x i16> %src2) {
1320; CHECK-LABEL: andmask3v8i8:
1321; CHECK:       // %bb.0:
1322; CHECK-NEXT:    movi.8h v2, #7
1323; CHECK-NEXT:    bic.8h v1, #254, lsl #8
1324; CHECK-NEXT:    and.16b v0, v0, v2
1325; CHECK-NEXT:    uhadd.8h v0, v0, v1
1326; CHECK-NEXT:    ret
1327  %zextsrc1 = and <8 x i16> %src1, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
1328  %zextsrc2 = and <8 x i16> %src2, <i16 511, i16 511, i16 511, i16 511, i16 511, i16 511, i16 511, i16 511>
1329  %add = add nuw nsw <8 x i16> %zextsrc1, %zextsrc2
1330  %resulti16 = lshr <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1331  ret <8 x i16> %resulti16
1332}
1333
1334define <16 x i8> @sextmaskv16i8(<16 x i16> %src1, <16 x i8> %src2) {
1335; CHECK-LABEL: sextmaskv16i8:
1336; CHECK:       // %bb.0:
1337; CHECK-NEXT:    sshr.8h v1, v1, #11
1338; CHECK-NEXT:    sshr.8h v0, v0, #11
1339; CHECK-NEXT:    uzp1.16b v0, v0, v1
1340; CHECK-NEXT:    shadd.16b v0, v0, v2
1341; CHECK-NEXT:    ret
1342  %sextsrc1 = ashr <16 x i16> %src1, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
1343  %sextsrc2 = sext <16 x i8> %src2 to <16 x i16>
1344  %add = add nsw <16 x i16> %sextsrc1, %sextsrc2
1345  %1 = ashr <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1346  %result = trunc <16 x i16> %1 to <16 x i8>
1347  ret <16 x i8> %result
1348}
1349
1350define <8 x i8> @sextmaskv8i8(<8 x i16> %src1, <8 x i8> %src2) {
1351; CHECK-LABEL: sextmaskv8i8:
1352; CHECK:       // %bb.0:
1353; CHECK-NEXT:    sshr.8h v0, v0, #11
1354; CHECK-NEXT:    xtn.8b v0, v0
1355; CHECK-NEXT:    shadd.8b v0, v0, v1
1356; CHECK-NEXT:    ret
1357  %sextsrc1 = ashr <8 x i16> %src1, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
1358  %sextsrc2 = sext <8 x i8> %src2 to <8 x i16>
1359  %add = add nsw <8 x i16> %sextsrc1, %sextsrc2
1360  %1 = ashr <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1361  %result = trunc <8 x i16> %1 to <8 x i8>
1362  ret <8 x i8> %result
1363}
1364
1365define <8 x i8> @sextmask2v8i8(<8 x i16> %src1, <8 x i8> %src2) {
1366; CHECK-LABEL: sextmask2v8i8:
1367; CHECK:       // %bb.0:
1368; CHECK-NEXT:    shrn.8b v0, v0, #8
1369; CHECK-NEXT:    shadd.8b v0, v0, v1
1370; CHECK-NEXT:    ret
1371  %sextsrc1 = ashr <8 x i16> %src1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1372  %sextsrc2 = sext <8 x i8> %src2 to <8 x i16>
1373  %add = add nsw <8 x i16> %sextsrc1, %sextsrc2
1374  %1 = ashr <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1375  %result = trunc <8 x i16> %1 to <8 x i8>
1376  ret <8 x i8> %result
1377}
1378
1379define <8 x i8> @sextmask3v8i8(<8 x i16> %src1, <8 x i8> %src2) {
1380; CHECK-LABEL: sextmask3v8i8:
1381; CHECK:       // %bb.0:
1382; CHECK-NEXT:    ushr.8h v0, v0, #7
1383; CHECK-NEXT:    sshll.8h v1, v1, #0
1384; CHECK-NEXT:    shadd.8h v0, v0, v1
1385; CHECK-NEXT:    xtn.8b v0, v0
1386; CHECK-NEXT:    ret
1387  %1 = ashr <8 x i16> %src1, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
1388  %sextsrc2 = sext <8 x i8> %src2 to <8 x i16>
1389  %add = add nsw <8 x i16> %1, %sextsrc2
1390  %2 = ashr <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1391  %result = trunc <8 x i16> %2 to <8 x i8>
1392  ret <8 x i8> %result
1393}
1394
1395define <4 x i16> @ext_via_i19(<4 x i16> %a) {
1396; CHECK-LABEL: ext_via_i19:
1397; CHECK:       // %bb.0:
1398; CHECK-NEXT:    movi.4h v1, #1
1399; CHECK-NEXT:    urhadd.4h v0, v0, v1
1400; CHECK-NEXT:    ret
1401  %t3 = zext <4 x i16> %a to <4 x i32>
1402  %t4 = add <4 x i32> %t3, <i32 1, i32 1, i32 1, i32 1>
1403  %t5 = trunc <4 x i32> %t4 to <4 x i19>
1404  %new0 = add <4 x i19> %t5, <i19 1, i19 1, i19 1, i19 1>
1405  %new1 = lshr <4 x i19> %new0, <i19 1, i19 1, i19 1, i19 1>
1406  %last = zext <4 x i19> %new1 to <4 x i32>
1407  %t6 = trunc <4 x i32> %last to <4 x i16>
1408  ret <4 x i16> %t6
1409}
1410
1411declare <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>)
1412declare <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16>, <4 x i16>)
1413declare <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32>, <2 x i32>)
1414declare <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>)
1415declare <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16>, <4 x i16>)
1416declare <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32>, <2 x i32>)
1417declare <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8>, <16 x i8>)
1418declare <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16>, <8 x i16>)
1419declare <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32>, <4 x i32>)
1420declare <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8>, <16 x i8>)
1421declare <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16>, <8 x i16>)
1422declare <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32>, <4 x i32>)
1423