xref: /llvm-project/llvm/test/CodeGen/AArch64/arm64-vmul.ll (revision db158c7c830807caeeb0691739c41f1d522029e9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -mattr=+aes | FileCheck %s
3
4define <8 x i16> @smull8h(ptr %A, ptr %B) nounwind {
5; CHECK-LABEL: smull8h:
6; CHECK:       // %bb.0:
7; CHECK-NEXT:    ldr d0, [x0]
8; CHECK-NEXT:    ldr d1, [x1]
9; CHECK-NEXT:    smull.8h v0, v0, v1
10; CHECK-NEXT:    ret
11  %tmp1 = load <8 x i8>, ptr %A
12  %tmp2 = load <8 x i8>, ptr %B
13  %tmp3 = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
14  ret <8 x i16> %tmp3
15}
16
17define <4 x i32> @smull4s(ptr %A, ptr %B) nounwind {
18; CHECK-LABEL: smull4s:
19; CHECK:       // %bb.0:
20; CHECK-NEXT:    ldr d0, [x0]
21; CHECK-NEXT:    ldr d1, [x1]
22; CHECK-NEXT:    smull.4s v0, v0, v1
23; CHECK-NEXT:    ret
24  %tmp1 = load <4 x i16>, ptr %A
25  %tmp2 = load <4 x i16>, ptr %B
26  %tmp3 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
27  ret <4 x i32> %tmp3
28}
29
30define <2 x i64> @smull2d(ptr %A, ptr %B) nounwind {
31; CHECK-LABEL: smull2d:
32; CHECK:       // %bb.0:
33; CHECK-NEXT:    ldr d0, [x0]
34; CHECK-NEXT:    ldr d1, [x1]
35; CHECK-NEXT:    smull.2d v0, v0, v1
36; CHECK-NEXT:    ret
37  %tmp1 = load <2 x i32>, ptr %A
38  %tmp2 = load <2 x i32>, ptr %B
39  %tmp3 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
40  ret <2 x i64> %tmp3
41}
42
43declare <8 x i16>  @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
44declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
45declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
46
47define <8 x i16> @umull8h(ptr %A, ptr %B) nounwind {
48; CHECK-LABEL: umull8h:
49; CHECK:       // %bb.0:
50; CHECK-NEXT:    ldr d0, [x0]
51; CHECK-NEXT:    ldr d1, [x1]
52; CHECK-NEXT:    umull.8h v0, v0, v1
53; CHECK-NEXT:    ret
54  %tmp1 = load <8 x i8>, ptr %A
55  %tmp2 = load <8 x i8>, ptr %B
56  %tmp3 = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
57  ret <8 x i16> %tmp3
58}
59
60define <4 x i32> @umull4s(ptr %A, ptr %B) nounwind {
61; CHECK-LABEL: umull4s:
62; CHECK:       // %bb.0:
63; CHECK-NEXT:    ldr d0, [x0]
64; CHECK-NEXT:    ldr d1, [x1]
65; CHECK-NEXT:    umull.4s v0, v0, v1
66; CHECK-NEXT:    ret
67  %tmp1 = load <4 x i16>, ptr %A
68  %tmp2 = load <4 x i16>, ptr %B
69  %tmp3 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
70  ret <4 x i32> %tmp3
71}
72
73define <2 x i64> @umull2d(ptr %A, ptr %B) nounwind {
74; CHECK-LABEL: umull2d:
75; CHECK:       // %bb.0:
76; CHECK-NEXT:    ldr d0, [x0]
77; CHECK-NEXT:    ldr d1, [x1]
78; CHECK-NEXT:    umull.2d v0, v0, v1
79; CHECK-NEXT:    ret
80  %tmp1 = load <2 x i32>, ptr %A
81  %tmp2 = load <2 x i32>, ptr %B
82  %tmp3 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
83  ret <2 x i64> %tmp3
84}
85
86declare <8 x i16>  @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
87declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
88declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
89
90define <4 x i32> @sqdmull4s(ptr %A, ptr %B) nounwind {
91; CHECK-LABEL: sqdmull4s:
92; CHECK:       // %bb.0:
93; CHECK-NEXT:    ldr d0, [x0]
94; CHECK-NEXT:    ldr d1, [x1]
95; CHECK-NEXT:    sqdmull.4s v0, v0, v1
96; CHECK-NEXT:    ret
97  %tmp1 = load <4 x i16>, ptr %A
98  %tmp2 = load <4 x i16>, ptr %B
99  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
100  ret <4 x i32> %tmp3
101}
102
103define <2 x i64> @sqdmull2d(ptr %A, ptr %B) nounwind {
104; CHECK-LABEL: sqdmull2d:
105; CHECK:       // %bb.0:
106; CHECK-NEXT:    ldr d0, [x0]
107; CHECK-NEXT:    ldr d1, [x1]
108; CHECK-NEXT:    sqdmull.2d v0, v0, v1
109; CHECK-NEXT:    ret
110  %tmp1 = load <2 x i32>, ptr %A
111  %tmp2 = load <2 x i32>, ptr %B
112  %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
113  ret <2 x i64> %tmp3
114}
115
116define <4 x i32> @sqdmull2_4s(ptr %A, ptr %B) nounwind {
117; CHECK-LABEL: sqdmull2_4s:
118; CHECK:       // %bb.0:
119; CHECK-NEXT:    ldr d0, [x0, #8]
120; CHECK-NEXT:    ldr d1, [x1, #8]
121; CHECK-NEXT:    sqdmull.4s v0, v0, v1
122; CHECK-NEXT:    ret
123  %load1 = load <8 x i16>, ptr %A
124  %load2 = load <8 x i16>, ptr %B
125  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
126  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
127  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
128  ret <4 x i32> %tmp3
129}
130
131define <2 x i64> @sqdmull2_2d(ptr %A, ptr %B) nounwind {
132; CHECK-LABEL: sqdmull2_2d:
133; CHECK:       // %bb.0:
134; CHECK-NEXT:    ldr d0, [x0, #8]
135; CHECK-NEXT:    ldr d1, [x1, #8]
136; CHECK-NEXT:    sqdmull.2d v0, v0, v1
137; CHECK-NEXT:    ret
138  %load1 = load <4 x i32>, ptr %A
139  %load2 = load <4 x i32>, ptr %B
140  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
141  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
142  %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
143  ret <2 x i64> %tmp3
144}
145
146
147declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
148declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
149
150define <8 x i16> @pmull8h(ptr %A, ptr %B) nounwind {
151; CHECK-LABEL: pmull8h:
152; CHECK:       // %bb.0:
153; CHECK-NEXT:    ldr d0, [x0]
154; CHECK-NEXT:    ldr d1, [x1]
155; CHECK-NEXT:    pmull.8h v0, v0, v1
156; CHECK-NEXT:    ret
157  %tmp1 = load <8 x i8>, ptr %A
158  %tmp2 = load <8 x i8>, ptr %B
159  %tmp3 = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
160  ret <8 x i16> %tmp3
161}
162
163declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
164
165define <4 x i16> @sqdmulh_4h(ptr %A, ptr %B) nounwind {
166; CHECK-LABEL: sqdmulh_4h:
167; CHECK:       // %bb.0:
168; CHECK-NEXT:    ldr d0, [x0]
169; CHECK-NEXT:    ldr d1, [x1]
170; CHECK-NEXT:    sqdmulh.4h v0, v0, v1
171; CHECK-NEXT:    ret
172  %tmp1 = load <4 x i16>, ptr %A
173  %tmp2 = load <4 x i16>, ptr %B
174  %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
175  ret <4 x i16> %tmp3
176}
177
178define <8 x i16> @sqdmulh_8h(ptr %A, ptr %B) nounwind {
179; CHECK-LABEL: sqdmulh_8h:
180; CHECK:       // %bb.0:
181; CHECK-NEXT:    ldr q0, [x0]
182; CHECK-NEXT:    ldr q1, [x1]
183; CHECK-NEXT:    sqdmulh.8h v0, v0, v1
184; CHECK-NEXT:    ret
185  %tmp1 = load <8 x i16>, ptr %A
186  %tmp2 = load <8 x i16>, ptr %B
187  %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
188  ret <8 x i16> %tmp3
189}
190
191define <2 x i32> @sqdmulh_2s(ptr %A, ptr %B) nounwind {
192; CHECK-LABEL: sqdmulh_2s:
193; CHECK:       // %bb.0:
194; CHECK-NEXT:    ldr d0, [x0]
195; CHECK-NEXT:    ldr d1, [x1]
196; CHECK-NEXT:    sqdmulh.2s v0, v0, v1
197; CHECK-NEXT:    ret
198  %tmp1 = load <2 x i32>, ptr %A
199  %tmp2 = load <2 x i32>, ptr %B
200  %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
201  ret <2 x i32> %tmp3
202}
203
204define <4 x i32> @sqdmulh_4s(ptr %A, ptr %B) nounwind {
205; CHECK-LABEL: sqdmulh_4s:
206; CHECK:       // %bb.0:
207; CHECK-NEXT:    ldr q0, [x0]
208; CHECK-NEXT:    ldr q1, [x1]
209; CHECK-NEXT:    sqdmulh.4s v0, v0, v1
210; CHECK-NEXT:    ret
211  %tmp1 = load <4 x i32>, ptr %A
212  %tmp2 = load <4 x i32>, ptr %B
213  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
214  ret <4 x i32> %tmp3
215}
216
217define i32 @sqdmulh_1s(ptr %A, ptr %B) nounwind {
218; CHECK-LABEL: sqdmulh_1s:
219; CHECK:       // %bb.0:
220; CHECK-NEXT:    ldr w8, [x0]
221; CHECK-NEXT:    ldr w9, [x1]
222; CHECK-NEXT:    fmov s0, w8
223; CHECK-NEXT:    fmov s1, w9
224; CHECK-NEXT:    sqdmulh s0, s0, s1
225; CHECK-NEXT:    fmov w0, s0
226; CHECK-NEXT:    ret
227  %tmp1 = load i32, ptr %A
228  %tmp2 = load i32, ptr %B
229  %tmp3 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %tmp1, i32 %tmp2)
230  ret i32 %tmp3
231}
232
233declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
234declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
235declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
236declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
237declare i32 @llvm.aarch64.neon.sqdmulh.i32(i32, i32) nounwind readnone
238
239define <4 x i16> @sqrdmulh_4h(ptr %A, ptr %B) nounwind {
240; CHECK-LABEL: sqrdmulh_4h:
241; CHECK:       // %bb.0:
242; CHECK-NEXT:    ldr d0, [x0]
243; CHECK-NEXT:    ldr d1, [x1]
244; CHECK-NEXT:    sqrdmulh.4h v0, v0, v1
245; CHECK-NEXT:    ret
246  %tmp1 = load <4 x i16>, ptr %A
247  %tmp2 = load <4 x i16>, ptr %B
248  %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
249  ret <4 x i16> %tmp3
250}
251
252define <8 x i16> @sqrdmulh_8h(ptr %A, ptr %B) nounwind {
253; CHECK-LABEL: sqrdmulh_8h:
254; CHECK:       // %bb.0:
255; CHECK-NEXT:    ldr q0, [x0]
256; CHECK-NEXT:    ldr q1, [x1]
257; CHECK-NEXT:    sqrdmulh.8h v0, v0, v1
258; CHECK-NEXT:    ret
259  %tmp1 = load <8 x i16>, ptr %A
260  %tmp2 = load <8 x i16>, ptr %B
261  %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
262  ret <8 x i16> %tmp3
263}
264
265define <2 x i32> @sqrdmulh_2s(ptr %A, ptr %B) nounwind {
266; CHECK-LABEL: sqrdmulh_2s:
267; CHECK:       // %bb.0:
268; CHECK-NEXT:    ldr d0, [x0]
269; CHECK-NEXT:    ldr d1, [x1]
270; CHECK-NEXT:    sqrdmulh.2s v0, v0, v1
271; CHECK-NEXT:    ret
272  %tmp1 = load <2 x i32>, ptr %A
273  %tmp2 = load <2 x i32>, ptr %B
274  %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
275  ret <2 x i32> %tmp3
276}
277
278define <4 x i32> @sqrdmulh_4s(ptr %A, ptr %B) nounwind {
279; CHECK-LABEL: sqrdmulh_4s:
280; CHECK:       // %bb.0:
281; CHECK-NEXT:    ldr q0, [x0]
282; CHECK-NEXT:    ldr q1, [x1]
283; CHECK-NEXT:    sqrdmulh.4s v0, v0, v1
284; CHECK-NEXT:    ret
285  %tmp1 = load <4 x i32>, ptr %A
286  %tmp2 = load <4 x i32>, ptr %B
287  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
288  ret <4 x i32> %tmp3
289}
290
291define i32 @sqrdmulh_1s(ptr %A, ptr %B) nounwind {
292; CHECK-LABEL: sqrdmulh_1s:
293; CHECK:       // %bb.0:
294; CHECK-NEXT:    ldr w8, [x0]
295; CHECK-NEXT:    ldr w9, [x1]
296; CHECK-NEXT:    fmov s0, w8
297; CHECK-NEXT:    fmov s1, w9
298; CHECK-NEXT:    sqrdmulh s0, s0, s1
299; CHECK-NEXT:    fmov w0, s0
300; CHECK-NEXT:    ret
301  %tmp1 = load i32, ptr %A
302  %tmp2 = load i32, ptr %B
303  %tmp3 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %tmp1, i32 %tmp2)
304  ret i32 %tmp3
305}
306
307declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
308declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
309declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
310declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
311declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32) nounwind readnone
312
313define <2 x float> @fmulx_2s(ptr %A, ptr %B) nounwind {
314; CHECK-LABEL: fmulx_2s:
315; CHECK:       // %bb.0:
316; CHECK-NEXT:    ldr d0, [x0]
317; CHECK-NEXT:    ldr d1, [x1]
318; CHECK-NEXT:    fmulx.2s v0, v0, v1
319; CHECK-NEXT:    ret
320  %tmp1 = load <2 x float>, ptr %A
321  %tmp2 = load <2 x float>, ptr %B
322  %tmp3 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
323  ret <2 x float> %tmp3
324}
325
326define <4 x float> @fmulx_4s(ptr %A, ptr %B) nounwind {
327; CHECK-LABEL: fmulx_4s:
328; CHECK:       // %bb.0:
329; CHECK-NEXT:    ldr q0, [x0]
330; CHECK-NEXT:    ldr q1, [x1]
331; CHECK-NEXT:    fmulx.4s v0, v0, v1
332; CHECK-NEXT:    ret
333  %tmp1 = load <4 x float>, ptr %A
334  %tmp2 = load <4 x float>, ptr %B
335  %tmp3 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
336  ret <4 x float> %tmp3
337}
338
339define <2 x double> @fmulx_2d(ptr %A, ptr %B) nounwind {
340; CHECK-LABEL: fmulx_2d:
341; CHECK:       // %bb.0:
342; CHECK-NEXT:    ldr q0, [x0]
343; CHECK-NEXT:    ldr q1, [x1]
344; CHECK-NEXT:    fmulx.2d v0, v0, v1
345; CHECK-NEXT:    ret
346  %tmp1 = load <2 x double>, ptr %A
347  %tmp2 = load <2 x double>, ptr %B
348  %tmp3 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
349  ret <2 x double> %tmp3
350}
351
352declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>) nounwind readnone
353declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>) nounwind readnone
354declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>) nounwind readnone
355
356define <4 x i32> @smlal4s(ptr %A, ptr %B, ptr %C) nounwind {
357; CHECK-LABEL: smlal4s:
358; CHECK:       // %bb.0:
359; CHECK-NEXT:    ldr d1, [x0]
360; CHECK-NEXT:    ldr d2, [x1]
361; CHECK-NEXT:    ldr q0, [x2]
362; CHECK-NEXT:    smlal.4s v0, v1, v2
363; CHECK-NEXT:    ret
364  %tmp1 = load <4 x i16>, ptr %A
365  %tmp2 = load <4 x i16>, ptr %B
366  %tmp3 = load <4 x i32>, ptr %C
367  %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
368  %tmp5 = add <4 x i32> %tmp3, %tmp4
369  ret <4 x i32> %tmp5
370}
371
372define <2 x i64> @smlal2d(ptr %A, ptr %B, ptr %C) nounwind {
373; CHECK-LABEL: smlal2d:
374; CHECK:       // %bb.0:
375; CHECK-NEXT:    ldr d1, [x0]
376; CHECK-NEXT:    ldr d2, [x1]
377; CHECK-NEXT:    ldr q0, [x2]
378; CHECK-NEXT:    smlal.2d v0, v1, v2
379; CHECK-NEXT:    ret
380  %tmp1 = load <2 x i32>, ptr %A
381  %tmp2 = load <2 x i32>, ptr %B
382  %tmp3 = load <2 x i64>, ptr %C
383  %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
384  %tmp5 = add <2 x i64> %tmp3, %tmp4
385  ret <2 x i64> %tmp5
386}
387
388define void @smlal8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
389; CHECK-LABEL: smlal8h_chain_with_constant:
390; CHECK:       // %bb.0:
391; CHECK-NEXT:    movi.16b v3, #1
392; CHECK-NEXT:    smlal.8h v3, v0, v2
393; CHECK-NEXT:    mvn.8b v0, v2
394; CHECK-NEXT:    smlal.8h v3, v1, v0
395; CHECK-NEXT:    str q3, [x0]
396; CHECK-NEXT:    ret
397  %xor = xor <8 x i8> %v3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
398  %smull.1 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %v1, <8 x i8> %v3)
399  %add.1 = add <8 x i16> %smull.1, <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>
400  %smull.2 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %v2, <8 x i8> %xor)
401  %add.2 = add <8 x i16> %add.1, %smull.2
402  store <8 x i16> %add.2, ptr %dst
403  ret void
404}
405
406define void @smlal2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
407; CHECK-LABEL: smlal2d_chain_with_constant:
408; CHECK:       // %bb.0:
409; CHECK-NEXT:    mov w8, #257 // =0x101
410; CHECK-NEXT:    dup.2d v3, x8
411; CHECK-NEXT:    smlal.2d v3, v0, v2
412; CHECK-NEXT:    mvn.8b v0, v2
413; CHECK-NEXT:    smlal.2d v3, v1, v0
414; CHECK-NEXT:    str q3, [x0]
415; CHECK-NEXT:    ret
416  %xor = xor <2 x i32> %v3, <i32 -1, i32 -1>
417  %smull.1 = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %v1, <2 x i32> %v3)
418  %add.1 = add <2 x i64> %smull.1, <i64 257, i64 257>
419  %smull.2 = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %v2, <2 x i32> %xor)
420  %add.2 = add <2 x i64> %add.1, %smull.2
421  store <2 x i64> %add.2, ptr %dst
422  ret void
423}
424
425define <4 x i32> @smlsl4s(ptr %A, ptr %B, ptr %C) nounwind {
426; CHECK-LABEL: smlsl4s:
427; CHECK:       // %bb.0:
428; CHECK-NEXT:    ldr d1, [x0]
429; CHECK-NEXT:    ldr d2, [x1]
430; CHECK-NEXT:    ldr q0, [x2]
431; CHECK-NEXT:    smlsl.4s v0, v1, v2
432; CHECK-NEXT:    ret
433  %tmp1 = load <4 x i16>, ptr %A
434  %tmp2 = load <4 x i16>, ptr %B
435  %tmp3 = load <4 x i32>, ptr %C
436  %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
437  %tmp5 = sub <4 x i32> %tmp3, %tmp4
438  ret <4 x i32> %tmp5
439}
440
441define <2 x i64> @smlsl2d(ptr %A, ptr %B, ptr %C) nounwind {
442; CHECK-LABEL: smlsl2d:
443; CHECK:       // %bb.0:
444; CHECK-NEXT:    ldr d1, [x0]
445; CHECK-NEXT:    ldr d2, [x1]
446; CHECK-NEXT:    ldr q0, [x2]
447; CHECK-NEXT:    smlsl.2d v0, v1, v2
448; CHECK-NEXT:    ret
449  %tmp1 = load <2 x i32>, ptr %A
450  %tmp2 = load <2 x i32>, ptr %B
451  %tmp3 = load <2 x i64>, ptr %C
452  %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
453  %tmp5 = sub <2 x i64> %tmp3, %tmp4
454  ret <2 x i64> %tmp5
455}
456
457define void @smlsl8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
458; CHECK-LABEL: smlsl8h_chain_with_constant:
459; CHECK:       // %bb.0:
460; CHECK-NEXT:    movi.16b v3, #1
461; CHECK-NEXT:    smlsl.8h v3, v0, v2
462; CHECK-NEXT:    mvn.8b v0, v2
463; CHECK-NEXT:    smlsl.8h v3, v1, v0
464; CHECK-NEXT:    str q3, [x0]
465; CHECK-NEXT:    ret
466  %xor = xor <8 x i8> %v3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
467  %smull.1 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %v1, <8 x i8> %v3)
468  %sub.1 = sub <8 x i16> <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>, %smull.1
469  %smull.2 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %v2, <8 x i8> %xor)
470  %sub.2 = sub <8 x i16> %sub.1, %smull.2
471  store <8 x i16> %sub.2, ptr %dst
472  ret void
473}
474
475define void @smlsl2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
476; CHECK-LABEL: smlsl2d_chain_with_constant:
477; CHECK:       // %bb.0:
478; CHECK-NEXT:    mov w8, #257 // =0x101
479; CHECK-NEXT:    dup.2d v3, x8
480; CHECK-NEXT:    smlsl.2d v3, v0, v2
481; CHECK-NEXT:    mvn.8b v0, v2
482; CHECK-NEXT:    smlsl.2d v3, v1, v0
483; CHECK-NEXT:    str q3, [x0]
484; CHECK-NEXT:    ret
485  %xor = xor <2 x i32> %v3, <i32 -1, i32 -1>
486  %smull.1 = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %v1, <2 x i32> %v3)
487  %sub.1 = sub <2 x i64> <i64 257, i64 257>, %smull.1
488  %smull.2 = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %v2, <2 x i32> %xor)
489  %sub.2 = sub <2 x i64> %sub.1, %smull.2
490  store <2 x i64> %sub.2, ptr %dst
491  ret void
492}
493
494declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
495declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
496declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
497declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
498
499define <4 x i32> @sqdmlal4s(ptr %A, ptr %B, ptr %C) nounwind {
500; CHECK-LABEL: sqdmlal4s:
501; CHECK:       // %bb.0:
502; CHECK-NEXT:    ldr d1, [x0]
503; CHECK-NEXT:    ldr d2, [x1]
504; CHECK-NEXT:    ldr q0, [x2]
505; CHECK-NEXT:    sqdmlal.4s v0, v1, v2
506; CHECK-NEXT:    ret
507  %tmp1 = load <4 x i16>, ptr %A
508  %tmp2 = load <4 x i16>, ptr %B
509  %tmp3 = load <4 x i32>, ptr %C
510  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
511  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
512  ret <4 x i32> %tmp5
513}
514
515define <2 x i64> @sqdmlal2d(ptr %A, ptr %B, ptr %C) nounwind {
516; CHECK-LABEL: sqdmlal2d:
517; CHECK:       // %bb.0:
518; CHECK-NEXT:    ldr d1, [x0]
519; CHECK-NEXT:    ldr d2, [x1]
520; CHECK-NEXT:    ldr q0, [x2]
521; CHECK-NEXT:    sqdmlal.2d v0, v1, v2
522; CHECK-NEXT:    ret
523  %tmp1 = load <2 x i32>, ptr %A
524  %tmp2 = load <2 x i32>, ptr %B
525  %tmp3 = load <2 x i64>, ptr %C
526  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
527  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
528  ret <2 x i64> %tmp5
529}
530
531define <4 x i32> @sqdmlal2_4s(ptr %A, ptr %B, ptr %C) nounwind {
532; CHECK-LABEL: sqdmlal2_4s:
533; CHECK:       // %bb.0:
534; CHECK-NEXT:    ldr q0, [x2]
535; CHECK-NEXT:    ldr d1, [x0, #8]
536; CHECK-NEXT:    ldr d2, [x1, #8]
537; CHECK-NEXT:    sqdmlal.4s v0, v1, v2
538; CHECK-NEXT:    ret
539  %load1 = load <8 x i16>, ptr %A
540  %load2 = load <8 x i16>, ptr %B
541  %tmp3 = load <4 x i32>, ptr %C
542  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
543  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
544  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
545  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
546  ret <4 x i32> %tmp5
547}
548
549define <2 x i64> @sqdmlal2_2d(ptr %A, ptr %B, ptr %C) nounwind {
550; CHECK-LABEL: sqdmlal2_2d:
551; CHECK:       // %bb.0:
552; CHECK-NEXT:    ldr q0, [x2]
553; CHECK-NEXT:    ldr d1, [x0, #8]
554; CHECK-NEXT:    ldr d2, [x1, #8]
555; CHECK-NEXT:    sqdmlal.2d v0, v1, v2
556; CHECK-NEXT:    ret
557  %load1 = load <4 x i32>, ptr %A
558  %load2 = load <4 x i32>, ptr %B
559  %tmp3 = load <2 x i64>, ptr %C
560  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
561  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
562  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
563  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
564  ret <2 x i64> %tmp5
565}
566
567define <4 x i32> @sqdmlsl4s(ptr %A, ptr %B, ptr %C) nounwind {
568; CHECK-LABEL: sqdmlsl4s:
569; CHECK:       // %bb.0:
570; CHECK-NEXT:    ldr d1, [x0]
571; CHECK-NEXT:    ldr d2, [x1]
572; CHECK-NEXT:    ldr q0, [x2]
573; CHECK-NEXT:    sqdmlsl.4s v0, v1, v2
574; CHECK-NEXT:    ret
575  %tmp1 = load <4 x i16>, ptr %A
576  %tmp2 = load <4 x i16>, ptr %B
577  %tmp3 = load <4 x i32>, ptr %C
578  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
579  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
580  ret <4 x i32> %tmp5
581}
582
583define <2 x i64> @sqdmlsl2d(ptr %A, ptr %B, ptr %C) nounwind {
584; CHECK-LABEL: sqdmlsl2d:
585; CHECK:       // %bb.0:
586; CHECK-NEXT:    ldr d1, [x0]
587; CHECK-NEXT:    ldr d2, [x1]
588; CHECK-NEXT:    ldr q0, [x2]
589; CHECK-NEXT:    sqdmlsl.2d v0, v1, v2
590; CHECK-NEXT:    ret
591  %tmp1 = load <2 x i32>, ptr %A
592  %tmp2 = load <2 x i32>, ptr %B
593  %tmp3 = load <2 x i64>, ptr %C
594  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
595  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
596  ret <2 x i64> %tmp5
597}
598
599define <4 x i32> @sqdmlsl2_4s(ptr %A, ptr %B, ptr %C) nounwind {
600; CHECK-LABEL: sqdmlsl2_4s:
601; CHECK:       // %bb.0:
602; CHECK-NEXT:    ldr q0, [x2]
603; CHECK-NEXT:    ldr d1, [x0, #8]
604; CHECK-NEXT:    ldr d2, [x1, #8]
605; CHECK-NEXT:    sqdmlsl.4s v0, v1, v2
606; CHECK-NEXT:    ret
607  %load1 = load <8 x i16>, ptr %A
608  %load2 = load <8 x i16>, ptr %B
609  %tmp3 = load <4 x i32>, ptr %C
610  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
611  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
612  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
613  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
614  ret <4 x i32> %tmp5
615}
616
617define <2 x i64> @sqdmlsl2_2d(ptr %A, ptr %B, ptr %C) nounwind {
618; CHECK-LABEL: sqdmlsl2_2d:
619; CHECK:       // %bb.0:
620; CHECK-NEXT:    ldr q0, [x2]
621; CHECK-NEXT:    ldr d1, [x0, #8]
622; CHECK-NEXT:    ldr d2, [x1, #8]
623; CHECK-NEXT:    sqdmlsl.2d v0, v1, v2
624; CHECK-NEXT:    ret
625  %load1 = load <4 x i32>, ptr %A
626  %load2 = load <4 x i32>, ptr %B
627  %tmp3 = load <2 x i64>, ptr %C
628  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
629  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
630  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
631  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
632  ret <2 x i64> %tmp5
633}
634
635define <4 x i32> @umlal4s(ptr %A, ptr %B, ptr %C) nounwind {
636; CHECK-LABEL: umlal4s:
637; CHECK:       // %bb.0:
638; CHECK-NEXT:    ldr d1, [x0]
639; CHECK-NEXT:    ldr d2, [x1]
640; CHECK-NEXT:    ldr q0, [x2]
641; CHECK-NEXT:    umlal.4s v0, v1, v2
642; CHECK-NEXT:    ret
643  %tmp1 = load <4 x i16>, ptr %A
644  %tmp2 = load <4 x i16>, ptr %B
645  %tmp3 = load <4 x i32>, ptr %C
646  %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
647  %tmp5 = add <4 x i32> %tmp3, %tmp4
648  ret <4 x i32> %tmp5
649}
650
651define <2 x i64> @umlal2d(ptr %A, ptr %B, ptr %C) nounwind {
652; CHECK-LABEL: umlal2d:
653; CHECK:       // %bb.0:
654; CHECK-NEXT:    ldr d1, [x0]
655; CHECK-NEXT:    ldr d2, [x1]
656; CHECK-NEXT:    ldr q0, [x2]
657; CHECK-NEXT:    umlal.2d v0, v1, v2
658; CHECK-NEXT:    ret
659  %tmp1 = load <2 x i32>, ptr %A
660  %tmp2 = load <2 x i32>, ptr %B
661  %tmp3 = load <2 x i64>, ptr %C
662  %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
663  %tmp5 = add <2 x i64> %tmp3, %tmp4
664  ret <2 x i64> %tmp5
665}
666
667define void @umlal8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
668; CHECK-LABEL: umlal8h_chain_with_constant:
669; CHECK:       // %bb.0:
670; CHECK-NEXT:    movi.16b v3, #1
671; CHECK-NEXT:    umlal.8h v3, v0, v2
672; CHECK-NEXT:    mvn.8b v0, v2
673; CHECK-NEXT:    umlal.8h v3, v1, v0
674; CHECK-NEXT:    str q3, [x0]
675; CHECK-NEXT:    ret
676  %xor = xor <8 x i8> %v3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
677  %umull.1 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %v1, <8 x i8> %v3)
678  %add.1 = add <8 x i16> %umull.1, <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>
679  %umull.2 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %v2, <8 x i8> %xor)
680  %add.2 = add <8 x i16> %add.1, %umull.2
681  store <8 x i16> %add.2, ptr %dst
682  ret void
683}
684
685define void @umlal2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
686; CHECK-LABEL: umlal2d_chain_with_constant:
687; CHECK:       // %bb.0:
688; CHECK-NEXT:    mov w8, #257 // =0x101
689; CHECK-NEXT:    dup.2d v3, x8
690; CHECK-NEXT:    umlal.2d v3, v0, v2
691; CHECK-NEXT:    mvn.8b v0, v2
692; CHECK-NEXT:    umlal.2d v3, v1, v0
693; CHECK-NEXT:    str q3, [x0]
694; CHECK-NEXT:    ret
695  %xor = xor <2 x i32> %v3, <i32 -1, i32 -1>
696  %umull.1 = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %v1, <2 x i32> %v3)
697  %add.1 = add <2 x i64> %umull.1, <i64 257, i64 257>
698  %umull.2 = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %v2, <2 x i32> %xor)
699  %add.2 = add <2 x i64> %add.1, %umull.2
700  store <2 x i64> %add.2, ptr %dst
701  ret void
702}
703
704define <4 x i32> @umlsl4s(ptr %A, ptr %B, ptr %C) nounwind {
705; CHECK-LABEL: umlsl4s:
706; CHECK:       // %bb.0:
707; CHECK-NEXT:    ldr d1, [x0]
708; CHECK-NEXT:    ldr d2, [x1]
709; CHECK-NEXT:    ldr q0, [x2]
710; CHECK-NEXT:    umlsl.4s v0, v1, v2
711; CHECK-NEXT:    ret
712  %tmp1 = load <4 x i16>, ptr %A
713  %tmp2 = load <4 x i16>, ptr %B
714  %tmp3 = load <4 x i32>, ptr %C
715  %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
716  %tmp5 = sub <4 x i32> %tmp3, %tmp4
717  ret <4 x i32> %tmp5
718}
719
720define <2 x i64> @umlsl2d(ptr %A, ptr %B, ptr %C) nounwind {
721; CHECK-LABEL: umlsl2d:
722; CHECK:       // %bb.0:
723; CHECK-NEXT:    ldr d1, [x0]
724; CHECK-NEXT:    ldr d2, [x1]
725; CHECK-NEXT:    ldr q0, [x2]
726; CHECK-NEXT:    umlsl.2d v0, v1, v2
727; CHECK-NEXT:    ret
728  %tmp1 = load <2 x i32>, ptr %A
729  %tmp2 = load <2 x i32>, ptr %B
730  %tmp3 = load <2 x i64>, ptr %C
731  %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
732  %tmp5 = sub <2 x i64> %tmp3, %tmp4
733  ret <2 x i64> %tmp5
734}
735
736define void @umlsl8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
737; CHECK-LABEL: umlsl8h_chain_with_constant:
738; CHECK:       // %bb.0:
739; CHECK-NEXT:    movi.16b v3, #1
740; CHECK-NEXT:    umlsl.8h v3, v0, v2
741; CHECK-NEXT:    mvn.8b v0, v2
742; CHECK-NEXT:    umlsl.8h v3, v1, v0
743; CHECK-NEXT:    str q3, [x0]
744; CHECK-NEXT:    ret
745  %xor = xor <8 x i8> %v3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
746  %umull.1 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %v1, <8 x i8> %v3)
747  %add.1 = sub <8 x i16> <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>, %umull.1
748  %umull.2 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %v2, <8 x i8> %xor)
749  %add.2 = sub <8 x i16> %add.1, %umull.2
750  store <8 x i16> %add.2, ptr %dst
751  ret void
752}
753
754define void @umlsl2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
755; CHECK-LABEL: umlsl2d_chain_with_constant:
756; CHECK:       // %bb.0:
757; CHECK-NEXT:    mov w8, #257 // =0x101
758; CHECK-NEXT:    dup.2d v3, x8
759; CHECK-NEXT:    umlsl.2d v3, v0, v2
760; CHECK-NEXT:    mvn.8b v0, v2
761; CHECK-NEXT:    umlsl.2d v3, v1, v0
762; CHECK-NEXT:    str q3, [x0]
763; CHECK-NEXT:    ret
764  %xor = xor <2 x i32> %v3, <i32 -1, i32 -1>
765  %umull.1 = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %v1, <2 x i32> %v3)
766  %add.1 = sub <2 x i64> <i64 257, i64 257>, %umull.1
767  %umull.2 = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %v2, <2 x i32> %xor)
768  %add.2 = sub <2 x i64> %add.1, %umull.2
769  store <2 x i64> %add.2, ptr %dst
770  ret void
771}
772
773define <2 x float> @fmla_2s(ptr %A, ptr %B, ptr %C) nounwind {
774; CHECK-LABEL: fmla_2s:
775; CHECK:       // %bb.0:
776; CHECK-NEXT:    ldr d1, [x0]
777; CHECK-NEXT:    ldr d2, [x1]
778; CHECK-NEXT:    ldr d0, [x2]
779; CHECK-NEXT:    fmla.2s v0, v2, v1
780; CHECK-NEXT:    ret
781  %tmp1 = load <2 x float>, ptr %A
782  %tmp2 = load <2 x float>, ptr %B
783  %tmp3 = load <2 x float>, ptr %C
784  %tmp4 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp2, <2 x float> %tmp3)
785  ret <2 x float> %tmp4
786}
787
788define <4 x float> @fmla_4s(ptr %A, ptr %B, ptr %C) nounwind {
789; CHECK-LABEL: fmla_4s:
790; CHECK:       // %bb.0:
791; CHECK-NEXT:    ldr q1, [x0]
792; CHECK-NEXT:    ldr q2, [x1]
793; CHECK-NEXT:    ldr q0, [x2]
794; CHECK-NEXT:    fmla.4s v0, v2, v1
795; CHECK-NEXT:    ret
796  %tmp1 = load <4 x float>, ptr %A
797  %tmp2 = load <4 x float>, ptr %B
798  %tmp3 = load <4 x float>, ptr %C
799  %tmp4 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp2, <4 x float> %tmp3)
800  ret <4 x float> %tmp4
801}
802
803define <2 x double> @fmla_2d(ptr %A, ptr %B, ptr %C) nounwind {
804; CHECK-LABEL: fmla_2d:
805; CHECK:       // %bb.0:
806; CHECK-NEXT:    ldr q1, [x0]
807; CHECK-NEXT:    ldr q2, [x1]
808; CHECK-NEXT:    ldr q0, [x2]
809; CHECK-NEXT:    fmla.2d v0, v2, v1
810; CHECK-NEXT:    ret
811  %tmp1 = load <2 x double>, ptr %A
812  %tmp2 = load <2 x double>, ptr %B
813  %tmp3 = load <2 x double>, ptr %C
814  %tmp4 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp2, <2 x double> %tmp3)
815  ret <2 x double> %tmp4
816}
817
818declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
819declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
820declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
821
822define <2 x float> @fmls_2s(ptr %A, ptr %B, ptr %C) nounwind {
823; CHECK-LABEL: fmls_2s:
824; CHECK:       // %bb.0:
825; CHECK-NEXT:    ldr d1, [x0]
826; CHECK-NEXT:    ldr d2, [x1]
827; CHECK-NEXT:    ldr d0, [x2]
828; CHECK-NEXT:    fmls.2s v0, v1, v2
829; CHECK-NEXT:    ret
830  %tmp1 = load <2 x float>, ptr %A
831  %tmp2 = load <2 x float>, ptr %B
832  %tmp3 = load <2 x float>, ptr %C
833  %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
834  %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp4, <2 x float> %tmp3)
835  ret <2 x float> %tmp5
836}
837
838define <4 x float> @fmls_4s(ptr %A, ptr %B, ptr %C) nounwind {
839; CHECK-LABEL: fmls_4s:
840; CHECK:       // %bb.0:
841; CHECK-NEXT:    ldr q1, [x0]
842; CHECK-NEXT:    ldr q2, [x1]
843; CHECK-NEXT:    ldr q0, [x2]
844; CHECK-NEXT:    fmls.4s v0, v1, v2
845; CHECK-NEXT:    ret
846  %tmp1 = load <4 x float>, ptr %A
847  %tmp2 = load <4 x float>, ptr %B
848  %tmp3 = load <4 x float>, ptr %C
849  %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
850  %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp4, <4 x float> %tmp3)
851  ret <4 x float> %tmp5
852}
853
854define <2 x double> @fmls_2d(ptr %A, ptr %B, ptr %C) nounwind {
855; CHECK-LABEL: fmls_2d:
856; CHECK:       // %bb.0:
857; CHECK-NEXT:    ldr q1, [x0]
858; CHECK-NEXT:    ldr q2, [x1]
859; CHECK-NEXT:    ldr q0, [x2]
860; CHECK-NEXT:    fmls.2d v0, v1, v2
861; CHECK-NEXT:    ret
862  %tmp1 = load <2 x double>, ptr %A
863  %tmp2 = load <2 x double>, ptr %B
864  %tmp3 = load <2 x double>, ptr %C
865  %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
866  %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp4, <2 x double> %tmp3)
867  ret <2 x double> %tmp5
868}
869
870define <2 x float> @fmls_commuted_neg_2s(ptr %A, ptr %B, ptr %C) nounwind {
871; CHECK-LABEL: fmls_commuted_neg_2s:
872; CHECK:       // %bb.0:
873; CHECK-NEXT:    ldr d1, [x0]
874; CHECK-NEXT:    ldr d2, [x1]
875; CHECK-NEXT:    ldr d0, [x2]
876; CHECK-NEXT:    fmls.2s v0, v1, v2
877; CHECK-NEXT:    ret
878  %tmp1 = load <2 x float>, ptr %A
879  %tmp2 = load <2 x float>, ptr %B
880  %tmp3 = load <2 x float>, ptr %C
881  %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
882  %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp4, <2 x float> %tmp1, <2 x float> %tmp3)
883  ret <2 x float> %tmp5
884}
885
886define <4 x float> @fmls_commuted_neg_4s(ptr %A, ptr %B, ptr %C) nounwind {
887; CHECK-LABEL: fmls_commuted_neg_4s:
888; CHECK:       // %bb.0:
889; CHECK-NEXT:    ldr q1, [x0]
890; CHECK-NEXT:    ldr q2, [x1]
891; CHECK-NEXT:    ldr q0, [x2]
892; CHECK-NEXT:    fmls.4s v0, v1, v2
893; CHECK-NEXT:    ret
894  %tmp1 = load <4 x float>, ptr %A
895  %tmp2 = load <4 x float>, ptr %B
896  %tmp3 = load <4 x float>, ptr %C
897  %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
898  %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp4, <4 x float> %tmp1, <4 x float> %tmp3)
899  ret <4 x float> %tmp5
900}
901
902define <2 x double> @fmls_commuted_neg_2d(ptr %A, ptr %B, ptr %C) nounwind {
903; CHECK-LABEL: fmls_commuted_neg_2d:
904; CHECK:       // %bb.0:
905; CHECK-NEXT:    ldr q1, [x0]
906; CHECK-NEXT:    ldr q2, [x1]
907; CHECK-NEXT:    ldr q0, [x2]
908; CHECK-NEXT:    fmls.2d v0, v1, v2
909; CHECK-NEXT:    ret
910  %tmp1 = load <2 x double>, ptr %A
911  %tmp2 = load <2 x double>, ptr %B
912  %tmp3 = load <2 x double>, ptr %C
913  %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
914  %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp4, <2 x double> %tmp1, <2 x double> %tmp3)
915  ret <2 x double> %tmp5
916}
917
918define <2 x float> @fmls_indexed_2s(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp {
919; CHECK-LABEL: fmls_indexed_2s:
920; CHECK:       // %bb.0: // %entry
921; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
922; CHECK-NEXT:    fmls.2s v0, v2, v1[0]
923; CHECK-NEXT:    ret
924entry:
925  %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %c
926  %lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer
927  %fmls1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %lane, <2 x float> %a)
928  ret <2 x float> %fmls1
929}
930
931define <4 x float> @fmls_indexed_4s(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp {
932; CHECK-LABEL: fmls_indexed_4s:
933; CHECK:       // %bb.0: // %entry
934; CHECK-NEXT:    fmls.4s v0, v2, v1[0]
935; CHECK-NEXT:    ret
936entry:
937  %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
938  %lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
939  %fmls1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %lane, <4 x float> %a)
940  ret <4 x float> %fmls1
941}
942
943define <2 x double> @fmls_indexed_2d(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp {
944; CHECK-LABEL: fmls_indexed_2d:
945; CHECK:       // %bb.0: // %entry
946; CHECK-NEXT:    fmls.2d v0, v2, v1[0]
947; CHECK-NEXT:    ret
948entry:
949  %0 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
950  %lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer
951  %fmls1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %0, <2 x double> %lane, <2 x double> %a)
952  ret <2 x double> %fmls1
953}
954
955define <2 x float> @fmla_indexed_scalar_2s(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp {
956; CHECK-LABEL: fmla_indexed_scalar_2s:
957; CHECK:       // %bb.0: // %entry
958; CHECK-NEXT:    // kill: def $s2 killed $s2 def $d2
959; CHECK-NEXT:    fmla.2s v0, v1, v2
960; CHECK-NEXT:    ret
961entry:
962  %v1 = insertelement <2 x float> undef, float %c, i32 0
963  %v2 = insertelement <2 x float> %v1, float %c, i32 1
964  %fmla1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %v1, <2 x float> %b, <2 x float> %a) nounwind
965  ret <2 x float> %fmla1
966}
967
968define <4 x float> @fmla_indexed_scalar_4s(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp {
969; CHECK-LABEL: fmla_indexed_scalar_4s:
970; CHECK:       // %bb.0: // %entry
971; CHECK-NEXT:    // kill: def $s2 killed $s2 def $q2
972; CHECK-NEXT:    fmla.4s v0, v1, v2[0]
973; CHECK-NEXT:    ret
974entry:
975  %v1 = insertelement <4 x float> undef, float %c, i32 0
976  %v2 = insertelement <4 x float> %v1, float %c, i32 1
977  %v3 = insertelement <4 x float> %v2, float %c, i32 2
978  %v4 = insertelement <4 x float> %v3, float %c, i32 3
979  %fmla1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %v4, <4 x float> %b, <4 x float> %a) nounwind
980  ret <4 x float> %fmla1
981}
982
983define <2 x double> @fmla_indexed_scalar_2d(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp {
984; CHECK-LABEL: fmla_indexed_scalar_2d:
985; CHECK:       // %bb.0: // %entry
986; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
987; CHECK-NEXT:    fmla.2d v0, v1, v2[0]
988; CHECK-NEXT:    ret
989entry:
990  %v1 = insertelement <2 x double> undef, double %c, i32 0
991  %v2 = insertelement <2 x double> %v1, double %c, i32 1
992  %fmla1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %v2, <2 x double> %b, <2 x double> %a) nounwind
993  ret <2 x double> %fmla1
994}
995
996define <2 x float> @fmls_indexed_2s_strict(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp strictfp {
997; CHECK-LABEL: fmls_indexed_2s_strict:
998; CHECK:       // %bb.0: // %entry
999; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1000; CHECK-NEXT:    fmls.2s v0, v2, v1[0]
1001; CHECK-NEXT:    ret
1002entry:
1003  %0 = fneg <2 x float> %c
1004  %lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer
1005  %fmls1 = tail call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %0, <2 x float> %lane, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
1006  ret <2 x float> %fmls1
1007}
1008
1009define <4 x float> @fmls_indexed_4s_strict(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp strictfp {
1010; CHECK-LABEL: fmls_indexed_4s_strict:
1011; CHECK:       // %bb.0: // %entry
1012; CHECK-NEXT:    fmls.4s v0, v2, v1[0]
1013; CHECK-NEXT:    ret
1014entry:
1015  %0 = fneg <4 x float> %c
1016  %lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
1017  %fmls1 = tail call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %0, <4 x float> %lane, <4 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
1018  ret <4 x float> %fmls1
1019}
1020
1021define <2 x double> @fmls_indexed_2d_strict(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp strictfp {
1022; CHECK-LABEL: fmls_indexed_2d_strict:
1023; CHECK:       // %bb.0: // %entry
1024; CHECK-NEXT:    fmls.2d v0, v2, v1[0]
1025; CHECK-NEXT:    ret
1026entry:
1027  %0 = fneg <2 x double> %c
1028  %lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer
1029  %fmls1 = tail call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %0, <2 x double> %lane, <2 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
1030  ret <2 x double> %fmls1
1031}
1032
1033define <2 x float> @fmla_indexed_scalar_2s_strict(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp strictfp {
1034; CHECK-LABEL: fmla_indexed_scalar_2s_strict:
1035; CHECK:       // %bb.0: // %entry
1036; CHECK-NEXT:    // kill: def $s2 killed $s2 def $q2
1037; CHECK-NEXT:    fmla.2s v0, v1, v2[0]
1038; CHECK-NEXT:    ret
1039entry:
1040  %v1 = insertelement <2 x float> undef, float %c, i32 0
1041  %v2 = insertelement <2 x float> %v1, float %c, i32 1
1042  %fmla1 = tail call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %v2, <2 x float> %b, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
1043  ret <2 x float> %fmla1
1044}
1045
1046define <4 x float> @fmla_indexed_scalar_4s_strict(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp strictfp {
1047; CHECK-LABEL: fmla_indexed_scalar_4s_strict:
1048; CHECK:       // %bb.0: // %entry
1049; CHECK-NEXT:    // kill: def $s2 killed $s2 def $q2
1050; CHECK-NEXT:    fmla.4s v0, v1, v2[0]
1051; CHECK-NEXT:    ret
1052entry:
1053  %v1 = insertelement <4 x float> undef, float %c, i32 0
1054  %v2 = insertelement <4 x float> %v1, float %c, i32 1
1055  %v3 = insertelement <4 x float> %v2, float %c, i32 2
1056  %v4 = insertelement <4 x float> %v3, float %c, i32 3
1057  %fmla1 = tail call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %v4, <4 x float> %b, <4 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
1058  ret <4 x float> %fmla1
1059}
1060
1061define <2 x double> @fmla_indexed_scalar_2d_strict(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp strictfp {
1062; CHECK-LABEL: fmla_indexed_scalar_2d_strict:
1063; CHECK:       // %bb.0: // %entry
1064; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
1065; CHECK-NEXT:    fmla.2d v0, v1, v2[0]
1066; CHECK-NEXT:    ret
1067entry:
1068  %v1 = insertelement <2 x double> undef, double %c, i32 0
1069  %v2 = insertelement <2 x double> %v1, double %c, i32 1
1070  %fmla1 = tail call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %v2, <2 x double> %b, <2 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
1071  ret <2 x double> %fmla1
1072}
1073
1074attributes #0 = { strictfp }
1075
1076declare <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float>, <2 x float>, <2 x float>, metadata, metadata)
1077declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata)
1078declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x double>, <2 x double>, metadata, metadata)
1079
1080define <4 x i16> @mul_4h(<4 x i16> %A, <4 x i16> %B) nounwind {
1081; CHECK-LABEL: mul_4h:
1082; CHECK:       // %bb.0:
1083; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1084; CHECK-NEXT:    mul.4h v0, v0, v1[1]
1085; CHECK-NEXT:    ret
1086  %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1087  %tmp4 = mul <4 x i16> %A, %tmp3
1088  ret <4 x i16> %tmp4
1089}
1090
1091define <8 x i16> @mul_8h(<8 x i16> %A, <8 x i16> %B) nounwind {
1092; CHECK-LABEL: mul_8h:
1093; CHECK:       // %bb.0:
1094; CHECK-NEXT:    mul.8h v0, v0, v1[1]
1095; CHECK-NEXT:    ret
1096  %tmp3 = shufflevector <8 x i16> %B, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1097  %tmp4 = mul <8 x i16> %A, %tmp3
1098  ret <8 x i16> %tmp4
1099}
1100
1101define <2 x i32> @mul_2s(<2 x i32> %A, <2 x i32> %B) nounwind {
1102; CHECK-LABEL: mul_2s:
1103; CHECK:       // %bb.0:
1104; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1105; CHECK-NEXT:    mul.2s v0, v0, v1[1]
1106; CHECK-NEXT:    ret
1107  %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
1108  %tmp4 = mul <2 x i32> %A, %tmp3
1109  ret <2 x i32> %tmp4
1110}
1111
1112define <4 x i32> @mul_4s(<4 x i32> %A, <4 x i32> %B) nounwind {
1113; CHECK-LABEL: mul_4s:
1114; CHECK:       // %bb.0:
1115; CHECK-NEXT:    mul.4s v0, v0, v1[1]
1116; CHECK-NEXT:    ret
1117  %tmp3 = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1118  %tmp4 = mul <4 x i32> %A, %tmp3
1119  ret <4 x i32> %tmp4
1120}
1121
1122define <2 x i64> @mul_2d(<2 x i64> %A, <2 x i64> %B) nounwind {
1123; CHECK-LABEL: mul_2d:
1124; CHECK:       // %bb.0:
1125; CHECK-NEXT:    fmov x10, d1
1126; CHECK-NEXT:    fmov x11, d0
1127; CHECK-NEXT:    mov.d x8, v1[1]
1128; CHECK-NEXT:    mov.d x9, v0[1]
1129; CHECK-NEXT:    mul x10, x11, x10
1130; CHECK-NEXT:    mul x8, x9, x8
1131; CHECK-NEXT:    fmov d0, x10
1132; CHECK-NEXT:    mov.d v0[1], x8
1133; CHECK-NEXT:    ret
1134  %tmp1 = mul <2 x i64> %A, %B
1135  ret <2 x i64> %tmp1
1136}
1137
1138define <2 x float> @fmul_lane_2s(<2 x float> %A, <2 x float> %B) nounwind {
1139; CHECK-LABEL: fmul_lane_2s:
1140; CHECK:       // %bb.0:
1141; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1142; CHECK-NEXT:    fmul.2s v0, v0, v1[1]
1143; CHECK-NEXT:    ret
1144  %tmp3 = shufflevector <2 x float> %B, <2 x float> poison, <2 x i32> <i32 1, i32 1>
1145  %tmp4 = fmul <2 x float> %A, %tmp3
1146  ret <2 x float> %tmp4
1147}
1148
1149define <4 x float> @fmul_lane_4s(<4 x float> %A, <4 x float> %B) nounwind {
1150; CHECK-LABEL: fmul_lane_4s:
1151; CHECK:       // %bb.0:
1152; CHECK-NEXT:    fmul.4s v0, v0, v1[1]
1153; CHECK-NEXT:    ret
1154  %tmp3 = shufflevector <4 x float> %B, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1155  %tmp4 = fmul <4 x float> %A, %tmp3
1156  ret <4 x float> %tmp4
1157}
1158
1159define <2 x double> @fmul_lane_2d(<2 x double> %A, <2 x double> %B) nounwind {
1160; CHECK-LABEL: fmul_lane_2d:
1161; CHECK:       // %bb.0:
1162; CHECK-NEXT:    fmul.2d v0, v0, v1[1]
1163; CHECK-NEXT:    ret
1164  %tmp3 = shufflevector <2 x double> %B, <2 x double> poison, <2 x i32> <i32 1, i32 1>
1165  %tmp4 = fmul <2 x double> %A, %tmp3
1166  ret <2 x double> %tmp4
1167}
1168
1169define float @fmul_lane_s(float %A, <4 x float> %vec) nounwind {
1170; CHECK-LABEL: fmul_lane_s:
1171; CHECK:       // %bb.0:
1172; CHECK-NEXT:    fmul.s s0, s0, v1[3]
1173; CHECK-NEXT:    ret
1174  %B = extractelement <4 x float> %vec, i32 3
1175  %res = fmul float %A, %B
1176  ret float %res
1177}
1178
1179define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind {
1180; CHECK-LABEL: fmul_lane_d:
1181; CHECK:       // %bb.0:
1182; CHECK-NEXT:    fmul.d d0, d0, v1[1]
1183; CHECK-NEXT:    ret
1184  %B = extractelement <2 x double> %vec, i32 1
1185  %res = fmul double %A, %B
1186  ret double %res
1187}
1188
1189
1190
1191define <2 x float> @fmulx_lane_2s(<2 x float> %A, <2 x float> %B) nounwind {
1192; CHECK-LABEL: fmulx_lane_2s:
1193; CHECK:       // %bb.0:
1194; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1195; CHECK-NEXT:    fmulx.2s v0, v0, v1[1]
1196; CHECK-NEXT:    ret
1197  %tmp3 = shufflevector <2 x float> %B, <2 x float> poison, <2 x i32> <i32 1, i32 1>
1198  %tmp4 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %A, <2 x float> %tmp3)
1199  ret <2 x float> %tmp4
1200}
1201
1202define <4 x float> @fmulx_lane_4s(<4 x float> %A, <4 x float> %B) nounwind {
1203; CHECK-LABEL: fmulx_lane_4s:
1204; CHECK:       // %bb.0:
1205; CHECK-NEXT:    fmulx.4s v0, v0, v1[1]
1206; CHECK-NEXT:    ret
1207  %tmp3 = shufflevector <4 x float> %B, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1208  %tmp4 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %A, <4 x float> %tmp3)
1209  ret <4 x float> %tmp4
1210}
1211
1212define <2 x double> @fmulx_lane_2d(<2 x double> %A, <2 x double> %B) nounwind {
1213; CHECK-LABEL: fmulx_lane_2d:
1214; CHECK:       // %bb.0:
1215; CHECK-NEXT:    fmulx.2d v0, v0, v1[1]
1216; CHECK-NEXT:    ret
1217  %tmp3 = shufflevector <2 x double> %B, <2 x double> poison, <2 x i32> <i32 1, i32 1>
1218  %tmp4 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %A, <2 x double> %tmp3)
1219  ret <2 x double> %tmp4
1220}
1221
1222define <4 x i16> @sqdmulh_lane_4h(<4 x i16> %A, <4 x i16> %B) nounwind {
1223; CHECK-LABEL: sqdmulh_lane_4h:
1224; CHECK:       // %bb.0:
1225; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1226; CHECK-NEXT:    sqdmulh.4h v0, v0, v1[1]
1227; CHECK-NEXT:    ret
1228  %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1229  %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %A, <4 x i16> %tmp3)
1230  ret <4 x i16> %tmp4
1231}
1232
1233define <8 x i16> @sqdmulh_lane_8h(<8 x i16> %A, <8 x i16> %B) nounwind {
1234; CHECK-LABEL: sqdmulh_lane_8h:
1235; CHECK:       // %bb.0:
1236; CHECK-NEXT:    sqdmulh.8h v0, v0, v1[1]
1237; CHECK-NEXT:    ret
1238  %tmp3 = shufflevector <8 x i16> %B, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1239  %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %A, <8 x i16> %tmp3)
1240  ret <8 x i16> %tmp4
1241}
1242
1243define <2 x i32> @sqdmulh_lane_2s(<2 x i32> %A, <2 x i32> %B) nounwind {
1244; CHECK-LABEL: sqdmulh_lane_2s:
1245; CHECK:       // %bb.0:
1246; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1247; CHECK-NEXT:    sqdmulh.2s v0, v0, v1[1]
1248; CHECK-NEXT:    ret
1249  %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
1250  %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %A, <2 x i32> %tmp3)
1251  ret <2 x i32> %tmp4
1252}
1253
1254define <4 x i32> @sqdmulh_lane_4s(<4 x i32> %A, <4 x i32> %B) nounwind {
1255; CHECK-LABEL: sqdmulh_lane_4s:
1256; CHECK:       // %bb.0:
1257; CHECK-NEXT:    sqdmulh.4s v0, v0, v1[1]
1258; CHECK-NEXT:    ret
1259  %tmp3 = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1260  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %A, <4 x i32> %tmp3)
1261  ret <4 x i32> %tmp4
1262}
1263
1264define i32 @sqdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
1265; CHECK-LABEL: sqdmulh_lane_1s:
1266; CHECK:       // %bb.0:
1267; CHECK-NEXT:    fmov s1, w0
1268; CHECK-NEXT:    sqdmulh.s s0, s1, v0[1]
1269; CHECK-NEXT:    fmov w0, s0
1270; CHECK-NEXT:    ret
1271  %tmp1 = extractelement <4 x i32> %B, i32 1
1272  %tmp2 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %A, i32 %tmp1)
1273  ret i32 %tmp2
1274}
1275
1276define <4 x i16> @sqrdmulh_lane_4h(<4 x i16> %A, <4 x i16> %B) nounwind {
1277; CHECK-LABEL: sqrdmulh_lane_4h:
1278; CHECK:       // %bb.0:
1279; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1280; CHECK-NEXT:    sqrdmulh.4h v0, v0, v1[1]
1281; CHECK-NEXT:    ret
1282  %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1283  %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %A, <4 x i16> %tmp3)
1284  ret <4 x i16> %tmp4
1285}
1286
1287define <8 x i16> @sqrdmulh_lane_8h(<8 x i16> %A, <8 x i16> %B) nounwind {
1288; CHECK-LABEL: sqrdmulh_lane_8h:
1289; CHECK:       // %bb.0:
1290; CHECK-NEXT:    sqrdmulh.8h v0, v0, v1[1]
1291; CHECK-NEXT:    ret
1292  %tmp3 = shufflevector <8 x i16> %B, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1293  %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %A, <8 x i16> %tmp3)
1294  ret <8 x i16> %tmp4
1295}
1296
1297define <2 x i32> @sqrdmulh_lane_2s(<2 x i32> %A, <2 x i32> %B) nounwind {
1298; CHECK-LABEL: sqrdmulh_lane_2s:
1299; CHECK:       // %bb.0:
1300; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1301; CHECK-NEXT:    sqrdmulh.2s v0, v0, v1[1]
1302; CHECK-NEXT:    ret
1303  %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
1304  %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %A, <2 x i32> %tmp3)
1305  ret <2 x i32> %tmp4
1306}
1307
1308define <4 x i32> @sqrdmulh_lane_4s(<4 x i32> %A, <4 x i32> %B) nounwind {
1309; CHECK-LABEL: sqrdmulh_lane_4s:
1310; CHECK:       // %bb.0:
1311; CHECK-NEXT:    sqrdmulh.4s v0, v0, v1[1]
1312; CHECK-NEXT:    ret
1313  %tmp3 = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1314  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %A, <4 x i32> %tmp3)
1315  ret <4 x i32> %tmp4
1316}
1317
1318define i32 @sqrdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
1319; CHECK-LABEL: sqrdmulh_lane_1s:
1320; CHECK:       // %bb.0:
1321; CHECK-NEXT:    fmov s1, w0
1322; CHECK-NEXT:    sqrdmulh.s s0, s1, v0[1]
1323; CHECK-NEXT:    fmov w0, s0
1324; CHECK-NEXT:    ret
1325  %tmp1 = extractelement <4 x i32> %B, i32 1
1326  %tmp2 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %A, i32 %tmp1)
1327  ret i32 %tmp2
1328}
1329
1330define <4 x i32> @sqdmull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind {
1331; CHECK-LABEL: sqdmull_lane_4s:
1332; CHECK:       // %bb.0:
1333; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1334; CHECK-NEXT:    sqdmull.4s v0, v0, v1[1]
1335; CHECK-NEXT:    ret
1336  %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1337  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %A, <4 x i16> %tmp3)
1338  ret <4 x i32> %tmp4
1339}
1340
1341define <2 x i64> @sqdmull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind {
1342; CHECK-LABEL: sqdmull_lane_2d:
1343; CHECK:       // %bb.0:
1344; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1345; CHECK-NEXT:    sqdmull.2d v0, v0, v1[1]
1346; CHECK-NEXT:    ret
1347  %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
1348  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %A, <2 x i32> %tmp3)
1349  ret <2 x i64> %tmp4
1350}
1351
1352define <4 x i32> @sqdmull2_lane_4s(<8 x i16> %A, <8 x i16> %B) nounwind {
1353; CHECK-LABEL: sqdmull2_lane_4s:
1354; CHECK:       // %bb.0:
1355; CHECK-NEXT:    sqdmull2.4s v0, v0, v1[1]
1356; CHECK-NEXT:    ret
1357  %tmp1 = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1358  %tmp2 = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1359  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
1360  ret <4 x i32> %tmp4
1361}
1362
1363define <2 x i64> @sqdmull2_lane_2d(<4 x i32> %A, <4 x i32> %B) nounwind {
1364; CHECK-LABEL: sqdmull2_lane_2d:
1365; CHECK:       // %bb.0:
1366; CHECK-NEXT:    sqdmull2.2d v0, v0, v1[1]
1367; CHECK-NEXT:    ret
1368  %tmp1 = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1369  %tmp2 = shufflevector <4 x i32> %B, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
1370  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
1371  ret <2 x i64> %tmp4
1372}
1373
1374define <4 x i32> @umull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind {
1375; CHECK-LABEL: umull_lane_4s:
1376; CHECK:       // %bb.0:
1377; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1378; CHECK-NEXT:    umull.4s v0, v0, v1[1]
1379; CHECK-NEXT:    ret
1380  %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1381  %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %A, <4 x i16> %tmp3)
1382  ret <4 x i32> %tmp4
1383}
1384
1385define <2 x i64> @umull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind {
1386; CHECK-LABEL: umull_lane_2d:
1387; CHECK:       // %bb.0:
1388; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1389; CHECK-NEXT:    umull.2d v0, v0, v1[1]
1390; CHECK-NEXT:    ret
1391  %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
1392  %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %tmp3)
1393  ret <2 x i64> %tmp4
1394}
1395
1396define <4 x i32> @smull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind {
1397; CHECK-LABEL: smull_lane_4s:
1398; CHECK:       // %bb.0:
1399; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1400; CHECK-NEXT:    smull.4s v0, v0, v1[1]
1401; CHECK-NEXT:    ret
1402  %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1403  %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %A, <4 x i16> %tmp3)
1404  ret <4 x i32> %tmp4
1405}
1406
1407define <2 x i64> @smull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind {
1408; CHECK-LABEL: smull_lane_2d:
1409; CHECK:       // %bb.0:
1410; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1411; CHECK-NEXT:    smull.2d v0, v0, v1[1]
1412; CHECK-NEXT:    ret
1413  %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
1414  %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %tmp3)
1415  ret <2 x i64> %tmp4
1416}
1417
1418define <4 x i32> @smlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
1419; CHECK-LABEL: smlal_lane_4s:
1420; CHECK:       // %bb.0:
1421; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1422; CHECK-NEXT:    smlal.4s v2, v0, v1[1]
1423; CHECK-NEXT:    mov.16b v0, v2
1424; CHECK-NEXT:    ret
1425  %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1426  %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %A, <4 x i16> %tmp4)
1427  %tmp6 = add <4 x i32> %C, %tmp5
1428  ret <4 x i32> %tmp6
1429}
1430
1431define <2 x i64> @smlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
1432; CHECK-LABEL: smlal_lane_2d:
1433; CHECK:       // %bb.0:
1434; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1435; CHECK-NEXT:    smlal.2d v2, v0, v1[1]
1436; CHECK-NEXT:    mov.16b v0, v2
1437; CHECK-NEXT:    ret
1438  %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
1439  %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %tmp4)
1440  %tmp6 = add <2 x i64> %C, %tmp5
1441  ret <2 x i64> %tmp6
1442}
1443
1444define <4 x i32> @sqdmlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
1445; CHECK-LABEL: sqdmlal_lane_4s:
1446; CHECK:       // %bb.0:
1447; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1448; CHECK-NEXT:    sqdmlal.4s v2, v0, v1[1]
1449; CHECK-NEXT:    mov.16b v0, v2
1450; CHECK-NEXT:    ret
1451  %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1452  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %A, <4 x i16> %tmp4)
1453  %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %C, <4 x i32> %tmp5)
1454  ret <4 x i32> %tmp6
1455}
1456
1457define <2 x i64> @sqdmlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
1458; CHECK-LABEL: sqdmlal_lane_2d:
1459; CHECK:       // %bb.0:
1460; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1461; CHECK-NEXT:    sqdmlal.2d v2, v0, v1[1]
1462; CHECK-NEXT:    mov.16b v0, v2
1463; CHECK-NEXT:    ret
1464  %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
1465  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %A, <2 x i32> %tmp4)
1466  %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %C, <2 x i64> %tmp5)
1467  ret <2 x i64> %tmp6
1468}
1469
1470define <4 x i32> @sqdmlal2_lane_4s(<8 x i16> %A, <8 x i16> %B, <4 x i32> %C) nounwind {
1471; CHECK-LABEL: sqdmlal2_lane_4s:
1472; CHECK:       // %bb.0:
1473; CHECK-NEXT:    sqdmlal2.4s v2, v0, v1[1]
1474; CHECK-NEXT:    mov.16b v0, v2
1475; CHECK-NEXT:    ret
1476  %tmp1 = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1477  %tmp2 = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1478  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
1479  %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %C, <4 x i32> %tmp5)
1480  ret <4 x i32> %tmp6
1481}
1482
1483define <2 x i64> @sqdmlal2_lane_2d(<4 x i32> %A, <4 x i32> %B, <2 x i64> %C) nounwind {
1484; CHECK-LABEL: sqdmlal2_lane_2d:
1485; CHECK:       // %bb.0:
1486; CHECK-NEXT:    sqdmlal2.2d v2, v0, v1[1]
1487; CHECK-NEXT:    mov.16b v0, v2
1488; CHECK-NEXT:    ret
1489  %tmp1 = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1490  %tmp2 = shufflevector <4 x i32> %B, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
1491  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
1492  %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %C, <2 x i64> %tmp5)
1493  ret <2 x i64> %tmp6
1494}
1495
1496define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
1497; CHECK-LABEL: sqdmlal_lane_1s:
1498; CHECK:       // %bb.0:
1499; CHECK-NEXT:    fmov s1, w1
1500; CHECK-NEXT:    fmov s2, w0
1501; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1502; CHECK-NEXT:    sqdmlal.h s2, h1, v0[1]
1503; CHECK-NEXT:    fmov w0, s2
1504; CHECK-NEXT:    ret
1505  %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
1506  %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1507  %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
1508  %prod = extractelement <4 x i32> %prod.vec, i32 0
1509  %res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod)
1510  ret i32 %res
1511}
1512declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
1513
1514define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
1515; CHECK-LABEL: sqdmlsl_lane_1s:
1516; CHECK:       // %bb.0:
1517; CHECK-NEXT:    fmov s1, w1
1518; CHECK-NEXT:    fmov s2, w0
1519; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1520; CHECK-NEXT:    sqdmlsl.h s2, h1, v0[1]
1521; CHECK-NEXT:    fmov w0, s2
1522; CHECK-NEXT:    ret
1523  %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
1524  %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1525  %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
1526  %prod = extractelement <4 x i32> %prod.vec, i32 0
1527  %res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod)
1528  ret i32 %res
1529}
1530declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
1531
1532define i32 @sqadd_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind {
1533; CHECK-LABEL: sqadd_lane1_sqdmull4s:
1534; CHECK:       // %bb.0:
1535; CHECK-NEXT:    sqdmull.4s v0, v0, v1
1536; CHECK-NEXT:    mov.s w8, v0[1]
1537; CHECK-NEXT:    fmov s0, w0
1538; CHECK-NEXT:    fmov s1, w8
1539; CHECK-NEXT:    sqadd s0, s0, s1
1540; CHECK-NEXT:    fmov w0, s0
1541; CHECK-NEXT:    ret
1542  %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %B, <4 x i16> %C)
1543  %prod = extractelement <4 x i32> %prod.vec, i32 1
1544  %res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod)
1545  ret i32 %res
1546}
1547
1548define i32 @sqsub_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind {
1549; CHECK-LABEL: sqsub_lane1_sqdmull4s:
1550; CHECK:       // %bb.0:
1551; CHECK-NEXT:    sqdmull.4s v0, v0, v1
1552; CHECK-NEXT:    mov.s w8, v0[1]
1553; CHECK-NEXT:    fmov s0, w0
1554; CHECK-NEXT:    fmov s1, w8
1555; CHECK-NEXT:    sqsub s0, s0, s1
1556; CHECK-NEXT:    fmov w0, s0
1557; CHECK-NEXT:    ret
1558  %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %B, <4 x i16> %C)
1559  %prod = extractelement <4 x i32> %prod.vec, i32 1
1560  %res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod)
1561  ret i32 %res
1562}
1563
1564define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
1565; CHECK-LABEL: sqdmlal_lane_1d:
1566; CHECK:       // %bb.0:
1567; CHECK-NEXT:    fmov d1, x0
1568; CHECK-NEXT:    fmov s2, w1
1569; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1570; CHECK-NEXT:    sqdmlal.s d1, s2, v0[1]
1571; CHECK-NEXT:    fmov x0, d1
1572; CHECK-NEXT:    ret
1573  %rhs = extractelement <2 x i32> %C, i32 1
1574  %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
1575  %res = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %A, i64 %prod)
1576  ret i64 %res
1577}
1578declare i64 @llvm.aarch64.neon.sqdmulls.scalar(i32, i32)
1579declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64)
1580
1581define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
1582; CHECK-LABEL: sqdmlsl_lane_1d:
1583; CHECK:       // %bb.0:
1584; CHECK-NEXT:    fmov d1, x0
1585; CHECK-NEXT:    fmov s2, w1
1586; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1587; CHECK-NEXT:    sqdmlsl.s d1, s2, v0[1]
1588; CHECK-NEXT:    fmov x0, d1
1589; CHECK-NEXT:    ret
1590  %rhs = extractelement <2 x i32> %C, i32 1
1591  %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
1592  %res = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %A, i64 %prod)
1593  ret i64 %res
1594}
1595declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64)
1596
1597
1598define <4 x i32> @umlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
1599; CHECK-LABEL: umlal_lane_4s:
1600; CHECK:       // %bb.0:
1601; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1602; CHECK-NEXT:    umlal.4s v2, v0, v1[1]
1603; CHECK-NEXT:    mov.16b v0, v2
1604; CHECK-NEXT:    ret
1605  %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1606  %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %A, <4 x i16> %tmp4)
1607  %tmp6 = add <4 x i32> %C, %tmp5
1608  ret <4 x i32> %tmp6
1609}
1610
1611define <2 x i64> @umlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
1612; CHECK-LABEL: umlal_lane_2d:
1613; CHECK:       // %bb.0:
1614; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1615; CHECK-NEXT:    umlal.2d v2, v0, v1[1]
1616; CHECK-NEXT:    mov.16b v0, v2
1617; CHECK-NEXT:    ret
1618  %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
1619  %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %tmp4)
1620  %tmp6 = add <2 x i64> %C, %tmp5
1621  ret <2 x i64> %tmp6
1622}
1623
1624
1625define <4 x i32> @smlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
1626; CHECK-LABEL: smlsl_lane_4s:
1627; CHECK:       // %bb.0:
1628; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1629; CHECK-NEXT:    smlsl.4s v2, v0, v1[1]
1630; CHECK-NEXT:    mov.16b v0, v2
1631; CHECK-NEXT:    ret
1632  %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1633  %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %A, <4 x i16> %tmp4)
1634  %tmp6 = sub <4 x i32> %C, %tmp5
1635  ret <4 x i32> %tmp6
1636}
1637
1638define <2 x i64> @smlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
1639; CHECK-LABEL: smlsl_lane_2d:
1640; CHECK:       // %bb.0:
1641; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1642; CHECK-NEXT:    smlsl.2d v2, v0, v1[1]
1643; CHECK-NEXT:    mov.16b v0, v2
1644; CHECK-NEXT:    ret
1645  %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
1646  %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %tmp4)
1647  %tmp6 = sub <2 x i64> %C, %tmp5
1648  ret <2 x i64> %tmp6
1649}
1650
1651define <4 x i32> @sqdmlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
1652; CHECK-LABEL: sqdmlsl_lane_4s:
1653; CHECK:       // %bb.0:
1654; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1655; CHECK-NEXT:    sqdmlsl.4s v2, v0, v1[1]
1656; CHECK-NEXT:    mov.16b v0, v2
1657; CHECK-NEXT:    ret
1658  %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1659  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %A, <4 x i16> %tmp4)
1660  %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %C, <4 x i32> %tmp5)
1661  ret <4 x i32> %tmp6
1662}
1663
1664define <2 x i64> @sqdmlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
1665; CHECK-LABEL: sqdmlsl_lane_2d:
1666; CHECK:       // %bb.0:
1667; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1668; CHECK-NEXT:    sqdmlsl.2d v2, v0, v1[1]
1669; CHECK-NEXT:    mov.16b v0, v2
1670; CHECK-NEXT:    ret
1671  %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
1672  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %A, <2 x i32> %tmp4)
1673  %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %C, <2 x i64> %tmp5)
1674  ret <2 x i64> %tmp6
1675}
1676
1677define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16> %A, <8 x i16> %B, <4 x i32> %C) nounwind {
1678; CHECK-LABEL: sqdmlsl2_lane_4s:
1679; CHECK:       // %bb.0:
1680; CHECK-NEXT:    sqdmlsl2.4s v2, v0, v1[1]
1681; CHECK-NEXT:    mov.16b v0, v2
1682; CHECK-NEXT:    ret
1683  %tmp1 = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1684  %tmp2 = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1685  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
1686  %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %C, <4 x i32> %tmp5)
1687  ret <4 x i32> %tmp6
1688}
1689
1690define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32> %A, <4 x i32> %B, <2 x i64> %C) nounwind {
1691; CHECK-LABEL: sqdmlsl2_lane_2d:
1692; CHECK:       // %bb.0:
1693; CHECK-NEXT:    sqdmlsl2.2d v2, v0, v1[1]
1694; CHECK-NEXT:    mov.16b v0, v2
1695; CHECK-NEXT:    ret
1696  %tmp1 = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1697  %tmp2 = shufflevector <4 x i32> %B, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
1698  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
1699  %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %C, <2 x i64> %tmp5)
1700  ret <2 x i64> %tmp6
1701}
1702
1703define <4 x i32> @umlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
1704; CHECK-LABEL: umlsl_lane_4s:
1705; CHECK:       // %bb.0:
1706; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1707; CHECK-NEXT:    umlsl.4s v2, v0, v1[1]
1708; CHECK-NEXT:    mov.16b v0, v2
1709; CHECK-NEXT:    ret
1710  %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1711  %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %A, <4 x i16> %tmp4)
1712  %tmp6 = sub <4 x i32> %C, %tmp5
1713  ret <4 x i32> %tmp6
1714}
1715
1716define <2 x i64> @umlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
1717; CHECK-LABEL: umlsl_lane_2d:
1718; CHECK:       // %bb.0:
1719; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
1720; CHECK-NEXT:    umlsl.2d v2, v0, v1[1]
1721; CHECK-NEXT:    mov.16b v0, v2
1722; CHECK-NEXT:    ret
1723  %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
1724  %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %tmp4)
1725  %tmp6 = sub <2 x i64> %C, %tmp5
1726  ret <2 x i64> %tmp6
1727}
1728
1729; Scalar FMULX
1730define float @fmulxs(float %a, float %b) nounwind {
1731; CHECK-LABEL: fmulxs:
1732; CHECK:       // %bb.0:
1733; CHECK-NEXT:    fmulx s0, s0, s1
1734; CHECK-NEXT:    ret
1735  %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
1736  ret float %fmulx.i
1737}
1738
1739define double @fmulxd(double %a, double %b) nounwind {
1740; CHECK-LABEL: fmulxd:
1741; CHECK:       // %bb.0:
1742; CHECK-NEXT:    fmulx d0, d0, d1
1743; CHECK-NEXT:    ret
1744  %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
1745  ret double %fmulx.i
1746}
1747
1748define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind {
1749; CHECK-LABEL: fmulxs_lane:
1750; CHECK:       // %bb.0:
1751; CHECK-NEXT:    fmulx.s s0, s0, v1[3]
1752; CHECK-NEXT:    ret
1753  %b = extractelement <4 x float> %vec, i32 3
1754  %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
1755  ret float %fmulx.i
1756}
1757
1758define double @fmulxd_lane(double %a, <2 x double> %vec) nounwind {
1759; CHECK-LABEL: fmulxd_lane:
1760; CHECK:       // %bb.0:
1761; CHECK-NEXT:    fmulx.d d0, d0, v1[1]
1762; CHECK-NEXT:    ret
1763  %b = extractelement <2 x double> %vec, i32 1
1764  %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
1765  ret double %fmulx.i
1766}
1767
1768declare double @llvm.aarch64.neon.fmulx.f64(double, double) nounwind readnone
1769declare float @llvm.aarch64.neon.fmulx.f32(float, float) nounwind readnone
1770
1771
1772define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind {
1773; CHECK-LABEL: smull2_8h_simple:
1774; CHECK:       // %bb.0:
1775; CHECK-NEXT:    smull2.8h v0, v0, v1
1776; CHECK-NEXT:    ret
1777  %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1778  %2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1779  %3 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %1, <8 x i8> %2) #2
1780  ret <8 x i16> %3
1781}
1782
1783define <8 x i16> @foo0(<16 x i8> %a, <16 x i8> %b) nounwind {
1784; CHECK-LABEL: foo0:
1785; CHECK:       // %bb.0:
1786; CHECK-NEXT:    smull2.8h v0, v0, v1
1787; CHECK-NEXT:    ret
1788  %tmp = bitcast <16 x i8> %a to <2 x i64>
1789  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1790  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
1791  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
1792  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1793  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
1794  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
1795  ret <8 x i16> %vmull.i.i
1796}
1797
1798define <4 x i32> @foo1(<8 x i16> %a, <8 x i16> %b) nounwind {
1799; CHECK-LABEL: foo1:
1800; CHECK:       // %bb.0:
1801; CHECK-NEXT:    smull2.4s v0, v0, v1
1802; CHECK-NEXT:    ret
1803  %tmp = bitcast <8 x i16> %a to <2 x i64>
1804  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1805  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
1806  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
1807  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1808  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
1809  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
1810  ret <4 x i32> %vmull2.i.i
1811}
1812
1813define <2 x i64> @foo2(<4 x i32> %a, <4 x i32> %b) nounwind {
1814; CHECK-LABEL: foo2:
1815; CHECK:       // %bb.0:
1816; CHECK-NEXT:    smull2.2d v0, v0, v1
1817; CHECK-NEXT:    ret
1818  %tmp = bitcast <4 x i32> %a to <2 x i64>
1819  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1820  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
1821  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
1822  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1823  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
1824  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
1825  ret <2 x i64> %vmull2.i.i
1826}
1827
1828define <8 x i16> @foo3(<16 x i8> %a, <16 x i8> %b) nounwind {
1829; CHECK-LABEL: foo3:
1830; CHECK:       // %bb.0:
1831; CHECK-NEXT:    umull2.8h v0, v0, v1
1832; CHECK-NEXT:    ret
1833  %tmp = bitcast <16 x i8> %a to <2 x i64>
1834  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1835  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
1836  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
1837  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1838  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
1839  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
1840  ret <8 x i16> %vmull.i.i
1841}
1842
1843define <4 x i32> @foo4(<8 x i16> %a, <8 x i16> %b) nounwind {
1844; CHECK-LABEL: foo4:
1845; CHECK:       // %bb.0:
1846; CHECK-NEXT:    umull2.4s v0, v0, v1
1847; CHECK-NEXT:    ret
1848  %tmp = bitcast <8 x i16> %a to <2 x i64>
1849  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1850  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
1851  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
1852  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1853  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
1854  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
1855  ret <4 x i32> %vmull2.i.i
1856}
1857
1858define <2 x i64> @foo5(<4 x i32> %a, <4 x i32> %b) nounwind {
1859; CHECK-LABEL: foo5:
1860; CHECK:       // %bb.0:
1861; CHECK-NEXT:    umull2.2d v0, v0, v1
1862; CHECK-NEXT:    ret
1863  %tmp = bitcast <4 x i32> %a to <2 x i64>
1864  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1865  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
1866  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
1867  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1868  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
1869  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
1870  ret <2 x i64> %vmull2.i.i
1871}
1872
1873define <4 x i32> @foo6(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
1874; CHECK-LABEL: foo6:
1875; CHECK:       // %bb.0: // %entry
1876; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
1877; CHECK-NEXT:    smull2.4s v0, v1, v2[1]
1878; CHECK-NEXT:    ret
1879entry:
1880  %0 = bitcast <8 x i16> %b to <2 x i64>
1881  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1882  %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
1883  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1884  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
1885  ret <4 x i32> %vmull2.i
1886}
1887
1888define <4 x i32> @foo6a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
1889; CHECK-LABEL: foo6a:
1890; CHECK:       // %bb.0: // %entry
1891; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
1892; CHECK-NEXT:    smull.4s v0, v1, v2[1]
1893; CHECK-NEXT:    ret
1894entry:
1895  %0 = bitcast <8 x i16> %b to <2 x i64>
1896  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
1897  %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
1898  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1899  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
1900  ret <4 x i32> %vmull2.i
1901}
1902
1903define <2 x i64> @foo7(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
1904; CHECK-LABEL: foo7:
1905; CHECK:       // %bb.0: // %entry
1906; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
1907; CHECK-NEXT:    smull2.2d v0, v1, v2[1]
1908; CHECK-NEXT:    ret
1909entry:
1910  %0 = bitcast <4 x i32> %b to <2 x i64>
1911  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1912  %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
1913  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1914  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
1915  ret <2 x i64> %vmull2.i
1916}
1917
1918define <2 x i64> @foo7a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
1919; CHECK-LABEL: foo7a:
1920; CHECK:       // %bb.0: // %entry
1921; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
1922; CHECK-NEXT:    smull.2d v0, v1, v2[1]
1923; CHECK-NEXT:    ret
1924entry:
1925  %0 = bitcast <4 x i32> %b to <2 x i64>
1926  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
1927  %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
1928  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1929  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
1930  ret <2 x i64> %vmull2.i
1931}
1932
1933
1934define <4 x i32> @foo8(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
1935; CHECK-LABEL: foo8:
1936; CHECK:       // %bb.0: // %entry
1937; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
1938; CHECK-NEXT:    umull2.4s v0, v1, v2[1]
1939; CHECK-NEXT:    ret
1940entry:
1941  %0 = bitcast <8 x i16> %b to <2 x i64>
1942  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1943  %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
1944  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1945  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
1946  ret <4 x i32> %vmull2.i
1947}
1948
1949define <4 x i32> @foo8a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
1950; CHECK-LABEL: foo8a:
1951; CHECK:       // %bb.0: // %entry
1952; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
1953; CHECK-NEXT:    umull.4s v0, v1, v2[1]
1954; CHECK-NEXT:    ret
1955entry:
1956  %0 = bitcast <8 x i16> %b to <2 x i64>
1957  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
1958  %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
1959  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1960  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
1961  ret <4 x i32> %vmull2.i
1962}
1963
1964define <2 x i64> @foo9(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
1965; CHECK-LABEL: foo9:
1966; CHECK:       // %bb.0: // %entry
1967; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
1968; CHECK-NEXT:    umull2.2d v0, v1, v2[1]
1969; CHECK-NEXT:    ret
1970entry:
1971  %0 = bitcast <4 x i32> %b to <2 x i64>
1972  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1973  %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
1974  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1975  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
1976  ret <2 x i64> %vmull2.i
1977}
1978
1979define <2 x i64> @foo9a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
1980; CHECK-LABEL: foo9a:
1981; CHECK:       // %bb.0: // %entry
1982; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
1983; CHECK-NEXT:    umull.2d v0, v1, v2[1]
1984; CHECK-NEXT:    ret
1985entry:
1986  %0 = bitcast <4 x i32> %b to <2 x i64>
1987  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
1988  %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
1989  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1990  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
1991  ret <2 x i64> %vmull2.i
1992}
1993
1994define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
1995; CHECK-LABEL: bar0:
1996; CHECK:       // %bb.0:
1997; CHECK-NEXT:    smlal2.8h v0, v1, v2
1998; CHECK-NEXT:    ret
1999  %tmp = bitcast <16 x i8> %b to <2 x i64>
2000  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
2001  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
2002  %tmp2 = bitcast <16 x i8> %c to <2 x i64>
2003  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
2004  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
2005  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
2006  %add.i = add <8 x i16> %vmull.i.i.i, %a
2007  ret <8 x i16> %add.i
2008}
2009
2010define <4 x i32> @bar1(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
2011; CHECK-LABEL: bar1:
2012; CHECK:       // %bb.0:
2013; CHECK-NEXT:    smlal2.4s v0, v1, v2
2014; CHECK-NEXT:    ret
2015  %tmp = bitcast <8 x i16> %b to <2 x i64>
2016  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
2017  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
2018  %tmp2 = bitcast <8 x i16> %c to <2 x i64>
2019  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
2020  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
2021  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
2022  %add.i = add <4 x i32> %vmull2.i.i.i, %a
2023  ret <4 x i32> %add.i
2024}
2025
2026define <2 x i64> @bar2(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
2027; CHECK-LABEL: bar2:
2028; CHECK:       // %bb.0:
2029; CHECK-NEXT:    smlal2.2d v0, v1, v2
2030; CHECK-NEXT:    ret
2031  %tmp = bitcast <4 x i32> %b to <2 x i64>
2032  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
2033  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
2034  %tmp2 = bitcast <4 x i32> %c to <2 x i64>
2035  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
2036  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
2037  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
2038  %add.i = add <2 x i64> %vmull2.i.i.i, %a
2039  ret <2 x i64> %add.i
2040}
2041
2042define <8 x i16> @bar3(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
2043; CHECK-LABEL: bar3:
2044; CHECK:       // %bb.0:
2045; CHECK-NEXT:    umlal2.8h v0, v1, v2
2046; CHECK-NEXT:    ret
2047  %tmp = bitcast <16 x i8> %b to <2 x i64>
2048  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
2049  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
2050  %tmp2 = bitcast <16 x i8> %c to <2 x i64>
2051  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
2052  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
2053  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
2054  %add.i = add <8 x i16> %vmull.i.i.i, %a
2055  ret <8 x i16> %add.i
2056}
2057
2058define <4 x i32> @bar4(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
2059; CHECK-LABEL: bar4:
2060; CHECK:       // %bb.0:
2061; CHECK-NEXT:    umlal2.4s v0, v1, v2
2062; CHECK-NEXT:    ret
2063  %tmp = bitcast <8 x i16> %b to <2 x i64>
2064  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
2065  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
2066  %tmp2 = bitcast <8 x i16> %c to <2 x i64>
2067  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
2068  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
2069  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
2070  %add.i = add <4 x i32> %vmull2.i.i.i, %a
2071  ret <4 x i32> %add.i
2072}
2073
2074define <2 x i64> @bar5(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
2075; CHECK-LABEL: bar5:
2076; CHECK:       // %bb.0:
2077; CHECK-NEXT:    umlal2.2d v0, v1, v2
2078; CHECK-NEXT:    ret
2079  %tmp = bitcast <4 x i32> %b to <2 x i64>
2080  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
2081  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
2082  %tmp2 = bitcast <4 x i32> %c to <2 x i64>
2083  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
2084  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
2085  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
2086  %add.i = add <2 x i64> %vmull2.i.i.i, %a
2087  ret <2 x i64> %add.i
2088}
2089
2090define <4 x i32> @mlal2_1(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
2091; CHECK-LABEL: mlal2_1:
2092; CHECK:       // %bb.0:
2093; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2094; CHECK-NEXT:    smlal2.4s v0, v1, v2[3]
2095; CHECK-NEXT:    ret
2096  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
2097  %tmp = bitcast <8 x i16> %b to <2 x i64>
2098  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
2099  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
2100  %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
2101  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
2102  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
2103  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
2104  %add = add <4 x i32> %vmull2.i.i, %a
2105  ret <4 x i32> %add
2106}
2107
2108define <2 x i64> @mlal2_2(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
2109; CHECK-LABEL: mlal2_2:
2110; CHECK:       // %bb.0:
2111; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2112; CHECK-NEXT:    smlal2.2d v0, v1, v2[1]
2113; CHECK-NEXT:    ret
2114  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2115  %tmp = bitcast <4 x i32> %b to <2 x i64>
2116  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
2117  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
2118  %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
2119  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
2120  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
2121  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
2122  %add = add <2 x i64> %vmull2.i.i, %a
2123  ret <2 x i64> %add
2124}
2125
2126define <4 x i32> @mlal2_4(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
2127; CHECK-LABEL: mlal2_4:
2128; CHECK:       // %bb.0:
2129; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2130; CHECK-NEXT:    umlal2.4s v0, v1, v2[2]
2131; CHECK-NEXT:    ret
2132  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2133  %tmp = bitcast <8 x i16> %b to <2 x i64>
2134  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
2135  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
2136  %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
2137  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
2138  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
2139  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
2140  %add = add <4 x i32> %vmull2.i.i, %a
2141  ret <4 x i32> %add
2142}
2143
2144define <2 x i64> @mlal2_5(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
2145; CHECK-LABEL: mlal2_5:
2146; CHECK:       // %bb.0:
2147; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2148; CHECK-NEXT:    umlal2.2d v0, v1, v2[0]
2149; CHECK-NEXT:    ret
2150  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> zeroinitializer
2151  %tmp = bitcast <4 x i32> %b to <2 x i64>
2152  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
2153  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
2154  %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
2155  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
2156  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
2157  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
2158  %add = add <2 x i64> %vmull2.i.i, %a
2159  ret <2 x i64> %add
2160}
2161
2162; rdar://12328502
2163define <2 x double> @vmulq_n_f64(<2 x double> %x, double %y) nounwind readnone ssp {
2164; CHECK-LABEL: vmulq_n_f64:
2165; CHECK:       // %bb.0: // %entry
2166; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
2167; CHECK-NEXT:    fmul.2d v0, v0, v1[0]
2168; CHECK-NEXT:    ret
2169entry:
2170  %vecinit.i = insertelement <2 x double> undef, double %y, i32 0
2171  %vecinit1.i = insertelement <2 x double> %vecinit.i, double %y, i32 1
2172  %mul.i = fmul <2 x double> %vecinit1.i, %x
2173  ret <2 x double> %mul.i
2174}
2175
2176define <4 x float> @vmulq_n_f32(<4 x float> %x, float %y) nounwind readnone ssp {
2177; CHECK-LABEL: vmulq_n_f32:
2178; CHECK:       // %bb.0: // %entry
2179; CHECK-NEXT:    // kill: def $s1 killed $s1 def $q1
2180; CHECK-NEXT:    fmul.4s v0, v0, v1[0]
2181; CHECK-NEXT:    ret
2182entry:
2183  %vecinit.i = insertelement <4 x float> undef, float %y, i32 0
2184  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %y, i32 1
2185  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %y, i32 2
2186  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %y, i32 3
2187  %mul.i = fmul <4 x float> %vecinit3.i, %x
2188  ret <4 x float> %mul.i
2189}
2190
2191define <2 x float> @vmul_n_f32(<2 x float> %x, float %y) nounwind readnone ssp {
2192; CHECK-LABEL: vmul_n_f32:
2193; CHECK:       // %bb.0: // %entry
2194; CHECK-NEXT:    // kill: def $s1 killed $s1 def $q1
2195; CHECK-NEXT:    fmul.2s v0, v0, v1[0]
2196; CHECK-NEXT:    ret
2197entry:
2198  %vecinit.i = insertelement <2 x float> undef, float %y, i32 0
2199  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %y, i32 1
2200  %mul.i = fmul <2 x float> %vecinit1.i, %x
2201  ret <2 x float> %mul.i
2202}
2203
2204define <4 x i16> @vmla_laneq_s16_test(<4 x i16> %a, <4 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
2205; CHECK-LABEL: vmla_laneq_s16_test:
2206; CHECK:       // %bb.0: // %entry
2207; CHECK-NEXT:    mla.4h v0, v1, v2[6]
2208; CHECK-NEXT:    ret
2209entry:
2210  %shuffle = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
2211  %mul = mul <4 x i16> %shuffle, %b
2212  %add = add <4 x i16> %mul, %a
2213  ret <4 x i16> %add
2214}
2215
2216define <2 x i32> @vmla_laneq_s32_test(<2 x i32> %a, <2 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
2217; CHECK-LABEL: vmla_laneq_s32_test:
2218; CHECK:       // %bb.0: // %entry
2219; CHECK-NEXT:    mla.2s v0, v1, v2[3]
2220; CHECK-NEXT:    ret
2221entry:
2222  %shuffle = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
2223  %mul = mul <2 x i32> %shuffle, %b
2224  %add = add <2 x i32> %mul, %a
2225  ret <2 x i32> %add
2226}
2227
2228define <8 x i16> @not_really_vmlaq_laneq_s16_test(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
2229; CHECK-LABEL: not_really_vmlaq_laneq_s16_test:
2230; CHECK:       // %bb.0: // %entry
2231; CHECK-NEXT:    mla.8h v0, v1, v2[5]
2232; CHECK-NEXT:    ret
2233entry:
2234  %shuffle1 = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2235  %shuffle2 = shufflevector <4 x i16> %shuffle1, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2236  %mul = mul <8 x i16> %shuffle2, %b
2237  %add = add <8 x i16> %mul, %a
2238  ret <8 x i16> %add
2239}
2240
2241define <4 x i32> @not_really_vmlaq_laneq_s32_test(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
2242; CHECK-LABEL: not_really_vmlaq_laneq_s32_test:
2243; CHECK:       // %bb.0: // %entry
2244; CHECK-NEXT:    mla.4s v0, v1, v2[3]
2245; CHECK-NEXT:    ret
2246entry:
2247  %shuffle1 = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2248  %shuffle2 = shufflevector <2 x i32> %shuffle1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2249  %mul = mul <4 x i32> %shuffle2, %b
2250  %add = add <4 x i32> %mul, %a
2251  ret <4 x i32> %add
2252}
2253
2254define <4 x i32> @vmull_laneq_s16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
2255; CHECK-LABEL: vmull_laneq_s16_test:
2256; CHECK:       // %bb.0: // %entry
2257; CHECK-NEXT:    smull.4s v0, v0, v1[6]
2258; CHECK-NEXT:    ret
2259entry:
2260  %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
2261  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
2262  ret <4 x i32> %vmull2.i
2263}
2264
2265define <2 x i64> @vmull_laneq_s32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
2266; CHECK-LABEL: vmull_laneq_s32_test:
2267; CHECK:       // %bb.0: // %entry
2268; CHECK-NEXT:    smull.2d v0, v0, v1[2]
2269; CHECK-NEXT:    ret
2270entry:
2271  %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
2272  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
2273  ret <2 x i64> %vmull2.i
2274}
2275define <4 x i32> @vmull_laneq_u16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
2276; CHECK-LABEL: vmull_laneq_u16_test:
2277; CHECK:       // %bb.0: // %entry
2278; CHECK-NEXT:    umull.4s v0, v0, v1[6]
2279; CHECK-NEXT:    ret
2280entry:
2281  %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
2282  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
2283  ret <4 x i32> %vmull2.i
2284}
2285
2286define <2 x i64> @vmull_laneq_u32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
2287; CHECK-LABEL: vmull_laneq_u32_test:
2288; CHECK:       // %bb.0: // %entry
2289; CHECK-NEXT:    umull.2d v0, v0, v1[2]
2290; CHECK-NEXT:    ret
2291entry:
2292  %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
2293  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
2294  ret <2 x i64> %vmull2.i
2295}
2296
2297define <4 x i32> @vmull_low_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
2298; CHECK-LABEL: vmull_low_n_s16_test:
2299; CHECK:       // %bb.0: // %entry
2300; CHECK-NEXT:    dup.4h v0, w0
2301; CHECK-NEXT:    smull.4s v0, v1, v0
2302; CHECK-NEXT:    ret
2303entry:
2304  %conv = trunc i32 %d to i16
2305  %0 = bitcast <8 x i16> %b to <2 x i64>
2306  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
2307  %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
2308  %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
2309  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
2310  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
2311  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
2312  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
2313  ret <4 x i32> %vmull2.i.i
2314}
2315
2316define <4 x i32> @vmull_high_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
2317; CHECK-LABEL: vmull_high_n_s16_test:
2318; CHECK:       // %bb.0: // %entry
2319; CHECK-NEXT:    dup.8h v0, w0
2320; CHECK-NEXT:    smull2.4s v0, v1, v0
2321; CHECK-NEXT:    ret
2322entry:
2323  %conv = trunc i32 %d to i16
2324  %0 = bitcast <8 x i16> %b to <2 x i64>
2325  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
2326  %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
2327  %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
2328  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
2329  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
2330  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
2331  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
2332  ret <4 x i32> %vmull2.i.i
2333}
2334
2335define <2 x i64> @vmull_high_n_s32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
2336; CHECK-LABEL: vmull_high_n_s32_test:
2337; CHECK:       // %bb.0: // %entry
2338; CHECK-NEXT:    dup.4s v0, w0
2339; CHECK-NEXT:    smull2.2d v0, v1, v0
2340; CHECK-NEXT:    ret
2341entry:
2342  %0 = bitcast <4 x i32> %b to <2 x i64>
2343  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
2344  %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
2345  %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
2346  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
2347  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
2348  ret <2 x i64> %vmull2.i.i
2349}
2350
2351define <4 x i32> @vmull_high_n_u16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
2352; CHECK-LABEL: vmull_high_n_u16_test:
2353; CHECK:       // %bb.0: // %entry
2354; CHECK-NEXT:    dup.8h v0, w0
2355; CHECK-NEXT:    umull2.4s v0, v1, v0
2356; CHECK-NEXT:    ret
2357entry:
2358  %conv = trunc i32 %d to i16
2359  %0 = bitcast <8 x i16> %b to <2 x i64>
2360  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
2361  %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
2362  %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
2363  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
2364  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
2365  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
2366  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
2367  ret <4 x i32> %vmull2.i.i
2368}
2369
2370define <2 x i64> @vmull_high_n_u32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
2371; CHECK-LABEL: vmull_high_n_u32_test:
2372; CHECK:       // %bb.0: // %entry
2373; CHECK-NEXT:    dup.4s v0, w0
2374; CHECK-NEXT:    umull2.2d v0, v1, v0
2375; CHECK-NEXT:    ret
2376entry:
2377  %0 = bitcast <4 x i32> %b to <2 x i64>
2378  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
2379  %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
2380  %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
2381  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
2382  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
2383  ret <2 x i64> %vmull2.i.i
2384}
2385
2386define <4 x i32> @vmul_built_dup_test(<4 x i32> %a, <4 x i32> %b) {
2387; CHECK-LABEL: vmul_built_dup_test:
2388; CHECK:       // %bb.0:
2389; CHECK-NEXT:    mul.4s v0, v0, v1[1]
2390; CHECK-NEXT:    ret
2391  %vget_lane = extractelement <4 x i32> %b, i32 1
2392  %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0
2393  %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1
2394  %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %vget_lane, i32 2
2395  %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %vget_lane, i32 3
2396  %prod = mul <4 x i32> %a, %vecinit3.i
2397  ret <4 x i32> %prod
2398}
2399
2400define <4 x i16> @vmul_built_dup_fromsmall_test(<4 x i16> %a, <4 x i16> %b) {
2401; CHECK-LABEL: vmul_built_dup_fromsmall_test:
2402; CHECK:       // %bb.0:
2403; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
2404; CHECK-NEXT:    mul.4h v0, v0, v1[3]
2405; CHECK-NEXT:    ret
2406  %vget_lane = extractelement <4 x i16> %b, i32 3
2407  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
2408  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
2409  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
2410  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
2411  %prod = mul <4 x i16> %a, %vecinit3.i
2412  ret <4 x i16> %prod
2413}
2414
2415define <8 x i16> @vmulq_built_dup_fromsmall_test(<8 x i16> %a, <4 x i16> %b) {
2416; CHECK-LABEL: vmulq_built_dup_fromsmall_test:
2417; CHECK:       // %bb.0:
2418; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
2419; CHECK-NEXT:    mul.8h v0, v0, v1[0]
2420; CHECK-NEXT:    ret
2421  %vget_lane = extractelement <4 x i16> %b, i32 0
2422  %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
2423  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
2424  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
2425  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
2426  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
2427  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
2428  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
2429  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
2430  %prod = mul <8 x i16> %a, %vecinit7.i
2431  ret <8 x i16> %prod
2432}
2433
2434define <2 x i64> @mull_from_two_extracts(<4 x i32> %lhs, <4 x i32> %rhs) {
2435; CHECK-LABEL: mull_from_two_extracts:
2436; CHECK:       // %bb.0:
2437; CHECK-NEXT:    sqdmull2.2d v0, v0, v1
2438; CHECK-NEXT:    ret
2439  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2440  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2441
2442  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2443  ret <2 x i64> %res
2444}
2445
2446define <2 x i64> @mlal_from_two_extracts(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
2447; CHECK-LABEL: mlal_from_two_extracts:
2448; CHECK:       // %bb.0:
2449; CHECK-NEXT:    sqdmlal2.2d v0, v1, v2
2450; CHECK-NEXT:    ret
2451  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2452  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2453
2454  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2455  %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
2456  ret <2 x i64> %sum
2457}
2458
2459define <2 x i64> @mull_from_extract_dup_low(<4 x i32> %lhs, i32 %rhs) {
2460; CHECK-LABEL: mull_from_extract_dup_low:
2461; CHECK:       // %bb.0:
2462; CHECK-NEXT:    dup.2s v1, w0
2463; CHECK-NEXT:    sqdmull.2d v0, v0, v1
2464; CHECK-NEXT:    ret
2465  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
2466  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
2467
2468  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
2469
2470  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
2471  ret <2 x i64> %res
2472}
2473
2474define <2 x i64> @mull_from_extract_dup_high(<4 x i32> %lhs, i32 %rhs) {
2475; CHECK-LABEL: mull_from_extract_dup_high:
2476; CHECK:       // %bb.0:
2477; CHECK-NEXT:    dup.4s v1, w0
2478; CHECK-NEXT:    sqdmull2.2d v0, v0, v1
2479; CHECK-NEXT:    ret
2480  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
2481  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
2482
2483  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2484
2485  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
2486  ret <2 x i64> %res
2487}
2488
2489define <8 x i16> @pmull_from_extract_dup_low(<16 x i8> %lhs, i8 %rhs) {
2490; CHECK-LABEL: pmull_from_extract_dup_low:
2491; CHECK:       // %bb.0:
2492; CHECK-NEXT:    dup.8b v1, w0
2493; CHECK-NEXT:    pmull.8h v0, v0, v1
2494; CHECK-NEXT:    ret
2495  %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
2496  %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
2497
2498  %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2499
2500  %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
2501  ret <8 x i16> %res
2502}
2503
2504define <8 x i16> @pmull_from_extract_dup_high(<16 x i8> %lhs, i8 %rhs) {
2505; CHECK-LABEL: pmull_from_extract_dup_high:
2506; CHECK:       // %bb.0:
2507; CHECK-NEXT:    dup.16b v1, w0
2508; CHECK-NEXT:    pmull2.8h v0, v0, v1
2509; CHECK-NEXT:    ret
2510  %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
2511  %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
2512
2513  %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2514
2515  %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
2516  ret <8 x i16> %res
2517}
2518
2519define <8 x i16> @pmull_from_extract_duplane_low(<16 x i8> %lhs, <8 x i8> %rhs) {
2520; CHECK-LABEL: pmull_from_extract_duplane_low:
2521; CHECK:       // %bb.0:
2522; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
2523; CHECK-NEXT:    dup.8b v1, v1[0]
2524; CHECK-NEXT:    pmull.8h v0, v0, v1
2525; CHECK-NEXT:    ret
2526  %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2527  %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
2528
2529  %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
2530  ret <8 x i16> %res
2531}
2532
2533define <8 x i16> @pmull_from_extract_duplane_high(<16 x i8> %lhs, <8 x i8> %rhs) {
2534; CHECK-LABEL: pmull_from_extract_duplane_high:
2535; CHECK:       // %bb.0:
2536; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
2537; CHECK-NEXT:    dup.16b v1, v1[0]
2538; CHECK-NEXT:    pmull2.8h v0, v0, v1
2539; CHECK-NEXT:    ret
2540  %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2541  %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
2542
2543  %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
2544  ret <8 x i16> %res
2545}
2546
2547define <2 x i64> @sqdmull_from_extract_duplane_low(<4 x i32> %lhs, <4 x i32> %rhs) {
2548; CHECK-LABEL: sqdmull_from_extract_duplane_low:
2549; CHECK:       // %bb.0:
2550; CHECK-NEXT:    sqdmull.2d v0, v0, v1[0]
2551; CHECK-NEXT:    ret
2552  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
2553  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
2554
2555  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2556  ret <2 x i64> %res
2557}
2558
2559define <2 x i64> @sqdmull_from_extract_duplane_high(<4 x i32> %lhs, <4 x i32> %rhs) {
2560; CHECK-LABEL: sqdmull_from_extract_duplane_high:
2561; CHECK:       // %bb.0:
2562; CHECK-NEXT:    sqdmull2.2d v0, v0, v1[0]
2563; CHECK-NEXT:    ret
2564  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2565  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
2566
2567  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2568  ret <2 x i64> %res
2569}
2570
2571define <2 x i64> @sqdmlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
2572; CHECK-LABEL: sqdmlal_from_extract_duplane_low:
2573; CHECK:       // %bb.0:
2574; CHECK-NEXT:    sqdmlal.2d v0, v1, v2[0]
2575; CHECK-NEXT:    ret
2576  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
2577  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
2578
2579  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2580  %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
2581  ret <2 x i64> %sum
2582}
2583
2584define <2 x i64> @sqdmlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
2585; CHECK-LABEL: sqdmlal_from_extract_duplane_high:
2586; CHECK:       // %bb.0:
2587; CHECK-NEXT:    sqdmlal2.2d v0, v1, v2[0]
2588; CHECK-NEXT:    ret
2589  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2590  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
2591
2592  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2593  %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
2594  ret <2 x i64> %sum
2595}
2596
2597define <2 x i64> @umlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
2598; CHECK-LABEL: umlal_from_extract_duplane_low:
2599; CHECK:       // %bb.0:
2600; CHECK-NEXT:    umlal.2d v0, v1, v2[0]
2601; CHECK-NEXT:    ret
2602  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
2603  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
2604
2605  %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2606  %sum = add <2 x i64> %accum, %res
2607  ret <2 x i64> %sum
2608}
2609
2610define <2 x i64> @umlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
2611; CHECK-LABEL: umlal_from_extract_duplane_high:
2612; CHECK:       // %bb.0:
2613; CHECK-NEXT:    umlal2.2d v0, v1, v2[0]
2614; CHECK-NEXT:    ret
2615  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2616  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
2617
2618  %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
2619  %sum = add <2 x i64> %accum, %res
2620  ret <2 x i64> %sum
2621}
2622
2623define float @scalar_fmla_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
2624; CHECK-LABEL: scalar_fmla_from_extract_v4f32:
2625; CHECK:       // %bb.0:
2626; CHECK-NEXT:    fmla.s s0, s1, v2[3]
2627; CHECK-NEXT:    ret
2628  %rhs = extractelement <4 x float> %rvec, i32 3
2629  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
2630  ret float %res
2631}
2632
2633define float @scalar_fmla_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
2634; CHECK-LABEL: scalar_fmla_from_extract_v2f32:
2635; CHECK:       // %bb.0:
2636; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2637; CHECK-NEXT:    fmla.s s0, s1, v2[1]
2638; CHECK-NEXT:    ret
2639  %rhs = extractelement <2 x float> %rvec, i32 1
2640  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
2641  ret float %res
2642}
2643
2644define float @scalar_fmls_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
2645; CHECK-LABEL: scalar_fmls_from_extract_v4f32:
2646; CHECK:       // %bb.0:
2647; CHECK-NEXT:    fmls.s s0, s1, v2[3]
2648; CHECK-NEXT:    ret
2649  %rhs.scal = extractelement <4 x float> %rvec, i32 3
2650  %rhs = fsub float -0.0, %rhs.scal
2651  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
2652  ret float %res
2653}
2654
2655define float @scalar_fmls_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
2656; CHECK-LABEL: scalar_fmls_from_extract_v2f32:
2657; CHECK:       // %bb.0:
2658; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2659; CHECK-NEXT:    fmls.s s0, s1, v2[1]
2660; CHECK-NEXT:    ret
2661  %rhs.scal = extractelement <2 x float> %rvec, i32 1
2662  %rhs = fsub float -0.0, %rhs.scal
2663  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
2664  ret float %res
2665}
2666
2667declare float @llvm.fma.f32(float, float, float)
2668
2669define double @scalar_fmla_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
2670; CHECK-LABEL: scalar_fmla_from_extract_v2f64:
2671; CHECK:       // %bb.0:
2672; CHECK-NEXT:    fmla.d d0, d1, v2[1]
2673; CHECK-NEXT:    ret
2674  %rhs = extractelement <2 x double> %rvec, i32 1
2675  %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
2676  ret double %res
2677}
2678
2679define double @scalar_fmls_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
2680; CHECK-LABEL: scalar_fmls_from_extract_v2f64:
2681; CHECK:       // %bb.0:
2682; CHECK-NEXT:    fmls.d d0, d1, v2[1]
2683; CHECK-NEXT:    ret
2684  %rhs.scal = extractelement <2 x double> %rvec, i32 1
2685  %rhs = fsub double -0.0, %rhs.scal
2686  %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
2687  ret double %res
2688}
2689
2690declare double @llvm.fma.f64(double, double, double)
2691
2692define <2 x float> @fmls_with_fneg_before_extract_v2f32(<2 x float> %accum, <2 x float> %lhs, <4 x float> %rhs) {
2693; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32:
2694; CHECK:       // %bb.0:
2695; CHECK-NEXT:    fmls.2s v0, v1, v2[3]
2696; CHECK-NEXT:    ret
2697  %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
2698  %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <2 x i32> <i32 3, i32 3>
2699  %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
2700  ret <2 x float> %res
2701}
2702
2703define <2 x float> @fmls_with_fneg_before_extract_v2f32_1(<2 x float> %accum, <2 x float> %lhs, <2 x float> %rhs) {
2704; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32_1:
2705; CHECK:       // %bb.0:
2706; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2707; CHECK-NEXT:    fmls.2s v0, v1, v2[1]
2708; CHECK-NEXT:    ret
2709  %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
2710  %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <2 x i32> <i32 1, i32 1>
2711  %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
2712  ret <2 x float> %res
2713}
2714
2715define <4 x float> @fmls_with_fneg_before_extract_v4f32(<4 x float> %accum, <4 x float> %lhs, <4 x float> %rhs) {
2716; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32:
2717; CHECK:       // %bb.0:
2718; CHECK-NEXT:    fmls.4s v0, v1, v2[3]
2719; CHECK-NEXT:    ret
2720  %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
2721  %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
2722  %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
2723  ret <4 x float> %res
2724}
2725
2726define <4 x float> @fmls_with_fneg_before_extract_v4f32_1(<4 x float> %accum, <4 x float> %lhs, <2 x float> %rhs) {
2727; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32_1:
2728; CHECK:       // %bb.0:
2729; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
2730; CHECK-NEXT:    fmls.4s v0, v1, v2[1]
2731; CHECK-NEXT:    ret
2732  %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
2733  %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2734  %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
2735  ret <4 x float> %res
2736}
2737
2738define <2 x double> @fmls_with_fneg_before_extract_v2f64(<2 x double> %accum, <2 x double> %lhs, <2 x double> %rhs) {
2739; CHECK-LABEL: fmls_with_fneg_before_extract_v2f64:
2740; CHECK:       // %bb.0:
2741; CHECK-NEXT:    fmls.2d v0, v1, v2[1]
2742; CHECK-NEXT:    ret
2743  %rhs_neg = fsub <2 x double> <double -0.0, double -0.0>, %rhs
2744  %splat = shufflevector <2 x double> %rhs_neg, <2 x double> undef, <2 x i32> <i32 1, i32 1>
2745  %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %lhs, <2 x double> %splat, <2 x double> %accum)
2746  ret <2 x double> %res
2747}
2748
2749define <1 x double> @test_fmul_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
2750; CHECK-LABEL: test_fmul_v1f64:
2751; CHECK:       // %bb.0:
2752; CHECK-NEXT:    fmul d0, d0, d1
2753; CHECK-NEXT:    ret
2754  %prod = fmul <1 x double> %L, %R
2755  ret <1 x double> %prod
2756}
2757
2758define <1 x double> @test_fdiv_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
2759; CHECK-LABEL: test_fdiv_v1f64:
2760; CHECK:       // %bb.0:
2761; CHECK-NEXT:    fdiv d0, d0, d1
2762; CHECK-NEXT:    ret
2763  %prod = fdiv <1 x double> %L, %R
2764  ret <1 x double> %prod
2765}
2766
2767define i32 @sqdmlal_s(i16 %A, i16 %B, i32 %C) nounwind {
2768; CHECK-LABEL: sqdmlal_s:
2769; CHECK:       // %bb.0:
2770; CHECK-NEXT:    fmov s0, w0
2771; CHECK-NEXT:    fmov s1, w1
2772; CHECK-NEXT:    fmov s2, w2
2773; CHECK-NEXT:    sqdmlal.h s2, h0, v1[0]
2774; CHECK-NEXT:    fmov w0, s2
2775; CHECK-NEXT:    ret
2776  %tmp1 = insertelement <4 x i16> undef, i16 %A, i64 0
2777  %tmp2 = insertelement <4 x i16> undef, i16 %B, i64 0
2778  %tmp3 = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
2779  %tmp4 = extractelement <4 x i32> %tmp3, i64 0
2780  %tmp5 = tail call i32 @llvm.aarch64.neon.sqadd.i32(i32 %C, i32 %tmp4)
2781  ret i32 %tmp5
2782}
2783
2784define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind {
2785; CHECK-LABEL: sqdmlal_d:
2786; CHECK:       // %bb.0:
2787; CHECK-NEXT:    fmov d0, x2
2788; CHECK-NEXT:    fmov s1, w0
2789; CHECK-NEXT:    fmov s2, w1
2790; CHECK-NEXT:    sqdmlal d0, s1, s2
2791; CHECK-NEXT:    fmov x0, d0
2792; CHECK-NEXT:    ret
2793  %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
2794  %tmp5 = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %C, i64 %tmp4)
2795  ret i64 %tmp5
2796}
2797
2798define i32 @sqdmlsl_s(i16 %A, i16 %B, i32 %C) nounwind {
2799; CHECK-LABEL: sqdmlsl_s:
2800; CHECK:       // %bb.0:
2801; CHECK-NEXT:    fmov s0, w0
2802; CHECK-NEXT:    fmov s1, w1
2803; CHECK-NEXT:    fmov s2, w2
2804; CHECK-NEXT:    sqdmlsl.h s2, h0, v1[0]
2805; CHECK-NEXT:    fmov w0, s2
2806; CHECK-NEXT:    ret
2807  %tmp1 = insertelement <4 x i16> undef, i16 %A, i64 0
2808  %tmp2 = insertelement <4 x i16> undef, i16 %B, i64 0
2809  %tmp3 = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
2810  %tmp4 = extractelement <4 x i32> %tmp3, i64 0
2811  %tmp5 = tail call i32 @llvm.aarch64.neon.sqsub.i32(i32 %C, i32 %tmp4)
2812  ret i32 %tmp5
2813}
2814
2815define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind {
2816; CHECK-LABEL: sqdmlsl_d:
2817; CHECK:       // %bb.0:
2818; CHECK-NEXT:    fmov d0, x2
2819; CHECK-NEXT:    fmov s1, w0
2820; CHECK-NEXT:    fmov s2, w1
2821; CHECK-NEXT:    sqdmlsl d0, s1, s2
2822; CHECK-NEXT:    fmov x0, d0
2823; CHECK-NEXT:    ret
2824  %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
2825  %tmp5 = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %C, i64 %tmp4)
2826  ret i64 %tmp5
2827}
2828
2829define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind {
2830; CHECK-LABEL: test_pmull_64:
2831; CHECK:       // %bb.0:
2832; CHECK-NEXT:    fmov d0, x1
2833; CHECK-NEXT:    fmov d1, x0
2834; CHECK-NEXT:    pmull.1q v0, v1, v0
2835; CHECK-NEXT:    ret
2836  %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)
2837  ret <16 x i8> %val
2838}
2839
2840define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind {
2841; CHECK-LABEL: test_pmull_high_64:
2842; CHECK:       // %bb.0:
2843; CHECK-NEXT:    pmull2.1q v0, v0, v1
2844; CHECK-NEXT:    ret
2845  %l_hi = extractelement <2 x i64> %l, i32 1
2846  %r_hi = extractelement <2 x i64> %r, i32 1
2847  %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l_hi, i64 %r_hi)
2848  ret <16 x i8> %val
2849}
2850
2851declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64)
2852
2853define <1 x i64> @test_mul_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) nounwind {
2854; CHECK-LABEL: test_mul_v1i64:
2855; CHECK:       // %bb.0:
2856; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
2857; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
2858; CHECK-NEXT:    fmov x8, d1
2859; CHECK-NEXT:    fmov x9, d0
2860; CHECK-NEXT:    mul x8, x9, x8
2861; CHECK-NEXT:    fmov d0, x8
2862; CHECK-NEXT:    ret
2863  %prod = mul <1 x i64> %lhs, %rhs
2864  ret <1 x i64> %prod
2865}
2866