xref: /llvm-project/llvm/test/CodeGen/AArch64/aarch64-smull.ll (revision 712ef7d0baf9b7b6c2a3f01f0c02305a0e0160e9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NEON
3; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE
4; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
5
6; CHECK-GI:       warning: Instruction selection used fallback path for pmlsl2_v8i16_uzp1
7; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for pmlsl_pmlsl2_v8i16_uzp1
8
9define <8 x i16> @smull_v8i8_v8i16(ptr %A, ptr %B) nounwind {
10; CHECK-LABEL: smull_v8i8_v8i16:
11; CHECK:       // %bb.0:
12; CHECK-NEXT:    ldr d0, [x0]
13; CHECK-NEXT:    ldr d1, [x1]
14; CHECK-NEXT:    smull v0.8h, v0.8b, v1.8b
15; CHECK-NEXT:    ret
16  %tmp1 = load <8 x i8>, ptr %A
17  %tmp2 = load <8 x i8>, ptr %B
18  %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
19  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
20  %tmp5 = mul <8 x i16> %tmp3, %tmp4
21  ret <8 x i16> %tmp5
22}
23
24define <4 x i32> @smull_v4i16_v4i32(ptr %A, ptr %B) nounwind {
25; CHECK-LABEL: smull_v4i16_v4i32:
26; CHECK:       // %bb.0:
27; CHECK-NEXT:    ldr d0, [x0]
28; CHECK-NEXT:    ldr d1, [x1]
29; CHECK-NEXT:    smull v0.4s, v0.4h, v1.4h
30; CHECK-NEXT:    ret
31  %tmp1 = load <4 x i16>, ptr %A
32  %tmp2 = load <4 x i16>, ptr %B
33  %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
34  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
35  %tmp5 = mul <4 x i32> %tmp3, %tmp4
36  ret <4 x i32> %tmp5
37}
38
39define <2 x i64> @smull_v2i32_v2i64(ptr %A, ptr %B) nounwind {
40; CHECK-LABEL: smull_v2i32_v2i64:
41; CHECK:       // %bb.0:
42; CHECK-NEXT:    ldr d0, [x0]
43; CHECK-NEXT:    ldr d1, [x1]
44; CHECK-NEXT:    smull v0.2d, v0.2s, v1.2s
45; CHECK-NEXT:    ret
46  %tmp1 = load <2 x i32>, ptr %A
47  %tmp2 = load <2 x i32>, ptr %B
48  %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
49  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
50  %tmp5 = mul <2 x i64> %tmp3, %tmp4
51  ret <2 x i64> %tmp5
52}
53
54define <8 x i32> @smull_zext_v8i8_v8i32(ptr %A, ptr %B) nounwind {
55; CHECK-NEON-LABEL: smull_zext_v8i8_v8i32:
56; CHECK-NEON:       // %bb.0:
57; CHECK-NEON-NEXT:    ldr d0, [x0]
58; CHECK-NEON-NEXT:    ldr q2, [x1]
59; CHECK-NEON-NEXT:    ushll v0.8h, v0.8b, #0
60; CHECK-NEON-NEXT:    smull2 v1.4s, v0.8h, v2.8h
61; CHECK-NEON-NEXT:    smull v0.4s, v0.4h, v2.4h
62; CHECK-NEON-NEXT:    ret
63;
64; CHECK-SVE-LABEL: smull_zext_v8i8_v8i32:
65; CHECK-SVE:       // %bb.0:
66; CHECK-SVE-NEXT:    ldr d0, [x0]
67; CHECK-SVE-NEXT:    ldr q2, [x1]
68; CHECK-SVE-NEXT:    ushll v0.8h, v0.8b, #0
69; CHECK-SVE-NEXT:    smull2 v1.4s, v0.8h, v2.8h
70; CHECK-SVE-NEXT:    smull v0.4s, v0.4h, v2.4h
71; CHECK-SVE-NEXT:    ret
72;
73; CHECK-GI-LABEL: smull_zext_v8i8_v8i32:
74; CHECK-GI:       // %bb.0:
75; CHECK-GI-NEXT:    ldr d0, [x0]
76; CHECK-GI-NEXT:    ldr q1, [x1]
77; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
78; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
79; CHECK-GI-NEXT:    ushll2 v3.4s, v0.8h, #0
80; CHECK-GI-NEXT:    sshll v0.4s, v1.4h, #0
81; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
82; CHECK-GI-NEXT:    mul v0.4s, v2.4s, v0.4s
83; CHECK-GI-NEXT:    mul v1.4s, v3.4s, v1.4s
84; CHECK-GI-NEXT:    ret
85  %load.A = load <8 x i8>, ptr %A
86  %load.B = load <8 x i16>, ptr %B
87  %zext.A = zext <8 x i8> %load.A to <8 x i32>
88  %sext.B = sext <8 x i16> %load.B to <8 x i32>
89  %res = mul <8 x i32> %zext.A, %sext.B
90  ret <8 x i32> %res
91}
92
93define <8 x i32> @smull_zext_v8i8_v8i32_sext_first_operand(ptr %A, ptr %B) nounwind {
94; CHECK-NEON-LABEL: smull_zext_v8i8_v8i32_sext_first_operand:
95; CHECK-NEON:       // %bb.0:
96; CHECK-NEON-NEXT:    ldr d0, [x1]
97; CHECK-NEON-NEXT:    ldr q2, [x0]
98; CHECK-NEON-NEXT:    ushll v0.8h, v0.8b, #0
99; CHECK-NEON-NEXT:    smull2 v1.4s, v2.8h, v0.8h
100; CHECK-NEON-NEXT:    smull v0.4s, v2.4h, v0.4h
101; CHECK-NEON-NEXT:    ret
102;
103; CHECK-SVE-LABEL: smull_zext_v8i8_v8i32_sext_first_operand:
104; CHECK-SVE:       // %bb.0:
105; CHECK-SVE-NEXT:    ldr d0, [x1]
106; CHECK-SVE-NEXT:    ldr q2, [x0]
107; CHECK-SVE-NEXT:    ushll v0.8h, v0.8b, #0
108; CHECK-SVE-NEXT:    smull2 v1.4s, v2.8h, v0.8h
109; CHECK-SVE-NEXT:    smull v0.4s, v2.4h, v0.4h
110; CHECK-SVE-NEXT:    ret
111;
112; CHECK-GI-LABEL: smull_zext_v8i8_v8i32_sext_first_operand:
113; CHECK-GI:       // %bb.0:
114; CHECK-GI-NEXT:    ldr d0, [x1]
115; CHECK-GI-NEXT:    ldr q1, [x0]
116; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
117; CHECK-GI-NEXT:    sshll v2.4s, v1.4h, #0
118; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
119; CHECK-GI-NEXT:    ushll v3.4s, v0.4h, #0
120; CHECK-GI-NEXT:    ushll2 v4.4s, v0.8h, #0
121; CHECK-GI-NEXT:    mul v0.4s, v2.4s, v3.4s
122; CHECK-GI-NEXT:    mul v1.4s, v1.4s, v4.4s
123; CHECK-GI-NEXT:    ret
124  %load.A = load <8 x i16>, ptr %A
125  %load.B = load <8 x i8>, ptr %B
126  %sext.A = sext <8 x i16> %load.A to <8 x i32>
127  %zext.B = zext <8 x i8> %load.B to <8 x i32>
128  %res = mul <8 x i32> %sext.A, %zext.B
129  ret <8 x i32> %res
130}
131
132define <8 x i32> @smull_zext_v8i8_v8i32_top_bit_is_1(ptr %A, ptr %B) nounwind {
133; CHECK-NEON-LABEL: smull_zext_v8i8_v8i32_top_bit_is_1:
134; CHECK-NEON:       // %bb.0:
135; CHECK-NEON-NEXT:    ldr q0, [x0]
136; CHECK-NEON-NEXT:    ldr q1, [x1]
137; CHECK-NEON-NEXT:    orr v0.8h, #128, lsl #8
138; CHECK-NEON-NEXT:    sshll v3.4s, v1.4h, #0
139; CHECK-NEON-NEXT:    sshll2 v1.4s, v1.8h, #0
140; CHECK-NEON-NEXT:    ushll v2.4s, v0.4h, #0
141; CHECK-NEON-NEXT:    ushll2 v0.4s, v0.8h, #0
142; CHECK-NEON-NEXT:    mul v1.4s, v0.4s, v1.4s
143; CHECK-NEON-NEXT:    mul v0.4s, v2.4s, v3.4s
144; CHECK-NEON-NEXT:    ret
145;
146; CHECK-SVE-LABEL: smull_zext_v8i8_v8i32_top_bit_is_1:
147; CHECK-SVE:       // %bb.0:
148; CHECK-SVE-NEXT:    ldr q0, [x0]
149; CHECK-SVE-NEXT:    ldr q1, [x1]
150; CHECK-SVE-NEXT:    orr v0.8h, #128, lsl #8
151; CHECK-SVE-NEXT:    sshll v3.4s, v1.4h, #0
152; CHECK-SVE-NEXT:    sshll2 v1.4s, v1.8h, #0
153; CHECK-SVE-NEXT:    ushll v2.4s, v0.4h, #0
154; CHECK-SVE-NEXT:    ushll2 v0.4s, v0.8h, #0
155; CHECK-SVE-NEXT:    mul v1.4s, v0.4s, v1.4s
156; CHECK-SVE-NEXT:    mul v0.4s, v2.4s, v3.4s
157; CHECK-SVE-NEXT:    ret
158;
159; CHECK-GI-LABEL: smull_zext_v8i8_v8i32_top_bit_is_1:
160; CHECK-GI:       // %bb.0:
161; CHECK-GI-NEXT:    movi v0.8h, #128, lsl #8
162; CHECK-GI-NEXT:    ldr q1, [x0]
163; CHECK-GI-NEXT:    orr v0.16b, v1.16b, v0.16b
164; CHECK-GI-NEXT:    ldr q1, [x1]
165; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
166; CHECK-GI-NEXT:    ushll2 v3.4s, v0.8h, #0
167; CHECK-GI-NEXT:    sshll v0.4s, v1.4h, #0
168; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
169; CHECK-GI-NEXT:    mul v0.4s, v2.4s, v0.4s
170; CHECK-GI-NEXT:    mul v1.4s, v3.4s, v1.4s
171; CHECK-GI-NEXT:    ret
172  %load.A = load <8 x i16>, ptr %A
173  %or.A = or <8 x i16> %load.A, <i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000>
174  %load.B = load <8 x i16>, ptr %B
175  %zext.A = zext <8 x i16> %or.A  to <8 x i32>
176  %sext.B = sext <8 x i16> %load.B to <8 x i32>
177  %res = mul <8 x i32> %zext.A, %sext.B
178  ret <8 x i32> %res
179}
180
181define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind {
182; CHECK-NEON-LABEL: smull_zext_v4i16_v4i32:
183; CHECK-NEON:       // %bb.0:
184; CHECK-NEON-NEXT:    ldr s0, [x0]
185; CHECK-NEON-NEXT:    ldr d1, [x1]
186; CHECK-NEON-NEXT:    ushll v0.8h, v0.8b, #0
187; CHECK-NEON-NEXT:    smull v0.4s, v0.4h, v1.4h
188; CHECK-NEON-NEXT:    ret
189;
190; CHECK-SVE-LABEL: smull_zext_v4i16_v4i32:
191; CHECK-SVE:       // %bb.0:
192; CHECK-SVE-NEXT:    ldr s0, [x0]
193; CHECK-SVE-NEXT:    ldr d1, [x1]
194; CHECK-SVE-NEXT:    ushll v0.8h, v0.8b, #0
195; CHECK-SVE-NEXT:    smull v0.4s, v0.4h, v1.4h
196; CHECK-SVE-NEXT:    ret
197;
198; CHECK-GI-LABEL: smull_zext_v4i16_v4i32:
199; CHECK-GI:       // %bb.0:
200; CHECK-GI-NEXT:    ldr w8, [x0]
201; CHECK-GI-NEXT:    fmov s0, w8
202; CHECK-GI-NEXT:    uxtb w8, w8
203; CHECK-GI-NEXT:    mov b1, v0.b[2]
204; CHECK-GI-NEXT:    mov b2, v0.b[1]
205; CHECK-GI-NEXT:    mov b3, v0.b[3]
206; CHECK-GI-NEXT:    fmov s0, w8
207; CHECK-GI-NEXT:    fmov w9, s1
208; CHECK-GI-NEXT:    fmov w10, s2
209; CHECK-GI-NEXT:    fmov w11, s3
210; CHECK-GI-NEXT:    ldr d2, [x1]
211; CHECK-GI-NEXT:    uxtb w9, w9
212; CHECK-GI-NEXT:    uxtb w10, w10
213; CHECK-GI-NEXT:    uxtb w11, w11
214; CHECK-GI-NEXT:    fmov s1, w9
215; CHECK-GI-NEXT:    mov v0.h[1], w10
216; CHECK-GI-NEXT:    mov v1.h[1], w11
217; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
218; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
219; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
220; CHECK-GI-NEXT:    sshll v1.4s, v2.4h, #0
221; CHECK-GI-NEXT:    mul v0.4s, v0.4s, v1.4s
222; CHECK-GI-NEXT:    ret
223  %load.A = load <4 x i8>, ptr %A
224  %load.B = load <4 x i16>, ptr %B
225  %zext.A = zext <4 x i8> %load.A to <4 x i32>
226  %sext.B = sext <4 x i16> %load.B to <4 x i32>
227  %res = mul <4 x i32> %zext.A, %sext.B
228  ret <4 x i32> %res
229}
230
231define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
232; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64:
233; CHECK-NEON:       // %bb.0:
234; CHECK-NEON-NEXT:    ldrh w8, [x0]
235; CHECK-NEON-NEXT:    ldrh w9, [x0, #2]
236; CHECK-NEON-NEXT:    ldr d1, [x1]
237; CHECK-NEON-NEXT:    fmov d0, x8
238; CHECK-NEON-NEXT:    mov v0.d[1], x9
239; CHECK-NEON-NEXT:    xtn v0.2s, v0.2d
240; CHECK-NEON-NEXT:    smull v0.2d, v0.2s, v1.2s
241; CHECK-NEON-NEXT:    ret
242;
243; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64:
244; CHECK-SVE:       // %bb.0:
245; CHECK-SVE-NEXT:    ldrh w8, [x0]
246; CHECK-SVE-NEXT:    ldrh w9, [x0, #2]
247; CHECK-SVE-NEXT:    ldr d1, [x1]
248; CHECK-SVE-NEXT:    fmov d0, x8
249; CHECK-SVE-NEXT:    mov v0.d[1], x9
250; CHECK-SVE-NEXT:    xtn v0.2s, v0.2d
251; CHECK-SVE-NEXT:    smull v0.2d, v0.2s, v1.2s
252; CHECK-SVE-NEXT:    ret
253;
254; CHECK-GI-LABEL: smull_zext_v2i32_v2i64:
255; CHECK-GI:       // %bb.0:
256; CHECK-GI-NEXT:    ld1 { v1.h }[0], [x0]
257; CHECK-GI-NEXT:    ldr h2, [x0, #2]
258; CHECK-GI-NEXT:    movi d0, #0x00ffff0000ffff
259; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
260; CHECK-GI-NEXT:    and v0.8b, v1.8b, v0.8b
261; CHECK-GI-NEXT:    ldr d1, [x1]
262; CHECK-GI-NEXT:    sshll v1.2d, v1.2s, #0
263; CHECK-GI-NEXT:    mov w8, v0.s[0]
264; CHECK-GI-NEXT:    mov w9, v0.s[1]
265; CHECK-GI-NEXT:    mov x11, v1.d[1]
266; CHECK-GI-NEXT:    mov v0.d[0], x8
267; CHECK-GI-NEXT:    mov v0.d[1], x9
268; CHECK-GI-NEXT:    fmov x9, d1
269; CHECK-GI-NEXT:    fmov x8, d0
270; CHECK-GI-NEXT:    mov x10, v0.d[1]
271; CHECK-GI-NEXT:    mul x8, x8, x9
272; CHECK-GI-NEXT:    mul x9, x10, x11
273; CHECK-GI-NEXT:    mov v0.d[0], x8
274; CHECK-GI-NEXT:    mov v0.d[1], x9
275; CHECK-GI-NEXT:    ret
276  %load.A = load <2 x i16>, ptr %A
277  %load.B = load <2 x i32>, ptr %B
278  %zext.A = zext <2 x i16> %load.A to <2 x i64>
279  %sext.B = sext <2 x i32> %load.B to <2 x i64>
280  %res = mul <2 x i64> %zext.A, %sext.B
281  ret <2 x i64> %res
282}
283
284define <2 x i64> @smull_zext_and_v2i32_v2i64(ptr %A, ptr %B) nounwind {
285; CHECK-NEON-LABEL: smull_zext_and_v2i32_v2i64:
286; CHECK-NEON:       // %bb.0:
287; CHECK-NEON-NEXT:    ldr d0, [x0]
288; CHECK-NEON-NEXT:    ldr d1, [x1]
289; CHECK-NEON-NEXT:    bic v0.2s, #128, lsl #24
290; CHECK-NEON-NEXT:    smull v0.2d, v0.2s, v1.2s
291; CHECK-NEON-NEXT:    ret
292;
293; CHECK-SVE-LABEL: smull_zext_and_v2i32_v2i64:
294; CHECK-SVE:       // %bb.0:
295; CHECK-SVE-NEXT:    ldr d0, [x0]
296; CHECK-SVE-NEXT:    ldr d1, [x1]
297; CHECK-SVE-NEXT:    bic v0.2s, #128, lsl #24
298; CHECK-SVE-NEXT:    smull v0.2d, v0.2s, v1.2s
299; CHECK-SVE-NEXT:    ret
300;
301; CHECK-GI-LABEL: smull_zext_and_v2i32_v2i64:
302; CHECK-GI:       // %bb.0:
303; CHECK-GI-NEXT:    mvni v0.2s, #128, lsl #24
304; CHECK-GI-NEXT:    ldr d1, [x0]
305; CHECK-GI-NEXT:    and v0.8b, v1.8b, v0.8b
306; CHECK-GI-NEXT:    ldr d1, [x1]
307; CHECK-GI-NEXT:    sshll v1.2d, v1.2s, #0
308; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
309; CHECK-GI-NEXT:    fmov x9, d1
310; CHECK-GI-NEXT:    mov x11, v1.d[1]
311; CHECK-GI-NEXT:    fmov x8, d0
312; CHECK-GI-NEXT:    mov x10, v0.d[1]
313; CHECK-GI-NEXT:    mul x8, x8, x9
314; CHECK-GI-NEXT:    mul x9, x10, x11
315; CHECK-GI-NEXT:    mov v0.d[0], x8
316; CHECK-GI-NEXT:    mov v0.d[1], x9
317; CHECK-GI-NEXT:    ret
318  %load.A = load <2 x i32>, ptr %A
319  %and.A = and <2 x i32> %load.A, <i32 u0x7FFFFFFF, i32 u0x7FFFFFFF>
320  %load.B = load <2 x i32>, ptr %B
321  %zext.A = zext <2 x i32> %and.A to <2 x i64>
322  %sext.B = sext <2 x i32> %load.B to <2 x i64>
323  %res = mul <2 x i64> %zext.A, %sext.B
324  ret <2 x i64> %res
325}
326
327define <8 x i16> @umull_v8i8_v8i16(ptr %A, ptr %B) nounwind {
328; CHECK-LABEL: umull_v8i8_v8i16:
329; CHECK:       // %bb.0:
330; CHECK-NEXT:    ldr d0, [x0]
331; CHECK-NEXT:    ldr d1, [x1]
332; CHECK-NEXT:    umull v0.8h, v0.8b, v1.8b
333; CHECK-NEXT:    ret
334  %tmp1 = load <8 x i8>, ptr %A
335  %tmp2 = load <8 x i8>, ptr %B
336  %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
337  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
338  %tmp5 = mul <8 x i16> %tmp3, %tmp4
339  ret <8 x i16> %tmp5
340}
341
342define <4 x i32> @umull_v4i16_v4i32(ptr %A, ptr %B) nounwind {
343; CHECK-LABEL: umull_v4i16_v4i32:
344; CHECK:       // %bb.0:
345; CHECK-NEXT:    ldr d0, [x0]
346; CHECK-NEXT:    ldr d1, [x1]
347; CHECK-NEXT:    umull v0.4s, v0.4h, v1.4h
348; CHECK-NEXT:    ret
349  %tmp1 = load <4 x i16>, ptr %A
350  %tmp2 = load <4 x i16>, ptr %B
351  %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
352  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
353  %tmp5 = mul <4 x i32> %tmp3, %tmp4
354  ret <4 x i32> %tmp5
355}
356
357define <2 x i64> @umull_v2i32_v2i64(ptr %A, ptr %B) nounwind {
358; CHECK-LABEL: umull_v2i32_v2i64:
359; CHECK:       // %bb.0:
360; CHECK-NEXT:    ldr d0, [x0]
361; CHECK-NEXT:    ldr d1, [x1]
362; CHECK-NEXT:    umull v0.2d, v0.2s, v1.2s
363; CHECK-NEXT:    ret
364  %tmp1 = load <2 x i32>, ptr %A
365  %tmp2 = load <2 x i32>, ptr %B
366  %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
367  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
368  %tmp5 = mul <2 x i64> %tmp3, %tmp4
369  ret <2 x i64> %tmp5
370}
371
372define <8 x i16> @amull_v8i8_v8i16(ptr %A, ptr %B) nounwind {
373; CHECK-NEON-LABEL: amull_v8i8_v8i16:
374; CHECK-NEON:       // %bb.0:
375; CHECK-NEON-NEXT:    ldr d0, [x0]
376; CHECK-NEON-NEXT:    ldr d1, [x1]
377; CHECK-NEON-NEXT:    smull v0.8h, v0.8b, v1.8b
378; CHECK-NEON-NEXT:    bic v0.8h, #255, lsl #8
379; CHECK-NEON-NEXT:    ret
380;
381; CHECK-SVE-LABEL: amull_v8i8_v8i16:
382; CHECK-SVE:       // %bb.0:
383; CHECK-SVE-NEXT:    ldr d0, [x0]
384; CHECK-SVE-NEXT:    ldr d1, [x1]
385; CHECK-SVE-NEXT:    smull v0.8h, v0.8b, v1.8b
386; CHECK-SVE-NEXT:    bic v0.8h, #255, lsl #8
387; CHECK-SVE-NEXT:    ret
388;
389; CHECK-GI-LABEL: amull_v8i8_v8i16:
390; CHECK-GI:       // %bb.0:
391; CHECK-GI-NEXT:    ldr d1, [x0]
392; CHECK-GI-NEXT:    ldr d2, [x1]
393; CHECK-GI-NEXT:    movi v0.2d, #0xff00ff00ff00ff
394; CHECK-GI-NEXT:    umull v1.8h, v1.8b, v2.8b
395; CHECK-GI-NEXT:    and v0.16b, v1.16b, v0.16b
396; CHECK-GI-NEXT:    ret
397  %tmp1 = load <8 x i8>, ptr %A
398  %tmp2 = load <8 x i8>, ptr %B
399  %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
400  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
401  %tmp5 = mul <8 x i16> %tmp3, %tmp4
402  %and = and <8 x i16> %tmp5, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
403  ret <8 x i16> %and
404}
405
406define <4 x i32> @amull_v4i16_v4i32(ptr %A, ptr %B) nounwind {
407; CHECK-NEON-LABEL: amull_v4i16_v4i32:
408; CHECK-NEON:       // %bb.0:
409; CHECK-NEON-NEXT:    ldr d1, [x0]
410; CHECK-NEON-NEXT:    ldr d2, [x1]
411; CHECK-NEON-NEXT:    movi v0.2d, #0x00ffff0000ffff
412; CHECK-NEON-NEXT:    smull v1.4s, v1.4h, v2.4h
413; CHECK-NEON-NEXT:    and v0.16b, v1.16b, v0.16b
414; CHECK-NEON-NEXT:    ret
415;
416; CHECK-SVE-LABEL: amull_v4i16_v4i32:
417; CHECK-SVE:       // %bb.0:
418; CHECK-SVE-NEXT:    ldr d1, [x0]
419; CHECK-SVE-NEXT:    ldr d2, [x1]
420; CHECK-SVE-NEXT:    movi v0.2d, #0x00ffff0000ffff
421; CHECK-SVE-NEXT:    smull v1.4s, v1.4h, v2.4h
422; CHECK-SVE-NEXT:    and v0.16b, v1.16b, v0.16b
423; CHECK-SVE-NEXT:    ret
424;
425; CHECK-GI-LABEL: amull_v4i16_v4i32:
426; CHECK-GI:       // %bb.0:
427; CHECK-GI-NEXT:    ldr d1, [x0]
428; CHECK-GI-NEXT:    ldr d2, [x1]
429; CHECK-GI-NEXT:    movi v0.2d, #0x00ffff0000ffff
430; CHECK-GI-NEXT:    umull v1.4s, v1.4h, v2.4h
431; CHECK-GI-NEXT:    and v0.16b, v1.16b, v0.16b
432; CHECK-GI-NEXT:    ret
433  %tmp1 = load <4 x i16>, ptr %A
434  %tmp2 = load <4 x i16>, ptr %B
435  %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
436  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
437  %tmp5 = mul <4 x i32> %tmp3, %tmp4
438  %and = and <4 x i32> %tmp5, <i32 65535, i32 65535, i32 65535, i32 65535>
439  ret <4 x i32> %and
440}
441
442define <2 x i64> @amull_v2i32_v2i64(ptr %A, ptr %B) nounwind {
443; CHECK-NEON-LABEL: amull_v2i32_v2i64:
444; CHECK-NEON:       // %bb.0:
445; CHECK-NEON-NEXT:    ldr d1, [x0]
446; CHECK-NEON-NEXT:    ldr d2, [x1]
447; CHECK-NEON-NEXT:    movi v0.2d, #0x000000ffffffff
448; CHECK-NEON-NEXT:    smull v1.2d, v1.2s, v2.2s
449; CHECK-NEON-NEXT:    and v0.16b, v1.16b, v0.16b
450; CHECK-NEON-NEXT:    ret
451;
452; CHECK-SVE-LABEL: amull_v2i32_v2i64:
453; CHECK-SVE:       // %bb.0:
454; CHECK-SVE-NEXT:    ldr d1, [x0]
455; CHECK-SVE-NEXT:    ldr d2, [x1]
456; CHECK-SVE-NEXT:    movi v0.2d, #0x000000ffffffff
457; CHECK-SVE-NEXT:    smull v1.2d, v1.2s, v2.2s
458; CHECK-SVE-NEXT:    and v0.16b, v1.16b, v0.16b
459; CHECK-SVE-NEXT:    ret
460;
461; CHECK-GI-LABEL: amull_v2i32_v2i64:
462; CHECK-GI:       // %bb.0:
463; CHECK-GI-NEXT:    ldr d1, [x0]
464; CHECK-GI-NEXT:    ldr d2, [x1]
465; CHECK-GI-NEXT:    movi v0.2d, #0x000000ffffffff
466; CHECK-GI-NEXT:    umull v1.2d, v1.2s, v2.2s
467; CHECK-GI-NEXT:    and v0.16b, v1.16b, v0.16b
468; CHECK-GI-NEXT:    ret
469  %tmp1 = load <2 x i32>, ptr %A
470  %tmp2 = load <2 x i32>, ptr %B
471  %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
472  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
473  %tmp5 = mul <2 x i64> %tmp3, %tmp4
474  %and = and <2 x i64> %tmp5, <i64 4294967295, i64 4294967295>
475  ret <2 x i64> %and
476}
477
478define <8 x i16> @smlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
479; CHECK-LABEL: smlal_v8i8_v8i16:
480; CHECK:       // %bb.0:
481; CHECK-NEXT:    ldr q0, [x0]
482; CHECK-NEXT:    ldr d1, [x1]
483; CHECK-NEXT:    ldr d2, [x2]
484; CHECK-NEXT:    smlal v0.8h, v1.8b, v2.8b
485; CHECK-NEXT:    ret
486  %tmp1 = load <8 x i16>, ptr %A
487  %tmp2 = load <8 x i8>, ptr %B
488  %tmp3 = load <8 x i8>, ptr %C
489  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
490  %tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
491  %tmp6 = mul <8 x i16> %tmp4, %tmp5
492  %tmp7 = add <8 x i16> %tmp1, %tmp6
493  ret <8 x i16> %tmp7
494}
495
496define <4 x i32> @smlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
497; CHECK-LABEL: smlal_v4i16_v4i32:
498; CHECK:       // %bb.0:
499; CHECK-NEXT:    ldr q0, [x0]
500; CHECK-NEXT:    ldr d1, [x1]
501; CHECK-NEXT:    ldr d2, [x2]
502; CHECK-NEXT:    smlal v0.4s, v1.4h, v2.4h
503; CHECK-NEXT:    ret
504  %tmp1 = load <4 x i32>, ptr %A
505  %tmp2 = load <4 x i16>, ptr %B
506  %tmp3 = load <4 x i16>, ptr %C
507  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
508  %tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
509  %tmp6 = mul <4 x i32> %tmp4, %tmp5
510  %tmp7 = add <4 x i32> %tmp1, %tmp6
511  ret <4 x i32> %tmp7
512}
513
514define <2 x i64> @smlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
515; CHECK-LABEL: smlal_v2i32_v2i64:
516; CHECK:       // %bb.0:
517; CHECK-NEXT:    ldr q0, [x0]
518; CHECK-NEXT:    ldr d1, [x1]
519; CHECK-NEXT:    ldr d2, [x2]
520; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.2s
521; CHECK-NEXT:    ret
522  %tmp1 = load <2 x i64>, ptr %A
523  %tmp2 = load <2 x i32>, ptr %B
524  %tmp3 = load <2 x i32>, ptr %C
525  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
526  %tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
527  %tmp6 = mul <2 x i64> %tmp4, %tmp5
528  %tmp7 = add <2 x i64> %tmp1, %tmp6
529  ret <2 x i64> %tmp7
530}
531
532define <8 x i16> @umlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
533; CHECK-LABEL: umlal_v8i8_v8i16:
534; CHECK:       // %bb.0:
535; CHECK-NEXT:    ldr q0, [x0]
536; CHECK-NEXT:    ldr d1, [x1]
537; CHECK-NEXT:    ldr d2, [x2]
538; CHECK-NEXT:    umlal v0.8h, v1.8b, v2.8b
539; CHECK-NEXT:    ret
540  %tmp1 = load <8 x i16>, ptr %A
541  %tmp2 = load <8 x i8>, ptr %B
542  %tmp3 = load <8 x i8>, ptr %C
543  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
544  %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
545  %tmp6 = mul <8 x i16> %tmp4, %tmp5
546  %tmp7 = add <8 x i16> %tmp1, %tmp6
547  ret <8 x i16> %tmp7
548}
549
550define <4 x i32> @umlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
551; CHECK-LABEL: umlal_v4i16_v4i32:
552; CHECK:       // %bb.0:
553; CHECK-NEXT:    ldr q0, [x0]
554; CHECK-NEXT:    ldr d1, [x1]
555; CHECK-NEXT:    ldr d2, [x2]
556; CHECK-NEXT:    umlal v0.4s, v1.4h, v2.4h
557; CHECK-NEXT:    ret
558  %tmp1 = load <4 x i32>, ptr %A
559  %tmp2 = load <4 x i16>, ptr %B
560  %tmp3 = load <4 x i16>, ptr %C
561  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
562  %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
563  %tmp6 = mul <4 x i32> %tmp4, %tmp5
564  %tmp7 = add <4 x i32> %tmp1, %tmp6
565  ret <4 x i32> %tmp7
566}
567
568define <2 x i64> @umlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
569; CHECK-LABEL: umlal_v2i32_v2i64:
570; CHECK:       // %bb.0:
571; CHECK-NEXT:    ldr q0, [x0]
572; CHECK-NEXT:    ldr d1, [x1]
573; CHECK-NEXT:    ldr d2, [x2]
574; CHECK-NEXT:    umlal v0.2d, v1.2s, v2.2s
575; CHECK-NEXT:    ret
576  %tmp1 = load <2 x i64>, ptr %A
577  %tmp2 = load <2 x i32>, ptr %B
578  %tmp3 = load <2 x i32>, ptr %C
579  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
580  %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
581  %tmp6 = mul <2 x i64> %tmp4, %tmp5
582  %tmp7 = add <2 x i64> %tmp1, %tmp6
583  ret <2 x i64> %tmp7
584}
585
586define <8 x i16> @amlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
587; CHECK-NEON-LABEL: amlal_v8i8_v8i16:
588; CHECK-NEON:       // %bb.0:
589; CHECK-NEON-NEXT:    ldr q0, [x0]
590; CHECK-NEON-NEXT:    ldr d1, [x1]
591; CHECK-NEON-NEXT:    ldr d2, [x2]
592; CHECK-NEON-NEXT:    smlal v0.8h, v1.8b, v2.8b
593; CHECK-NEON-NEXT:    bic v0.8h, #255, lsl #8
594; CHECK-NEON-NEXT:    ret
595;
596; CHECK-SVE-LABEL: amlal_v8i8_v8i16:
597; CHECK-SVE:       // %bb.0:
598; CHECK-SVE-NEXT:    ldr q0, [x0]
599; CHECK-SVE-NEXT:    ldr d1, [x1]
600; CHECK-SVE-NEXT:    ldr d2, [x2]
601; CHECK-SVE-NEXT:    smlal v0.8h, v1.8b, v2.8b
602; CHECK-SVE-NEXT:    bic v0.8h, #255, lsl #8
603; CHECK-SVE-NEXT:    ret
604;
605; CHECK-GI-LABEL: amlal_v8i8_v8i16:
606; CHECK-GI:       // %bb.0:
607; CHECK-GI-NEXT:    ldr q0, [x0]
608; CHECK-GI-NEXT:    ldr d1, [x1]
609; CHECK-GI-NEXT:    movi v3.2d, #0xff00ff00ff00ff
610; CHECK-GI-NEXT:    ldr d2, [x2]
611; CHECK-GI-NEXT:    umlal v0.8h, v1.8b, v2.8b
612; CHECK-GI-NEXT:    and v0.16b, v0.16b, v3.16b
613; CHECK-GI-NEXT:    ret
614  %tmp1 = load <8 x i16>, ptr %A
615  %tmp2 = load <8 x i8>, ptr %B
616  %tmp3 = load <8 x i8>, ptr %C
617  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
618  %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
619  %tmp6 = mul <8 x i16> %tmp4, %tmp5
620  %tmp7 = add <8 x i16> %tmp1, %tmp6
621  %and = and <8 x i16> %tmp7, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
622  ret <8 x i16> %and
623}
624
625define <4 x i32> @amlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
626; CHECK-NEON-LABEL: amlal_v4i16_v4i32:
627; CHECK-NEON:       // %bb.0:
628; CHECK-NEON-NEXT:    ldr q0, [x0]
629; CHECK-NEON-NEXT:    ldr d1, [x1]
630; CHECK-NEON-NEXT:    ldr d2, [x2]
631; CHECK-NEON-NEXT:    smlal v0.4s, v1.4h, v2.4h
632; CHECK-NEON-NEXT:    movi v1.2d, #0x00ffff0000ffff
633; CHECK-NEON-NEXT:    and v0.16b, v0.16b, v1.16b
634; CHECK-NEON-NEXT:    ret
635;
636; CHECK-SVE-LABEL: amlal_v4i16_v4i32:
637; CHECK-SVE:       // %bb.0:
638; CHECK-SVE-NEXT:    ldr q0, [x0]
639; CHECK-SVE-NEXT:    ldr d1, [x1]
640; CHECK-SVE-NEXT:    ldr d2, [x2]
641; CHECK-SVE-NEXT:    smlal v0.4s, v1.4h, v2.4h
642; CHECK-SVE-NEXT:    movi v1.2d, #0x00ffff0000ffff
643; CHECK-SVE-NEXT:    and v0.16b, v0.16b, v1.16b
644; CHECK-SVE-NEXT:    ret
645;
646; CHECK-GI-LABEL: amlal_v4i16_v4i32:
647; CHECK-GI:       // %bb.0:
648; CHECK-GI-NEXT:    ldr q0, [x0]
649; CHECK-GI-NEXT:    ldr d1, [x1]
650; CHECK-GI-NEXT:    movi v3.2d, #0x00ffff0000ffff
651; CHECK-GI-NEXT:    ldr d2, [x2]
652; CHECK-GI-NEXT:    umlal v0.4s, v1.4h, v2.4h
653; CHECK-GI-NEXT:    and v0.16b, v0.16b, v3.16b
654; CHECK-GI-NEXT:    ret
655  %tmp1 = load <4 x i32>, ptr %A
656  %tmp2 = load <4 x i16>, ptr %B
657  %tmp3 = load <4 x i16>, ptr %C
658  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
659  %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
660  %tmp6 = mul <4 x i32> %tmp4, %tmp5
661  %tmp7 = add <4 x i32> %tmp1, %tmp6
662  %and = and <4 x i32> %tmp7, <i32 65535, i32 65535, i32 65535, i32 65535>
663  ret <4 x i32> %and
664}
665
666define <2 x i64> @amlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
667; CHECK-NEON-LABEL: amlal_v2i32_v2i64:
668; CHECK-NEON:       // %bb.0:
669; CHECK-NEON-NEXT:    ldr q0, [x0]
670; CHECK-NEON-NEXT:    ldr d1, [x1]
671; CHECK-NEON-NEXT:    ldr d2, [x2]
672; CHECK-NEON-NEXT:    smlal v0.2d, v1.2s, v2.2s
673; CHECK-NEON-NEXT:    movi v1.2d, #0x000000ffffffff
674; CHECK-NEON-NEXT:    and v0.16b, v0.16b, v1.16b
675; CHECK-NEON-NEXT:    ret
676;
677; CHECK-SVE-LABEL: amlal_v2i32_v2i64:
678; CHECK-SVE:       // %bb.0:
679; CHECK-SVE-NEXT:    ldr q0, [x0]
680; CHECK-SVE-NEXT:    ldr d1, [x1]
681; CHECK-SVE-NEXT:    ldr d2, [x2]
682; CHECK-SVE-NEXT:    smlal v0.2d, v1.2s, v2.2s
683; CHECK-SVE-NEXT:    movi v1.2d, #0x000000ffffffff
684; CHECK-SVE-NEXT:    and v0.16b, v0.16b, v1.16b
685; CHECK-SVE-NEXT:    ret
686;
687; CHECK-GI-LABEL: amlal_v2i32_v2i64:
688; CHECK-GI:       // %bb.0:
689; CHECK-GI-NEXT:    ldr q0, [x0]
690; CHECK-GI-NEXT:    ldr d1, [x1]
691; CHECK-GI-NEXT:    movi v3.2d, #0x000000ffffffff
692; CHECK-GI-NEXT:    ldr d2, [x2]
693; CHECK-GI-NEXT:    umlal v0.2d, v1.2s, v2.2s
694; CHECK-GI-NEXT:    and v0.16b, v0.16b, v3.16b
695; CHECK-GI-NEXT:    ret
696  %tmp1 = load <2 x i64>, ptr %A
697  %tmp2 = load <2 x i32>, ptr %B
698  %tmp3 = load <2 x i32>, ptr %C
699  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
700  %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
701  %tmp6 = mul <2 x i64> %tmp4, %tmp5
702  %tmp7 = add <2 x i64> %tmp1, %tmp6
703  %and = and <2 x i64> %tmp7, <i64 4294967295, i64 4294967295>
704  ret <2 x i64> %and
705}
706
707define <8 x i16> @smlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
708; CHECK-LABEL: smlsl_v8i8_v8i16:
709; CHECK:       // %bb.0:
710; CHECK-NEXT:    ldr q0, [x0]
711; CHECK-NEXT:    ldr d1, [x1]
712; CHECK-NEXT:    ldr d2, [x2]
713; CHECK-NEXT:    smlsl v0.8h, v1.8b, v2.8b
714; CHECK-NEXT:    ret
715  %tmp1 = load <8 x i16>, ptr %A
716  %tmp2 = load <8 x i8>, ptr %B
717  %tmp3 = load <8 x i8>, ptr %C
718  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
719  %tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
720  %tmp6 = mul <8 x i16> %tmp4, %tmp5
721  %tmp7 = sub <8 x i16> %tmp1, %tmp6
722  ret <8 x i16> %tmp7
723}
724
725define <4 x i32> @smlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
726; CHECK-LABEL: smlsl_v4i16_v4i32:
727; CHECK:       // %bb.0:
728; CHECK-NEXT:    ldr q0, [x0]
729; CHECK-NEXT:    ldr d1, [x1]
730; CHECK-NEXT:    ldr d2, [x2]
731; CHECK-NEXT:    smlsl v0.4s, v1.4h, v2.4h
732; CHECK-NEXT:    ret
733  %tmp1 = load <4 x i32>, ptr %A
734  %tmp2 = load <4 x i16>, ptr %B
735  %tmp3 = load <4 x i16>, ptr %C
736  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
737  %tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
738  %tmp6 = mul <4 x i32> %tmp4, %tmp5
739  %tmp7 = sub <4 x i32> %tmp1, %tmp6
740  ret <4 x i32> %tmp7
741}
742
743define <2 x i64> @smlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
744; CHECK-LABEL: smlsl_v2i32_v2i64:
745; CHECK:       // %bb.0:
746; CHECK-NEXT:    ldr q0, [x0]
747; CHECK-NEXT:    ldr d1, [x1]
748; CHECK-NEXT:    ldr d2, [x2]
749; CHECK-NEXT:    smlsl v0.2d, v1.2s, v2.2s
750; CHECK-NEXT:    ret
751  %tmp1 = load <2 x i64>, ptr %A
752  %tmp2 = load <2 x i32>, ptr %B
753  %tmp3 = load <2 x i32>, ptr %C
754  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
755  %tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
756  %tmp6 = mul <2 x i64> %tmp4, %tmp5
757  %tmp7 = sub <2 x i64> %tmp1, %tmp6
758  ret <2 x i64> %tmp7
759}
760
761define <8 x i16> @umlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
762; CHECK-LABEL: umlsl_v8i8_v8i16:
763; CHECK:       // %bb.0:
764; CHECK-NEXT:    ldr q0, [x0]
765; CHECK-NEXT:    ldr d1, [x1]
766; CHECK-NEXT:    ldr d2, [x2]
767; CHECK-NEXT:    umlsl v0.8h, v1.8b, v2.8b
768; CHECK-NEXT:    ret
769  %tmp1 = load <8 x i16>, ptr %A
770  %tmp2 = load <8 x i8>, ptr %B
771  %tmp3 = load <8 x i8>, ptr %C
772  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
773  %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
774  %tmp6 = mul <8 x i16> %tmp4, %tmp5
775  %tmp7 = sub <8 x i16> %tmp1, %tmp6
776  ret <8 x i16> %tmp7
777}
778
779define <4 x i32> @umlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
780; CHECK-LABEL: umlsl_v4i16_v4i32:
781; CHECK:       // %bb.0:
782; CHECK-NEXT:    ldr q0, [x0]
783; CHECK-NEXT:    ldr d1, [x1]
784; CHECK-NEXT:    ldr d2, [x2]
785; CHECK-NEXT:    umlsl v0.4s, v1.4h, v2.4h
786; CHECK-NEXT:    ret
787  %tmp1 = load <4 x i32>, ptr %A
788  %tmp2 = load <4 x i16>, ptr %B
789  %tmp3 = load <4 x i16>, ptr %C
790  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
791  %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
792  %tmp6 = mul <4 x i32> %tmp4, %tmp5
793  %tmp7 = sub <4 x i32> %tmp1, %tmp6
794  ret <4 x i32> %tmp7
795}
796
797define <2 x i64> @umlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
798; CHECK-LABEL: umlsl_v2i32_v2i64:
799; CHECK:       // %bb.0:
800; CHECK-NEXT:    ldr q0, [x0]
801; CHECK-NEXT:    ldr d1, [x1]
802; CHECK-NEXT:    ldr d2, [x2]
803; CHECK-NEXT:    umlsl v0.2d, v1.2s, v2.2s
804; CHECK-NEXT:    ret
805  %tmp1 = load <2 x i64>, ptr %A
806  %tmp2 = load <2 x i32>, ptr %B
807  %tmp3 = load <2 x i32>, ptr %C
808  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
809  %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
810  %tmp6 = mul <2 x i64> %tmp4, %tmp5
811  %tmp7 = sub <2 x i64> %tmp1, %tmp6
812  ret <2 x i64> %tmp7
813}
814
815define <8 x i16> @amlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
816; CHECK-NEON-LABEL: amlsl_v8i8_v8i16:
817; CHECK-NEON:       // %bb.0:
818; CHECK-NEON-NEXT:    ldr q0, [x0]
819; CHECK-NEON-NEXT:    ldr d1, [x1]
820; CHECK-NEON-NEXT:    ldr d2, [x2]
821; CHECK-NEON-NEXT:    smlsl v0.8h, v1.8b, v2.8b
822; CHECK-NEON-NEXT:    bic v0.8h, #255, lsl #8
823; CHECK-NEON-NEXT:    ret
824;
825; CHECK-SVE-LABEL: amlsl_v8i8_v8i16:
826; CHECK-SVE:       // %bb.0:
827; CHECK-SVE-NEXT:    ldr q0, [x0]
828; CHECK-SVE-NEXT:    ldr d1, [x1]
829; CHECK-SVE-NEXT:    ldr d2, [x2]
830; CHECK-SVE-NEXT:    smlsl v0.8h, v1.8b, v2.8b
831; CHECK-SVE-NEXT:    bic v0.8h, #255, lsl #8
832; CHECK-SVE-NEXT:    ret
833;
834; CHECK-GI-LABEL: amlsl_v8i8_v8i16:
835; CHECK-GI:       // %bb.0:
836; CHECK-GI-NEXT:    ldr q0, [x0]
837; CHECK-GI-NEXT:    ldr d1, [x1]
838; CHECK-GI-NEXT:    movi v3.2d, #0xff00ff00ff00ff
839; CHECK-GI-NEXT:    ldr d2, [x2]
840; CHECK-GI-NEXT:    umlsl v0.8h, v1.8b, v2.8b
841; CHECK-GI-NEXT:    and v0.16b, v0.16b, v3.16b
842; CHECK-GI-NEXT:    ret
843  %tmp1 = load <8 x i16>, ptr %A
844  %tmp2 = load <8 x i8>, ptr %B
845  %tmp3 = load <8 x i8>, ptr %C
846  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
847  %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
848  %tmp6 = mul <8 x i16> %tmp4, %tmp5
849  %tmp7 = sub <8 x i16> %tmp1, %tmp6
850  %and = and <8 x i16> %tmp7, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
851  ret <8 x i16> %and
852}
853
854define <4 x i32> @amlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
855; CHECK-NEON-LABEL: amlsl_v4i16_v4i32:
856; CHECK-NEON:       // %bb.0:
857; CHECK-NEON-NEXT:    ldr q0, [x0]
858; CHECK-NEON-NEXT:    ldr d1, [x1]
859; CHECK-NEON-NEXT:    ldr d2, [x2]
860; CHECK-NEON-NEXT:    smlsl v0.4s, v1.4h, v2.4h
861; CHECK-NEON-NEXT:    movi v1.2d, #0x00ffff0000ffff
862; CHECK-NEON-NEXT:    and v0.16b, v0.16b, v1.16b
863; CHECK-NEON-NEXT:    ret
864;
865; CHECK-SVE-LABEL: amlsl_v4i16_v4i32:
866; CHECK-SVE:       // %bb.0:
867; CHECK-SVE-NEXT:    ldr q0, [x0]
868; CHECK-SVE-NEXT:    ldr d1, [x1]
869; CHECK-SVE-NEXT:    ldr d2, [x2]
870; CHECK-SVE-NEXT:    smlsl v0.4s, v1.4h, v2.4h
871; CHECK-SVE-NEXT:    movi v1.2d, #0x00ffff0000ffff
872; CHECK-SVE-NEXT:    and v0.16b, v0.16b, v1.16b
873; CHECK-SVE-NEXT:    ret
874;
875; CHECK-GI-LABEL: amlsl_v4i16_v4i32:
876; CHECK-GI:       // %bb.0:
877; CHECK-GI-NEXT:    ldr q0, [x0]
878; CHECK-GI-NEXT:    ldr d1, [x1]
879; CHECK-GI-NEXT:    movi v3.2d, #0x00ffff0000ffff
880; CHECK-GI-NEXT:    ldr d2, [x2]
881; CHECK-GI-NEXT:    umlsl v0.4s, v1.4h, v2.4h
882; CHECK-GI-NEXT:    and v0.16b, v0.16b, v3.16b
883; CHECK-GI-NEXT:    ret
884  %tmp1 = load <4 x i32>, ptr %A
885  %tmp2 = load <4 x i16>, ptr %B
886  %tmp3 = load <4 x i16>, ptr %C
887  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
888  %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
889  %tmp6 = mul <4 x i32> %tmp4, %tmp5
890  %tmp7 = sub <4 x i32> %tmp1, %tmp6
891  %and = and <4 x i32> %tmp7, <i32 65535, i32 65535, i32 65535, i32 65535>
892  ret <4 x i32> %and
893}
894
895define <2 x i64> @amlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
896; CHECK-NEON-LABEL: amlsl_v2i32_v2i64:
897; CHECK-NEON:       // %bb.0:
898; CHECK-NEON-NEXT:    ldr q0, [x0]
899; CHECK-NEON-NEXT:    ldr d1, [x1]
900; CHECK-NEON-NEXT:    ldr d2, [x2]
901; CHECK-NEON-NEXT:    smlsl v0.2d, v1.2s, v2.2s
902; CHECK-NEON-NEXT:    movi v1.2d, #0x000000ffffffff
903; CHECK-NEON-NEXT:    and v0.16b, v0.16b, v1.16b
904; CHECK-NEON-NEXT:    ret
905;
906; CHECK-SVE-LABEL: amlsl_v2i32_v2i64:
907; CHECK-SVE:       // %bb.0:
908; CHECK-SVE-NEXT:    ldr q0, [x0]
909; CHECK-SVE-NEXT:    ldr d1, [x1]
910; CHECK-SVE-NEXT:    ldr d2, [x2]
911; CHECK-SVE-NEXT:    smlsl v0.2d, v1.2s, v2.2s
912; CHECK-SVE-NEXT:    movi v1.2d, #0x000000ffffffff
913; CHECK-SVE-NEXT:    and v0.16b, v0.16b, v1.16b
914; CHECK-SVE-NEXT:    ret
915;
916; CHECK-GI-LABEL: amlsl_v2i32_v2i64:
917; CHECK-GI:       // %bb.0:
918; CHECK-GI-NEXT:    ldr q0, [x0]
919; CHECK-GI-NEXT:    ldr d1, [x1]
920; CHECK-GI-NEXT:    movi v3.2d, #0x000000ffffffff
921; CHECK-GI-NEXT:    ldr d2, [x2]
922; CHECK-GI-NEXT:    umlsl v0.2d, v1.2s, v2.2s
923; CHECK-GI-NEXT:    and v0.16b, v0.16b, v3.16b
924; CHECK-GI-NEXT:    ret
925  %tmp1 = load <2 x i64>, ptr %A
926  %tmp2 = load <2 x i32>, ptr %B
927  %tmp3 = load <2 x i32>, ptr %C
928  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
929  %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
930  %tmp6 = mul <2 x i64> %tmp4, %tmp5
931  %tmp7 = sub <2 x i64> %tmp1, %tmp6
932  %and = and <2 x i64> %tmp7, <i64 4294967295, i64 4294967295>
933  ret <2 x i64> %and
934}
935
936; SMULL recognizing BUILD_VECTORs with sign/zero-extended elements.
937define <8 x i16> @smull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
938; CHECK-NEON-LABEL: smull_extvec_v8i8_v8i16:
939; CHECK-NEON:       // %bb.0:
940; CHECK-NEON-NEXT:    movi v1.8b, #244
941; CHECK-NEON-NEXT:    smull v0.8h, v0.8b, v1.8b
942; CHECK-NEON-NEXT:    ret
943;
944; CHECK-SVE-LABEL: smull_extvec_v8i8_v8i16:
945; CHECK-SVE:       // %bb.0:
946; CHECK-SVE-NEXT:    movi v1.8b, #244
947; CHECK-SVE-NEXT:    smull v0.8h, v0.8b, v1.8b
948; CHECK-SVE-NEXT:    ret
949;
950; CHECK-GI-LABEL: smull_extvec_v8i8_v8i16:
951; CHECK-GI:       // %bb.0:
952; CHECK-GI-NEXT:    mvni v1.8h, #11
953; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
954; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
955; CHECK-GI-NEXT:    ret
956  %tmp3 = sext <8 x i8> %arg to <8 x i16>
957  %tmp4 = mul <8 x i16> %tmp3, <i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12>
958  ret <8 x i16> %tmp4
959}
960
961define <8 x i16> @smull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
962; Do not use SMULL if the BUILD_VECTOR element values are too big.
963; CHECK-NEON-LABEL: smull_noextvec_v8i8_v8i16:
964; CHECK-NEON:       // %bb.0:
965; CHECK-NEON-NEXT:    mov w8, #64537 // =0xfc19
966; CHECK-NEON-NEXT:    sshll v0.8h, v0.8b, #0
967; CHECK-NEON-NEXT:    dup v1.8h, w8
968; CHECK-NEON-NEXT:    mul v0.8h, v0.8h, v1.8h
969; CHECK-NEON-NEXT:    ret
970;
971; CHECK-SVE-LABEL: smull_noextvec_v8i8_v8i16:
972; CHECK-SVE:       // %bb.0:
973; CHECK-SVE-NEXT:    mov w8, #64537 // =0xfc19
974; CHECK-SVE-NEXT:    sshll v0.8h, v0.8b, #0
975; CHECK-SVE-NEXT:    dup v1.8h, w8
976; CHECK-SVE-NEXT:    mul v0.8h, v0.8h, v1.8h
977; CHECK-SVE-NEXT:    ret
978;
979; CHECK-GI-LABEL: smull_noextvec_v8i8_v8i16:
980; CHECK-GI:       // %bb.0:
981; CHECK-GI-NEXT:    adrp x8, .LCPI34_0
982; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
983; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI34_0]
984; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
985; CHECK-GI-NEXT:    ret
986  %tmp3 = sext <8 x i8> %arg to <8 x i16>
987  %tmp4 = mul <8 x i16> %tmp3, <i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999>
988  ret <8 x i16> %tmp4
989}
990
991define <4 x i32> @smull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
992; CHECK-NEON-LABEL: smull_extvec_v4i16_v4i32:
993; CHECK-NEON:       // %bb.0:
994; CHECK-NEON-NEXT:    mvni v1.4h, #11
995; CHECK-NEON-NEXT:    smull v0.4s, v0.4h, v1.4h
996; CHECK-NEON-NEXT:    ret
997;
998; CHECK-SVE-LABEL: smull_extvec_v4i16_v4i32:
999; CHECK-SVE:       // %bb.0:
1000; CHECK-SVE-NEXT:    mvni v1.4h, #11
1001; CHECK-SVE-NEXT:    smull v0.4s, v0.4h, v1.4h
1002; CHECK-SVE-NEXT:    ret
1003;
1004; CHECK-GI-LABEL: smull_extvec_v4i16_v4i32:
1005; CHECK-GI:       // %bb.0:
1006; CHECK-GI-NEXT:    mvni v1.4s, #11
1007; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
1008; CHECK-GI-NEXT:    mul v0.4s, v0.4s, v1.4s
1009; CHECK-GI-NEXT:    ret
1010  %tmp3 = sext <4 x i16> %arg to <4 x i32>
1011  %tmp4 = mul <4 x i32> %tmp3, <i32 -12, i32 -12, i32 -12, i32 -12>
1012  ret <4 x i32> %tmp4
1013}
1014
1015define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
1016; CHECK-NEON-LABEL: smull_extvec_v2i32_v2i64:
1017; CHECK-NEON:       // %bb.0:
1018; CHECK-NEON-NEXT:    mov w8, #-1234 // =0xfffffb2e
1019; CHECK-NEON-NEXT:    dup v1.2s, w8
1020; CHECK-NEON-NEXT:    smull v0.2d, v0.2s, v1.2s
1021; CHECK-NEON-NEXT:    ret
1022;
1023; CHECK-SVE-LABEL: smull_extvec_v2i32_v2i64:
1024; CHECK-SVE:       // %bb.0:
1025; CHECK-SVE-NEXT:    mov w8, #-1234 // =0xfffffb2e
1026; CHECK-SVE-NEXT:    dup v1.2s, w8
1027; CHECK-SVE-NEXT:    smull v0.2d, v0.2s, v1.2s
1028; CHECK-SVE-NEXT:    ret
1029;
1030; CHECK-GI-LABEL: smull_extvec_v2i32_v2i64:
1031; CHECK-GI:       // %bb.0:
1032; CHECK-GI-NEXT:    adrp x8, .LCPI36_0
1033; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
1034; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI36_0]
1035; CHECK-GI-NEXT:    fmov x8, d0
1036; CHECK-GI-NEXT:    fmov x9, d1
1037; CHECK-GI-NEXT:    mov x10, v0.d[1]
1038; CHECK-GI-NEXT:    mov x11, v1.d[1]
1039; CHECK-GI-NEXT:    mul x8, x8, x9
1040; CHECK-GI-NEXT:    mul x9, x10, x11
1041; CHECK-GI-NEXT:    mov v0.d[0], x8
1042; CHECK-GI-NEXT:    mov v0.d[1], x9
1043; CHECK-GI-NEXT:    ret
1044  %tmp3 = sext <2 x i32> %arg to <2 x i64>
1045  %tmp4 = mul <2 x i64> %tmp3, <i64 -1234, i64 -1234>
1046  ret <2 x i64> %tmp4
1047}
1048
1049define <8 x i16> @umull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
1050; CHECK-NEON-LABEL: umull_extvec_v8i8_v8i16:
1051; CHECK-NEON:       // %bb.0:
1052; CHECK-NEON-NEXT:    movi v1.8b, #12
1053; CHECK-NEON-NEXT:    umull v0.8h, v0.8b, v1.8b
1054; CHECK-NEON-NEXT:    ret
1055;
1056; CHECK-SVE-LABEL: umull_extvec_v8i8_v8i16:
1057; CHECK-SVE:       // %bb.0:
1058; CHECK-SVE-NEXT:    movi v1.8b, #12
1059; CHECK-SVE-NEXT:    umull v0.8h, v0.8b, v1.8b
1060; CHECK-SVE-NEXT:    ret
1061;
1062; CHECK-GI-LABEL: umull_extvec_v8i8_v8i16:
1063; CHECK-GI:       // %bb.0:
1064; CHECK-GI-NEXT:    movi v1.8h, #12
1065; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
1066; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
1067; CHECK-GI-NEXT:    ret
1068  %tmp3 = zext <8 x i8> %arg to <8 x i16>
1069  %tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
1070  ret <8 x i16> %tmp4
1071}
1072
1073define <8 x i16> @umull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
1074; Do not use SMULL if the BUILD_VECTOR element values are too big.
1075; CHECK-NEON-LABEL: umull_noextvec_v8i8_v8i16:
1076; CHECK-NEON:       // %bb.0:
1077; CHECK-NEON-NEXT:    mov w8, #999 // =0x3e7
1078; CHECK-NEON-NEXT:    ushll v0.8h, v0.8b, #0
1079; CHECK-NEON-NEXT:    dup v1.8h, w8
1080; CHECK-NEON-NEXT:    mul v0.8h, v0.8h, v1.8h
1081; CHECK-NEON-NEXT:    ret
1082;
1083; CHECK-SVE-LABEL: umull_noextvec_v8i8_v8i16:
1084; CHECK-SVE:       // %bb.0:
1085; CHECK-SVE-NEXT:    mov w8, #999 // =0x3e7
1086; CHECK-SVE-NEXT:    ushll v0.8h, v0.8b, #0
1087; CHECK-SVE-NEXT:    dup v1.8h, w8
1088; CHECK-SVE-NEXT:    mul v0.8h, v0.8h, v1.8h
1089; CHECK-SVE-NEXT:    ret
1090;
1091; CHECK-GI-LABEL: umull_noextvec_v8i8_v8i16:
1092; CHECK-GI:       // %bb.0:
1093; CHECK-GI-NEXT:    adrp x8, .LCPI38_0
1094; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
1095; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI38_0]
1096; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
1097; CHECK-GI-NEXT:    ret
1098  %tmp3 = zext <8 x i8> %arg to <8 x i16>
1099  %tmp4 = mul <8 x i16> %tmp3, <i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999>
1100  ret <8 x i16> %tmp4
1101}
1102
1103define <4 x i32> @umull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
1104; CHECK-NEON-LABEL: umull_extvec_v4i16_v4i32:
1105; CHECK-NEON:       // %bb.0:
1106; CHECK-NEON-NEXT:    mov w8, #1234 // =0x4d2
1107; CHECK-NEON-NEXT:    dup v1.4h, w8
1108; CHECK-NEON-NEXT:    umull v0.4s, v0.4h, v1.4h
1109; CHECK-NEON-NEXT:    ret
1110;
1111; CHECK-SVE-LABEL: umull_extvec_v4i16_v4i32:
1112; CHECK-SVE:       // %bb.0:
1113; CHECK-SVE-NEXT:    mov w8, #1234 // =0x4d2
1114; CHECK-SVE-NEXT:    dup v1.4h, w8
1115; CHECK-SVE-NEXT:    umull v0.4s, v0.4h, v1.4h
1116; CHECK-SVE-NEXT:    ret
1117;
1118; CHECK-GI-LABEL: umull_extvec_v4i16_v4i32:
1119; CHECK-GI:       // %bb.0:
1120; CHECK-GI-NEXT:    adrp x8, .LCPI39_0
1121; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
1122; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI39_0]
1123; CHECK-GI-NEXT:    mul v0.4s, v0.4s, v1.4s
1124; CHECK-GI-NEXT:    ret
1125  %tmp3 = zext <4 x i16> %arg to <4 x i32>
1126  %tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234>
1127  ret <4 x i32> %tmp4
1128}
1129
1130define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
1131; CHECK-NEON-LABEL: umull_extvec_v2i32_v2i64:
1132; CHECK-NEON:       // %bb.0:
1133; CHECK-NEON-NEXT:    mov w8, #1234 // =0x4d2
1134; CHECK-NEON-NEXT:    dup v1.2s, w8
1135; CHECK-NEON-NEXT:    umull v0.2d, v0.2s, v1.2s
1136; CHECK-NEON-NEXT:    ret
1137;
1138; CHECK-SVE-LABEL: umull_extvec_v2i32_v2i64:
1139; CHECK-SVE:       // %bb.0:
1140; CHECK-SVE-NEXT:    mov w8, #1234 // =0x4d2
1141; CHECK-SVE-NEXT:    dup v1.2s, w8
1142; CHECK-SVE-NEXT:    umull v0.2d, v0.2s, v1.2s
1143; CHECK-SVE-NEXT:    ret
1144;
1145; CHECK-GI-LABEL: umull_extvec_v2i32_v2i64:
1146; CHECK-GI:       // %bb.0:
1147; CHECK-GI-NEXT:    adrp x8, .LCPI40_0
1148; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
1149; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI40_0]
1150; CHECK-GI-NEXT:    fmov x8, d0
1151; CHECK-GI-NEXT:    fmov x9, d1
1152; CHECK-GI-NEXT:    mov x10, v0.d[1]
1153; CHECK-GI-NEXT:    mov x11, v1.d[1]
1154; CHECK-GI-NEXT:    mul x8, x8, x9
1155; CHECK-GI-NEXT:    mul x9, x10, x11
1156; CHECK-GI-NEXT:    mov v0.d[0], x8
1157; CHECK-GI-NEXT:    mov v0.d[1], x9
1158; CHECK-GI-NEXT:    ret
1159  %tmp3 = zext <2 x i32> %arg to <2 x i64>
1160  %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
1161  ret <2 x i64> %tmp4
1162}
1163
1164define <8 x i16> @amull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
1165; CHECK-NEON-LABEL: amull_extvec_v8i8_v8i16:
1166; CHECK-NEON:       // %bb.0:
1167; CHECK-NEON-NEXT:    movi v1.8b, #12
1168; CHECK-NEON-NEXT:    smull v0.8h, v0.8b, v1.8b
1169; CHECK-NEON-NEXT:    bic v0.8h, #255, lsl #8
1170; CHECK-NEON-NEXT:    ret
1171;
1172; CHECK-SVE-LABEL: amull_extvec_v8i8_v8i16:
1173; CHECK-SVE:       // %bb.0:
1174; CHECK-SVE-NEXT:    movi v1.8b, #12
1175; CHECK-SVE-NEXT:    smull v0.8h, v0.8b, v1.8b
1176; CHECK-SVE-NEXT:    bic v0.8h, #255, lsl #8
1177; CHECK-SVE-NEXT:    ret
1178;
1179; CHECK-GI-LABEL: amull_extvec_v8i8_v8i16:
1180; CHECK-GI:       // %bb.0:
1181; CHECK-GI-NEXT:    movi v1.8h, #12
1182; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
1183; CHECK-GI-NEXT:    movi v2.2d, #0xff00ff00ff00ff
1184; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
1185; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
1186; CHECK-GI-NEXT:    ret
1187  %tmp3 = zext <8 x i8> %arg to <8 x i16>
1188  %tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
1189  %and = and <8 x i16> %tmp4, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
1190  ret <8 x i16> %and
1191}
1192
1193define <4 x i32> @amull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
1194; CHECK-NEON-LABEL: amull_extvec_v4i16_v4i32:
1195; CHECK-NEON:       // %bb.0:
1196; CHECK-NEON-NEXT:    mov w8, #1234 // =0x4d2
1197; CHECK-NEON-NEXT:    dup v1.4h, w8
1198; CHECK-NEON-NEXT:    smull v0.4s, v0.4h, v1.4h
1199; CHECK-NEON-NEXT:    movi v1.2d, #0x00ffff0000ffff
1200; CHECK-NEON-NEXT:    and v0.16b, v0.16b, v1.16b
1201; CHECK-NEON-NEXT:    ret
1202;
1203; CHECK-SVE-LABEL: amull_extvec_v4i16_v4i32:
1204; CHECK-SVE:       // %bb.0:
1205; CHECK-SVE-NEXT:    mov w8, #1234 // =0x4d2
1206; CHECK-SVE-NEXT:    dup v1.4h, w8
1207; CHECK-SVE-NEXT:    smull v0.4s, v0.4h, v1.4h
1208; CHECK-SVE-NEXT:    movi v1.2d, #0x00ffff0000ffff
1209; CHECK-SVE-NEXT:    and v0.16b, v0.16b, v1.16b
1210; CHECK-SVE-NEXT:    ret
1211;
1212; CHECK-GI-LABEL: amull_extvec_v4i16_v4i32:
1213; CHECK-GI:       // %bb.0:
1214; CHECK-GI-NEXT:    adrp x8, .LCPI42_0
1215; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
1216; CHECK-GI-NEXT:    movi v2.2d, #0x00ffff0000ffff
1217; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI42_0]
1218; CHECK-GI-NEXT:    mul v0.4s, v0.4s, v1.4s
1219; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
1220; CHECK-GI-NEXT:    ret
1221  %tmp3 = zext <4 x i16> %arg to <4 x i32>
1222  %tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234>
1223  %and = and <4 x i32> %tmp4, <i32 65535, i32 65535, i32 65535, i32 65535>
1224  ret <4 x i32> %and
1225}
1226
1227define <2 x i64> @amull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
1228; CHECK-NEON-LABEL: amull_extvec_v2i32_v2i64:
1229; CHECK-NEON:       // %bb.0:
1230; CHECK-NEON-NEXT:    mov w8, #1234 // =0x4d2
1231; CHECK-NEON-NEXT:    dup v1.2s, w8
1232; CHECK-NEON-NEXT:    smull v0.2d, v0.2s, v1.2s
1233; CHECK-NEON-NEXT:    movi v1.2d, #0x000000ffffffff
1234; CHECK-NEON-NEXT:    and v0.16b, v0.16b, v1.16b
1235; CHECK-NEON-NEXT:    ret
1236;
1237; CHECK-SVE-LABEL: amull_extvec_v2i32_v2i64:
1238; CHECK-SVE:       // %bb.0:
1239; CHECK-SVE-NEXT:    mov w8, #1234 // =0x4d2
1240; CHECK-SVE-NEXT:    dup v1.2s, w8
1241; CHECK-SVE-NEXT:    smull v0.2d, v0.2s, v1.2s
1242; CHECK-SVE-NEXT:    movi v1.2d, #0x000000ffffffff
1243; CHECK-SVE-NEXT:    and v0.16b, v0.16b, v1.16b
1244; CHECK-SVE-NEXT:    ret
1245;
1246; CHECK-GI-LABEL: amull_extvec_v2i32_v2i64:
1247; CHECK-GI:       // %bb.0:
1248; CHECK-GI-NEXT:    adrp x8, .LCPI43_0
1249; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
1250; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI43_0]
1251; CHECK-GI-NEXT:    fmov x8, d0
1252; CHECK-GI-NEXT:    fmov x9, d1
1253; CHECK-GI-NEXT:    mov x10, v0.d[1]
1254; CHECK-GI-NEXT:    mov x11, v1.d[1]
1255; CHECK-GI-NEXT:    movi v1.2d, #0x000000ffffffff
1256; CHECK-GI-NEXT:    mul x8, x8, x9
1257; CHECK-GI-NEXT:    mul x9, x10, x11
1258; CHECK-GI-NEXT:    mov v0.d[0], x8
1259; CHECK-GI-NEXT:    mov v0.d[1], x9
1260; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
1261; CHECK-GI-NEXT:    ret
1262  %tmp3 = zext <2 x i32> %arg to <2 x i64>
1263  %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
1264  %and = and <2 x i64> %tmp4, <i64 4294967295, i64 4294967295>
1265  ret <2 x i64> %and
1266}
1267
1268define i16 @smullWithInconsistentExtensions(<8 x i8> %x, <8 x i8> %y) {
1269; If one operand has a zero-extend and the other a sign-extend, smull
1270; cannot be used.
1271; CHECK-LABEL: smullWithInconsistentExtensions:
1272; CHECK:       // %bb.0:
1273; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
1274; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
1275; CHECK-NEXT:    mul v0.8h, v0.8h, v1.8h
1276; CHECK-NEXT:    umov w0, v0.h[0]
1277; CHECK-NEXT:    ret
1278  %s = sext <8 x i8> %x to <8 x i16>
1279  %z = zext <8 x i8> %y to <8 x i16>
1280  %m = mul <8 x i16> %s, %z
1281  %r = extractelement <8 x i16> %m, i32 0
1282  ret i16 %r
1283}
1284
1285define <8 x i16> @smull_extended_vector_operand(<8 x i16> %v) {
1286; CHECK-LABEL: smull_extended_vector_operand:
1287; CHECK:       // %bb.0: // %entry
1288; CHECK-NEXT:    movi v1.4s, #139, lsl #8
1289; CHECK-NEXT:    sshll v2.4s, v0.4h, #0
1290; CHECK-NEXT:    sshll2 v0.4s, v0.8h, #0
1291; CHECK-NEXT:    mul v2.4s, v2.4s, v1.4s
1292; CHECK-NEXT:    mul v1.4s, v0.4s, v1.4s
1293; CHECK-NEXT:    shrn v0.4h, v2.4s, #1
1294; CHECK-NEXT:    shrn2 v0.8h, v1.4s, #1
1295; CHECK-NEXT:    ret
1296entry:
1297%0 = sext <8 x i16> %v to <8 x i32>
1298%1 = mul <8 x i32> %0, <i32 35584, i32 35584, i32 35584, i32 35584, i32 35584, i32 35584, i32 35584, i32 35584>
1299%2 = lshr <8 x i32> %1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1300%3 = trunc <8 x i32> %2 to <8 x i16>
1301ret <8 x i16> %3
1302
1303}
1304
1305define void @distribute(ptr %dst, ptr %src, i32 %mul) nounwind {
1306; CHECK-NEON-LABEL: distribute:
1307; CHECK-NEON:       // %bb.0: // %entry
1308; CHECK-NEON-NEXT:    ldr q0, [x1]
1309; CHECK-NEON-NEXT:    dup v1.8b, w2
1310; CHECK-NEON-NEXT:    mov d2, v0.d[1]
1311; CHECK-NEON-NEXT:    umull v2.8h, v2.8b, v1.8b
1312; CHECK-NEON-NEXT:    umlal v2.8h, v0.8b, v1.8b
1313; CHECK-NEON-NEXT:    str q2, [x0]
1314; CHECK-NEON-NEXT:    ret
1315;
1316; CHECK-SVE-LABEL: distribute:
1317; CHECK-SVE:       // %bb.0: // %entry
1318; CHECK-SVE-NEXT:    ldr q0, [x1]
1319; CHECK-SVE-NEXT:    dup v1.8b, w2
1320; CHECK-SVE-NEXT:    mov d2, v0.d[1]
1321; CHECK-SVE-NEXT:    umull v2.8h, v2.8b, v1.8b
1322; CHECK-SVE-NEXT:    umlal v2.8h, v0.8b, v1.8b
1323; CHECK-SVE-NEXT:    str q2, [x0]
1324; CHECK-SVE-NEXT:    ret
1325;
1326; CHECK-GI-LABEL: distribute:
1327; CHECK-GI:       // %bb.0: // %entry
1328; CHECK-GI-NEXT:    ldr q0, [x1]
1329; CHECK-GI-NEXT:    dup v1.8b, w2
1330; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
1331; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
1332; CHECK-GI-NEXT:    uaddw2 v0.8h, v2.8h, v0.16b
1333; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
1334; CHECK-GI-NEXT:    str q0, [x0]
1335; CHECK-GI-NEXT:    ret
1336entry:
1337  %0 = trunc i32 %mul to i8
1338  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
1339  %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
1340  %3 = load <16 x i8>, ptr %src, align 1
1341  %4 = bitcast <16 x i8> %3 to <2 x double>
1342  %5 = extractelement <2 x double> %4, i32 1
1343  %6 = bitcast double %5 to <8 x i8>
1344  %7 = zext <8 x i8> %6 to <8 x i16>
1345  %8 = zext <8 x i8> %2 to <8 x i16>
1346  %9 = extractelement <2 x double> %4, i32 0
1347  %10 = bitcast double %9 to <8 x i8>
1348  %11 = zext <8 x i8> %10 to <8 x i16>
1349  %12 = add <8 x i16> %7, %11
1350  %13 = mul <8 x i16> %12, %8
1351  store <8 x i16> %13, ptr %dst, align 2
1352  ret void
1353}
1354
1355define <16 x i16> @umull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
1356; CHECK-NEON-LABEL: umull2_i8:
1357; CHECK-NEON:       // %bb.0:
1358; CHECK-NEON-NEXT:    umull2 v2.8h, v0.16b, v1.16b
1359; CHECK-NEON-NEXT:    umull v0.8h, v0.8b, v1.8b
1360; CHECK-NEON-NEXT:    mov v1.16b, v2.16b
1361; CHECK-NEON-NEXT:    ret
1362;
1363; CHECK-SVE-LABEL: umull2_i8:
1364; CHECK-SVE:       // %bb.0:
1365; CHECK-SVE-NEXT:    umull2 v2.8h, v0.16b, v1.16b
1366; CHECK-SVE-NEXT:    umull v0.8h, v0.8b, v1.8b
1367; CHECK-SVE-NEXT:    mov v1.16b, v2.16b
1368; CHECK-SVE-NEXT:    ret
1369;
1370; CHECK-GI-LABEL: umull2_i8:
1371; CHECK-GI:       // %bb.0:
1372; CHECK-GI-NEXT:    umull v2.8h, v0.8b, v1.8b
1373; CHECK-GI-NEXT:    umull2 v1.8h, v0.16b, v1.16b
1374; CHECK-GI-NEXT:    mov v0.16b, v2.16b
1375; CHECK-GI-NEXT:    ret
1376  %arg1_ext = zext <16 x i8> %arg1 to <16 x i16>
1377  %arg2_ext = zext <16 x i8> %arg2 to <16 x i16>
1378  %mul = mul <16 x i16> %arg1_ext, %arg2_ext
1379  ret <16 x i16> %mul
1380}
1381
1382define <16 x i16> @smull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
1383; CHECK-NEON-LABEL: smull2_i8:
1384; CHECK-NEON:       // %bb.0:
1385; CHECK-NEON-NEXT:    smull2 v2.8h, v0.16b, v1.16b
1386; CHECK-NEON-NEXT:    smull v0.8h, v0.8b, v1.8b
1387; CHECK-NEON-NEXT:    mov v1.16b, v2.16b
1388; CHECK-NEON-NEXT:    ret
1389;
1390; CHECK-SVE-LABEL: smull2_i8:
1391; CHECK-SVE:       // %bb.0:
1392; CHECK-SVE-NEXT:    smull2 v2.8h, v0.16b, v1.16b
1393; CHECK-SVE-NEXT:    smull v0.8h, v0.8b, v1.8b
1394; CHECK-SVE-NEXT:    mov v1.16b, v2.16b
1395; CHECK-SVE-NEXT:    ret
1396;
1397; CHECK-GI-LABEL: smull2_i8:
1398; CHECK-GI:       // %bb.0:
1399; CHECK-GI-NEXT:    smull v2.8h, v0.8b, v1.8b
1400; CHECK-GI-NEXT:    smull2 v1.8h, v0.16b, v1.16b
1401; CHECK-GI-NEXT:    mov v0.16b, v2.16b
1402; CHECK-GI-NEXT:    ret
1403  %arg1_ext = sext <16 x i8> %arg1 to <16 x i16>
1404  %arg2_ext = sext <16 x i8> %arg2 to <16 x i16>
1405  %mul = mul <16 x i16> %arg1_ext, %arg2_ext
1406  ret <16 x i16> %mul
1407}
1408
1409define <8 x i32> @umull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
1410; CHECK-NEON-LABEL: umull2_i16:
1411; CHECK-NEON:       // %bb.0:
1412; CHECK-NEON-NEXT:    umull2 v2.4s, v0.8h, v1.8h
1413; CHECK-NEON-NEXT:    umull v0.4s, v0.4h, v1.4h
1414; CHECK-NEON-NEXT:    mov v1.16b, v2.16b
1415; CHECK-NEON-NEXT:    ret
1416;
1417; CHECK-SVE-LABEL: umull2_i16:
1418; CHECK-SVE:       // %bb.0:
1419; CHECK-SVE-NEXT:    umull2 v2.4s, v0.8h, v1.8h
1420; CHECK-SVE-NEXT:    umull v0.4s, v0.4h, v1.4h
1421; CHECK-SVE-NEXT:    mov v1.16b, v2.16b
1422; CHECK-SVE-NEXT:    ret
1423;
1424; CHECK-GI-LABEL: umull2_i16:
1425; CHECK-GI:       // %bb.0:
1426; CHECK-GI-NEXT:    umull v2.4s, v0.4h, v1.4h
1427; CHECK-GI-NEXT:    umull2 v1.4s, v0.8h, v1.8h
1428; CHECK-GI-NEXT:    mov v0.16b, v2.16b
1429; CHECK-GI-NEXT:    ret
1430  %arg1_ext = zext <8 x i16> %arg1 to <8 x i32>
1431  %arg2_ext = zext <8 x i16> %arg2 to <8 x i32>
1432  %mul = mul <8 x i32> %arg1_ext, %arg2_ext
1433  ret <8 x i32> %mul
1434}
1435
1436define <8 x i32> @smull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
1437; CHECK-NEON-LABEL: smull2_i16:
1438; CHECK-NEON:       // %bb.0:
1439; CHECK-NEON-NEXT:    smull2 v2.4s, v0.8h, v1.8h
1440; CHECK-NEON-NEXT:    smull v0.4s, v0.4h, v1.4h
1441; CHECK-NEON-NEXT:    mov v1.16b, v2.16b
1442; CHECK-NEON-NEXT:    ret
1443;
1444; CHECK-SVE-LABEL: smull2_i16:
1445; CHECK-SVE:       // %bb.0:
1446; CHECK-SVE-NEXT:    smull2 v2.4s, v0.8h, v1.8h
1447; CHECK-SVE-NEXT:    smull v0.4s, v0.4h, v1.4h
1448; CHECK-SVE-NEXT:    mov v1.16b, v2.16b
1449; CHECK-SVE-NEXT:    ret
1450;
1451; CHECK-GI-LABEL: smull2_i16:
1452; CHECK-GI:       // %bb.0:
1453; CHECK-GI-NEXT:    smull v2.4s, v0.4h, v1.4h
1454; CHECK-GI-NEXT:    smull2 v1.4s, v0.8h, v1.8h
1455; CHECK-GI-NEXT:    mov v0.16b, v2.16b
1456; CHECK-GI-NEXT:    ret
1457  %arg1_ext = sext <8 x i16> %arg1 to <8 x i32>
1458  %arg2_ext = sext <8 x i16> %arg2 to <8 x i32>
1459  %mul = mul <8 x i32> %arg1_ext, %arg2_ext
1460  ret <8 x i32> %mul
1461}
1462
1463define <4 x i64> @umull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
1464; CHECK-NEON-LABEL: umull2_i32:
1465; CHECK-NEON:       // %bb.0:
1466; CHECK-NEON-NEXT:    umull2 v2.2d, v0.4s, v1.4s
1467; CHECK-NEON-NEXT:    umull v0.2d, v0.2s, v1.2s
1468; CHECK-NEON-NEXT:    mov v1.16b, v2.16b
1469; CHECK-NEON-NEXT:    ret
1470;
1471; CHECK-SVE-LABEL: umull2_i32:
1472; CHECK-SVE:       // %bb.0:
1473; CHECK-SVE-NEXT:    umull2 v2.2d, v0.4s, v1.4s
1474; CHECK-SVE-NEXT:    umull v0.2d, v0.2s, v1.2s
1475; CHECK-SVE-NEXT:    mov v1.16b, v2.16b
1476; CHECK-SVE-NEXT:    ret
1477;
1478; CHECK-GI-LABEL: umull2_i32:
1479; CHECK-GI:       // %bb.0:
1480; CHECK-GI-NEXT:    umull v2.2d, v0.2s, v1.2s
1481; CHECK-GI-NEXT:    umull2 v1.2d, v0.4s, v1.4s
1482; CHECK-GI-NEXT:    mov v0.16b, v2.16b
1483; CHECK-GI-NEXT:    ret
1484  %arg1_ext = zext <4 x i32> %arg1 to <4 x i64>
1485  %arg2_ext = zext <4 x i32> %arg2 to <4 x i64>
1486  %mul = mul <4 x i64> %arg1_ext, %arg2_ext
1487  ret <4 x i64> %mul
1488}
1489
1490define <4 x i64> @smull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
1491; CHECK-NEON-LABEL: smull2_i32:
1492; CHECK-NEON:       // %bb.0:
1493; CHECK-NEON-NEXT:    smull2 v2.2d, v0.4s, v1.4s
1494; CHECK-NEON-NEXT:    smull v0.2d, v0.2s, v1.2s
1495; CHECK-NEON-NEXT:    mov v1.16b, v2.16b
1496; CHECK-NEON-NEXT:    ret
1497;
1498; CHECK-SVE-LABEL: smull2_i32:
1499; CHECK-SVE:       // %bb.0:
1500; CHECK-SVE-NEXT:    smull2 v2.2d, v0.4s, v1.4s
1501; CHECK-SVE-NEXT:    smull v0.2d, v0.2s, v1.2s
1502; CHECK-SVE-NEXT:    mov v1.16b, v2.16b
1503; CHECK-SVE-NEXT:    ret
1504;
1505; CHECK-GI-LABEL: smull2_i32:
1506; CHECK-GI:       // %bb.0:
1507; CHECK-GI-NEXT:    smull v2.2d, v0.2s, v1.2s
1508; CHECK-GI-NEXT:    smull2 v1.2d, v0.4s, v1.4s
1509; CHECK-GI-NEXT:    mov v0.16b, v2.16b
1510; CHECK-GI-NEXT:    ret
1511  %arg1_ext = sext <4 x i32> %arg1 to <4 x i64>
1512  %arg2_ext = sext <4 x i32> %arg2 to <4 x i64>
1513  %mul = mul <4 x i64> %arg1_ext, %arg2_ext
1514  ret <4 x i64> %mul
1515}
1516
1517define <16 x i16> @amull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
1518; CHECK-NEON-LABEL: amull2_i8:
1519; CHECK-NEON:       // %bb.0:
1520; CHECK-NEON-NEXT:    smull v2.8h, v0.8b, v1.8b
1521; CHECK-NEON-NEXT:    smull2 v1.8h, v0.16b, v1.16b
1522; CHECK-NEON-NEXT:    bic v2.8h, #255, lsl #8
1523; CHECK-NEON-NEXT:    bic v1.8h, #255, lsl #8
1524; CHECK-NEON-NEXT:    mov v0.16b, v2.16b
1525; CHECK-NEON-NEXT:    ret
1526;
1527; CHECK-SVE-LABEL: amull2_i8:
1528; CHECK-SVE:       // %bb.0:
1529; CHECK-SVE-NEXT:    smull v2.8h, v0.8b, v1.8b
1530; CHECK-SVE-NEXT:    smull2 v1.8h, v0.16b, v1.16b
1531; CHECK-SVE-NEXT:    bic v2.8h, #255, lsl #8
1532; CHECK-SVE-NEXT:    bic v1.8h, #255, lsl #8
1533; CHECK-SVE-NEXT:    mov v0.16b, v2.16b
1534; CHECK-SVE-NEXT:    ret
1535;
1536; CHECK-GI-LABEL: amull2_i8:
1537; CHECK-GI:       // %bb.0:
1538; CHECK-GI-NEXT:    movi v2.2d, #0xff00ff00ff00ff
1539; CHECK-GI-NEXT:    umull v3.8h, v0.8b, v1.8b
1540; CHECK-GI-NEXT:    umull2 v1.8h, v0.16b, v1.16b
1541; CHECK-GI-NEXT:    and v0.16b, v3.16b, v2.16b
1542; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
1543; CHECK-GI-NEXT:    ret
1544  %arg1_ext = zext <16 x i8> %arg1 to <16 x i16>
1545  %arg2_ext = zext <16 x i8> %arg2 to <16 x i16>
1546  %mul = mul <16 x i16> %arg1_ext, %arg2_ext
1547  %and = and <16 x i16> %mul, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
1548  ret <16 x i16> %and
1549}
1550
1551define <8 x i32> @amull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
1552; CHECK-NEON-LABEL: amull2_i16:
1553; CHECK-NEON:       // %bb.0:
1554; CHECK-NEON-NEXT:    movi v2.2d, #0x00ffff0000ffff
1555; CHECK-NEON-NEXT:    smull v3.4s, v0.4h, v1.4h
1556; CHECK-NEON-NEXT:    smull2 v0.4s, v0.8h, v1.8h
1557; CHECK-NEON-NEXT:    and v1.16b, v0.16b, v2.16b
1558; CHECK-NEON-NEXT:    and v0.16b, v3.16b, v2.16b
1559; CHECK-NEON-NEXT:    ret
1560;
1561; CHECK-SVE-LABEL: amull2_i16:
1562; CHECK-SVE:       // %bb.0:
1563; CHECK-SVE-NEXT:    movi v2.2d, #0x00ffff0000ffff
1564; CHECK-SVE-NEXT:    smull v3.4s, v0.4h, v1.4h
1565; CHECK-SVE-NEXT:    smull2 v0.4s, v0.8h, v1.8h
1566; CHECK-SVE-NEXT:    and v1.16b, v0.16b, v2.16b
1567; CHECK-SVE-NEXT:    and v0.16b, v3.16b, v2.16b
1568; CHECK-SVE-NEXT:    ret
1569;
1570; CHECK-GI-LABEL: amull2_i16:
1571; CHECK-GI:       // %bb.0:
1572; CHECK-GI-NEXT:    movi v2.2d, #0x00ffff0000ffff
1573; CHECK-GI-NEXT:    umull v3.4s, v0.4h, v1.4h
1574; CHECK-GI-NEXT:    umull2 v1.4s, v0.8h, v1.8h
1575; CHECK-GI-NEXT:    and v0.16b, v3.16b, v2.16b
1576; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
1577; CHECK-GI-NEXT:    ret
1578  %arg1_ext = zext <8 x i16> %arg1 to <8 x i32>
1579  %arg2_ext = zext <8 x i16> %arg2 to <8 x i32>
1580  %mul = mul <8 x i32> %arg1_ext, %arg2_ext
1581  %and = and <8 x i32> %mul, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
1582  ret <8 x i32> %and
1583}
1584
1585define <4 x i64> @amull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
1586; CHECK-NEON-LABEL: amull2_i32:
1587; CHECK-NEON:       // %bb.0:
1588; CHECK-NEON-NEXT:    movi v2.2d, #0x000000ffffffff
1589; CHECK-NEON-NEXT:    smull v3.2d, v0.2s, v1.2s
1590; CHECK-NEON-NEXT:    smull2 v0.2d, v0.4s, v1.4s
1591; CHECK-NEON-NEXT:    and v1.16b, v0.16b, v2.16b
1592; CHECK-NEON-NEXT:    and v0.16b, v3.16b, v2.16b
1593; CHECK-NEON-NEXT:    ret
1594;
1595; CHECK-SVE-LABEL: amull2_i32:
1596; CHECK-SVE:       // %bb.0:
1597; CHECK-SVE-NEXT:    movi v2.2d, #0x000000ffffffff
1598; CHECK-SVE-NEXT:    smull v3.2d, v0.2s, v1.2s
1599; CHECK-SVE-NEXT:    smull2 v0.2d, v0.4s, v1.4s
1600; CHECK-SVE-NEXT:    and v1.16b, v0.16b, v2.16b
1601; CHECK-SVE-NEXT:    and v0.16b, v3.16b, v2.16b
1602; CHECK-SVE-NEXT:    ret
1603;
1604; CHECK-GI-LABEL: amull2_i32:
1605; CHECK-GI:       // %bb.0:
1606; CHECK-GI-NEXT:    movi v2.2d, #0x000000ffffffff
1607; CHECK-GI-NEXT:    umull v3.2d, v0.2s, v1.2s
1608; CHECK-GI-NEXT:    umull2 v1.2d, v0.4s, v1.4s
1609; CHECK-GI-NEXT:    and v0.16b, v3.16b, v2.16b
1610; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
1611; CHECK-GI-NEXT:    ret
1612  %arg1_ext = zext <4 x i32> %arg1 to <4 x i64>
1613  %arg2_ext = zext <4 x i32> %arg2 to <4 x i64>
1614  %mul = mul <4 x i64> %arg1_ext, %arg2_ext
1615  %and = and <4 x i64> %mul, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1616  ret <4 x i64> %and
1617}
1618
1619
1620define <8 x i16> @umull_and_v8i16(<8 x i8> %src1, <8 x i16> %src2) {
1621; CHECK-NEON-LABEL: umull_and_v8i16:
1622; CHECK-NEON:       // %bb.0: // %entry
1623; CHECK-NEON-NEXT:    bic v1.8h, #255, lsl #8
1624; CHECK-NEON-NEXT:    xtn v1.8b, v1.8h
1625; CHECK-NEON-NEXT:    umull v0.8h, v0.8b, v1.8b
1626; CHECK-NEON-NEXT:    ret
1627;
1628; CHECK-SVE-LABEL: umull_and_v8i16:
1629; CHECK-SVE:       // %bb.0: // %entry
1630; CHECK-SVE-NEXT:    bic v1.8h, #255, lsl #8
1631; CHECK-SVE-NEXT:    xtn v1.8b, v1.8h
1632; CHECK-SVE-NEXT:    umull v0.8h, v0.8b, v1.8b
1633; CHECK-SVE-NEXT:    ret
1634;
1635; CHECK-GI-LABEL: umull_and_v8i16:
1636; CHECK-GI:       // %bb.0: // %entry
1637; CHECK-GI-NEXT:    movi v2.2d, #0xff00ff00ff00ff
1638; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
1639; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
1640; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
1641; CHECK-GI-NEXT:    ret
1642entry:
1643  %in1 = zext <8 x i8> %src1 to <8 x i16>
1644  %in2 = and <8 x i16> %src2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
1645  %out = mul nsw <8 x i16> %in1, %in2
1646  ret <8 x i16> %out
1647}
1648
1649define <8 x i16> @umull_and_v8i16_c(<8 x i8> %src1, <8 x i16> %src2) {
1650; CHECK-NEON-LABEL: umull_and_v8i16_c:
1651; CHECK-NEON:       // %bb.0: // %entry
1652; CHECK-NEON-NEXT:    bic v1.8h, #255, lsl #8
1653; CHECK-NEON-NEXT:    xtn v1.8b, v1.8h
1654; CHECK-NEON-NEXT:    umull v0.8h, v1.8b, v0.8b
1655; CHECK-NEON-NEXT:    ret
1656;
1657; CHECK-SVE-LABEL: umull_and_v8i16_c:
1658; CHECK-SVE:       // %bb.0: // %entry
1659; CHECK-SVE-NEXT:    bic v1.8h, #255, lsl #8
1660; CHECK-SVE-NEXT:    xtn v1.8b, v1.8h
1661; CHECK-SVE-NEXT:    umull v0.8h, v1.8b, v0.8b
1662; CHECK-SVE-NEXT:    ret
1663;
1664; CHECK-GI-LABEL: umull_and_v8i16_c:
1665; CHECK-GI:       // %bb.0: // %entry
1666; CHECK-GI-NEXT:    movi v2.2d, #0xff00ff00ff00ff
1667; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
1668; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
1669; CHECK-GI-NEXT:    mul v0.8h, v1.8h, v0.8h
1670; CHECK-GI-NEXT:    ret
1671entry:
1672  %in1 = zext <8 x i8> %src1 to <8 x i16>
1673  %in2 = and <8 x i16> %src2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
1674  %out = mul nsw <8 x i16> %in2, %in1
1675  ret <8 x i16> %out
1676}
1677
1678define <8 x i16> @umull_and256_v8i16(<8 x i8> %src1, <8 x i16> %src2) {
1679; CHECK-LABEL: umull_and256_v8i16:
1680; CHECK:       // %bb.0: // %entry
1681; CHECK-NEXT:    movi v2.8h, #1, lsl #8
1682; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
1683; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
1684; CHECK-NEXT:    mul v0.8h, v0.8h, v1.8h
1685; CHECK-NEXT:    ret
1686entry:
1687  %in1 = zext <8 x i8> %src1 to <8 x i16>
1688  %in2 = and <8 x i16> %src2, <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>
1689  %out = mul nsw <8 x i16> %in1, %in2
1690  ret <8 x i16> %out
1691}
1692
1693define <8 x i16> @umull_andconst_v8i16(<8 x i8> %src1, <8 x i16> %src2) {
1694; CHECK-NEON-LABEL: umull_andconst_v8i16:
1695; CHECK-NEON:       // %bb.0: // %entry
1696; CHECK-NEON-NEXT:    movi v1.2d, #0xffffffffffffffff
1697; CHECK-NEON-NEXT:    umull v0.8h, v0.8b, v1.8b
1698; CHECK-NEON-NEXT:    ret
1699;
1700; CHECK-SVE-LABEL: umull_andconst_v8i16:
1701; CHECK-SVE:       // %bb.0: // %entry
1702; CHECK-SVE-NEXT:    movi v1.2d, #0xffffffffffffffff
1703; CHECK-SVE-NEXT:    umull v0.8h, v0.8b, v1.8b
1704; CHECK-SVE-NEXT:    ret
1705;
1706; CHECK-GI-LABEL: umull_andconst_v8i16:
1707; CHECK-GI:       // %bb.0: // %entry
1708; CHECK-GI-NEXT:    movi v1.2d, #0xff00ff00ff00ff
1709; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
1710; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
1711; CHECK-GI-NEXT:    ret
1712entry:
1713  %in1 = zext <8 x i8> %src1 to <8 x i16>
1714  %out = mul nsw <8 x i16> %in1, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
1715  ret <8 x i16> %out
1716}
1717
1718define <8 x i16> @umull_smaller_v8i16(<8 x i4> %src1, <8 x i16> %src2) {
1719; CHECK-NEON-LABEL: umull_smaller_v8i16:
1720; CHECK-NEON:       // %bb.0: // %entry
1721; CHECK-NEON-NEXT:    movi v2.8b, #15
1722; CHECK-NEON-NEXT:    bic v1.8h, #255, lsl #8
1723; CHECK-NEON-NEXT:    xtn v1.8b, v1.8h
1724; CHECK-NEON-NEXT:    and v0.8b, v0.8b, v2.8b
1725; CHECK-NEON-NEXT:    umull v0.8h, v0.8b, v1.8b
1726; CHECK-NEON-NEXT:    ret
1727;
1728; CHECK-SVE-LABEL: umull_smaller_v8i16:
1729; CHECK-SVE:       // %bb.0: // %entry
1730; CHECK-SVE-NEXT:    movi v2.8b, #15
1731; CHECK-SVE-NEXT:    bic v1.8h, #255, lsl #8
1732; CHECK-SVE-NEXT:    xtn v1.8b, v1.8h
1733; CHECK-SVE-NEXT:    and v0.8b, v0.8b, v2.8b
1734; CHECK-SVE-NEXT:    umull v0.8h, v0.8b, v1.8b
1735; CHECK-SVE-NEXT:    ret
1736;
1737; CHECK-GI-LABEL: umull_smaller_v8i16:
1738; CHECK-GI:       // %bb.0: // %entry
1739; CHECK-GI-NEXT:    movi v2.2d, #0xff00ff00ff00ff
1740; CHECK-GI-NEXT:    movi v3.8h, #15
1741; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
1742; CHECK-GI-NEXT:    and v0.16b, v0.16b, v3.16b
1743; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
1744; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
1745; CHECK-GI-NEXT:    ret
1746entry:
1747  %in1 = zext <8 x i4> %src1 to <8 x i16>
1748  %in2 = and <8 x i16> %src2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
1749  %out = mul nsw <8 x i16> %in1, %in2
1750  ret <8 x i16> %out
1751}
1752
1753define <4 x i32> @umull_and_v4i32(<4 x i16> %src1, <4 x i32> %src2) {
1754; CHECK-NEON-LABEL: umull_and_v4i32:
1755; CHECK-NEON:       // %bb.0: // %entry
1756; CHECK-NEON-NEXT:    movi v2.2d, #0x0000ff000000ff
1757; CHECK-NEON-NEXT:    and v1.16b, v1.16b, v2.16b
1758; CHECK-NEON-NEXT:    xtn v1.4h, v1.4s
1759; CHECK-NEON-NEXT:    umull v0.4s, v0.4h, v1.4h
1760; CHECK-NEON-NEXT:    ret
1761;
1762; CHECK-SVE-LABEL: umull_and_v4i32:
1763; CHECK-SVE:       // %bb.0: // %entry
1764; CHECK-SVE-NEXT:    movi v2.2d, #0x0000ff000000ff
1765; CHECK-SVE-NEXT:    and v1.16b, v1.16b, v2.16b
1766; CHECK-SVE-NEXT:    xtn v1.4h, v1.4s
1767; CHECK-SVE-NEXT:    umull v0.4s, v0.4h, v1.4h
1768; CHECK-SVE-NEXT:    ret
1769;
1770; CHECK-GI-LABEL: umull_and_v4i32:
1771; CHECK-GI:       // %bb.0: // %entry
1772; CHECK-GI-NEXT:    movi v2.2d, #0x0000ff000000ff
1773; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
1774; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
1775; CHECK-GI-NEXT:    mul v0.4s, v0.4s, v1.4s
1776; CHECK-GI-NEXT:    ret
1777entry:
1778  %in1 = zext <4 x i16> %src1 to <4 x i32>
1779  %in2 = and <4 x i32> %src2, <i32 255, i32 255, i32 255, i32 255>
1780  %out = mul nsw <4 x i32> %in1, %in2
1781  ret <4 x i32> %out
1782}
1783
1784define <8 x i32> @umull_and_v8i32(<8 x i16> %src1, <8 x i32> %src2) {
1785; CHECK-NEON-LABEL: umull_and_v8i32:
1786; CHECK-NEON:       // %bb.0: // %entry
1787; CHECK-NEON-NEXT:    movi v3.2d, #0x0000ff000000ff
1788; CHECK-NEON-NEXT:    and v2.16b, v2.16b, v3.16b
1789; CHECK-NEON-NEXT:    and v1.16b, v1.16b, v3.16b
1790; CHECK-NEON-NEXT:    uzp1 v2.8h, v1.8h, v2.8h
1791; CHECK-NEON-NEXT:    umull2 v1.4s, v0.8h, v2.8h
1792; CHECK-NEON-NEXT:    umull v0.4s, v0.4h, v2.4h
1793; CHECK-NEON-NEXT:    ret
1794;
1795; CHECK-SVE-LABEL: umull_and_v8i32:
1796; CHECK-SVE:       // %bb.0: // %entry
1797; CHECK-SVE-NEXT:    movi v3.2d, #0x0000ff000000ff
1798; CHECK-SVE-NEXT:    and v2.16b, v2.16b, v3.16b
1799; CHECK-SVE-NEXT:    and v1.16b, v1.16b, v3.16b
1800; CHECK-SVE-NEXT:    uzp1 v2.8h, v1.8h, v2.8h
1801; CHECK-SVE-NEXT:    umull2 v1.4s, v0.8h, v2.8h
1802; CHECK-SVE-NEXT:    umull v0.4s, v0.4h, v2.4h
1803; CHECK-SVE-NEXT:    ret
1804;
1805; CHECK-GI-LABEL: umull_and_v8i32:
1806; CHECK-GI:       // %bb.0: // %entry
1807; CHECK-GI-NEXT:    movi v3.2d, #0x0000ff000000ff
1808; CHECK-GI-NEXT:    ushll v4.4s, v0.4h, #0
1809; CHECK-GI-NEXT:    ushll2 v5.4s, v0.8h, #0
1810; CHECK-GI-NEXT:    and v0.16b, v1.16b, v3.16b
1811; CHECK-GI-NEXT:    and v1.16b, v2.16b, v3.16b
1812; CHECK-GI-NEXT:    mul v0.4s, v4.4s, v0.4s
1813; CHECK-GI-NEXT:    mul v1.4s, v5.4s, v1.4s
1814; CHECK-GI-NEXT:    ret
1815entry:
1816  %in1 = zext <8 x i16> %src1 to <8 x i32>
1817  %in2 = and <8 x i32> %src2, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
1818  %out = mul nsw <8 x i32> %in1, %in2
1819  ret <8 x i32> %out
1820}
1821
1822define <8 x i32> @umull_and_v8i32_dup(<8 x i16> %src1, i32 %src2) {
1823; CHECK-NEON-LABEL: umull_and_v8i32_dup:
1824; CHECK-NEON:       // %bb.0: // %entry
1825; CHECK-NEON-NEXT:    and w8, w0, #0xff
1826; CHECK-NEON-NEXT:    dup v2.8h, w8
1827; CHECK-NEON-NEXT:    umull2 v1.4s, v0.8h, v2.8h
1828; CHECK-NEON-NEXT:    umull v0.4s, v0.4h, v2.4h
1829; CHECK-NEON-NEXT:    ret
1830;
1831; CHECK-SVE-LABEL: umull_and_v8i32_dup:
1832; CHECK-SVE:       // %bb.0: // %entry
1833; CHECK-SVE-NEXT:    and w8, w0, #0xff
1834; CHECK-SVE-NEXT:    dup v2.8h, w8
1835; CHECK-SVE-NEXT:    umull2 v1.4s, v0.8h, v2.8h
1836; CHECK-SVE-NEXT:    umull v0.4s, v0.4h, v2.4h
1837; CHECK-SVE-NEXT:    ret
1838;
1839; CHECK-GI-LABEL: umull_and_v8i32_dup:
1840; CHECK-GI:       // %bb.0: // %entry
1841; CHECK-GI-NEXT:    and w8, w0, #0xff
1842; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
1843; CHECK-GI-NEXT:    ushll2 v2.4s, v0.8h, #0
1844; CHECK-GI-NEXT:    dup v3.4s, w8
1845; CHECK-GI-NEXT:    mul v0.4s, v1.4s, v3.4s
1846; CHECK-GI-NEXT:    mul v1.4s, v2.4s, v3.4s
1847; CHECK-GI-NEXT:    ret
1848entry:
1849  %in1 = zext <8 x i16> %src1 to <8 x i32>
1850  %in2 = and i32 %src2, 255
1851  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %in2, i64 0
1852  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
1853  %out = mul nsw <8 x i32> %in1, %broadcast.splat
1854  ret <8 x i32> %out
1855}
1856
1857define <2 x i64> @umull_and_v2i64(<2 x i32> %src1, <2 x i64> %src2) {
1858; CHECK-NEON-LABEL: umull_and_v2i64:
1859; CHECK-NEON:       // %bb.0: // %entry
1860; CHECK-NEON-NEXT:    movi v2.2d, #0x000000000000ff
1861; CHECK-NEON-NEXT:    and v1.16b, v1.16b, v2.16b
1862; CHECK-NEON-NEXT:    xtn v1.2s, v1.2d
1863; CHECK-NEON-NEXT:    umull v0.2d, v0.2s, v1.2s
1864; CHECK-NEON-NEXT:    ret
1865;
1866; CHECK-SVE-LABEL: umull_and_v2i64:
1867; CHECK-SVE:       // %bb.0: // %entry
1868; CHECK-SVE-NEXT:    movi v2.2d, #0x000000000000ff
1869; CHECK-SVE-NEXT:    and v1.16b, v1.16b, v2.16b
1870; CHECK-SVE-NEXT:    xtn v1.2s, v1.2d
1871; CHECK-SVE-NEXT:    umull v0.2d, v0.2s, v1.2s
1872; CHECK-SVE-NEXT:    ret
1873;
1874; CHECK-GI-LABEL: umull_and_v2i64:
1875; CHECK-GI:       // %bb.0: // %entry
1876; CHECK-GI-NEXT:    movi v2.2d, #0x000000000000ff
1877; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
1878; CHECK-GI-NEXT:    fmov x8, d0
1879; CHECK-GI-NEXT:    mov x10, v0.d[1]
1880; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
1881; CHECK-GI-NEXT:    fmov x9, d1
1882; CHECK-GI-NEXT:    mov x11, v1.d[1]
1883; CHECK-GI-NEXT:    mul x8, x8, x9
1884; CHECK-GI-NEXT:    mul x9, x10, x11
1885; CHECK-GI-NEXT:    mov v0.d[0], x8
1886; CHECK-GI-NEXT:    mov v0.d[1], x9
1887; CHECK-GI-NEXT:    ret
1888entry:
1889  %in1 = zext <2 x i32> %src1 to <2 x i64>
1890  %in2 = and <2 x i64> %src2, <i64 255, i64 255>
1891  %out = mul nsw <2 x i64> %in1, %in2
1892  ret <2 x i64> %out
1893}
1894
1895define <4 x i64> @umull_and_v4i64(<4 x i32> %src1, <4 x i64> %src2) {
1896; CHECK-NEON-LABEL: umull_and_v4i64:
1897; CHECK-NEON:       // %bb.0: // %entry
1898; CHECK-NEON-NEXT:    movi v3.2d, #0x000000000000ff
1899; CHECK-NEON-NEXT:    and v2.16b, v2.16b, v3.16b
1900; CHECK-NEON-NEXT:    and v1.16b, v1.16b, v3.16b
1901; CHECK-NEON-NEXT:    uzp1 v2.4s, v1.4s, v2.4s
1902; CHECK-NEON-NEXT:    umull2 v1.2d, v0.4s, v2.4s
1903; CHECK-NEON-NEXT:    umull v0.2d, v0.2s, v2.2s
1904; CHECK-NEON-NEXT:    ret
1905;
1906; CHECK-SVE-LABEL: umull_and_v4i64:
1907; CHECK-SVE:       // %bb.0: // %entry
1908; CHECK-SVE-NEXT:    movi v3.2d, #0x000000000000ff
1909; CHECK-SVE-NEXT:    and v2.16b, v2.16b, v3.16b
1910; CHECK-SVE-NEXT:    and v1.16b, v1.16b, v3.16b
1911; CHECK-SVE-NEXT:    uzp1 v2.4s, v1.4s, v2.4s
1912; CHECK-SVE-NEXT:    umull2 v1.2d, v0.4s, v2.4s
1913; CHECK-SVE-NEXT:    umull v0.2d, v0.2s, v2.2s
1914; CHECK-SVE-NEXT:    ret
1915;
1916; CHECK-GI-LABEL: umull_and_v4i64:
1917; CHECK-GI:       // %bb.0: // %entry
1918; CHECK-GI-NEXT:    movi v3.2d, #0x000000000000ff
1919; CHECK-GI-NEXT:    ushll v4.2d, v0.2s, #0
1920; CHECK-GI-NEXT:    ushll2 v0.2d, v0.4s, #0
1921; CHECK-GI-NEXT:    fmov x8, d4
1922; CHECK-GI-NEXT:    mov x10, v4.d[1]
1923; CHECK-GI-NEXT:    mov x13, v0.d[1]
1924; CHECK-GI-NEXT:    and v1.16b, v1.16b, v3.16b
1925; CHECK-GI-NEXT:    and v2.16b, v2.16b, v3.16b
1926; CHECK-GI-NEXT:    fmov x9, d1
1927; CHECK-GI-NEXT:    fmov x12, d2
1928; CHECK-GI-NEXT:    mov x11, v1.d[1]
1929; CHECK-GI-NEXT:    mov x14, v2.d[1]
1930; CHECK-GI-NEXT:    mul x8, x8, x9
1931; CHECK-GI-NEXT:    fmov x9, d0
1932; CHECK-GI-NEXT:    mul x10, x10, x11
1933; CHECK-GI-NEXT:    mul x9, x9, x12
1934; CHECK-GI-NEXT:    mov v0.d[0], x8
1935; CHECK-GI-NEXT:    mul x11, x13, x14
1936; CHECK-GI-NEXT:    mov v1.d[0], x9
1937; CHECK-GI-NEXT:    mov v0.d[1], x10
1938; CHECK-GI-NEXT:    mov v1.d[1], x11
1939; CHECK-GI-NEXT:    ret
1940entry:
1941  %in1 = zext <4 x i32> %src1 to <4 x i64>
1942  %in2 = and <4 x i64> %src2, <i64 255, i64 255, i64 255, i64 255>
1943  %out = mul nsw <4 x i64> %in1, %in2
1944  ret <4 x i64> %out
1945}
1946
1947define <4 x i64> @umull_and_v4i64_dup(<4 x i32> %src1, i64 %src2) {
1948; CHECK-NEON-LABEL: umull_and_v4i64_dup:
1949; CHECK-NEON:       // %bb.0: // %entry
1950; CHECK-NEON-NEXT:    and w8, w0, #0xff
1951; CHECK-NEON-NEXT:    dup v2.4s, w8
1952; CHECK-NEON-NEXT:    umull2 v1.2d, v0.4s, v2.4s
1953; CHECK-NEON-NEXT:    umull v0.2d, v0.2s, v2.2s
1954; CHECK-NEON-NEXT:    ret
1955;
1956; CHECK-SVE-LABEL: umull_and_v4i64_dup:
1957; CHECK-SVE:       // %bb.0: // %entry
1958; CHECK-SVE-NEXT:    and w8, w0, #0xff
1959; CHECK-SVE-NEXT:    dup v2.4s, w8
1960; CHECK-SVE-NEXT:    umull2 v1.2d, v0.4s, v2.4s
1961; CHECK-SVE-NEXT:    umull v0.2d, v0.2s, v2.2s
1962; CHECK-SVE-NEXT:    ret
1963;
1964; CHECK-GI-LABEL: umull_and_v4i64_dup:
1965; CHECK-GI:       // %bb.0: // %entry
1966; CHECK-GI-NEXT:    and x8, x0, #0xff
1967; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
1968; CHECK-GI-NEXT:    ushll2 v0.2d, v0.4s, #0
1969; CHECK-GI-NEXT:    dup v2.2d, x8
1970; CHECK-GI-NEXT:    fmov x8, d1
1971; CHECK-GI-NEXT:    fmov x12, d0
1972; CHECK-GI-NEXT:    mov x10, v1.d[1]
1973; CHECK-GI-NEXT:    fmov x9, d2
1974; CHECK-GI-NEXT:    mov x11, v2.d[1]
1975; CHECK-GI-NEXT:    mov x13, v0.d[1]
1976; CHECK-GI-NEXT:    mul x8, x8, x9
1977; CHECK-GI-NEXT:    mul x9, x12, x9
1978; CHECK-GI-NEXT:    mul x10, x10, x11
1979; CHECK-GI-NEXT:    mov v0.d[0], x8
1980; CHECK-GI-NEXT:    mul x11, x13, x11
1981; CHECK-GI-NEXT:    mov v1.d[0], x9
1982; CHECK-GI-NEXT:    mov v0.d[1], x10
1983; CHECK-GI-NEXT:    mov v1.d[1], x11
1984; CHECK-GI-NEXT:    ret
1985entry:
1986  %in1 = zext <4 x i32> %src1 to <4 x i64>
1987  %in2 = and i64 %src2, 255
1988  %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %in2, i64 0
1989  %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer
1990  %out = mul nsw <4 x i64> %in1, %broadcast.splat
1991  ret <4 x i64> %out
1992}
1993
1994define void @pmlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) {
1995; CHECK-LABEL: pmlsl2_v8i16_uzp1:
1996; CHECK:       // %bb.0:
1997; CHECK-NEXT:    ldr q2, [x1, #16]
1998; CHECK-NEXT:    uzp1 v2.16b, v0.16b, v2.16b
1999; CHECK-NEXT:    pmull2 v0.8h, v0.16b, v2.16b
2000; CHECK-NEXT:    sub v0.8h, v1.8h, v0.8h
2001; CHECK-NEXT:    str q0, [x0]
2002; CHECK-NEXT:    ret
2003  %5 = getelementptr inbounds i32, ptr %3, i64 4
2004  %6 = load <8 x i16>, ptr %5, align 4
2005  %7 = trunc <8 x i16> %6 to <8 x i8>
2006  %8 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2007  %9 = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %8, <8 x i8> %7)
2008  %10 = sub <8 x i16> %1, %9
2009  store <8 x i16> %10, ptr %2, align 16
2010  ret void
2011}
2012
2013define void @smlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) {
2014; CHECK-NEON-LABEL: smlsl2_v8i16_uzp1:
2015; CHECK-NEON:       // %bb.0:
2016; CHECK-NEON-NEXT:    ldr q2, [x1, #16]
2017; CHECK-NEON-NEXT:    uzp1 v2.16b, v0.16b, v2.16b
2018; CHECK-NEON-NEXT:    smlsl2 v1.8h, v0.16b, v2.16b
2019; CHECK-NEON-NEXT:    str q1, [x0]
2020; CHECK-NEON-NEXT:    ret
2021;
2022; CHECK-SVE-LABEL: smlsl2_v8i16_uzp1:
2023; CHECK-SVE:       // %bb.0:
2024; CHECK-SVE-NEXT:    ldr q2, [x1, #16]
2025; CHECK-SVE-NEXT:    uzp1 v2.16b, v0.16b, v2.16b
2026; CHECK-SVE-NEXT:    smlsl2 v1.8h, v0.16b, v2.16b
2027; CHECK-SVE-NEXT:    str q1, [x0]
2028; CHECK-SVE-NEXT:    ret
2029;
2030; CHECK-GI-LABEL: smlsl2_v8i16_uzp1:
2031; CHECK-GI:       // %bb.0:
2032; CHECK-GI-NEXT:    ldr q2, [x1, #16]
2033; CHECK-GI-NEXT:    mov d0, v0.d[1]
2034; CHECK-GI-NEXT:    xtn v2.8b, v2.8h
2035; CHECK-GI-NEXT:    smlsl v1.8h, v0.8b, v2.8b
2036; CHECK-GI-NEXT:    str q1, [x0]
2037; CHECK-GI-NEXT:    ret
2038  %5 = getelementptr inbounds i32, ptr %3, i64 4
2039  %6 = load <8 x i16>, ptr %5, align 4
2040  %7 = trunc <8 x i16> %6 to <8 x i8>
2041  %8 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2042  %9 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %8, <8 x i8> %7)
2043  %10 = sub <8 x i16> %1, %9
2044  store <8 x i16> %10, ptr %2, align 16
2045  ret void
2046}
2047
2048define void @umlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) {
2049; CHECK-NEON-LABEL: umlsl2_v8i16_uzp1:
2050; CHECK-NEON:       // %bb.0:
2051; CHECK-NEON-NEXT:    ldr q2, [x1, #16]
2052; CHECK-NEON-NEXT:    uzp1 v2.16b, v0.16b, v2.16b
2053; CHECK-NEON-NEXT:    umlsl2 v1.8h, v0.16b, v2.16b
2054; CHECK-NEON-NEXT:    str q1, [x0]
2055; CHECK-NEON-NEXT:    ret
2056;
2057; CHECK-SVE-LABEL: umlsl2_v8i16_uzp1:
2058; CHECK-SVE:       // %bb.0:
2059; CHECK-SVE-NEXT:    ldr q2, [x1, #16]
2060; CHECK-SVE-NEXT:    uzp1 v2.16b, v0.16b, v2.16b
2061; CHECK-SVE-NEXT:    umlsl2 v1.8h, v0.16b, v2.16b
2062; CHECK-SVE-NEXT:    str q1, [x0]
2063; CHECK-SVE-NEXT:    ret
2064;
2065; CHECK-GI-LABEL: umlsl2_v8i16_uzp1:
2066; CHECK-GI:       // %bb.0:
2067; CHECK-GI-NEXT:    ldr q2, [x1, #16]
2068; CHECK-GI-NEXT:    mov d0, v0.d[1]
2069; CHECK-GI-NEXT:    xtn v2.8b, v2.8h
2070; CHECK-GI-NEXT:    umlsl v1.8h, v0.8b, v2.8b
2071; CHECK-GI-NEXT:    str q1, [x0]
2072; CHECK-GI-NEXT:    ret
2073  %5 = getelementptr inbounds i32, ptr %3, i64 4
2074  %6 = load <8 x i16>, ptr %5, align 4
2075  %7 = trunc <8 x i16> %6 to <8 x i8>
2076  %8 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2077  %9 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %8, <8 x i8> %7)
2078  %10 = sub <8 x i16> %1, %9
2079  store <8 x i16> %10, ptr %2, align 16
2080  ret void
2081}
2082
2083define void @smlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3) {
2084; CHECK-NEON-LABEL: smlsl2_v4i32_uzp1:
2085; CHECK-NEON:       // %bb.0:
2086; CHECK-NEON-NEXT:    ldr q2, [x1, #16]
2087; CHECK-NEON-NEXT:    uzp1 v2.8h, v0.8h, v2.8h
2088; CHECK-NEON-NEXT:    smlsl2 v1.4s, v0.8h, v2.8h
2089; CHECK-NEON-NEXT:    str q1, [x0]
2090; CHECK-NEON-NEXT:    ret
2091;
2092; CHECK-SVE-LABEL: smlsl2_v4i32_uzp1:
2093; CHECK-SVE:       // %bb.0:
2094; CHECK-SVE-NEXT:    ldr q2, [x1, #16]
2095; CHECK-SVE-NEXT:    uzp1 v2.8h, v0.8h, v2.8h
2096; CHECK-SVE-NEXT:    smlsl2 v1.4s, v0.8h, v2.8h
2097; CHECK-SVE-NEXT:    str q1, [x0]
2098; CHECK-SVE-NEXT:    ret
2099;
2100; CHECK-GI-LABEL: smlsl2_v4i32_uzp1:
2101; CHECK-GI:       // %bb.0:
2102; CHECK-GI-NEXT:    ldr q2, [x1, #16]
2103; CHECK-GI-NEXT:    mov d0, v0.d[1]
2104; CHECK-GI-NEXT:    xtn v2.4h, v2.4s
2105; CHECK-GI-NEXT:    smlsl v1.4s, v0.4h, v2.4h
2106; CHECK-GI-NEXT:    str q1, [x0]
2107; CHECK-GI-NEXT:    ret
2108  %5 = getelementptr inbounds i32, ptr %3, i64 4
2109  %6 = load <4 x i32>, ptr %5, align 4
2110  %7 = trunc <4 x i32> %6 to <4 x i16>
2111  %8 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2112  %9 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %8, <4 x i16> %7)
2113  %10 = sub <4 x i32> %1, %9
2114  store <4 x i32> %10, ptr %2, align 16
2115  ret void
2116}
2117
2118define void @umlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3) {
2119; CHECK-NEON-LABEL: umlsl2_v4i32_uzp1:
2120; CHECK-NEON:       // %bb.0:
2121; CHECK-NEON-NEXT:    ldr q2, [x1, #16]
2122; CHECK-NEON-NEXT:    uzp1 v2.8h, v0.8h, v2.8h
2123; CHECK-NEON-NEXT:    umlsl2 v1.4s, v0.8h, v2.8h
2124; CHECK-NEON-NEXT:    str q1, [x0]
2125; CHECK-NEON-NEXT:    ret
2126;
2127; CHECK-SVE-LABEL: umlsl2_v4i32_uzp1:
2128; CHECK-SVE:       // %bb.0:
2129; CHECK-SVE-NEXT:    ldr q2, [x1, #16]
2130; CHECK-SVE-NEXT:    uzp1 v2.8h, v0.8h, v2.8h
2131; CHECK-SVE-NEXT:    umlsl2 v1.4s, v0.8h, v2.8h
2132; CHECK-SVE-NEXT:    str q1, [x0]
2133; CHECK-SVE-NEXT:    ret
2134;
2135; CHECK-GI-LABEL: umlsl2_v4i32_uzp1:
2136; CHECK-GI:       // %bb.0:
2137; CHECK-GI-NEXT:    ldr q2, [x1, #16]
2138; CHECK-GI-NEXT:    mov d0, v0.d[1]
2139; CHECK-GI-NEXT:    xtn v2.4h, v2.4s
2140; CHECK-GI-NEXT:    umlsl v1.4s, v0.4h, v2.4h
2141; CHECK-GI-NEXT:    str q1, [x0]
2142; CHECK-GI-NEXT:    ret
2143  %5 = getelementptr inbounds i32, ptr %3, i64 4
2144  %6 = load <4 x i32>, ptr %5, align 4
2145  %7 = trunc <4 x i32> %6 to <4 x i16>
2146  %8 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2147  %9 = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %8, <4 x i16> %7)
2148  %10 = sub <4 x i32> %1, %9
2149  store <4 x i32> %10, ptr %2, align 16
2150  ret void
2151}
2152
2153define void @pmlsl_pmlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3, i32 %4) {
2154; CHECK-LABEL: pmlsl_pmlsl2_v8i16_uzp1:
2155; CHECK:       // %bb.0: // %entry
2156; CHECK-NEXT:    ldp q2, q3, [x1]
2157; CHECK-NEXT:    uzp1 v2.16b, v2.16b, v3.16b
2158; CHECK-NEXT:    pmull v3.8h, v0.8b, v2.8b
2159; CHECK-NEXT:    pmull2 v0.8h, v0.16b, v2.16b
2160; CHECK-NEXT:    add v0.8h, v3.8h, v0.8h
2161; CHECK-NEXT:    sub v0.8h, v1.8h, v0.8h
2162; CHECK-NEXT:    str q0, [x0]
2163; CHECK-NEXT:    ret
2164entry:
2165  %5 = load <8 x i16>, ptr %3, align 4
2166  %6 = trunc <8 x i16> %5 to <8 x i8>
2167  %7 = getelementptr inbounds i32, ptr %3, i64 4
2168  %8 = load <8 x i16>, ptr %7, align 4
2169  %9 = trunc <8 x i16> %8 to <8 x i8>
2170  %10 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2171  %11 = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %10, <8 x i8> %6)
2172  %12 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2173  %13 = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %12, <8 x i8> %9)
2174  %14 = add <8 x i16> %11, %13
2175  %15 = sub <8 x i16> %1, %14
2176  store <8 x i16> %15, ptr %2, align 16
2177  ret void
2178}
2179
2180define void @smlsl_smlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3, i32 %4) {
2181; CHECK-NEON-LABEL: smlsl_smlsl2_v8i16_uzp1:
2182; CHECK-NEON:       // %bb.0: // %entry
2183; CHECK-NEON-NEXT:    ldp q2, q3, [x1]
2184; CHECK-NEON-NEXT:    uzp1 v2.16b, v2.16b, v3.16b
2185; CHECK-NEON-NEXT:    smlsl v1.8h, v0.8b, v2.8b
2186; CHECK-NEON-NEXT:    smlsl2 v1.8h, v0.16b, v2.16b
2187; CHECK-NEON-NEXT:    str q1, [x0]
2188; CHECK-NEON-NEXT:    ret
2189;
2190; CHECK-SVE-LABEL: smlsl_smlsl2_v8i16_uzp1:
2191; CHECK-SVE:       // %bb.0: // %entry
2192; CHECK-SVE-NEXT:    ldp q2, q3, [x1]
2193; CHECK-SVE-NEXT:    uzp1 v2.16b, v2.16b, v3.16b
2194; CHECK-SVE-NEXT:    smlsl v1.8h, v0.8b, v2.8b
2195; CHECK-SVE-NEXT:    smlsl2 v1.8h, v0.16b, v2.16b
2196; CHECK-SVE-NEXT:    str q1, [x0]
2197; CHECK-SVE-NEXT:    ret
2198;
2199; CHECK-GI-LABEL: smlsl_smlsl2_v8i16_uzp1:
2200; CHECK-GI:       // %bb.0: // %entry
2201; CHECK-GI-NEXT:    ldp q4, q2, [x1]
2202; CHECK-GI-NEXT:    mov d3, v0.d[1]
2203; CHECK-GI-NEXT:    xtn v2.8b, v2.8h
2204; CHECK-GI-NEXT:    xtn v4.8b, v4.8h
2205; CHECK-GI-NEXT:    smull v2.8h, v3.8b, v2.8b
2206; CHECK-GI-NEXT:    smlal v2.8h, v0.8b, v4.8b
2207; CHECK-GI-NEXT:    sub v0.8h, v1.8h, v2.8h
2208; CHECK-GI-NEXT:    str q0, [x0]
2209; CHECK-GI-NEXT:    ret
2210entry:
2211  %5 = load <8 x i16>, ptr %3, align 4
2212  %6 = trunc <8 x i16> %5 to <8 x i8>
2213  %7 = getelementptr inbounds i32, ptr %3, i64 4
2214  %8 = load <8 x i16>, ptr %7, align 4
2215  %9 = trunc <8 x i16> %8 to <8 x i8>
2216  %10 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2217  %11 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %10, <8 x i8> %6)
2218  %12 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2219  %13 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %12, <8 x i8> %9)
2220  %14 = add <8 x i16> %11, %13
2221  %15 = sub <8 x i16> %1, %14
2222  store <8 x i16> %15, ptr %2, align 16
2223  ret void
2224}
2225
2226define void @umlsl_umlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3, i32 %4) {
2227; CHECK-NEON-LABEL: umlsl_umlsl2_v8i16_uzp1:
2228; CHECK-NEON:       // %bb.0: // %entry
2229; CHECK-NEON-NEXT:    ldp q2, q3, [x1]
2230; CHECK-NEON-NEXT:    uzp1 v2.16b, v2.16b, v3.16b
2231; CHECK-NEON-NEXT:    umlsl v1.8h, v0.8b, v2.8b
2232; CHECK-NEON-NEXT:    umlsl2 v1.8h, v0.16b, v2.16b
2233; CHECK-NEON-NEXT:    str q1, [x0]
2234; CHECK-NEON-NEXT:    ret
2235;
2236; CHECK-SVE-LABEL: umlsl_umlsl2_v8i16_uzp1:
2237; CHECK-SVE:       // %bb.0: // %entry
2238; CHECK-SVE-NEXT:    ldp q2, q3, [x1]
2239; CHECK-SVE-NEXT:    uzp1 v2.16b, v2.16b, v3.16b
2240; CHECK-SVE-NEXT:    umlsl v1.8h, v0.8b, v2.8b
2241; CHECK-SVE-NEXT:    umlsl2 v1.8h, v0.16b, v2.16b
2242; CHECK-SVE-NEXT:    str q1, [x0]
2243; CHECK-SVE-NEXT:    ret
2244;
2245; CHECK-GI-LABEL: umlsl_umlsl2_v8i16_uzp1:
2246; CHECK-GI:       // %bb.0: // %entry
2247; CHECK-GI-NEXT:    ldp q4, q2, [x1]
2248; CHECK-GI-NEXT:    mov d3, v0.d[1]
2249; CHECK-GI-NEXT:    xtn v2.8b, v2.8h
2250; CHECK-GI-NEXT:    xtn v4.8b, v4.8h
2251; CHECK-GI-NEXT:    umull v2.8h, v3.8b, v2.8b
2252; CHECK-GI-NEXT:    umlal v2.8h, v0.8b, v4.8b
2253; CHECK-GI-NEXT:    sub v0.8h, v1.8h, v2.8h
2254; CHECK-GI-NEXT:    str q0, [x0]
2255; CHECK-GI-NEXT:    ret
2256entry:
2257  %5 = load <8 x i16>, ptr %3, align 4
2258  %6 = trunc <8 x i16> %5 to <8 x i8>
2259  %7 = getelementptr inbounds i32, ptr %3, i64 4
2260  %8 = load <8 x i16>, ptr %7, align 4
2261  %9 = trunc <8 x i16> %8 to <8 x i8>
2262  %10 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2263  %11 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %10, <8 x i8> %6)
2264  %12 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2265  %13 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %12, <8 x i8> %9)
2266  %14 = add <8 x i16> %11, %13
2267  %15 = sub <8 x i16> %1, %14
2268  store <8 x i16> %15, ptr %2, align 16
2269  ret void
2270}
2271
2272define void @smlsl_smlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3, i32 %4) {
2273; CHECK-NEON-LABEL: smlsl_smlsl2_v4i32_uzp1:
2274; CHECK-NEON:       // %bb.0: // %entry
2275; CHECK-NEON-NEXT:    ldp q2, q3, [x1]
2276; CHECK-NEON-NEXT:    uzp1 v2.8h, v2.8h, v3.8h
2277; CHECK-NEON-NEXT:    smlsl v1.4s, v0.4h, v2.4h
2278; CHECK-NEON-NEXT:    smlsl2 v1.4s, v0.8h, v2.8h
2279; CHECK-NEON-NEXT:    str q1, [x0]
2280; CHECK-NEON-NEXT:    ret
2281;
2282; CHECK-SVE-LABEL: smlsl_smlsl2_v4i32_uzp1:
2283; CHECK-SVE:       // %bb.0: // %entry
2284; CHECK-SVE-NEXT:    ldp q2, q3, [x1]
2285; CHECK-SVE-NEXT:    uzp1 v2.8h, v2.8h, v3.8h
2286; CHECK-SVE-NEXT:    smlsl v1.4s, v0.4h, v2.4h
2287; CHECK-SVE-NEXT:    smlsl2 v1.4s, v0.8h, v2.8h
2288; CHECK-SVE-NEXT:    str q1, [x0]
2289; CHECK-SVE-NEXT:    ret
2290;
2291; CHECK-GI-LABEL: smlsl_smlsl2_v4i32_uzp1:
2292; CHECK-GI:       // %bb.0: // %entry
2293; CHECK-GI-NEXT:    ldp q4, q2, [x1]
2294; CHECK-GI-NEXT:    mov d3, v0.d[1]
2295; CHECK-GI-NEXT:    xtn v2.4h, v2.4s
2296; CHECK-GI-NEXT:    xtn v4.4h, v4.4s
2297; CHECK-GI-NEXT:    smull v2.4s, v3.4h, v2.4h
2298; CHECK-GI-NEXT:    smlal v2.4s, v0.4h, v4.4h
2299; CHECK-GI-NEXT:    sub v0.4s, v1.4s, v2.4s
2300; CHECK-GI-NEXT:    str q0, [x0]
2301; CHECK-GI-NEXT:    ret
2302entry:
2303  %5 = load <4 x i32>, ptr %3, align 4
2304  %6 = trunc <4 x i32> %5 to <4 x i16>
2305  %7 = getelementptr inbounds i32, ptr %3, i64 4
2306  %8 = load <4 x i32>, ptr %7, align 4
2307  %9 = trunc <4 x i32> %8 to <4 x i16>
2308  %10 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2309  %11 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %10, <4 x i16> %6)
2310  %12 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2311  %13 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %12, <4 x i16> %9)
2312  %14 = add <4 x i32> %11, %13
2313  %15 = sub <4 x i32> %1, %14
2314  store <4 x i32> %15, ptr %2, align 16
2315  ret void
2316}
2317
2318define void @umlsl_umlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3, i32 %4) {
2319; CHECK-NEON-LABEL: umlsl_umlsl2_v4i32_uzp1:
2320; CHECK-NEON:       // %bb.0: // %entry
2321; CHECK-NEON-NEXT:    ldp q2, q3, [x1]
2322; CHECK-NEON-NEXT:    uzp1 v2.8h, v2.8h, v3.8h
2323; CHECK-NEON-NEXT:    umlsl v1.4s, v0.4h, v2.4h
2324; CHECK-NEON-NEXT:    umlsl2 v1.4s, v0.8h, v2.8h
2325; CHECK-NEON-NEXT:    str q1, [x0]
2326; CHECK-NEON-NEXT:    ret
2327;
2328; CHECK-SVE-LABEL: umlsl_umlsl2_v4i32_uzp1:
2329; CHECK-SVE:       // %bb.0: // %entry
2330; CHECK-SVE-NEXT:    ldp q2, q3, [x1]
2331; CHECK-SVE-NEXT:    uzp1 v2.8h, v2.8h, v3.8h
2332; CHECK-SVE-NEXT:    umlsl v1.4s, v0.4h, v2.4h
2333; CHECK-SVE-NEXT:    umlsl2 v1.4s, v0.8h, v2.8h
2334; CHECK-SVE-NEXT:    str q1, [x0]
2335; CHECK-SVE-NEXT:    ret
2336;
2337; CHECK-GI-LABEL: umlsl_umlsl2_v4i32_uzp1:
2338; CHECK-GI:       // %bb.0: // %entry
2339; CHECK-GI-NEXT:    ldp q4, q2, [x1]
2340; CHECK-GI-NEXT:    mov d3, v0.d[1]
2341; CHECK-GI-NEXT:    xtn v2.4h, v2.4s
2342; CHECK-GI-NEXT:    xtn v4.4h, v4.4s
2343; CHECK-GI-NEXT:    umull v2.4s, v3.4h, v2.4h
2344; CHECK-GI-NEXT:    umlal v2.4s, v0.4h, v4.4h
2345; CHECK-GI-NEXT:    sub v0.4s, v1.4s, v2.4s
2346; CHECK-GI-NEXT:    str q0, [x0]
2347; CHECK-GI-NEXT:    ret
2348entry:
2349  %5 = load <4 x i32>, ptr %3, align 4
2350  %6 = trunc <4 x i32> %5 to <4 x i16>
2351  %7 = getelementptr inbounds i32, ptr %3, i64 4
2352  %8 = load <4 x i32>, ptr %7, align 4
2353  %9 = trunc <4 x i32> %8 to <4 x i16>
2354  %10 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2355  %11 = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %10, <4 x i16> %6)
2356  %12 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2357  %13 = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %12, <4 x i16> %9)
2358  %14 = add <4 x i32> %11, %13
2359  %15 = sub <4 x i32> %1, %14
2360  store <4 x i32> %15, ptr %2, align 16
2361  ret void
2362}
2363
2364define <2 x i32> @do_stuff(<2 x i64> %0, <2 x i64> %1) {
2365; CHECK-NEON-LABEL: do_stuff:
2366; CHECK-NEON:       // %bb.0:
2367; CHECK-NEON-NEXT:    uzp1 v0.4s, v0.4s, v0.4s
2368; CHECK-NEON-NEXT:    smull2 v0.2d, v1.4s, v0.4s
2369; CHECK-NEON-NEXT:    xtn v0.2s, v0.2d
2370; CHECK-NEON-NEXT:    add v0.2s, v0.2s, v1.2s
2371; CHECK-NEON-NEXT:    ret
2372;
2373; CHECK-SVE-LABEL: do_stuff:
2374; CHECK-SVE:       // %bb.0:
2375; CHECK-SVE-NEXT:    uzp1 v0.4s, v0.4s, v0.4s
2376; CHECK-SVE-NEXT:    smull2 v0.2d, v1.4s, v0.4s
2377; CHECK-SVE-NEXT:    xtn v0.2s, v0.2d
2378; CHECK-SVE-NEXT:    add v0.2s, v0.2s, v1.2s
2379; CHECK-SVE-NEXT:    ret
2380;
2381; CHECK-GI-LABEL: do_stuff:
2382; CHECK-GI:       // %bb.0:
2383; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
2384; CHECK-GI-NEXT:    mov d2, v1.d[1]
2385; CHECK-GI-NEXT:    smull v0.2d, v2.2s, v0.2s
2386; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
2387; CHECK-GI-NEXT:    add v0.2s, v0.2s, v1.2s
2388; CHECK-GI-NEXT:    ret
2389  %bc.1 = bitcast <2 x i64> %1 to <4 x i32>
2390  %trunc.0 = trunc <2 x i64> %0 to <2 x i32>
2391  %shuff.hi = shufflevector <4 x i32> %bc.1, <4 x i32> zeroinitializer, <2 x i32> <i32 2, i32 3>
2392  %shuff.lo = shufflevector <4 x i32> %bc.1, <4 x i32> zeroinitializer, <2 x i32> <i32 0, i32 1>
2393  %smull = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuff.hi, <2 x i32> %trunc.0)
2394  %trunc.smull = trunc <2 x i64> %smull to <2 x i32>
2395  %final = add <2 x i32> %trunc.smull, %shuff.lo
2396  ret <2 x i32> %final
2397}
2398
2399define <2 x i64> @lsr(<2 x i64> %a, <2 x i64> %b) {
2400; CHECK-NEON-LABEL: lsr:
2401; CHECK-NEON:       // %bb.0:
2402; CHECK-NEON-NEXT:    shrn v0.2s, v0.2d, #32
2403; CHECK-NEON-NEXT:    shrn v1.2s, v1.2d, #32
2404; CHECK-NEON-NEXT:    umull v0.2d, v0.2s, v1.2s
2405; CHECK-NEON-NEXT:    ret
2406;
2407; CHECK-SVE-LABEL: lsr:
2408; CHECK-SVE:       // %bb.0:
2409; CHECK-SVE-NEXT:    shrn v0.2s, v0.2d, #32
2410; CHECK-SVE-NEXT:    shrn v1.2s, v1.2d, #32
2411; CHECK-SVE-NEXT:    umull v0.2d, v0.2s, v1.2s
2412; CHECK-SVE-NEXT:    ret
2413;
2414; CHECK-GI-LABEL: lsr:
2415; CHECK-GI:       // %bb.0:
2416; CHECK-GI-NEXT:    ushr v0.2d, v0.2d, #32
2417; CHECK-GI-NEXT:    ushr v1.2d, v1.2d, #32
2418; CHECK-GI-NEXT:    fmov x8, d0
2419; CHECK-GI-NEXT:    fmov x9, d1
2420; CHECK-GI-NEXT:    mov x10, v0.d[1]
2421; CHECK-GI-NEXT:    mov x11, v1.d[1]
2422; CHECK-GI-NEXT:    mul x8, x8, x9
2423; CHECK-GI-NEXT:    mul x9, x10, x11
2424; CHECK-GI-NEXT:    mov v0.d[0], x8
2425; CHECK-GI-NEXT:    mov v0.d[1], x9
2426; CHECK-GI-NEXT:    ret
2427    %x = lshr <2 x i64> %a, <i64 32, i64 32>
2428    %y = lshr <2 x i64> %b, <i64 32, i64 32>
2429    %z = mul nsw <2 x i64> %x, %y
2430    ret <2 x i64> %z
2431}
2432
2433define <2 x i64> @lsr_const(<2 x i64> %a, <2 x i64> %b) {
2434; CHECK-NEON-LABEL: lsr_const:
2435; CHECK-NEON:       // %bb.0:
2436; CHECK-NEON-NEXT:    movi v1.2s, #31
2437; CHECK-NEON-NEXT:    shrn v0.2s, v0.2d, #32
2438; CHECK-NEON-NEXT:    umull v0.2d, v0.2s, v1.2s
2439; CHECK-NEON-NEXT:    ret
2440;
2441; CHECK-SVE-LABEL: lsr_const:
2442; CHECK-SVE:       // %bb.0:
2443; CHECK-SVE-NEXT:    movi v1.2s, #31
2444; CHECK-SVE-NEXT:    shrn v0.2s, v0.2d, #32
2445; CHECK-SVE-NEXT:    umull v0.2d, v0.2s, v1.2s
2446; CHECK-SVE-NEXT:    ret
2447;
2448; CHECK-GI-LABEL: lsr_const:
2449; CHECK-GI:       // %bb.0:
2450; CHECK-GI-NEXT:    adrp x8, .LCPI79_0
2451; CHECK-GI-NEXT:    ushr v0.2d, v0.2d, #32
2452; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI79_0]
2453; CHECK-GI-NEXT:    fmov x8, d0
2454; CHECK-GI-NEXT:    fmov x9, d1
2455; CHECK-GI-NEXT:    mov x10, v0.d[1]
2456; CHECK-GI-NEXT:    mov x11, v1.d[1]
2457; CHECK-GI-NEXT:    mul x8, x8, x9
2458; CHECK-GI-NEXT:    mul x9, x10, x11
2459; CHECK-GI-NEXT:    mov v0.d[0], x8
2460; CHECK-GI-NEXT:    mov v0.d[1], x9
2461; CHECK-GI-NEXT:    ret
2462    %x = lshr <2 x i64> %a, <i64 32, i64 32>
2463    %z = mul nsw <2 x i64> %x, <i64 31, i64 31>
2464    ret <2 x i64> %z
2465}
2466
2467define <2 x i64> @asr(<2 x i64> %a, <2 x i64> %b) {
2468; CHECK-NEON-LABEL: asr:
2469; CHECK-NEON:       // %bb.0:
2470; CHECK-NEON-NEXT:    shrn v0.2s, v0.2d, #32
2471; CHECK-NEON-NEXT:    shrn v1.2s, v1.2d, #32
2472; CHECK-NEON-NEXT:    smull v0.2d, v0.2s, v1.2s
2473; CHECK-NEON-NEXT:    ret
2474;
2475; CHECK-SVE-LABEL: asr:
2476; CHECK-SVE:       // %bb.0:
2477; CHECK-SVE-NEXT:    shrn v0.2s, v0.2d, #32
2478; CHECK-SVE-NEXT:    shrn v1.2s, v1.2d, #32
2479; CHECK-SVE-NEXT:    smull v0.2d, v0.2s, v1.2s
2480; CHECK-SVE-NEXT:    ret
2481;
2482; CHECK-GI-LABEL: asr:
2483; CHECK-GI:       // %bb.0:
2484; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #32
2485; CHECK-GI-NEXT:    sshr v1.2d, v1.2d, #32
2486; CHECK-GI-NEXT:    fmov x8, d0
2487; CHECK-GI-NEXT:    fmov x9, d1
2488; CHECK-GI-NEXT:    mov x10, v0.d[1]
2489; CHECK-GI-NEXT:    mov x11, v1.d[1]
2490; CHECK-GI-NEXT:    mul x8, x8, x9
2491; CHECK-GI-NEXT:    mul x9, x10, x11
2492; CHECK-GI-NEXT:    mov v0.d[0], x8
2493; CHECK-GI-NEXT:    mov v0.d[1], x9
2494; CHECK-GI-NEXT:    ret
2495    %x = ashr <2 x i64> %a, <i64 32, i64 32>
2496    %y = ashr <2 x i64> %b, <i64 32, i64 32>
2497    %z = mul nsw <2 x i64> %x, %y
2498    ret <2 x i64> %z
2499}
2500
2501define <2 x i64> @asr_const(<2 x i64> %a, <2 x i64> %b) {
2502; CHECK-NEON-LABEL: asr_const:
2503; CHECK-NEON:       // %bb.0:
2504; CHECK-NEON-NEXT:    movi v1.2s, #31
2505; CHECK-NEON-NEXT:    shrn v0.2s, v0.2d, #32
2506; CHECK-NEON-NEXT:    smull v0.2d, v0.2s, v1.2s
2507; CHECK-NEON-NEXT:    ret
2508;
2509; CHECK-SVE-LABEL: asr_const:
2510; CHECK-SVE:       // %bb.0:
2511; CHECK-SVE-NEXT:    movi v1.2s, #31
2512; CHECK-SVE-NEXT:    shrn v0.2s, v0.2d, #32
2513; CHECK-SVE-NEXT:    smull v0.2d, v0.2s, v1.2s
2514; CHECK-SVE-NEXT:    ret
2515;
2516; CHECK-GI-LABEL: asr_const:
2517; CHECK-GI:       // %bb.0:
2518; CHECK-GI-NEXT:    adrp x8, .LCPI81_0
2519; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #32
2520; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI81_0]
2521; CHECK-GI-NEXT:    fmov x8, d0
2522; CHECK-GI-NEXT:    fmov x9, d1
2523; CHECK-GI-NEXT:    mov x10, v0.d[1]
2524; CHECK-GI-NEXT:    mov x11, v1.d[1]
2525; CHECK-GI-NEXT:    mul x8, x8, x9
2526; CHECK-GI-NEXT:    mul x9, x10, x11
2527; CHECK-GI-NEXT:    mov v0.d[0], x8
2528; CHECK-GI-NEXT:    mov v0.d[1], x9
2529; CHECK-GI-NEXT:    ret
2530    %x = ashr <2 x i64> %a, <i64 32, i64 32>
2531    %z = mul nsw <2 x i64> %x, <i64 31, i64 31>
2532    ret <2 x i64> %z
2533}
2534
2535define <8 x i16> @smulladdl_v8i8_v8i16(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) nounwind {
2536; CHECK-NEON-LABEL: smulladdl_v8i8_v8i16:
2537; CHECK-NEON:       // %bb.0:
2538; CHECK-NEON-NEXT:    smull v0.8h, v0.8b, v1.8b
2539; CHECK-NEON-NEXT:    saddw v0.8h, v0.8h, v2.8b
2540; CHECK-NEON-NEXT:    ret
2541;
2542; CHECK-SVE-LABEL: smulladdl_v8i8_v8i16:
2543; CHECK-SVE:       // %bb.0:
2544; CHECK-SVE-NEXT:    smull v0.8h, v0.8b, v1.8b
2545; CHECK-SVE-NEXT:    saddw v0.8h, v0.8h, v2.8b
2546; CHECK-SVE-NEXT:    ret
2547;
2548; CHECK-GI-LABEL: smulladdl_v8i8_v8i16:
2549; CHECK-GI:       // %bb.0:
2550; CHECK-GI-NEXT:    sshll v2.8h, v2.8b, #0
2551; CHECK-GI-NEXT:    smlal v2.8h, v0.8b, v1.8b
2552; CHECK-GI-NEXT:    mov v0.16b, v2.16b
2553; CHECK-GI-NEXT:    ret
2554  %tmp1 = sext <8 x i8> %A to <8 x i16>
2555  %tmp2 = sext <8 x i8> %B to <8 x i16>
2556  %tmp3 = sext <8 x i8> %C to <8 x i16>
2557  %tmp4 = mul <8 x i16> %tmp1, %tmp2
2558  %tmp5 = add <8 x i16> %tmp4, %tmp3
2559  ret <8 x i16> %tmp5
2560}
2561
2562define <8 x i16> @umulladdl_v8i8_v8i16(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) nounwind {
2563; CHECK-NEON-LABEL: umulladdl_v8i8_v8i16:
2564; CHECK-NEON:       // %bb.0:
2565; CHECK-NEON-NEXT:    umull v0.8h, v0.8b, v1.8b
2566; CHECK-NEON-NEXT:    uaddw v0.8h, v0.8h, v2.8b
2567; CHECK-NEON-NEXT:    ret
2568;
2569; CHECK-SVE-LABEL: umulladdl_v8i8_v8i16:
2570; CHECK-SVE:       // %bb.0:
2571; CHECK-SVE-NEXT:    umull v0.8h, v0.8b, v1.8b
2572; CHECK-SVE-NEXT:    uaddw v0.8h, v0.8h, v2.8b
2573; CHECK-SVE-NEXT:    ret
2574;
2575; CHECK-GI-LABEL: umulladdl_v8i8_v8i16:
2576; CHECK-GI:       // %bb.0:
2577; CHECK-GI-NEXT:    ushll v2.8h, v2.8b, #0
2578; CHECK-GI-NEXT:    umlal v2.8h, v0.8b, v1.8b
2579; CHECK-GI-NEXT:    mov v0.16b, v2.16b
2580; CHECK-GI-NEXT:    ret
2581  %tmp1 = zext <8 x i8> %A to <8 x i16>
2582  %tmp2 = zext <8 x i8> %B to <8 x i16>
2583  %tmp3 = zext <8 x i8> %C to <8 x i16>
2584  %tmp4 = mul <8 x i16> %tmp1, %tmp2
2585  %tmp5 = add <8 x i16> %tmp4, %tmp3
2586  ret <8 x i16> %tmp5
2587}
2588
2589define <8 x i16> @smlall_v8i8_v8i16(<8 x i8> %A, <8 x i8> %B, <8 x i16> %C) nounwind {
2590; CHECK-LABEL: smlall_v8i8_v8i16:
2591; CHECK:       // %bb.0:
2592; CHECK-NEXT:    smlal v2.8h, v0.8b, v1.8b
2593; CHECK-NEXT:    mov v0.16b, v2.16b
2594; CHECK-NEXT:    ret
2595  %tmp1 = sext <8 x i8> %A to <8 x i16>
2596  %tmp2 = sext <8 x i8> %B to <8 x i16>
2597  %tmp4 = mul <8 x i16> %tmp1, %tmp2
2598  %tmp5 = add <8 x i16> %tmp4, %C
2599  ret <8 x i16> %tmp5
2600}
2601
2602define <8 x i16> @umlall_v8i8_v8i16(<8 x i8> %A, <8 x i8> %B, <8 x i16> %C) nounwind {
2603; CHECK-LABEL: umlall_v8i8_v8i16:
2604; CHECK:       // %bb.0:
2605; CHECK-NEXT:    umlal v2.8h, v0.8b, v1.8b
2606; CHECK-NEXT:    mov v0.16b, v2.16b
2607; CHECK-NEXT:    ret
2608  %tmp1 = zext <8 x i8> %A to <8 x i16>
2609  %tmp2 = zext <8 x i8> %B to <8 x i16>
2610  %tmp4 = mul <8 x i16> %tmp1, %tmp2
2611  %tmp5 = add <8 x i16> %tmp4, %C
2612  ret <8 x i16> %tmp5
2613}
2614
2615define <8 x i16> @smulladdl_const_v8i8_v8i16(<8 x i8> %A, <8 x i8> %C) nounwind {
2616; CHECK-NEON-LABEL: smulladdl_const_v8i8_v8i16:
2617; CHECK-NEON:       // %bb.0:
2618; CHECK-NEON-NEXT:    movi v2.8b, #10
2619; CHECK-NEON-NEXT:    smull v0.8h, v0.8b, v2.8b
2620; CHECK-NEON-NEXT:    saddw v0.8h, v0.8h, v1.8b
2621; CHECK-NEON-NEXT:    ret
2622;
2623; CHECK-SVE-LABEL: smulladdl_const_v8i8_v8i16:
2624; CHECK-SVE:       // %bb.0:
2625; CHECK-SVE-NEXT:    movi v2.8b, #10
2626; CHECK-SVE-NEXT:    smull v0.8h, v0.8b, v2.8b
2627; CHECK-SVE-NEXT:    saddw v0.8h, v0.8h, v1.8b
2628; CHECK-SVE-NEXT:    ret
2629;
2630; CHECK-GI-LABEL: smulladdl_const_v8i8_v8i16:
2631; CHECK-GI:       // %bb.0:
2632; CHECK-GI-NEXT:    movi v2.8h, #10
2633; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
2634; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v2.8h
2635; CHECK-GI-NEXT:    saddw v0.8h, v0.8h, v1.8b
2636; CHECK-GI-NEXT:    ret
2637  %tmp1 = sext <8 x i8> %A to <8 x i16>
2638  %tmp3 = sext <8 x i8> %C to <8 x i16>
2639  %tmp4 = mul <8 x i16> %tmp1, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
2640  %tmp5 = add <8 x i16> %tmp4, %tmp3
2641  ret <8 x i16> %tmp5
2642}
2643
2644define <8 x i16> @umulladdl_const_v8i8_v8i16(<8 x i8> %A, <8 x i8> %C) nounwind {
2645; CHECK-NEON-LABEL: umulladdl_const_v8i8_v8i16:
2646; CHECK-NEON:       // %bb.0:
2647; CHECK-NEON-NEXT:    movi v2.8b, #10
2648; CHECK-NEON-NEXT:    umull v0.8h, v0.8b, v2.8b
2649; CHECK-NEON-NEXT:    uaddw v0.8h, v0.8h, v1.8b
2650; CHECK-NEON-NEXT:    ret
2651;
2652; CHECK-SVE-LABEL: umulladdl_const_v8i8_v8i16:
2653; CHECK-SVE:       // %bb.0:
2654; CHECK-SVE-NEXT:    movi v2.8b, #10
2655; CHECK-SVE-NEXT:    umull v0.8h, v0.8b, v2.8b
2656; CHECK-SVE-NEXT:    uaddw v0.8h, v0.8h, v1.8b
2657; CHECK-SVE-NEXT:    ret
2658;
2659; CHECK-GI-LABEL: umulladdl_const_v8i8_v8i16:
2660; CHECK-GI:       // %bb.0:
2661; CHECK-GI-NEXT:    movi v2.8h, #10
2662; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
2663; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v2.8h
2664; CHECK-GI-NEXT:    uaddw v0.8h, v0.8h, v1.8b
2665; CHECK-GI-NEXT:    ret
2666  %tmp1 = zext <8 x i8> %A to <8 x i16>
2667  %tmp3 = zext <8 x i8> %C to <8 x i16>
2668  %tmp4 = mul <8 x i16> %tmp1, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
2669  %tmp5 = add <8 x i16> %tmp4, %tmp3
2670  ret <8 x i16> %tmp5
2671}
2672
2673define <8 x i16> @sdistribute_v8i8(<8 x i8> %src1, <8 x i8> %src2, <8 x i8> %mul) {
2674; CHECK-NEON-LABEL: sdistribute_v8i8:
2675; CHECK-NEON:       // %bb.0: // %entry
2676; CHECK-NEON-NEXT:    smull v0.8h, v0.8b, v2.8b
2677; CHECK-NEON-NEXT:    smlal v0.8h, v1.8b, v2.8b
2678; CHECK-NEON-NEXT:    ret
2679;
2680; CHECK-SVE-LABEL: sdistribute_v8i8:
2681; CHECK-SVE:       // %bb.0: // %entry
2682; CHECK-SVE-NEXT:    smull v0.8h, v0.8b, v2.8b
2683; CHECK-SVE-NEXT:    smlal v0.8h, v1.8b, v2.8b
2684; CHECK-SVE-NEXT:    ret
2685;
2686; CHECK-GI-LABEL: sdistribute_v8i8:
2687; CHECK-GI:       // %bb.0: // %entry
2688; CHECK-GI-NEXT:    sshll v2.8h, v2.8b, #0
2689; CHECK-GI-NEXT:    saddl v0.8h, v0.8b, v1.8b
2690; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v2.8h
2691; CHECK-GI-NEXT:    ret
2692entry:
2693  %4 = sext <8 x i8> %src1 to <8 x i16>
2694  %5 = sext <8 x i8> %mul to <8 x i16>
2695  %7 = sext <8 x i8> %src2 to <8 x i16>
2696  %8 = add nuw nsw <8 x i16> %4, %7
2697  %9 = mul <8 x i16> %8, %5
2698  ret <8 x i16> %9
2699}
2700
2701define <8 x i16> @sdistribute_const1_v8i8(<8 x i8> %src1, <8 x i8> %mul) {
2702; CHECK-NEON-LABEL: sdistribute_const1_v8i8:
2703; CHECK-NEON:       // %bb.0: // %entry
2704; CHECK-NEON-NEXT:    movi v2.8b, #10
2705; CHECK-NEON-NEXT:    smull v0.8h, v0.8b, v1.8b
2706; CHECK-NEON-NEXT:    smlal v0.8h, v2.8b, v1.8b
2707; CHECK-NEON-NEXT:    ret
2708;
2709; CHECK-SVE-LABEL: sdistribute_const1_v8i8:
2710; CHECK-SVE:       // %bb.0: // %entry
2711; CHECK-SVE-NEXT:    movi v2.8b, #10
2712; CHECK-SVE-NEXT:    smull v0.8h, v0.8b, v1.8b
2713; CHECK-SVE-NEXT:    smlal v0.8h, v2.8b, v1.8b
2714; CHECK-SVE-NEXT:    ret
2715;
2716; CHECK-GI-LABEL: sdistribute_const1_v8i8:
2717; CHECK-GI:       // %bb.0: // %entry
2718; CHECK-GI-NEXT:    movi v2.8h, #10
2719; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
2720; CHECK-GI-NEXT:    saddw v0.8h, v2.8h, v0.8b
2721; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
2722; CHECK-GI-NEXT:    ret
2723entry:
2724  %4 = sext <8 x i8> %src1 to <8 x i16>
2725  %5 = sext <8 x i8> %mul to <8 x i16>
2726  %8 = add nuw nsw <8 x i16> %4, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
2727  %9 = mul <8 x i16> %8, %5
2728  ret <8 x i16> %9
2729}
2730
2731define <8 x i16> @sdistribute_const2_v8i8(<8 x i8> %src1, <8 x i8> %src2) {
2732; CHECK-NEON-LABEL: sdistribute_const2_v8i8:
2733; CHECK-NEON:       // %bb.0: // %entry
2734; CHECK-NEON-NEXT:    movi v2.8b, #10
2735; CHECK-NEON-NEXT:    smull v0.8h, v0.8b, v2.8b
2736; CHECK-NEON-NEXT:    smlal v0.8h, v1.8b, v2.8b
2737; CHECK-NEON-NEXT:    ret
2738;
2739; CHECK-SVE-LABEL: sdistribute_const2_v8i8:
2740; CHECK-SVE:       // %bb.0: // %entry
2741; CHECK-SVE-NEXT:    movi v2.8b, #10
2742; CHECK-SVE-NEXT:    smull v0.8h, v0.8b, v2.8b
2743; CHECK-SVE-NEXT:    smlal v0.8h, v1.8b, v2.8b
2744; CHECK-SVE-NEXT:    ret
2745;
2746; CHECK-GI-LABEL: sdistribute_const2_v8i8:
2747; CHECK-GI:       // %bb.0: // %entry
2748; CHECK-GI-NEXT:    movi v2.8h, #10
2749; CHECK-GI-NEXT:    saddl v0.8h, v0.8b, v1.8b
2750; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v2.8h
2751; CHECK-GI-NEXT:    ret
2752entry:
2753  %4 = sext <8 x i8> %src1 to <8 x i16>
2754  %5 = sext <8 x i8> %src2 to <8 x i16>
2755  %8 = add nuw nsw <8 x i16> %4, %5
2756  %9 = mul <8 x i16> %8, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
2757  ret <8 x i16> %9
2758}
2759
2760define <8 x i16> @udistribute_v8i8(<8 x i8> %src1, <8 x i8> %src2, <8 x i8> %mul) {
2761; CHECK-NEON-LABEL: udistribute_v8i8:
2762; CHECK-NEON:       // %bb.0: // %entry
2763; CHECK-NEON-NEXT:    umull v0.8h, v0.8b, v2.8b
2764; CHECK-NEON-NEXT:    umlal v0.8h, v1.8b, v2.8b
2765; CHECK-NEON-NEXT:    ret
2766;
2767; CHECK-SVE-LABEL: udistribute_v8i8:
2768; CHECK-SVE:       // %bb.0: // %entry
2769; CHECK-SVE-NEXT:    umull v0.8h, v0.8b, v2.8b
2770; CHECK-SVE-NEXT:    umlal v0.8h, v1.8b, v2.8b
2771; CHECK-SVE-NEXT:    ret
2772;
2773; CHECK-GI-LABEL: udistribute_v8i8:
2774; CHECK-GI:       // %bb.0: // %entry
2775; CHECK-GI-NEXT:    ushll v2.8h, v2.8b, #0
2776; CHECK-GI-NEXT:    uaddl v0.8h, v0.8b, v1.8b
2777; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v2.8h
2778; CHECK-GI-NEXT:    ret
2779entry:
2780  %4 = zext <8 x i8> %src1 to <8 x i16>
2781  %5 = zext <8 x i8> %mul to <8 x i16>
2782  %7 = zext <8 x i8> %src2 to <8 x i16>
2783  %8 = add nuw nsw <8 x i16> %4, %7
2784  %9 = mul <8 x i16> %8, %5
2785  ret <8 x i16> %9
2786}
2787
2788define <8 x i16> @udistribute_const1_v8i8(<8 x i8> %src1, <8 x i8> %mul) {
2789; CHECK-NEON-LABEL: udistribute_const1_v8i8:
2790; CHECK-NEON:       // %bb.0: // %entry
2791; CHECK-NEON-NEXT:    movi v2.8b, #10
2792; CHECK-NEON-NEXT:    umull v0.8h, v0.8b, v1.8b
2793; CHECK-NEON-NEXT:    umlal v0.8h, v2.8b, v1.8b
2794; CHECK-NEON-NEXT:    ret
2795;
2796; CHECK-SVE-LABEL: udistribute_const1_v8i8:
2797; CHECK-SVE:       // %bb.0: // %entry
2798; CHECK-SVE-NEXT:    movi v2.8b, #10
2799; CHECK-SVE-NEXT:    umull v0.8h, v0.8b, v1.8b
2800; CHECK-SVE-NEXT:    umlal v0.8h, v2.8b, v1.8b
2801; CHECK-SVE-NEXT:    ret
2802;
2803; CHECK-GI-LABEL: udistribute_const1_v8i8:
2804; CHECK-GI:       // %bb.0: // %entry
2805; CHECK-GI-NEXT:    movi v2.8h, #10
2806; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
2807; CHECK-GI-NEXT:    uaddw v0.8h, v2.8h, v0.8b
2808; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
2809; CHECK-GI-NEXT:    ret
2810entry:
2811  %4 = zext <8 x i8> %src1 to <8 x i16>
2812  %5 = zext <8 x i8> %mul to <8 x i16>
2813  %8 = add nuw nsw <8 x i16> %4, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
2814  %9 = mul <8 x i16> %8, %5
2815  ret <8 x i16> %9
2816}
2817
2818define <8 x i16> @udistribute_const2_v8i8(<8 x i8> %src1, <8 x i8> %src2) {
2819; CHECK-NEON-LABEL: udistribute_const2_v8i8:
2820; CHECK-NEON:       // %bb.0: // %entry
2821; CHECK-NEON-NEXT:    movi v2.8b, #10
2822; CHECK-NEON-NEXT:    umull v0.8h, v0.8b, v2.8b
2823; CHECK-NEON-NEXT:    umlal v0.8h, v1.8b, v2.8b
2824; CHECK-NEON-NEXT:    ret
2825;
2826; CHECK-SVE-LABEL: udistribute_const2_v8i8:
2827; CHECK-SVE:       // %bb.0: // %entry
2828; CHECK-SVE-NEXT:    movi v2.8b, #10
2829; CHECK-SVE-NEXT:    umull v0.8h, v0.8b, v2.8b
2830; CHECK-SVE-NEXT:    umlal v0.8h, v1.8b, v2.8b
2831; CHECK-SVE-NEXT:    ret
2832;
2833; CHECK-GI-LABEL: udistribute_const2_v8i8:
2834; CHECK-GI:       // %bb.0: // %entry
2835; CHECK-GI-NEXT:    movi v2.8h, #10
2836; CHECK-GI-NEXT:    uaddl v0.8h, v0.8b, v1.8b
2837; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v2.8h
2838; CHECK-GI-NEXT:    ret
2839entry:
2840  %4 = zext <8 x i8> %src1 to <8 x i16>
2841  %5 = zext <8 x i8> %src2 to <8 x i16>
2842  %8 = add nuw nsw <8 x i16> %4, %5
2843  %9 = mul <8 x i16> %8, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
2844  ret <8 x i16> %9
2845}
2846
2847
2848define <2 x i64> @smulladdl_v2i32_v2i64(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) nounwind {
2849; CHECK-NEON-LABEL: smulladdl_v2i32_v2i64:
2850; CHECK-NEON:       // %bb.0:
2851; CHECK-NEON-NEXT:    smull v0.2d, v0.2s, v1.2s
2852; CHECK-NEON-NEXT:    saddw v0.2d, v0.2d, v2.2s
2853; CHECK-NEON-NEXT:    ret
2854;
2855; CHECK-SVE-LABEL: smulladdl_v2i32_v2i64:
2856; CHECK-SVE:       // %bb.0:
2857; CHECK-SVE-NEXT:    smull v0.2d, v0.2s, v1.2s
2858; CHECK-SVE-NEXT:    saddw v0.2d, v0.2d, v2.2s
2859; CHECK-SVE-NEXT:    ret
2860;
2861; CHECK-GI-LABEL: smulladdl_v2i32_v2i64:
2862; CHECK-GI:       // %bb.0:
2863; CHECK-GI-NEXT:    sshll v2.2d, v2.2s, #0
2864; CHECK-GI-NEXT:    smlal v2.2d, v0.2s, v1.2s
2865; CHECK-GI-NEXT:    mov v0.16b, v2.16b
2866; CHECK-GI-NEXT:    ret
2867  %tmp1 = sext <2 x i32> %A to <2 x i64>
2868  %tmp2 = sext <2 x i32> %B to <2 x i64>
2869  %tmp3 = sext <2 x i32> %C to <2 x i64>
2870  %tmp4 = mul <2 x i64> %tmp1, %tmp2
2871  %tmp5 = add <2 x i64> %tmp4, %tmp3
2872  ret <2 x i64> %tmp5
2873}
2874
2875define <2 x i64> @umulladdl_v2i32_v2i64(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) nounwind {
2876; CHECK-NEON-LABEL: umulladdl_v2i32_v2i64:
2877; CHECK-NEON:       // %bb.0:
2878; CHECK-NEON-NEXT:    umull v0.2d, v0.2s, v1.2s
2879; CHECK-NEON-NEXT:    uaddw v0.2d, v0.2d, v2.2s
2880; CHECK-NEON-NEXT:    ret
2881;
2882; CHECK-SVE-LABEL: umulladdl_v2i32_v2i64:
2883; CHECK-SVE:       // %bb.0:
2884; CHECK-SVE-NEXT:    umull v0.2d, v0.2s, v1.2s
2885; CHECK-SVE-NEXT:    uaddw v0.2d, v0.2d, v2.2s
2886; CHECK-SVE-NEXT:    ret
2887;
2888; CHECK-GI-LABEL: umulladdl_v2i32_v2i64:
2889; CHECK-GI:       // %bb.0:
2890; CHECK-GI-NEXT:    ushll v2.2d, v2.2s, #0
2891; CHECK-GI-NEXT:    umlal v2.2d, v0.2s, v1.2s
2892; CHECK-GI-NEXT:    mov v0.16b, v2.16b
2893; CHECK-GI-NEXT:    ret
2894  %tmp1 = zext <2 x i32> %A to <2 x i64>
2895  %tmp2 = zext <2 x i32> %B to <2 x i64>
2896  %tmp3 = zext <2 x i32> %C to <2 x i64>
2897  %tmp4 = mul <2 x i64> %tmp1, %tmp2
2898  %tmp5 = add <2 x i64> %tmp4, %tmp3
2899  ret <2 x i64> %tmp5
2900}
2901
2902define <2 x i64> @smlall_v2i32_v2i64(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
2903; CHECK-LABEL: smlall_v2i32_v2i64:
2904; CHECK:       // %bb.0:
2905; CHECK-NEXT:    smlal v2.2d, v0.2s, v1.2s
2906; CHECK-NEXT:    mov v0.16b, v2.16b
2907; CHECK-NEXT:    ret
2908  %tmp1 = sext <2 x i32> %A to <2 x i64>
2909  %tmp2 = sext <2 x i32> %B to <2 x i64>
2910  %tmp4 = mul <2 x i64> %tmp1, %tmp2
2911  %tmp5 = add <2 x i64> %tmp4, %C
2912  ret <2 x i64> %tmp5
2913}
2914
2915define <2 x i64> @umlall_v2i32_v2i64(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
2916; CHECK-LABEL: umlall_v2i32_v2i64:
2917; CHECK:       // %bb.0:
2918; CHECK-NEXT:    umlal v2.2d, v0.2s, v1.2s
2919; CHECK-NEXT:    mov v0.16b, v2.16b
2920; CHECK-NEXT:    ret
2921  %tmp1 = zext <2 x i32> %A to <2 x i64>
2922  %tmp2 = zext <2 x i32> %B to <2 x i64>
2923  %tmp4 = mul <2 x i64> %tmp1, %tmp2
2924  %tmp5 = add <2 x i64> %tmp4, %C
2925  ret <2 x i64> %tmp5
2926}
2927
2928define <2 x i64> @smulladdl_const_v2i32_v2i64(<2 x i32> %A, <2 x i32> %C) nounwind {
2929; CHECK-NEON-LABEL: smulladdl_const_v2i32_v2i64:
2930; CHECK-NEON:       // %bb.0:
2931; CHECK-NEON-NEXT:    movi v2.2s, #10
2932; CHECK-NEON-NEXT:    smull v0.2d, v0.2s, v2.2s
2933; CHECK-NEON-NEXT:    saddw v0.2d, v0.2d, v1.2s
2934; CHECK-NEON-NEXT:    ret
2935;
2936; CHECK-SVE-LABEL: smulladdl_const_v2i32_v2i64:
2937; CHECK-SVE:       // %bb.0:
2938; CHECK-SVE-NEXT:    movi v2.2s, #10
2939; CHECK-SVE-NEXT:    smull v0.2d, v0.2s, v2.2s
2940; CHECK-SVE-NEXT:    saddw v0.2d, v0.2d, v1.2s
2941; CHECK-SVE-NEXT:    ret
2942;
2943; CHECK-GI-LABEL: smulladdl_const_v2i32_v2i64:
2944; CHECK-GI:       // %bb.0:
2945; CHECK-GI-NEXT:    adrp x8, .LCPI98_0
2946; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
2947; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI98_0]
2948; CHECK-GI-NEXT:    fmov x8, d0
2949; CHECK-GI-NEXT:    fmov x9, d2
2950; CHECK-GI-NEXT:    mov x10, v0.d[1]
2951; CHECK-GI-NEXT:    mov x11, v2.d[1]
2952; CHECK-GI-NEXT:    mul x8, x8, x9
2953; CHECK-GI-NEXT:    mul x9, x10, x11
2954; CHECK-GI-NEXT:    mov v0.d[0], x8
2955; CHECK-GI-NEXT:    mov v0.d[1], x9
2956; CHECK-GI-NEXT:    saddw v0.2d, v0.2d, v1.2s
2957; CHECK-GI-NEXT:    ret
2958  %tmp1 = sext <2 x i32> %A to <2 x i64>
2959  %tmp3 = sext <2 x i32> %C to <2 x i64>
2960  %tmp4 = mul <2 x i64> %tmp1, <i64 10, i64 10>
2961  %tmp5 = add <2 x i64> %tmp4, %tmp3
2962  ret <2 x i64> %tmp5
2963}
2964
2965define <2 x i64> @umulladdl_const_v2i32_v2i64(<2 x i32> %A, <2 x i32> %C) nounwind {
2966; CHECK-NEON-LABEL: umulladdl_const_v2i32_v2i64:
2967; CHECK-NEON:       // %bb.0:
2968; CHECK-NEON-NEXT:    movi v2.2s, #10
2969; CHECK-NEON-NEXT:    umull v0.2d, v0.2s, v2.2s
2970; CHECK-NEON-NEXT:    uaddw v0.2d, v0.2d, v1.2s
2971; CHECK-NEON-NEXT:    ret
2972;
2973; CHECK-SVE-LABEL: umulladdl_const_v2i32_v2i64:
2974; CHECK-SVE:       // %bb.0:
2975; CHECK-SVE-NEXT:    movi v2.2s, #10
2976; CHECK-SVE-NEXT:    umull v0.2d, v0.2s, v2.2s
2977; CHECK-SVE-NEXT:    uaddw v0.2d, v0.2d, v1.2s
2978; CHECK-SVE-NEXT:    ret
2979;
2980; CHECK-GI-LABEL: umulladdl_const_v2i32_v2i64:
2981; CHECK-GI:       // %bb.0:
2982; CHECK-GI-NEXT:    adrp x8, .LCPI99_0
2983; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
2984; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI99_0]
2985; CHECK-GI-NEXT:    fmov x8, d0
2986; CHECK-GI-NEXT:    fmov x9, d2
2987; CHECK-GI-NEXT:    mov x10, v0.d[1]
2988; CHECK-GI-NEXT:    mov x11, v2.d[1]
2989; CHECK-GI-NEXT:    mul x8, x8, x9
2990; CHECK-GI-NEXT:    mul x9, x10, x11
2991; CHECK-GI-NEXT:    mov v0.d[0], x8
2992; CHECK-GI-NEXT:    mov v0.d[1], x9
2993; CHECK-GI-NEXT:    uaddw v0.2d, v0.2d, v1.2s
2994; CHECK-GI-NEXT:    ret
2995  %tmp1 = zext <2 x i32> %A to <2 x i64>
2996  %tmp3 = zext <2 x i32> %C to <2 x i64>
2997  %tmp4 = mul <2 x i64> %tmp1, <i64 10, i64 10>
2998  %tmp5 = add <2 x i64> %tmp4, %tmp3
2999  ret <2 x i64> %tmp5
3000}
3001
3002define <2 x i64> @sdistribute_v2i32(<2 x i32> %src1, <2 x i32> %src2, <2 x i32> %mul) {
3003; CHECK-NEON-LABEL: sdistribute_v2i32:
3004; CHECK-NEON:       // %bb.0: // %entry
3005; CHECK-NEON-NEXT:    smull v0.2d, v0.2s, v2.2s
3006; CHECK-NEON-NEXT:    smlal v0.2d, v1.2s, v2.2s
3007; CHECK-NEON-NEXT:    ret
3008;
3009; CHECK-SVE-LABEL: sdistribute_v2i32:
3010; CHECK-SVE:       // %bb.0: // %entry
3011; CHECK-SVE-NEXT:    smull v0.2d, v0.2s, v2.2s
3012; CHECK-SVE-NEXT:    smlal v0.2d, v1.2s, v2.2s
3013; CHECK-SVE-NEXT:    ret
3014;
3015; CHECK-GI-LABEL: sdistribute_v2i32:
3016; CHECK-GI:       // %bb.0: // %entry
3017; CHECK-GI-NEXT:    sshll v2.2d, v2.2s, #0
3018; CHECK-GI-NEXT:    saddl v0.2d, v0.2s, v1.2s
3019; CHECK-GI-NEXT:    fmov x8, d0
3020; CHECK-GI-NEXT:    fmov x9, d2
3021; CHECK-GI-NEXT:    mov x10, v0.d[1]
3022; CHECK-GI-NEXT:    mov x11, v2.d[1]
3023; CHECK-GI-NEXT:    mul x8, x8, x9
3024; CHECK-GI-NEXT:    mul x9, x10, x11
3025; CHECK-GI-NEXT:    mov v0.d[0], x8
3026; CHECK-GI-NEXT:    mov v0.d[1], x9
3027; CHECK-GI-NEXT:    ret
3028entry:
3029  %4 = sext <2 x i32> %src1 to <2 x i64>
3030  %5 = sext <2 x i32> %mul to <2 x i64>
3031  %7 = sext <2 x i32> %src2 to <2 x i64>
3032  %8 = add nuw nsw <2 x i64> %4, %7
3033  %9 = mul <2 x i64> %8, %5
3034  ret <2 x i64> %9
3035}
3036
3037define <2 x i64> @sdistribute_const1_v2i32(<2 x i32> %src1, <2 x i32> %mul) {
3038; CHECK-NEON-LABEL: sdistribute_const1_v2i32:
3039; CHECK-NEON:       // %bb.0: // %entry
3040; CHECK-NEON-NEXT:    movi v2.2s, #10
3041; CHECK-NEON-NEXT:    smull v0.2d, v0.2s, v1.2s
3042; CHECK-NEON-NEXT:    smlal v0.2d, v2.2s, v1.2s
3043; CHECK-NEON-NEXT:    ret
3044;
3045; CHECK-SVE-LABEL: sdistribute_const1_v2i32:
3046; CHECK-SVE:       // %bb.0: // %entry
3047; CHECK-SVE-NEXT:    movi v2.2s, #10
3048; CHECK-SVE-NEXT:    smull v0.2d, v0.2s, v1.2s
3049; CHECK-SVE-NEXT:    smlal v0.2d, v2.2s, v1.2s
3050; CHECK-SVE-NEXT:    ret
3051;
3052; CHECK-GI-LABEL: sdistribute_const1_v2i32:
3053; CHECK-GI:       // %bb.0: // %entry
3054; CHECK-GI-NEXT:    adrp x8, .LCPI101_0
3055; CHECK-GI-NEXT:    sshll v1.2d, v1.2s, #0
3056; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI101_0]
3057; CHECK-GI-NEXT:    saddw v0.2d, v2.2d, v0.2s
3058; CHECK-GI-NEXT:    fmov x9, d1
3059; CHECK-GI-NEXT:    mov x11, v1.d[1]
3060; CHECK-GI-NEXT:    fmov x8, d0
3061; CHECK-GI-NEXT:    mov x10, v0.d[1]
3062; CHECK-GI-NEXT:    mul x8, x8, x9
3063; CHECK-GI-NEXT:    mul x9, x10, x11
3064; CHECK-GI-NEXT:    mov v0.d[0], x8
3065; CHECK-GI-NEXT:    mov v0.d[1], x9
3066; CHECK-GI-NEXT:    ret
3067entry:
3068  %4 = sext <2 x i32> %src1 to <2 x i64>
3069  %5 = sext <2 x i32> %mul to <2 x i64>
3070  %8 = add nuw nsw <2 x i64> %4, <i64 10, i64 10>
3071  %9 = mul <2 x i64> %8, %5
3072  ret <2 x i64> %9
3073}
3074
3075define <2 x i64> @sdistribute_const2_v2i32(<2 x i32> %src1, <2 x i32> %src2) {
3076; CHECK-NEON-LABEL: sdistribute_const2_v2i32:
3077; CHECK-NEON:       // %bb.0: // %entry
3078; CHECK-NEON-NEXT:    movi v2.2s, #10
3079; CHECK-NEON-NEXT:    smull v0.2d, v0.2s, v2.2s
3080; CHECK-NEON-NEXT:    smlal v0.2d, v1.2s, v2.2s
3081; CHECK-NEON-NEXT:    ret
3082;
3083; CHECK-SVE-LABEL: sdistribute_const2_v2i32:
3084; CHECK-SVE:       // %bb.0: // %entry
3085; CHECK-SVE-NEXT:    movi v2.2s, #10
3086; CHECK-SVE-NEXT:    smull v0.2d, v0.2s, v2.2s
3087; CHECK-SVE-NEXT:    smlal v0.2d, v1.2s, v2.2s
3088; CHECK-SVE-NEXT:    ret
3089;
3090; CHECK-GI-LABEL: sdistribute_const2_v2i32:
3091; CHECK-GI:       // %bb.0: // %entry
3092; CHECK-GI-NEXT:    adrp x8, .LCPI102_0
3093; CHECK-GI-NEXT:    saddl v0.2d, v0.2s, v1.2s
3094; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI102_0]
3095; CHECK-GI-NEXT:    fmov x8, d0
3096; CHECK-GI-NEXT:    fmov x9, d1
3097; CHECK-GI-NEXT:    mov x10, v0.d[1]
3098; CHECK-GI-NEXT:    mov x11, v1.d[1]
3099; CHECK-GI-NEXT:    mul x8, x8, x9
3100; CHECK-GI-NEXT:    mul x9, x10, x11
3101; CHECK-GI-NEXT:    mov v0.d[0], x8
3102; CHECK-GI-NEXT:    mov v0.d[1], x9
3103; CHECK-GI-NEXT:    ret
3104entry:
3105  %4 = sext <2 x i32> %src1 to <2 x i64>
3106  %5 = sext <2 x i32> %src2 to <2 x i64>
3107  %8 = add nuw nsw <2 x i64> %4, %5
3108  %9 = mul <2 x i64> %8, <i64 10, i64 10>
3109  ret <2 x i64> %9
3110}
3111
3112define <2 x i64> @udistribute_v2i32(<2 x i32> %src1, <2 x i32> %src2, <2 x i32> %mul) {
3113; CHECK-NEON-LABEL: udistribute_v2i32:
3114; CHECK-NEON:       // %bb.0: // %entry
3115; CHECK-NEON-NEXT:    umull v0.2d, v0.2s, v2.2s
3116; CHECK-NEON-NEXT:    umlal v0.2d, v1.2s, v2.2s
3117; CHECK-NEON-NEXT:    ret
3118;
3119; CHECK-SVE-LABEL: udistribute_v2i32:
3120; CHECK-SVE:       // %bb.0: // %entry
3121; CHECK-SVE-NEXT:    umull v0.2d, v0.2s, v2.2s
3122; CHECK-SVE-NEXT:    umlal v0.2d, v1.2s, v2.2s
3123; CHECK-SVE-NEXT:    ret
3124;
3125; CHECK-GI-LABEL: udistribute_v2i32:
3126; CHECK-GI:       // %bb.0: // %entry
3127; CHECK-GI-NEXT:    ushll v2.2d, v2.2s, #0
3128; CHECK-GI-NEXT:    uaddl v0.2d, v0.2s, v1.2s
3129; CHECK-GI-NEXT:    fmov x8, d0
3130; CHECK-GI-NEXT:    fmov x9, d2
3131; CHECK-GI-NEXT:    mov x10, v0.d[1]
3132; CHECK-GI-NEXT:    mov x11, v2.d[1]
3133; CHECK-GI-NEXT:    mul x8, x8, x9
3134; CHECK-GI-NEXT:    mul x9, x10, x11
3135; CHECK-GI-NEXT:    mov v0.d[0], x8
3136; CHECK-GI-NEXT:    mov v0.d[1], x9
3137; CHECK-GI-NEXT:    ret
3138entry:
3139  %4 = zext <2 x i32> %src1 to <2 x i64>
3140  %5 = zext <2 x i32> %mul to <2 x i64>
3141  %7 = zext <2 x i32> %src2 to <2 x i64>
3142  %8 = add nuw nsw <2 x i64> %4, %7
3143  %9 = mul <2 x i64> %8, %5
3144  ret <2 x i64> %9
3145}
3146
3147define <2 x i64> @udistribute_const1_v2i32(<2 x i32> %src1, <2 x i32> %mul) {
3148; CHECK-NEON-LABEL: udistribute_const1_v2i32:
3149; CHECK-NEON:       // %bb.0: // %entry
3150; CHECK-NEON-NEXT:    movi v2.2s, #10
3151; CHECK-NEON-NEXT:    umull v0.2d, v0.2s, v1.2s
3152; CHECK-NEON-NEXT:    umlal v0.2d, v2.2s, v1.2s
3153; CHECK-NEON-NEXT:    ret
3154;
3155; CHECK-SVE-LABEL: udistribute_const1_v2i32:
3156; CHECK-SVE:       // %bb.0: // %entry
3157; CHECK-SVE-NEXT:    movi v2.2s, #10
3158; CHECK-SVE-NEXT:    umull v0.2d, v0.2s, v1.2s
3159; CHECK-SVE-NEXT:    umlal v0.2d, v2.2s, v1.2s
3160; CHECK-SVE-NEXT:    ret
3161;
3162; CHECK-GI-LABEL: udistribute_const1_v2i32:
3163; CHECK-GI:       // %bb.0: // %entry
3164; CHECK-GI-NEXT:    adrp x8, .LCPI104_0
3165; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
3166; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI104_0]
3167; CHECK-GI-NEXT:    uaddw v0.2d, v2.2d, v0.2s
3168; CHECK-GI-NEXT:    fmov x9, d1
3169; CHECK-GI-NEXT:    mov x11, v1.d[1]
3170; CHECK-GI-NEXT:    fmov x8, d0
3171; CHECK-GI-NEXT:    mov x10, v0.d[1]
3172; CHECK-GI-NEXT:    mul x8, x8, x9
3173; CHECK-GI-NEXT:    mul x9, x10, x11
3174; CHECK-GI-NEXT:    mov v0.d[0], x8
3175; CHECK-GI-NEXT:    mov v0.d[1], x9
3176; CHECK-GI-NEXT:    ret
3177entry:
3178  %4 = zext <2 x i32> %src1 to <2 x i64>
3179  %5 = zext <2 x i32> %mul to <2 x i64>
3180  %8 = add nuw nsw <2 x i64> %4, <i64 10, i64 10>
3181  %9 = mul <2 x i64> %8, %5
3182  ret <2 x i64> %9
3183}
3184
3185define <2 x i64> @udistribute_const2_v2i32(<2 x i32> %src1, <2 x i32> %src2) {
3186; CHECK-NEON-LABEL: udistribute_const2_v2i32:
3187; CHECK-NEON:       // %bb.0: // %entry
3188; CHECK-NEON-NEXT:    movi v2.2s, #10
3189; CHECK-NEON-NEXT:    umull v0.2d, v0.2s, v2.2s
3190; CHECK-NEON-NEXT:    umlal v0.2d, v1.2s, v2.2s
3191; CHECK-NEON-NEXT:    ret
3192;
3193; CHECK-SVE-LABEL: udistribute_const2_v2i32:
3194; CHECK-SVE:       // %bb.0: // %entry
3195; CHECK-SVE-NEXT:    movi v2.2s, #10
3196; CHECK-SVE-NEXT:    umull v0.2d, v0.2s, v2.2s
3197; CHECK-SVE-NEXT:    umlal v0.2d, v1.2s, v2.2s
3198; CHECK-SVE-NEXT:    ret
3199;
3200; CHECK-GI-LABEL: udistribute_const2_v2i32:
3201; CHECK-GI:       // %bb.0: // %entry
3202; CHECK-GI-NEXT:    adrp x8, .LCPI105_0
3203; CHECK-GI-NEXT:    uaddl v0.2d, v0.2s, v1.2s
3204; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI105_0]
3205; CHECK-GI-NEXT:    fmov x8, d0
3206; CHECK-GI-NEXT:    fmov x9, d1
3207; CHECK-GI-NEXT:    mov x10, v0.d[1]
3208; CHECK-GI-NEXT:    mov x11, v1.d[1]
3209; CHECK-GI-NEXT:    mul x8, x8, x9
3210; CHECK-GI-NEXT:    mul x9, x10, x11
3211; CHECK-GI-NEXT:    mov v0.d[0], x8
3212; CHECK-GI-NEXT:    mov v0.d[1], x9
3213; CHECK-GI-NEXT:    ret
3214entry:
3215  %4 = zext <2 x i32> %src1 to <2 x i64>
3216  %5 = zext <2 x i32> %src2 to <2 x i64>
3217  %8 = add nuw nsw <2 x i64> %4, %5
3218  %9 = mul <2 x i64> %8, <i64 10, i64 10>
3219  ret <2 x i64> %9
3220}
3221
3222declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>)
3223declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>)
3224declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>)
3225declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
3226declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
3227declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
3228