xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-vmulh.ll (revision dfe11c00212405f1da1f09d7c1125d7661e8da5a)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
3
4define arm_aapcs_vfpcc <2 x i32> @vmulhs_v2i32(<2 x i32> %s0, <2 x i32> %s1) {
5; CHECK-LABEL: vmulhs_v2i32:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    vmullb.s32 q2, q0, q1
8; CHECK-NEXT:    vmov r0, s11
9; CHECK-NEXT:    vmov r1, s9
10; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
11; CHECK-NEXT:    asrs r0, r0, #31
12; CHECK-NEXT:    asrs r1, r1, #31
13; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
14; CHECK-NEXT:    bx lr
15entry:
16  %s0s = sext <2 x i32> %s0 to <2 x i64>
17  %s1s = sext <2 x i32> %s1 to <2 x i64>
18  %m = mul <2 x i64> %s0s, %s1s
19  %s = ashr <2 x i64> %m, <i64 32, i64 32>
20  %s2 = trunc <2 x i64> %s to <2 x i32>
21  ret <2 x i32> %s2
22}
23
24define arm_aapcs_vfpcc <2 x i32> @vmulhu_v2i32(<2 x i32> %s0, <2 x i32> %s1) {
25; CHECK-LABEL: vmulhu_v2i32:
26; CHECK:       @ %bb.0: @ %entry
27; CHECK-NEXT:    vmullb.u32 q2, q0, q1
28; CHECK-NEXT:    vldr s1, .LCPI1_0
29; CHECK-NEXT:    vmov.f32 s0, s9
30; CHECK-NEXT:    vmov.f32 s2, s11
31; CHECK-NEXT:    vmov.f32 s3, s1
32; CHECK-NEXT:    bx lr
33; CHECK-NEXT:    .p2align 2
34; CHECK-NEXT:  @ %bb.1:
35; CHECK-NEXT:  .LCPI1_0:
36; CHECK-NEXT:    .long 0x00000000 @ float 0
37entry:
38  %s0s = zext <2 x i32> %s0 to <2 x i64>
39  %s1s = zext <2 x i32> %s1 to <2 x i64>
40  %m = mul <2 x i64> %s0s, %s1s
41  %s = lshr <2 x i64> %m, <i64 32, i64 32>
42  %s2 = trunc <2 x i64> %s to <2 x i32>
43  ret <2 x i32> %s2
44}
45
46define arm_aapcs_vfpcc <4 x i32> @vmulhs_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
47; CHECK-LABEL: vmulhs_v4i32:
48; CHECK:       @ %bb.0: @ %entry
49; CHECK-NEXT:    vmulh.s32 q0, q0, q1
50; CHECK-NEXT:    bx lr
51entry:
52  %s0s = sext <4 x i32> %s0 to <4 x i64>
53  %s1s = sext <4 x i32> %s1 to <4 x i64>
54  %m = mul <4 x i64> %s0s, %s1s
55  %s = ashr <4 x i64> %m, <i64 32, i64 32, i64 32, i64 32>
56  %s2 = trunc <4 x i64> %s to <4 x i32>
57  ret <4 x i32> %s2
58}
59
60define arm_aapcs_vfpcc <4 x i32> @vmulhu_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
61; CHECK-LABEL: vmulhu_v4i32:
62; CHECK:       @ %bb.0: @ %entry
63; CHECK-NEXT:    vmulh.u32 q0, q0, q1
64; CHECK-NEXT:    bx lr
65entry:
66  %s0s = zext <4 x i32> %s0 to <4 x i64>
67  %s1s = zext <4 x i32> %s1 to <4 x i64>
68  %m = mul <4 x i64> %s0s, %s1s
69  %s = lshr <4 x i64> %m, <i64 32, i64 32, i64 32, i64 32>
70  %s2 = trunc <4 x i64> %s to <4 x i32>
71  ret <4 x i32> %s2
72}
73
74define arm_aapcs_vfpcc <4 x i16> @vmulhs_v4i16(<4 x i16> %s0, <4 x i16> %s1) {
75; CHECK-LABEL: vmulhs_v4i16:
76; CHECK:       @ %bb.0: @ %entry
77; CHECK-NEXT:    vmullb.s16 q0, q0, q1
78; CHECK-NEXT:    vshr.s32 q0, q0, #16
79; CHECK-NEXT:    bx lr
80entry:
81  %s0s = sext <4 x i16> %s0 to <4 x i32>
82  %s1s = sext <4 x i16> %s1 to <4 x i32>
83  %m = mul <4 x i32> %s0s, %s1s
84  %s = ashr <4 x i32> %m, <i32 16, i32 16, i32 16, i32 16>
85  %s2 = trunc <4 x i32> %s to <4 x i16>
86  ret <4 x i16> %s2
87}
88
89define arm_aapcs_vfpcc <4 x i16> @vmulhu_v4i16(<4 x i16> %s0, <4 x i16> %s1) {
90; CHECK-LABEL: vmulhu_v4i16:
91; CHECK:       @ %bb.0: @ %entry
92; CHECK-NEXT:    vmullb.u16 q0, q0, q1
93; CHECK-NEXT:    vshr.u32 q0, q0, #16
94; CHECK-NEXT:    bx lr
95entry:
96  %s0s = zext <4 x i16> %s0 to <4 x i32>
97  %s1s = zext <4 x i16> %s1 to <4 x i32>
98  %m = mul <4 x i32> %s0s, %s1s
99  %s = lshr <4 x i32> %m, <i32 16, i32 16, i32 16, i32 16>
100  %s2 = trunc <4 x i32> %s to <4 x i16>
101  ret <4 x i16> %s2
102}
103
104define arm_aapcs_vfpcc <8 x i16> @vmulhs_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
105; CHECK-LABEL: vmulhs_v8i16:
106; CHECK:       @ %bb.0: @ %entry
107; CHECK-NEXT:    vmulh.s16 q0, q0, q1
108; CHECK-NEXT:    bx lr
109entry:
110  %s0s = sext <8 x i16> %s0 to <8 x i32>
111  %s1s = sext <8 x i16> %s1 to <8 x i32>
112  %m = mul <8 x i32> %s0s, %s1s
113  %s = ashr <8 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
114  %s2 = trunc <8 x i32> %s to <8 x i16>
115  ret <8 x i16> %s2
116}
117
118define arm_aapcs_vfpcc <8 x i16> @vmulhu_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
119; CHECK-LABEL: vmulhu_v8i16:
120; CHECK:       @ %bb.0: @ %entry
121; CHECK-NEXT:    vmulh.u16 q0, q0, q1
122; CHECK-NEXT:    bx lr
123entry:
124  %s0s = zext <8 x i16> %s0 to <8 x i32>
125  %s1s = zext <8 x i16> %s1 to <8 x i32>
126  %m = mul <8 x i32> %s0s, %s1s
127  %s = lshr <8 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
128  %s2 = trunc <8 x i32> %s to <8 x i16>
129  ret <8 x i16> %s2
130}
131
132define arm_aapcs_vfpcc <4 x i8> @vmulhs_v4i8(<4 x i8> %s0, <4 x i8> %s1) {
133; CHECK-LABEL: vmulhs_v4i8:
134; CHECK:       @ %bb.0: @ %entry
135; CHECK-NEXT:    vmovlb.s8 q1, q1
136; CHECK-NEXT:    vmovlb.s8 q0, q0
137; CHECK-NEXT:    vmovlb.s16 q1, q1
138; CHECK-NEXT:    vmovlb.s16 q0, q0
139; CHECK-NEXT:    vmul.i32 q0, q0, q1
140; CHECK-NEXT:    vshr.s32 q0, q0, #8
141; CHECK-NEXT:    bx lr
142entry:
143  %s0s = sext <4 x i8> %s0 to <4 x i16>
144  %s1s = sext <4 x i8> %s1 to <4 x i16>
145  %m = mul <4 x i16> %s0s, %s1s
146  %s = ashr <4 x i16> %m, <i16 8, i16 8, i16 8, i16 8>
147  %s2 = trunc <4 x i16> %s to <4 x i8>
148  ret <4 x i8> %s2
149}
150
151define arm_aapcs_vfpcc <4 x i8> @vmulhu_v4i8(<4 x i8> %s0, <4 x i8> %s1) {
152; CHECK-LABEL: vmulhu_v4i8:
153; CHECK:       @ %bb.0: @ %entry
154; CHECK-NEXT:    vmov.i32 q2, #0xff
155; CHECK-NEXT:    vand q1, q1, q2
156; CHECK-NEXT:    vand q0, q0, q2
157; CHECK-NEXT:    vmul.i32 q0, q0, q1
158; CHECK-NEXT:    vshr.u32 q0, q0, #8
159; CHECK-NEXT:    bx lr
160entry:
161  %s0s = zext <4 x i8> %s0 to <4 x i16>
162  %s1s = zext <4 x i8> %s1 to <4 x i16>
163  %m = mul <4 x i16> %s0s, %s1s
164  %s = lshr <4 x i16> %m, <i16 8, i16 8, i16 8, i16 8>
165  %s2 = trunc <4 x i16> %s to <4 x i8>
166  ret <4 x i8> %s2
167}
168
169define arm_aapcs_vfpcc <8 x i8> @vmulhs_v8i8(<8 x i8> %s0, <8 x i8> %s1) {
170; CHECK-LABEL: vmulhs_v8i8:
171; CHECK:       @ %bb.0: @ %entry
172; CHECK-NEXT:    vmullb.s8 q0, q0, q1
173; CHECK-NEXT:    vshr.s16 q0, q0, #8
174; CHECK-NEXT:    bx lr
175entry:
176  %s0s = sext <8 x i8> %s0 to <8 x i16>
177  %s1s = sext <8 x i8> %s1 to <8 x i16>
178  %m = mul <8 x i16> %s0s, %s1s
179  %s = ashr <8 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
180  %s2 = trunc <8 x i16> %s to <8 x i8>
181  ret <8 x i8> %s2
182}
183
184define arm_aapcs_vfpcc <8 x i8> @vmulhu_v8i8(<8 x i8> %s0, <8 x i8> %s1) {
185; CHECK-LABEL: vmulhu_v8i8:
186; CHECK:       @ %bb.0: @ %entry
187; CHECK-NEXT:    vmullb.u8 q0, q0, q1
188; CHECK-NEXT:    vshr.u16 q0, q0, #8
189; CHECK-NEXT:    bx lr
190entry:
191  %s0s = zext <8 x i8> %s0 to <8 x i16>
192  %s1s = zext <8 x i8> %s1 to <8 x i16>
193  %m = mul <8 x i16> %s0s, %s1s
194  %s = lshr <8 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
195  %s2 = trunc <8 x i16> %s to <8 x i8>
196  ret <8 x i8> %s2
197}
198
199define arm_aapcs_vfpcc <16 x i8> @vmulhs_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
200; CHECK-LABEL: vmulhs_v16i8:
201; CHECK:       @ %bb.0: @ %entry
202; CHECK-NEXT:    vmulh.s8 q0, q0, q1
203; CHECK-NEXT:    bx lr
204entry:
205  %s0s = sext <16 x i8> %s0 to <16 x i16>
206  %s1s = sext <16 x i8> %s1 to <16 x i16>
207  %m = mul <16 x i16> %s0s, %s1s
208  %s = ashr <16 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
209  %s2 = trunc <16 x i16> %s to <16 x i8>
210  ret <16 x i8> %s2
211}
212
213define arm_aapcs_vfpcc <16 x i8> @vmulhu_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
214; CHECK-LABEL: vmulhu_v16i8:
215; CHECK:       @ %bb.0: @ %entry
216; CHECK-NEXT:    vmulh.u8 q0, q0, q1
217; CHECK-NEXT:    bx lr
218entry:
219  %s0s = zext <16 x i8> %s0 to <16 x i16>
220  %s1s = zext <16 x i8> %s1 to <16 x i16>
221  %m = mul <16 x i16> %s0s, %s1s
222  %s = lshr <16 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
223  %s2 = trunc <16 x i16> %s to <16 x i8>
224  ret <16 x i8> %s2
225}
226
227define void @vmulh_s8(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) {
228; CHECK-LABEL: vmulh_s8:
229; CHECK:       @ %bb.0: @ %entry
230; CHECK-NEXT:    .save {r7, lr}
231; CHECK-NEXT:    push {r7, lr}
232; CHECK-NEXT:    mov.w lr, #64
233; CHECK-NEXT:  .LBB14_1: @ %vector.body
234; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
235; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
236; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
237; CHECK-NEXT:    vmulh.s8 q0, q1, q0
238; CHECK-NEXT:    vstrb.8 q0, [r2], #16
239; CHECK-NEXT:    le lr, .LBB14_1
240; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
241; CHECK-NEXT:    pop {r7, pc}
242entry:
243  br label %vector.body
244
245vector.body:                                      ; preds = %vector.body, %entry
246  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
247  %0 = getelementptr inbounds i8, ptr %x, i32 %index
248  %wide.load = load <16 x i8>, ptr %0, align 1
249  %1 = sext <16 x i8> %wide.load to <16 x i16>
250  %2 = getelementptr inbounds i8, ptr %y, i32 %index
251  %wide.load17 = load <16 x i8>, ptr %2, align 1
252  %3 = sext <16 x i8> %wide.load17 to <16 x i16>
253  %4 = mul nsw <16 x i16> %3, %1
254  %5 = lshr <16 x i16> %4, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
255  %6 = trunc <16 x i16> %5 to <16 x i8>
256  %7 = getelementptr inbounds i8, ptr %z, i32 %index
257  store <16 x i8> %6, ptr %7, align 1
258  %index.next = add i32 %index, 16
259  %8 = icmp eq i32 %index.next, 1024
260  br i1 %8, label %for.cond.cleanup, label %vector.body
261
262for.cond.cleanup:                                 ; preds = %vector.body
263  ret void
264}
265
266define void @vmulh_s16(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) {
267; CHECK-LABEL: vmulh_s16:
268; CHECK:       @ %bb.0: @ %entry
269; CHECK-NEXT:    .save {r7, lr}
270; CHECK-NEXT:    push {r7, lr}
271; CHECK-NEXT:    mov.w lr, #128
272; CHECK-NEXT:  .LBB15_1: @ %vector.body
273; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
274; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
275; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
276; CHECK-NEXT:    vmulh.s16 q0, q1, q0
277; CHECK-NEXT:    vstrb.8 q0, [r2], #16
278; CHECK-NEXT:    le lr, .LBB15_1
279; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
280; CHECK-NEXT:    pop {r7, pc}
281entry:
282  br label %vector.body
283
284vector.body:                                      ; preds = %vector.body, %entry
285  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
286  %0 = getelementptr inbounds i16, ptr %x, i32 %index
287  %wide.load = load <8 x i16>, ptr %0, align 2
288  %1 = sext <8 x i16> %wide.load to <8 x i32>
289  %2 = getelementptr inbounds i16, ptr %y, i32 %index
290  %wide.load17 = load <8 x i16>, ptr %2, align 2
291  %3 = sext <8 x i16> %wide.load17 to <8 x i32>
292  %4 = mul nsw <8 x i32> %3, %1
293  %5 = lshr <8 x i32> %4, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
294  %6 = trunc <8 x i32> %5 to <8 x i16>
295  %7 = getelementptr inbounds i16, ptr %z, i32 %index
296  store <8 x i16> %6, ptr %7, align 2
297  %index.next = add i32 %index, 8
298  %8 = icmp eq i32 %index.next, 1024
299  br i1 %8, label %for.cond.cleanup, label %vector.body
300
301for.cond.cleanup:                                 ; preds = %vector.body
302  ret void
303}
304
305define void @vmulh_s32(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) {
306; CHECK-LABEL: vmulh_s32:
307; CHECK:       @ %bb.0: @ %entry
308; CHECK-NEXT:    .save {r7, lr}
309; CHECK-NEXT:    push {r7, lr}
310; CHECK-NEXT:    mov.w lr, #256
311; CHECK-NEXT:  .LBB16_1: @ %vector.body
312; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
313; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
314; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
315; CHECK-NEXT:    vmulh.s32 q0, q1, q0
316; CHECK-NEXT:    vstrb.8 q0, [r2], #16
317; CHECK-NEXT:    le lr, .LBB16_1
318; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
319; CHECK-NEXT:    pop {r7, pc}
320entry:
321  br label %vector.body
322
323vector.body:                                      ; preds = %vector.body, %entry
324  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
325  %0 = getelementptr inbounds i32, ptr %x, i32 %index
326  %wide.load = load <4 x i32>, ptr %0, align 4
327  %1 = sext <4 x i32> %wide.load to <4 x i64>
328  %2 = getelementptr inbounds i32, ptr %y, i32 %index
329  %wide.load17 = load <4 x i32>, ptr %2, align 4
330  %3 = sext <4 x i32> %wide.load17 to <4 x i64>
331  %4 = mul nsw <4 x i64> %3, %1
332  %5 = lshr <4 x i64> %4, <i64 32, i64 32, i64 32, i64 32>
333  %6 = trunc <4 x i64> %5 to <4 x i32>
334  %7 = getelementptr inbounds i32, ptr %z, i32 %index
335  store <4 x i32> %6, ptr %7, align 4
336  %index.next = add i32 %index, 4
337  %8 = icmp eq i32 %index.next, 1024
338  br i1 %8, label %for.cond.cleanup, label %vector.body
339
340for.cond.cleanup:                                 ; preds = %vector.body
341  ret void
342}
343
344define void @vmulh_u8(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) {
345; CHECK-LABEL: vmulh_u8:
346; CHECK:       @ %bb.0: @ %entry
347; CHECK-NEXT:    .save {r7, lr}
348; CHECK-NEXT:    push {r7, lr}
349; CHECK-NEXT:    mov.w lr, #64
350; CHECK-NEXT:  .LBB17_1: @ %vector.body
351; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
352; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
353; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
354; CHECK-NEXT:    vmulh.u8 q0, q1, q0
355; CHECK-NEXT:    vstrb.8 q0, [r2], #16
356; CHECK-NEXT:    le lr, .LBB17_1
357; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
358; CHECK-NEXT:    pop {r7, pc}
359entry:
360  br label %vector.body
361
362vector.body:                                      ; preds = %vector.body, %entry
363  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
364  %0 = getelementptr inbounds i8, ptr %x, i32 %index
365  %wide.load = load <16 x i8>, ptr %0, align 1
366  %1 = zext <16 x i8> %wide.load to <16 x i16>
367  %2 = getelementptr inbounds i8, ptr %y, i32 %index
368  %wide.load17 = load <16 x i8>, ptr %2, align 1
369  %3 = zext <16 x i8> %wide.load17 to <16 x i16>
370  %4 = mul nuw <16 x i16> %3, %1
371  %5 = lshr <16 x i16> %4, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
372  %6 = trunc <16 x i16> %5 to <16 x i8>
373  %7 = getelementptr inbounds i8, ptr %z, i32 %index
374  store <16 x i8> %6, ptr %7, align 1
375  %index.next = add i32 %index, 16
376  %8 = icmp eq i32 %index.next, 1024
377  br i1 %8, label %for.cond.cleanup, label %vector.body
378
379for.cond.cleanup:                                 ; preds = %vector.body
380  ret void
381}
382
383define void @vmulh_u16(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) {
384; CHECK-LABEL: vmulh_u16:
385; CHECK:       @ %bb.0: @ %entry
386; CHECK-NEXT:    .save {r7, lr}
387; CHECK-NEXT:    push {r7, lr}
388; CHECK-NEXT:    mov.w lr, #128
389; CHECK-NEXT:  .LBB18_1: @ %vector.body
390; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
391; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
392; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
393; CHECK-NEXT:    vmulh.u16 q0, q1, q0
394; CHECK-NEXT:    vstrb.8 q0, [r2], #16
395; CHECK-NEXT:    le lr, .LBB18_1
396; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
397; CHECK-NEXT:    pop {r7, pc}
398entry:
399  br label %vector.body
400
401vector.body:                                      ; preds = %vector.body, %entry
402  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
403  %0 = getelementptr inbounds i16, ptr %x, i32 %index
404  %wide.load = load <8 x i16>, ptr %0, align 2
405  %1 = zext <8 x i16> %wide.load to <8 x i32>
406  %2 = getelementptr inbounds i16, ptr %y, i32 %index
407  %wide.load17 = load <8 x i16>, ptr %2, align 2
408  %3 = zext <8 x i16> %wide.load17 to <8 x i32>
409  %4 = mul nuw <8 x i32> %3, %1
410  %5 = lshr <8 x i32> %4, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
411  %6 = trunc <8 x i32> %5 to <8 x i16>
412  %7 = getelementptr inbounds i16, ptr %z, i32 %index
413  store <8 x i16> %6, ptr %7, align 2
414  %index.next = add i32 %index, 8
415  %8 = icmp eq i32 %index.next, 1024
416  br i1 %8, label %for.cond.cleanup, label %vector.body
417
418for.cond.cleanup:                                 ; preds = %vector.body
419  ret void
420}
421
422define void @vmulh_u32(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) {
423; CHECK-LABEL: vmulh_u32:
424; CHECK:       @ %bb.0: @ %entry
425; CHECK-NEXT:    .save {r7, lr}
426; CHECK-NEXT:    push {r7, lr}
427; CHECK-NEXT:    mov.w lr, #256
428; CHECK-NEXT:  .LBB19_1: @ %vector.body
429; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
430; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
431; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
432; CHECK-NEXT:    vmulh.u32 q0, q1, q0
433; CHECK-NEXT:    vstrb.8 q0, [r2], #16
434; CHECK-NEXT:    le lr, .LBB19_1
435; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
436; CHECK-NEXT:    pop {r7, pc}
437entry:
438  br label %vector.body
439
440vector.body:                                      ; preds = %vector.body, %entry
441  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
442  %0 = getelementptr inbounds i32, ptr %x, i32 %index
443  %wide.load = load <4 x i32>, ptr %0, align 4
444  %1 = zext <4 x i32> %wide.load to <4 x i64>
445  %2 = getelementptr inbounds i32, ptr %y, i32 %index
446  %wide.load17 = load <4 x i32>, ptr %2, align 4
447  %3 = zext <4 x i32> %wide.load17 to <4 x i64>
448  %4 = mul nuw <4 x i64> %3, %1
449  %5 = lshr <4 x i64> %4, <i64 32, i64 32, i64 32, i64 32>
450  %6 = trunc <4 x i64> %5 to <4 x i32>
451  %7 = getelementptr inbounds i32, ptr %z, i32 %index
452  store <4 x i32> %6, ptr %7, align 4
453  %index.next = add i32 %index, 4
454  %8 = icmp eq i32 %index.next, 1024
455  br i1 %8, label %for.cond.cleanup, label %vector.body
456
457for.cond.cleanup:                                 ; preds = %vector.body
458  ret void
459}
460
461
462define void @vmulh_s32_pred(ptr noalias nocapture %d, ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) {
463; CHECK-LABEL: vmulh_s32_pred:
464; CHECK:       @ %bb.0: @ %entry
465; CHECK-NEXT:    .save {r7, lr}
466; CHECK-NEXT:    push {r7, lr}
467; CHECK-NEXT:    cmp r3, #1
468; CHECK-NEXT:    it lt
469; CHECK-NEXT:    poplt {r7, pc}
470; CHECK-NEXT:  .LBB20_1: @ %vector.ph
471; CHECK-NEXT:    dlstp.32 lr, r3
472; CHECK-NEXT:  .LBB20_2: @ %vector.body
473; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
474; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
475; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
476; CHECK-NEXT:    vmulh.s32 q0, q1, q0
477; CHECK-NEXT:    vstrw.32 q0, [r0], #16
478; CHECK-NEXT:    letp lr, .LBB20_2
479; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
480; CHECK-NEXT:    pop {r7, pc}
481entry:
482  %cmp10 = icmp sgt i32 %n, 0
483  br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
484
485vector.ph:                                        ; preds = %entry
486  %n.rnd.up = add i32 %n, 3
487  %n.vec = and i32 %n.rnd.up, -4
488  br label %vector.body
489
490vector.body:                                      ; preds = %vector.body, %vector.ph
491  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
492  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
493  %0 = getelementptr inbounds i32, ptr %x, i32 %index
494  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %0, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison)
495  %1 = sext <4 x i32> %wide.masked.load to <4 x i64>
496  %2 = getelementptr inbounds i32, ptr %y, i32 %index
497  %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison)
498  %3 = sext <4 x i32> %wide.masked.load12 to <4 x i64>
499  %4 = mul nsw <4 x i64> %3, %1
500  %5 = lshr <4 x i64> %4, <i64 32, i64 32, i64 32, i64 32>
501  %6 = trunc <4 x i64> %5 to <4 x i32>
502  %7 = getelementptr inbounds i32, ptr %d, i32 %index
503  call void @llvm.masked.store.v4i32.p0(<4 x i32> %6, ptr %7, i32 4, <4 x i1> %active.lane.mask)
504  %index.next = add i32 %index, 4
505  %8 = icmp eq i32 %index.next, %n.vec
506  br i1 %8, label %for.cond.cleanup, label %vector.body
507
508for.cond.cleanup:                                 ; preds = %vector.body, %entry
509  ret void
510}
511
512define void @vmulh_u32_pred(ptr noalias nocapture %d, ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) {
513; CHECK-LABEL: vmulh_u32_pred:
514; CHECK:       @ %bb.0: @ %entry
515; CHECK-NEXT:    .save {r7, lr}
516; CHECK-NEXT:    push {r7, lr}
517; CHECK-NEXT:    cmp r3, #1
518; CHECK-NEXT:    it lt
519; CHECK-NEXT:    poplt {r7, pc}
520; CHECK-NEXT:  .LBB21_1: @ %vector.ph
521; CHECK-NEXT:    dlstp.32 lr, r3
522; CHECK-NEXT:  .LBB21_2: @ %vector.body
523; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
524; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
525; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
526; CHECK-NEXT:    vmulh.u32 q0, q1, q0
527; CHECK-NEXT:    vstrw.32 q0, [r0], #16
528; CHECK-NEXT:    letp lr, .LBB21_2
529; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
530; CHECK-NEXT:    pop {r7, pc}
531entry:
532  %cmp10 = icmp sgt i32 %n, 0
533  br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
534
535vector.ph:                                        ; preds = %entry
536  %n.rnd.up = add i32 %n, 3
537  %n.vec = and i32 %n.rnd.up, -4
538  br label %vector.body
539
540vector.body:                                      ; preds = %vector.body, %vector.ph
541  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
542  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
543  %0 = getelementptr inbounds i32, ptr %x, i32 %index
544  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %0, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison)
545  %1 = zext <4 x i32> %wide.masked.load to <4 x i64>
546  %2 = getelementptr inbounds i32, ptr %y, i32 %index
547  %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison)
548  %3 = zext <4 x i32> %wide.masked.load12 to <4 x i64>
549  %4 = mul nuw <4 x i64> %3, %1
550  %5 = lshr <4 x i64> %4, <i64 32, i64 32, i64 32, i64 32>
551  %6 = trunc <4 x i64> %5 to <4 x i32>
552  %7 = getelementptr inbounds i32, ptr %d, i32 %index
553  call void @llvm.masked.store.v4i32.p0(<4 x i32> %6, ptr %7, i32 4, <4 x i1> %active.lane.mask)
554  %index.next = add i32 %index, 4
555  %8 = icmp eq i32 %index.next, %n.vec
556  br i1 %8, label %for.cond.cleanup, label %vector.body
557
558for.cond.cleanup:                                 ; preds = %vector.body, %entry
559  ret void
560}
561
562define void @vmulh_s16_pred(ptr noalias nocapture %d, ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) {
563; CHECK-LABEL: vmulh_s16_pred:
564; CHECK:       @ %bb.0: @ %entry
565; CHECK-NEXT:    .save {r7, lr}
566; CHECK-NEXT:    push {r7, lr}
567; CHECK-NEXT:    cmp r3, #1
568; CHECK-NEXT:    it lt
569; CHECK-NEXT:    poplt {r7, pc}
570; CHECK-NEXT:  .LBB22_1: @ %vector.ph
571; CHECK-NEXT:    dlstp.16 lr, r3
572; CHECK-NEXT:  .LBB22_2: @ %vector.body
573; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
574; CHECK-NEXT:    vldrh.u16 q0, [r1], #16
575; CHECK-NEXT:    vldrh.u16 q1, [r2], #16
576; CHECK-NEXT:    vmulh.s16 q0, q1, q0
577; CHECK-NEXT:    vstrh.16 q0, [r0], #16
578; CHECK-NEXT:    letp lr, .LBB22_2
579; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
580; CHECK-NEXT:    pop {r7, pc}
581entry:
582  %cmp10 = icmp sgt i32 %n, 0
583  br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
584
585vector.ph:                                        ; preds = %entry
586  %n.rnd.up = add i32 %n, 7
587  %n.vec = and i32 %n.rnd.up, -8
588  br label %vector.body
589
590vector.body:                                      ; preds = %vector.body, %vector.ph
591  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
592  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
593  %0 = getelementptr inbounds i16, ptr %x, i32 %index
594  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %0, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison)
595  %1 = sext <8 x i16> %wide.masked.load to <8 x i32>
596  %2 = getelementptr inbounds i16, ptr %y, i32 %index
597  %wide.masked.load12 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %2, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison)
598  %3 = sext <8 x i16> %wide.masked.load12 to <8 x i32>
599  %4 = mul nsw <8 x i32> %3, %1
600  %5 = lshr <8 x i32> %4, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
601  %6 = trunc <8 x i32> %5 to <8 x i16>
602  %7 = getelementptr inbounds i16, ptr %d, i32 %index
603  call void @llvm.masked.store.v8i16.p0(<8 x i16> %6, ptr %7, i32 2, <8 x i1> %active.lane.mask)
604  %index.next = add i32 %index, 8
605  %8 = icmp eq i32 %index.next, %n.vec
606  br i1 %8, label %for.cond.cleanup, label %vector.body
607
608for.cond.cleanup:                                 ; preds = %vector.body, %entry
609  ret void
610}
611
612define void @vmulh_u16_pred(ptr noalias nocapture %d, ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) {
613; CHECK-LABEL: vmulh_u16_pred:
614; CHECK:       @ %bb.0: @ %entry
615; CHECK-NEXT:    .save {r7, lr}
616; CHECK-NEXT:    push {r7, lr}
617; CHECK-NEXT:    cmp r3, #1
618; CHECK-NEXT:    it lt
619; CHECK-NEXT:    poplt {r7, pc}
620; CHECK-NEXT:  .LBB23_1: @ %vector.ph
621; CHECK-NEXT:    dlstp.16 lr, r3
622; CHECK-NEXT:  .LBB23_2: @ %vector.body
623; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
624; CHECK-NEXT:    vldrh.u16 q0, [r1], #16
625; CHECK-NEXT:    vldrh.u16 q1, [r2], #16
626; CHECK-NEXT:    vmulh.u16 q0, q1, q0
627; CHECK-NEXT:    vstrh.16 q0, [r0], #16
628; CHECK-NEXT:    letp lr, .LBB23_2
629; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
630; CHECK-NEXT:    pop {r7, pc}
631entry:
632  %cmp10 = icmp sgt i32 %n, 0
633  br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
634
635vector.ph:                                        ; preds = %entry
636  %n.rnd.up = add i32 %n, 7
637  %n.vec = and i32 %n.rnd.up, -8
638  br label %vector.body
639
640vector.body:                                      ; preds = %vector.body, %vector.ph
641  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
642  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
643  %0 = getelementptr inbounds i16, ptr %x, i32 %index
644  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %0, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison)
645  %1 = zext <8 x i16> %wide.masked.load to <8 x i32>
646  %2 = getelementptr inbounds i16, ptr %y, i32 %index
647  %wide.masked.load12 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %2, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison)
648  %3 = zext <8 x i16> %wide.masked.load12 to <8 x i32>
649  %4 = mul nuw <8 x i32> %3, %1
650  %5 = lshr <8 x i32> %4, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
651  %6 = trunc <8 x i32> %5 to <8 x i16>
652  %7 = getelementptr inbounds i16, ptr %d, i32 %index
653  call void @llvm.masked.store.v8i16.p0(<8 x i16> %6, ptr %7, i32 2, <8 x i1> %active.lane.mask)
654  %index.next = add i32 %index, 8
655  %8 = icmp eq i32 %index.next, %n.vec
656  br i1 %8, label %for.cond.cleanup, label %vector.body
657
658for.cond.cleanup:                                 ; preds = %vector.body, %entry
659  ret void
660}
661
662define void @vmulh_s8_pred(ptr noalias nocapture %d, ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) {
663; CHECK-LABEL: vmulh_s8_pred:
664; CHECK:       @ %bb.0: @ %entry
665; CHECK-NEXT:    .save {r7, lr}
666; CHECK-NEXT:    push {r7, lr}
667; CHECK-NEXT:    cmp r3, #1
668; CHECK-NEXT:    it lt
669; CHECK-NEXT:    poplt {r7, pc}
670; CHECK-NEXT:  .LBB24_1: @ %vector.ph
671; CHECK-NEXT:    dlstp.8 lr, r3
672; CHECK-NEXT:  .LBB24_2: @ %vector.body
673; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
674; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
675; CHECK-NEXT:    vldrb.u8 q1, [r2], #16
676; CHECK-NEXT:    vmulh.s8 q0, q1, q0
677; CHECK-NEXT:    vstrb.8 q0, [r0], #16
678; CHECK-NEXT:    letp lr, .LBB24_2
679; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
680; CHECK-NEXT:    pop {r7, pc}
681entry:
682  %cmp10 = icmp sgt i32 %n, 0
683  br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
684
685vector.ph:                                        ; preds = %entry
686  %n.rnd.up = add i32 %n, 15
687  %n.vec = and i32 %n.rnd.up, -16
688  br label %vector.body
689
690vector.body:                                      ; preds = %vector.body, %vector.ph
691  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
692  %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
693  %0 = getelementptr inbounds i8, ptr %x, i32 %index
694  %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %0, i32 1, <16 x i1> %active.lane.mask, <16 x i8> poison)
695  %1 = sext <16 x i8> %wide.masked.load to <16 x i16>
696  %2 = getelementptr inbounds i8, ptr %y, i32 %index
697  %wide.masked.load12 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %2, i32 1, <16 x i1> %active.lane.mask, <16 x i8> poison)
698  %3 = sext <16 x i8> %wide.masked.load12 to <16 x i16>
699  %4 = mul nsw <16 x i16> %3, %1
700  %5 = lshr <16 x i16> %4, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
701  %6 = trunc <16 x i16> %5 to <16 x i8>
702  %7 = getelementptr inbounds i8, ptr %d, i32 %index
703  call void @llvm.masked.store.v16i8.p0(<16 x i8> %6, ptr %7, i32 1, <16 x i1> %active.lane.mask)
704  %index.next = add i32 %index, 16
705  %8 = icmp eq i32 %index.next, %n.vec
706  br i1 %8, label %for.cond.cleanup, label %vector.body
707
708for.cond.cleanup:                                 ; preds = %vector.body, %entry
709  ret void
710}
711
712define void @vmulh_u8_pred(ptr noalias nocapture %d, ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) {
713; CHECK-LABEL: vmulh_u8_pred:
714; CHECK:       @ %bb.0: @ %entry
715; CHECK-NEXT:    .save {r7, lr}
716; CHECK-NEXT:    push {r7, lr}
717; CHECK-NEXT:    cmp r3, #1
718; CHECK-NEXT:    it lt
719; CHECK-NEXT:    poplt {r7, pc}
720; CHECK-NEXT:  .LBB25_1: @ %vector.ph
721; CHECK-NEXT:    dlstp.8 lr, r3
722; CHECK-NEXT:  .LBB25_2: @ %vector.body
723; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
724; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
725; CHECK-NEXT:    vldrb.u8 q1, [r2], #16
726; CHECK-NEXT:    vmulh.u8 q0, q1, q0
727; CHECK-NEXT:    vstrb.8 q0, [r0], #16
728; CHECK-NEXT:    letp lr, .LBB25_2
729; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
730; CHECK-NEXT:    pop {r7, pc}
731entry:
732  %cmp10 = icmp sgt i32 %n, 0
733  br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
734
735vector.ph:                                        ; preds = %entry
736  %n.rnd.up = add i32 %n, 15
737  %n.vec = and i32 %n.rnd.up, -16
738  br label %vector.body
739
740vector.body:                                      ; preds = %vector.body, %vector.ph
741  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
742  %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
743  %0 = getelementptr inbounds i8, ptr %x, i32 %index
744  %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %0, i32 1, <16 x i1> %active.lane.mask, <16 x i8> poison)
745  %1 = zext <16 x i8> %wide.masked.load to <16 x i16>
746  %2 = getelementptr inbounds i8, ptr %y, i32 %index
747  %wide.masked.load12 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %2, i32 1, <16 x i1> %active.lane.mask, <16 x i8> poison)
748  %3 = zext <16 x i8> %wide.masked.load12 to <16 x i16>
749  %4 = mul nuw <16 x i16> %3, %1
750  %5 = lshr <16 x i16> %4, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
751  %6 = trunc <16 x i16> %5 to <16 x i8>
752  %7 = getelementptr inbounds i8, ptr %d, i32 %index
753  call void @llvm.masked.store.v16i8.p0(<16 x i8> %6, ptr %7, i32 1, <16 x i1> %active.lane.mask)
754  %index.next = add i32 %index, 16
755  %8 = icmp eq i32 %index.next, %n.vec
756  br i1 %8, label %for.cond.cleanup, label %vector.body
757
758for.cond.cleanup:                                 ; preds = %vector.body, %entry
759  ret void
760}
761
762
763define arm_aapcs_vfpcc i16 @vmulhs_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
764; CHECK-LABEL: vmulhs_reduce_v16i8:
765; CHECK:       @ %bb.0: @ %entry
766; CHECK-NEXT:    vmulh.s8 q0, q0, q1
767; CHECK-NEXT:    vaddv.s8 r0, q0
768; CHECK-NEXT:    bx lr
769entry:
770  %s0s = sext <16 x i8> %s0 to <16 x i16>
771  %s1s = sext <16 x i8> %s1 to <16 x i16>
772  %m = mul <16 x i16> %s0s, %s1s
773  %s = ashr <16 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
774  %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
775  ret i16 %result
776}
777
778define arm_aapcs_vfpcc i16 @vmulhu_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
779; CHECK-LABEL: vmulhu_reduce_v16i8:
780; CHECK:       @ %bb.0: @ %entry
781; CHECK-NEXT:    vmulh.u8 q0, q0, q1
782; CHECK-NEXT:    vaddv.s8 r0, q0
783; CHECK-NEXT:    bx lr
784entry:
785  %s0s = zext <16 x i8> %s0 to <16 x i16>
786  %s1s = zext <16 x i8> %s1 to <16 x i16>
787  %m = mul <16 x i16> %s0s, %s1s
788  %s = ashr <16 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
789  %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
790  ret i16 %result
791}
792
793declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
794
795
796declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
797declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>)
798declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>)
799declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
800declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32 immarg, <8 x i1>, <8 x i16>)
801declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32 immarg, <8 x i1>)
802declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
803declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32 immarg, <16 x i1>, <16 x i8>)
804declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32 immarg, <16 x i1>)
805