xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll (revision e0ed0333f0fed2e73f805afd58b61176a87aa3ad)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
3
4define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
5; CHECK-LABEL: ssatmul_s_q31:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
8; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
9; CHECK-NEXT:    .pad #8
10; CHECK-NEXT:    sub sp, #8
11; CHECK-NEXT:    cmp r3, #0
12; CHECK-NEXT:    beq.w .LBB0_8
13; CHECK-NEXT:  @ %bb.1: @ %entry
14; CHECK-NEXT:    mov r11, r2
15; CHECK-NEXT:    cmp r3, #1
16; CHECK-NEXT:    bne .LBB0_3
17; CHECK-NEXT:  @ %bb.2:
18; CHECK-NEXT:    movs r2, #0
19; CHECK-NEXT:    mov r12, r0
20; CHECK-NEXT:    mov r8, r1
21; CHECK-NEXT:    mov r10, r11
22; CHECK-NEXT:    b .LBB0_6
23; CHECK-NEXT:  .LBB0_3: @ %vector.ph
24; CHECK-NEXT:    bic r2, r3, #1
25; CHECK-NEXT:    adr r4, .LCPI0_0
26; CHECK-NEXT:    subs r7, r2, #2
27; CHECK-NEXT:    movs r6, #1
28; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
29; CHECK-NEXT:    add.w r10, r11, r2, lsl #2
30; CHECK-NEXT:    add.w lr, r6, r7, lsr #1
31; CHECK-NEXT:    str r2, [sp] @ 4-byte Spill
32; CHECK-NEXT:    add.w r8, r1, r2, lsl #2
33; CHECK-NEXT:    add.w r12, r0, r2, lsl #2
34; CHECK-NEXT:    vldrw.u32 q0, [r4]
35; CHECK-NEXT:    vmvn.i32 q1, #0x80000000
36; CHECK-NEXT:  .LBB0_4: @ %vector.body
37; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
38; CHECK-NEXT:    ldrd r4, r2, [r0], #8
39; CHECK-NEXT:    movs r5, #0
40; CHECK-NEXT:    ldrd r7, r6, [r1], #8
41; CHECK-NEXT:    smull r4, r7, r7, r4
42; CHECK-NEXT:    asrl r4, r7, #31
43; CHECK-NEXT:    rsbs.w r9, r4, #-2147483648
44; CHECK-NEXT:    mov.w r9, #-1
45; CHECK-NEXT:    sbcs.w r3, r9, r7
46; CHECK-NEXT:    csetm r3, lt
47; CHECK-NEXT:    bfi r5, r3, #0, #8
48; CHECK-NEXT:    smull r2, r3, r6, r2
49; CHECK-NEXT:    asrl r2, r3, #31
50; CHECK-NEXT:    rsbs.w r6, r2, #-2147483648
51; CHECK-NEXT:    vmov q2[2], q2[0], r4, r2
52; CHECK-NEXT:    sbcs.w r6, r9, r3
53; CHECK-NEXT:    vmov q2[3], q2[1], r7, r3
54; CHECK-NEXT:    csetm r6, lt
55; CHECK-NEXT:    bfi r5, r6, #8, #8
56; CHECK-NEXT:    vmsr p0, r5
57; CHECK-NEXT:    mvn r5, #-2147483648
58; CHECK-NEXT:    vpsel q2, q2, q0
59; CHECK-NEXT:    vmov r2, r3, d4
60; CHECK-NEXT:    subs r2, r2, r5
61; CHECK-NEXT:    sbcs r2, r3, #0
62; CHECK-NEXT:    mov.w r3, #0
63; CHECK-NEXT:    csetm r2, lt
64; CHECK-NEXT:    bfi r3, r2, #0, #8
65; CHECK-NEXT:    vmov r2, r4, d5
66; CHECK-NEXT:    subs r2, r2, r5
67; CHECK-NEXT:    sbcs r2, r4, #0
68; CHECK-NEXT:    csetm r2, lt
69; CHECK-NEXT:    bfi r3, r2, #8, #8
70; CHECK-NEXT:    vmsr p0, r3
71; CHECK-NEXT:    vpsel q2, q2, q1
72; CHECK-NEXT:    vmov r2, s10
73; CHECK-NEXT:    vmov r3, s8
74; CHECK-NEXT:    strd r3, r2, [r11], #8
75; CHECK-NEXT:    le lr, .LBB0_4
76; CHECK-NEXT:  @ %bb.5: @ %middle.block
77; CHECK-NEXT:    ldrd r2, r3, [sp] @ 8-byte Folded Reload
78; CHECK-NEXT:    cmp r2, r3
79; CHECK-NEXT:    beq .LBB0_8
80; CHECK-NEXT:  .LBB0_6: @ %for.body.preheader
81; CHECK-NEXT:    sub.w lr, r3, r2
82; CHECK-NEXT:    mov.w r0, #-1
83; CHECK-NEXT:    mov.w r1, #-2147483648
84; CHECK-NEXT:    mvn r3, #-2147483648
85; CHECK-NEXT:  .LBB0_7: @ %for.body
86; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
87; CHECK-NEXT:    ldr r2, [r12], #4
88; CHECK-NEXT:    ldr r4, [r8], #4
89; CHECK-NEXT:    smull r2, r5, r4, r2
90; CHECK-NEXT:    asrl r2, r5, #31
91; CHECK-NEXT:    subs r4, r1, r2
92; CHECK-NEXT:    sbcs.w r4, r0, r5
93; CHECK-NEXT:    csel r2, r2, r1, lt
94; CHECK-NEXT:    csel r4, r5, r0, lt
95; CHECK-NEXT:    subs r5, r2, r3
96; CHECK-NEXT:    sbcs r4, r4, #0
97; CHECK-NEXT:    csel r2, r2, r3, lt
98; CHECK-NEXT:    str r2, [r10], #4
99; CHECK-NEXT:    le lr, .LBB0_7
100; CHECK-NEXT:  .LBB0_8: @ %for.cond.cleanup
101; CHECK-NEXT:    add sp, #8
102; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
103; CHECK-NEXT:    .p2align 4
104; CHECK-NEXT:  @ %bb.9:
105; CHECK-NEXT:  .LCPI0_0:
106; CHECK-NEXT:    .long 2147483648 @ 0x80000000
107; CHECK-NEXT:    .long 4294967295 @ 0xffffffff
108; CHECK-NEXT:    .long 2147483648 @ 0x80000000
109; CHECK-NEXT:    .long 4294967295 @ 0xffffffff
110entry:
111  switch i32 %N, label %vector.ph [
112    i32 0, label %for.cond.cleanup
113    i32 1, label %for.body.preheader
114  ]
115
116vector.ph:                                        ; preds = %entry
117  %n.vec = and i32 %N, -2
118  %ind.end = getelementptr i32, ptr %pSrcA, i32 %n.vec
119  %ind.end15 = getelementptr i32, ptr %pSrcB, i32 %n.vec
120  %ind.end17 = getelementptr i32, ptr %pDst, i32 %n.vec
121  br label %vector.body
122
123vector.body:                                      ; preds = %vector.body, %vector.ph
124  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
125  %next.gep = getelementptr i32, ptr %pSrcA, i32 %index
126  %next.gep18 = getelementptr i32, ptr %pSrcB, i32 %index
127  %next.gep19 = getelementptr i32, ptr %pDst, i32 %index
128  %wide.load = load <2 x i32>, ptr %next.gep, align 4
129  %0 = sext <2 x i32> %wide.load to <2 x i64>
130  %wide.load20 = load <2 x i32>, ptr %next.gep18, align 4
131  %1 = sext <2 x i32> %wide.load20 to <2 x i64>
132  %2 = mul nsw <2 x i64> %1, %0
133  %3 = ashr <2 x i64> %2, <i64 31, i64 31>
134  %4 = icmp sgt <2 x i64> %3, <i64 -2147483648, i64 -2147483648>
135  %5 = select <2 x i1> %4, <2 x i64> %3, <2 x i64> <i64 -2147483648, i64 -2147483648>
136  %6 = icmp slt <2 x i64> %5, <i64 2147483647, i64 2147483647>
137  %7 = select <2 x i1> %6, <2 x i64> %5, <2 x i64> <i64 2147483647, i64 2147483647>
138  %8 = trunc <2 x i64> %7 to <2 x i32>
139  store <2 x i32> %8, ptr %next.gep19, align 4
140  %index.next = add i32 %index, 2
141  %9 = icmp eq i32 %index.next, %n.vec
142  br i1 %9, label %middle.block, label %vector.body
143
144middle.block:                                     ; preds = %vector.body
145  %cmp.n = icmp eq i32 %n.vec, %N
146  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader
147
148for.body.preheader:                               ; preds = %entry, %middle.block
149  %i.012.ph = phi i32 [ 0, %entry ], [ %n.vec, %middle.block ]
150  %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %entry ], [ %ind.end, %middle.block ]
151  %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %entry ], [ %ind.end15, %middle.block ]
152  %pDst.addr.09.ph = phi ptr [ %pDst, %entry ], [ %ind.end17, %middle.block ]
153  br label %for.body
154
155for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
156  ret void
157
158for.body:                                         ; preds = %for.body.preheader, %for.body
159  %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader ]
160  %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader ]
161  %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader ]
162  %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader ]
163  %incdec.ptr = getelementptr inbounds i32, ptr %pSrcA.addr.011, i32 1
164  %10 = load i32, ptr %pSrcA.addr.011, align 4
165  %conv = sext i32 %10 to i64
166  %incdec.ptr1 = getelementptr inbounds i32, ptr %pSrcB.addr.010, i32 1
167  %11 = load i32, ptr %pSrcB.addr.010, align 4
168  %conv2 = sext i32 %11 to i64
169  %mul = mul nsw i64 %conv2, %conv
170  %shr = ashr i64 %mul, 31
171  %12 = icmp sgt i64 %shr, -2147483648
172  %.val.i = select i1 %12, i64 %shr, i64 -2147483648
173  %13 = icmp slt i64 %.val.i, 2147483647
174  %retval.0.i = select i1 %13, i64 %.val.i, i64 2147483647
175  %conv3 = trunc i64 %retval.0.i to i32
176  %incdec.ptr4 = getelementptr inbounds i32, ptr %pDst.addr.09, i32 1
177  store i32 %conv3, ptr %pDst.addr.09, align 4
178  %inc = add nuw i32 %i.012, 1
179  %exitcond = icmp eq i32 %inc, %N
180  br i1 %exitcond, label %for.cond.cleanup, label %for.body
181}
182
183define arm_aapcs_vfpcc void @ssatmul_4_q31(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
184; CHECK-LABEL: ssatmul_4_q31:
185; CHECK:       @ %bb.0: @ %entry
186; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
187; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
188; CHECK-NEXT:    .pad #4
189; CHECK-NEXT:    sub sp, #4
190; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
191; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
192; CHECK-NEXT:    .pad #16
193; CHECK-NEXT:    sub sp, #16
194; CHECK-NEXT:    cmp r3, #0
195; CHECK-NEXT:    beq.w .LBB1_8
196; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
197; CHECK-NEXT:    mov r5, r1
198; CHECK-NEXT:    movs r1, #0
199; CHECK-NEXT:    cmp r3, #3
200; CHECK-NEXT:    bhi .LBB1_3
201; CHECK-NEXT:  @ %bb.2:
202; CHECK-NEXT:    mov r12, r0
203; CHECK-NEXT:    mov r9, r5
204; CHECK-NEXT:    mov r11, r2
205; CHECK-NEXT:    b .LBB1_6
206; CHECK-NEXT:  .LBB1_3: @ %vector.ph
207; CHECK-NEXT:    bic r1, r3, #3
208; CHECK-NEXT:    adr r4, .LCPI1_0
209; CHECK-NEXT:    subs r7, r1, #4
210; CHECK-NEXT:    movs r6, #1
211; CHECK-NEXT:    vldrw.u32 q0, [r4]
212; CHECK-NEXT:    adr r4, .LCPI1_1
213; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
214; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
215; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
216; CHECK-NEXT:    add.w r11, r2, r1, lsl #2
217; CHECK-NEXT:    add.w r9, r5, r1, lsl #2
218; CHECK-NEXT:    add.w r12, r0, r1, lsl #2
219; CHECK-NEXT:    vldrw.u32 q1, [r4]
220; CHECK-NEXT:  .LBB1_4: @ %vector.body
221; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
222; CHECK-NEXT:    vldrw.u32 q3, [r5], #16
223; CHECK-NEXT:    vldrw.u32 q2, [r0], #16
224; CHECK-NEXT:    str r2, [sp, #12] @ 4-byte Spill
225; CHECK-NEXT:    mov.w r2, #-1
226; CHECK-NEXT:    vmov.f32 s16, s10
227; CHECK-NEXT:    str r5, [sp, #8] @ 4-byte Spill
228; CHECK-NEXT:    vmov.f32 s20, s14
229; CHECK-NEXT:    mov.w r8, #0
230; CHECK-NEXT:    vmov.f32 s18, s11
231; CHECK-NEXT:    vmov.f32 s22, s15
232; CHECK-NEXT:    vmullb.s32 q6, q5, q4
233; CHECK-NEXT:    vmov.f32 s14, s13
234; CHECK-NEXT:    vmov r4, r7, d12
235; CHECK-NEXT:    asrl r4, r7, #31
236; CHECK-NEXT:    vmov.f32 s10, s9
237; CHECK-NEXT:    rsbs.w r5, r4, #-2147483648
238; CHECK-NEXT:    sbcs.w r5, r2, r7
239; CHECK-NEXT:    csetm r5, lt
240; CHECK-NEXT:    bfi r8, r5, #0, #8
241; CHECK-NEXT:    vmov r10, r5, d13
242; CHECK-NEXT:    asrl r10, r5, #31
243; CHECK-NEXT:    vmov r6, s14
244; CHECK-NEXT:    rsbs.w r3, r10, #-2147483648
245; CHECK-NEXT:    vmov q4[2], q4[0], r4, r10
246; CHECK-NEXT:    sbcs.w r3, r2, r5
247; CHECK-NEXT:    vmov q4[3], q4[1], r7, r5
248; CHECK-NEXT:    csetm r3, lt
249; CHECK-NEXT:    bfi r8, r3, #8, #8
250; CHECK-NEXT:    vmsr p0, r8
251; CHECK-NEXT:    mvn r8, #-2147483648
252; CHECK-NEXT:    vpsel q4, q4, q0
253; CHECK-NEXT:    vmov r3, r4, d8
254; CHECK-NEXT:    subs.w r3, r3, r8
255; CHECK-NEXT:    sbcs r3, r4, #0
256; CHECK-NEXT:    mov.w r4, #0
257; CHECK-NEXT:    csetm r3, lt
258; CHECK-NEXT:    bfi r4, r3, #0, #8
259; CHECK-NEXT:    vmov r3, r5, d9
260; CHECK-NEXT:    subs.w r3, r3, r8
261; CHECK-NEXT:    sbcs r3, r5, #0
262; CHECK-NEXT:    mov.w r5, #0
263; CHECK-NEXT:    csetm r3, lt
264; CHECK-NEXT:    bfi r4, r3, #8, #8
265; CHECK-NEXT:    vmov r3, s8
266; CHECK-NEXT:    vmsr p0, r4
267; CHECK-NEXT:    vmov r4, s12
268; CHECK-NEXT:    vpsel q4, q4, q1
269; CHECK-NEXT:    smull r4, r7, r4, r3
270; CHECK-NEXT:    asrl r4, r7, #31
271; CHECK-NEXT:    rsbs.w r3, r4, #-2147483648
272; CHECK-NEXT:    sbcs.w r3, r2, r7
273; CHECK-NEXT:    csetm r3, lt
274; CHECK-NEXT:    bfi r5, r3, #0, #8
275; CHECK-NEXT:    vmov r3, s10
276; CHECK-NEXT:    smull r6, r3, r6, r3
277; CHECK-NEXT:    asrl r6, r3, #31
278; CHECK-NEXT:    rsbs.w r1, r6, #-2147483648
279; CHECK-NEXT:    vmov q2[2], q2[0], r4, r6
280; CHECK-NEXT:    sbcs.w r1, r2, r3
281; CHECK-NEXT:    vmov q2[3], q2[1], r7, r3
282; CHECK-NEXT:    csetm r1, lt
283; CHECK-NEXT:    bfi r5, r1, #8, #8
284; CHECK-NEXT:    vmsr p0, r5
285; CHECK-NEXT:    ldrd r5, r2, [sp, #8] @ 8-byte Folded Reload
286; CHECK-NEXT:    vpsel q2, q2, q0
287; CHECK-NEXT:    vmov r1, r3, d4
288; CHECK-NEXT:    subs.w r1, r1, r8
289; CHECK-NEXT:    sbcs r1, r3, #0
290; CHECK-NEXT:    mov.w r3, #0
291; CHECK-NEXT:    csetm r1, lt
292; CHECK-NEXT:    bfi r3, r1, #0, #8
293; CHECK-NEXT:    vmov r1, r4, d5
294; CHECK-NEXT:    subs.w r1, r1, r8
295; CHECK-NEXT:    sbcs r1, r4, #0
296; CHECK-NEXT:    csetm r1, lt
297; CHECK-NEXT:    bfi r3, r1, #8, #8
298; CHECK-NEXT:    vmsr p0, r3
299; CHECK-NEXT:    vpsel q2, q2, q1
300; CHECK-NEXT:    vmov.f32 s9, s10
301; CHECK-NEXT:    vmov.f32 s10, s16
302; CHECK-NEXT:    vmov.f32 s11, s18
303; CHECK-NEXT:    vstrb.8 q2, [r2], #16
304; CHECK-NEXT:    le lr, .LBB1_4
305; CHECK-NEXT:  @ %bb.5: @ %middle.block
306; CHECK-NEXT:    ldrd r1, r3, [sp] @ 8-byte Folded Reload
307; CHECK-NEXT:    cmp r1, r3
308; CHECK-NEXT:    beq .LBB1_8
309; CHECK-NEXT:  .LBB1_6: @ %for.body.preheader21
310; CHECK-NEXT:    sub.w lr, r3, r1
311; CHECK-NEXT:    mov.w r0, #-1
312; CHECK-NEXT:    mov.w r3, #-2147483648
313; CHECK-NEXT:    mvn r2, #-2147483648
314; CHECK-NEXT:  .LBB1_7: @ %for.body
315; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
316; CHECK-NEXT:    ldr r1, [r12], #4
317; CHECK-NEXT:    ldr r4, [r9], #4
318; CHECK-NEXT:    smull r4, r1, r4, r1
319; CHECK-NEXT:    asrl r4, r1, #31
320; CHECK-NEXT:    subs r5, r3, r4
321; CHECK-NEXT:    sbcs.w r5, r0, r1
322; CHECK-NEXT:    csel r4, r4, r3, lt
323; CHECK-NEXT:    csel r1, r1, r0, lt
324; CHECK-NEXT:    subs r5, r4, r2
325; CHECK-NEXT:    sbcs r1, r1, #0
326; CHECK-NEXT:    csel r1, r4, r2, lt
327; CHECK-NEXT:    str r1, [r11], #4
328; CHECK-NEXT:    le lr, .LBB1_7
329; CHECK-NEXT:  .LBB1_8: @ %for.cond.cleanup
330; CHECK-NEXT:    add sp, #16
331; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
332; CHECK-NEXT:    add sp, #4
333; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
334; CHECK-NEXT:    .p2align 4
335; CHECK-NEXT:  @ %bb.9:
336; CHECK-NEXT:  .LCPI1_0:
337; CHECK-NEXT:    .long 2147483648 @ 0x80000000
338; CHECK-NEXT:    .long 4294967295 @ 0xffffffff
339; CHECK-NEXT:    .long 2147483648 @ 0x80000000
340; CHECK-NEXT:    .long 4294967295 @ 0xffffffff
341; CHECK-NEXT:  .LCPI1_1:
342; CHECK-NEXT:    .long 2147483647 @ 0x7fffffff
343; CHECK-NEXT:    .long 0 @ 0x0
344; CHECK-NEXT:    .long 2147483647 @ 0x7fffffff
345; CHECK-NEXT:    .long 0 @ 0x0
346entry:
347  %cmp8 = icmp eq i32 %N, 0
348  br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
349
350for.body.preheader:                               ; preds = %entry
351  %min.iters.check = icmp ult i32 %N, 4
352  br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
353
354for.body.preheader21:                             ; preds = %middle.block, %for.body.preheader
355  %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
356  %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
357  %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
358  %pDst.addr.09.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
359  br label %for.body
360
361vector.ph:                                        ; preds = %for.body.preheader
362  %n.vec = and i32 %N, -4
363  %ind.end = getelementptr i32, ptr %pSrcA, i32 %n.vec
364  %ind.end15 = getelementptr i32, ptr %pSrcB, i32 %n.vec
365  %ind.end17 = getelementptr i32, ptr %pDst, i32 %n.vec
366  br label %vector.body
367
368vector.body:                                      ; preds = %vector.body, %vector.ph
369  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
370  %next.gep = getelementptr i32, ptr %pSrcA, i32 %index
371  %next.gep18 = getelementptr i32, ptr %pSrcB, i32 %index
372  %next.gep19 = getelementptr i32, ptr %pDst, i32 %index
373  %wide.load = load <4 x i32>, ptr %next.gep, align 4
374  %0 = sext <4 x i32> %wide.load to <4 x i64>
375  %wide.load20 = load <4 x i32>, ptr %next.gep18, align 4
376  %1 = sext <4 x i32> %wide.load20 to <4 x i64>
377  %2 = mul nsw <4 x i64> %1, %0
378  %3 = ashr <4 x i64> %2, <i64 31, i64 31, i64 31, i64 31>
379  %4 = icmp sgt <4 x i64> %3, <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648>
380  %5 = select <4 x i1> %4, <4 x i64> %3, <4 x i64> <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648>
381  %6 = icmp slt <4 x i64> %5, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
382  %7 = select <4 x i1> %6, <4 x i64> %5, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
383  %8 = trunc <4 x i64> %7 to <4 x i32>
384  store <4 x i32> %8, ptr %next.gep19, align 4
385  %index.next = add i32 %index, 4
386  %9 = icmp eq i32 %index.next, %n.vec
387  br i1 %9, label %middle.block, label %vector.body
388
389middle.block:                                     ; preds = %vector.body
390  %cmp.n = icmp eq i32 %n.vec, %N
391  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
392
393for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
394  ret void
395
396for.body:                                         ; preds = %for.body.preheader21, %for.body
397  %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
398  %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
399  %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
400  %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
401  %incdec.ptr = getelementptr inbounds i32, ptr %pSrcA.addr.011, i32 1
402  %10 = load i32, ptr %pSrcA.addr.011, align 4
403  %conv = sext i32 %10 to i64
404  %incdec.ptr1 = getelementptr inbounds i32, ptr %pSrcB.addr.010, i32 1
405  %11 = load i32, ptr %pSrcB.addr.010, align 4
406  %conv2 = sext i32 %11 to i64
407  %mul = mul nsw i64 %conv2, %conv
408  %shr = ashr i64 %mul, 31
409  %12 = icmp sgt i64 %shr, -2147483648
410  %.val.i = select i1 %12, i64 %shr, i64 -2147483648
411  %13 = icmp slt i64 %.val.i, 2147483647
412  %retval.0.i = select i1 %13, i64 %.val.i, i64 2147483647
413  %conv3 = trunc i64 %retval.0.i to i32
414  %incdec.ptr4 = getelementptr inbounds i32, ptr %pDst.addr.09, i32 1
415  store i32 %conv3, ptr %pDst.addr.09, align 4
416  %inc = add nuw i32 %i.012, 1
417  %exitcond = icmp eq i32 %inc, %N
418  br i1 %exitcond, label %for.cond.cleanup, label %for.body
419}
420
421define arm_aapcs_vfpcc void @ssatmul_4t_q31(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
422; CHECK-LABEL: ssatmul_4t_q31:
423; CHECK:       @ %bb.0: @ %entry
424; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
425; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
426; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
427; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
428; CHECK-NEXT:    .pad #24
429; CHECK-NEXT:    sub sp, #24
430; CHECK-NEXT:    cmp r3, #0
431; CHECK-NEXT:    beq.w .LBB2_3
432; CHECK-NEXT:  @ %bb.1: @ %vector.ph
433; CHECK-NEXT:    adds r6, r3, #3
434; CHECK-NEXT:    movs r5, #1
435; CHECK-NEXT:    bic r6, r6, #3
436; CHECK-NEXT:    adr r4, .LCPI2_1
437; CHECK-NEXT:    subs r6, #4
438; CHECK-NEXT:    vldrw.u32 q2, [r4]
439; CHECK-NEXT:    mov.w r9, #0
440; CHECK-NEXT:    mov.w r12, #-1
441; CHECK-NEXT:    add.w lr, r5, r6, lsr #2
442; CHECK-NEXT:    adr r5, .LCPI2_0
443; CHECK-NEXT:    vldrw.u32 q0, [r5]
444; CHECK-NEXT:    adr r5, .LCPI2_2
445; CHECK-NEXT:    subs r6, r3, #1
446; CHECK-NEXT:    vldrw.u32 q3, [r5]
447; CHECK-NEXT:    vdup.32 q1, r6
448; CHECK-NEXT:    mvn r8, #-2147483648
449; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
450; CHECK-NEXT:  .LBB2_2: @ %vector.body
451; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
452; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
453; CHECK-NEXT:    vdup.32 q4, r9
454; CHECK-NEXT:    movs r4, #0
455; CHECK-NEXT:    add.w r9, r9, #4
456; CHECK-NEXT:    vorr q4, q4, q0
457; CHECK-NEXT:    vcmp.u32 cs, q1, q4
458; CHECK-NEXT:    vstr p0, [sp, #20] @ 4-byte Spill
459; CHECK-NEXT:    vpstt
460; CHECK-NEXT:    vldrwt.u32 q4, [r0], #16
461; CHECK-NEXT:    vldrwt.u32 q5, [r1], #16
462; CHECK-NEXT:    vmov.f32 s24, s18
463; CHECK-NEXT:    vmov.f32 s26, s19
464; CHECK-NEXT:    vmov.f32 s28, s22
465; CHECK-NEXT:    vmov.f32 s30, s23
466; CHECK-NEXT:    vmullb.s32 q0, q7, q6
467; CHECK-NEXT:    vmov.f32 s18, s21
468; CHECK-NEXT:    vmov r10, r5, d0
469; CHECK-NEXT:    asrl r10, r5, #31
470; CHECK-NEXT:    rsbs.w r7, r10, #-2147483648
471; CHECK-NEXT:    sbcs.w r7, r12, r5
472; CHECK-NEXT:    csetm r7, lt
473; CHECK-NEXT:    bfi r4, r7, #0, #8
474; CHECK-NEXT:    vmov r6, r7, d1
475; CHECK-NEXT:    asrl r6, r7, #31
476; CHECK-NEXT:    rsbs.w r3, r6, #-2147483648
477; CHECK-NEXT:    vmov q0[2], q0[0], r10, r6
478; CHECK-NEXT:    sbcs.w r3, r12, r7
479; CHECK-NEXT:    vmov q0[3], q0[1], r5, r7
480; CHECK-NEXT:    csetm r3, lt
481; CHECK-NEXT:    vmov r7, s18
482; CHECK-NEXT:    bfi r4, r3, #8, #8
483; CHECK-NEXT:    vmsr p0, r4
484; CHECK-NEXT:    vpsel q0, q0, q2
485; CHECK-NEXT:    vmov r3, r4, d0
486; CHECK-NEXT:    subs.w r3, r3, r8
487; CHECK-NEXT:    sbcs r3, r4, #0
488; CHECK-NEXT:    mov.w r4, #0
489; CHECK-NEXT:    csetm r3, lt
490; CHECK-NEXT:    bfi r4, r3, #0, #8
491; CHECK-NEXT:    vmov r3, r5, d1
492; CHECK-NEXT:    subs.w r3, r3, r8
493; CHECK-NEXT:    sbcs r3, r5, #0
494; CHECK-NEXT:    csetm r3, lt
495; CHECK-NEXT:    bfi r4, r3, #8, #8
496; CHECK-NEXT:    vmov r3, s16
497; CHECK-NEXT:    vmsr p0, r4
498; CHECK-NEXT:    vmov r4, s20
499; CHECK-NEXT:    vpsel q6, q0, q3
500; CHECK-NEXT:    vmov.f32 s2, s17
501; CHECK-NEXT:    smull r10, r5, r4, r3
502; CHECK-NEXT:    movs r4, #0
503; CHECK-NEXT:    asrl r10, r5, #31
504; CHECK-NEXT:    rsbs.w r3, r10, #-2147483648
505; CHECK-NEXT:    sbcs.w r3, r12, r5
506; CHECK-NEXT:    csetm r3, lt
507; CHECK-NEXT:    bfi r4, r3, #0, #8
508; CHECK-NEXT:    vmov r3, s2
509; CHECK-NEXT:    smull r6, r3, r7, r3
510; CHECK-NEXT:    asrl r6, r3, #31
511; CHECK-NEXT:    rsbs.w r7, r6, #-2147483648
512; CHECK-NEXT:    vmov q0[2], q0[0], r10, r6
513; CHECK-NEXT:    sbcs.w r7, r12, r3
514; CHECK-NEXT:    vmov q0[3], q0[1], r5, r3
515; CHECK-NEXT:    csetm r7, lt
516; CHECK-NEXT:    bfi r4, r7, #8, #8
517; CHECK-NEXT:    vmsr p0, r4
518; CHECK-NEXT:    vpsel q0, q0, q2
519; CHECK-NEXT:    vmov r3, r4, d0
520; CHECK-NEXT:    subs.w r3, r3, r8
521; CHECK-NEXT:    sbcs r3, r4, #0
522; CHECK-NEXT:    mov.w r4, #0
523; CHECK-NEXT:    csetm r3, lt
524; CHECK-NEXT:    bfi r4, r3, #0, #8
525; CHECK-NEXT:    vmov r3, r5, d1
526; CHECK-NEXT:    subs.w r3, r3, r8
527; CHECK-NEXT:    sbcs r3, r5, #0
528; CHECK-NEXT:    csetm r3, lt
529; CHECK-NEXT:    bfi r4, r3, #8, #8
530; CHECK-NEXT:    vmsr p0, r4
531; CHECK-NEXT:    vpsel q0, q0, q3
532; CHECK-NEXT:    vldr p0, [sp, #20] @ 4-byte Reload
533; CHECK-NEXT:    vmov.f32 s1, s2
534; CHECK-NEXT:    vmov.f32 s2, s24
535; CHECK-NEXT:    vmov.f32 s3, s26
536; CHECK-NEXT:    vpst
537; CHECK-NEXT:    vstrwt.32 q0, [r2], #16
538; CHECK-NEXT:    le lr, .LBB2_2
539; CHECK-NEXT:  .LBB2_3: @ %for.cond.cleanup
540; CHECK-NEXT:    add sp, #24
541; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
542; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
543; CHECK-NEXT:    .p2align 4
544; CHECK-NEXT:  @ %bb.4:
545; CHECK-NEXT:  .LCPI2_0:
546; CHECK-NEXT:    .long 0 @ 0x0
547; CHECK-NEXT:    .long 1 @ 0x1
548; CHECK-NEXT:    .long 2 @ 0x2
549; CHECK-NEXT:    .long 3 @ 0x3
550; CHECK-NEXT:  .LCPI2_1:
551; CHECK-NEXT:    .long 2147483648 @ 0x80000000
552; CHECK-NEXT:    .long 4294967295 @ 0xffffffff
553; CHECK-NEXT:    .long 2147483648 @ 0x80000000
554; CHECK-NEXT:    .long 4294967295 @ 0xffffffff
555; CHECK-NEXT:  .LCPI2_2:
556; CHECK-NEXT:    .long 2147483647 @ 0x7fffffff
557; CHECK-NEXT:    .long 0 @ 0x0
558; CHECK-NEXT:    .long 2147483647 @ 0x7fffffff
559; CHECK-NEXT:    .long 0 @ 0x0
560entry:
561  %cmp8 = icmp eq i32 %N, 0
562  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
563
564vector.ph:                                        ; preds = %entry
565  %n.rnd.up = add i32 %N, 3
566  %n.vec = and i32 %n.rnd.up, -4
567  %trip.count.minus.1 = add i32 %N, -1
568  %broadcast.splatinsert20 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
569  %broadcast.splat21 = shufflevector <4 x i32> %broadcast.splatinsert20, <4 x i32> undef, <4 x i32> zeroinitializer
570  br label %vector.body
571
572vector.body:                                      ; preds = %vector.body, %vector.ph
573  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
574  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
575  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
576  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
577  %next.gep = getelementptr i32, ptr %pSrcA, i32 %index
578  %next.gep18 = getelementptr i32, ptr %pSrcB, i32 %index
579  %next.gep19 = getelementptr i32, ptr %pDst, i32 %index
580  %0 = icmp ule <4 x i32> %induction, %broadcast.splat21
581  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %next.gep, i32 4, <4 x i1> %0, <4 x i32> undef)
582  %1 = sext <4 x i32> %wide.masked.load to <4 x i64>
583  %wide.masked.load22 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %next.gep18, i32 4, <4 x i1> %0, <4 x i32> undef)
584  %2 = sext <4 x i32> %wide.masked.load22 to <4 x i64>
585  %3 = mul nsw <4 x i64> %2, %1
586  %4 = ashr <4 x i64> %3, <i64 31, i64 31, i64 31, i64 31>
587  %5 = icmp sgt <4 x i64> %4, <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648>
588  %6 = select <4 x i1> %5, <4 x i64> %4, <4 x i64> <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648>
589  %7 = icmp slt <4 x i64> %6, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
590  %8 = select <4 x i1> %7, <4 x i64> %6, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
591  %9 = trunc <4 x i64> %8 to <4 x i32>
592  call void @llvm.masked.store.v4i32.p0(<4 x i32> %9, ptr %next.gep19, i32 4, <4 x i1> %0)
593  %index.next = add i32 %index, 4
594  %10 = icmp eq i32 %index.next, %n.vec
595  br i1 %10, label %for.cond.cleanup, label %vector.body
596
597for.cond.cleanup:                                 ; preds = %vector.body, %entry
598  ret void
599}
600
601define arm_aapcs_vfpcc void @usatmul_2_q31(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
602; CHECK-LABEL: usatmul_2_q31:
603; CHECK:       @ %bb.0: @ %entry
604; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
605; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
606; CHECK-NEXT:    .pad #4
607; CHECK-NEXT:    sub sp, #4
608; CHECK-NEXT:    cmp r3, #0
609; CHECK-NEXT:    beq .LBB3_8
610; CHECK-NEXT:  @ %bb.1: @ %entry
611; CHECK-NEXT:    mov r8, r2
612; CHECK-NEXT:    cmp r3, #1
613; CHECK-NEXT:    bne .LBB3_3
614; CHECK-NEXT:  @ %bb.2:
615; CHECK-NEXT:    movs r7, #0
616; CHECK-NEXT:    mov r12, r0
617; CHECK-NEXT:    mov r11, r1
618; CHECK-NEXT:    mov r2, r8
619; CHECK-NEXT:    b .LBB3_6
620; CHECK-NEXT:  .LBB3_3: @ %vector.ph
621; CHECK-NEXT:    bic r5, r3, #1
622; CHECK-NEXT:    movs r6, #1
623; CHECK-NEXT:    subs r7, r5, #2
624; CHECK-NEXT:    str r5, [sp] @ 4-byte Spill
625; CHECK-NEXT:    add.w r2, r8, r5, lsl #2
626; CHECK-NEXT:    add.w r11, r1, r5, lsl #2
627; CHECK-NEXT:    add.w lr, r6, r7, lsr #1
628; CHECK-NEXT:    add.w r12, r0, r5, lsl #2
629; CHECK-NEXT:    vmov.i8 q0, #0xff
630; CHECK-NEXT:  .LBB3_4: @ %vector.body
631; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
632; CHECK-NEXT:    ldrd r4, r9, [r0], #8
633; CHECK-NEXT:    ldrd r5, r10, [r1], #8
634; CHECK-NEXT:    umull r4, r5, r5, r4
635; CHECK-NEXT:    lsrl r4, r5, #31
636; CHECK-NEXT:    subs.w r6, r4, #-1
637; CHECK-NEXT:    sbcs r5, r5, #0
638; CHECK-NEXT:    mov.w r6, #0
639; CHECK-NEXT:    csetm r5, lo
640; CHECK-NEXT:    bfi r6, r5, #0, #8
641; CHECK-NEXT:    umull r10, r5, r10, r9
642; CHECK-NEXT:    lsrl r10, r5, #31
643; CHECK-NEXT:    subs.w r7, r10, #-1
644; CHECK-NEXT:    vmov q1[2], q1[0], r4, r10
645; CHECK-NEXT:    sbcs r5, r5, #0
646; CHECK-NEXT:    csetm r5, lo
647; CHECK-NEXT:    bfi r6, r5, #8, #8
648; CHECK-NEXT:    vmsr p0, r6
649; CHECK-NEXT:    vpsel q1, q1, q0
650; CHECK-NEXT:    vmov r4, s6
651; CHECK-NEXT:    vmov r5, s4
652; CHECK-NEXT:    strd r5, r4, [r8], #8
653; CHECK-NEXT:    le lr, .LBB3_4
654; CHECK-NEXT:  @ %bb.5: @ %middle.block
655; CHECK-NEXT:    ldr r7, [sp] @ 4-byte Reload
656; CHECK-NEXT:    cmp r7, r3
657; CHECK-NEXT:    beq .LBB3_8
658; CHECK-NEXT:  .LBB3_6: @ %for.body.preheader
659; CHECK-NEXT:    sub.w lr, r3, r7
660; CHECK-NEXT:  .LBB3_7: @ %for.body
661; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
662; CHECK-NEXT:    ldr r0, [r12], #4
663; CHECK-NEXT:    ldr r1, [r11], #4
664; CHECK-NEXT:    umull r0, r1, r1, r0
665; CHECK-NEXT:    lsrl r0, r1, #31
666; CHECK-NEXT:    subs.w r3, r0, #-1
667; CHECK-NEXT:    sbcs r1, r1, #0
668; CHECK-NEXT:    it hs
669; CHECK-NEXT:    movhs.w r0, #-1
670; CHECK-NEXT:    str r0, [r2], #4
671; CHECK-NEXT:    le lr, .LBB3_7
672; CHECK-NEXT:  .LBB3_8: @ %for.cond.cleanup
673; CHECK-NEXT:    add sp, #4
674; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
675entry:
676  switch i32 %N, label %vector.ph [
677    i32 0, label %for.cond.cleanup
678    i32 1, label %for.body.preheader
679  ]
680
681vector.ph:                                        ; preds = %entry
682  %n.vec = and i32 %N, -2
683  %ind.end = getelementptr i32, ptr %pSrcA, i32 %n.vec
684  %ind.end15 = getelementptr i32, ptr %pSrcB, i32 %n.vec
685  %ind.end17 = getelementptr i32, ptr %pDst, i32 %n.vec
686  br label %vector.body
687
688vector.body:                                      ; preds = %vector.body, %vector.ph
689  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
690  %next.gep = getelementptr i32, ptr %pSrcA, i32 %index
691  %next.gep18 = getelementptr i32, ptr %pSrcB, i32 %index
692  %next.gep19 = getelementptr i32, ptr %pDst, i32 %index
693  %wide.load = load <2 x i32>, ptr %next.gep, align 4
694  %0 = zext <2 x i32> %wide.load to <2 x i64>
695  %wide.load20 = load <2 x i32>, ptr %next.gep18, align 4
696  %1 = zext <2 x i32> %wide.load20 to <2 x i64>
697  %2 = mul nuw <2 x i64> %1, %0
698  %3 = lshr <2 x i64> %2, <i64 31, i64 31>
699  %4 = icmp ult <2 x i64> %3, <i64 4294967295, i64 4294967295>
700  %5 = select <2 x i1> %4, <2 x i64> %3, <2 x i64> <i64 4294967295, i64 4294967295>
701  %6 = trunc <2 x i64> %5 to <2 x i32>
702  store <2 x i32> %6, ptr %next.gep19, align 4
703  %index.next = add i32 %index, 2
704  %7 = icmp eq i32 %index.next, %n.vec
705  br i1 %7, label %middle.block, label %vector.body
706
707middle.block:                                     ; preds = %vector.body
708  %cmp.n = icmp eq i32 %n.vec, %N
709  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader
710
711for.body.preheader:                               ; preds = %entry, %middle.block
712  %i.012.ph = phi i32 [ 0, %entry ], [ %n.vec, %middle.block ]
713  %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %entry ], [ %ind.end, %middle.block ]
714  %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %entry ], [ %ind.end15, %middle.block ]
715  %pDst.addr.09.ph = phi ptr [ %pDst, %entry ], [ %ind.end17, %middle.block ]
716  br label %for.body
717
718for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
719  ret void
720
721for.body:                                         ; preds = %for.body.preheader, %for.body
722  %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader ]
723  %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader ]
724  %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader ]
725  %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader ]
726  %incdec.ptr = getelementptr inbounds i32, ptr %pSrcA.addr.011, i32 1
727  %8 = load i32, ptr %pSrcA.addr.011, align 4
728  %conv = zext i32 %8 to i64
729  %incdec.ptr1 = getelementptr inbounds i32, ptr %pSrcB.addr.010, i32 1
730  %9 = load i32, ptr %pSrcB.addr.010, align 4
731  %conv2 = zext i32 %9 to i64
732  %mul = mul nuw i64 %conv2, %conv
733  %shr = lshr i64 %mul, 31
734  %10 = icmp ult i64 %shr, 4294967295
735  %retval.0.i = select i1 %10, i64 %shr, i64 4294967295
736  %conv3 = trunc i64 %retval.0.i to i32
737  %incdec.ptr4 = getelementptr inbounds i32, ptr %pDst.addr.09, i32 1
738  store i32 %conv3, ptr %pDst.addr.09, align 4
739  %inc = add nuw i32 %i.012, 1
740  %exitcond = icmp eq i32 %inc, %N
741  br i1 %exitcond, label %for.cond.cleanup, label %for.body
742}
743
744define arm_aapcs_vfpcc void @usatmul_4_q31(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
745; CHECK-LABEL: usatmul_4_q31:
746; CHECK:       @ %bb.0: @ %entry
747; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
748; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
749; CHECK-NEXT:    .pad #4
750; CHECK-NEXT:    sub sp, #4
751; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
752; CHECK-NEXT:    vpush {d8, d9, d10, d11}
753; CHECK-NEXT:    cmp r3, #0
754; CHECK-NEXT:    beq.w .LBB4_8
755; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
756; CHECK-NEXT:    mov.w r8, #0
757; CHECK-NEXT:    cmp r3, #3
758; CHECK-NEXT:    bhi .LBB4_3
759; CHECK-NEXT:  @ %bb.2:
760; CHECK-NEXT:    mov r12, r0
761; CHECK-NEXT:    mov r9, r1
762; CHECK-NEXT:    mov r11, r2
763; CHECK-NEXT:    b .LBB4_6
764; CHECK-NEXT:  .LBB4_3: @ %vector.ph
765; CHECK-NEXT:    bic r8, r3, #3
766; CHECK-NEXT:    movs r6, #1
767; CHECK-NEXT:    sub.w r7, r8, #4
768; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
769; CHECK-NEXT:    add.w r11, r2, r8, lsl #2
770; CHECK-NEXT:    add.w r9, r1, r8, lsl #2
771; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
772; CHECK-NEXT:    add.w r12, r0, r8, lsl #2
773; CHECK-NEXT:  .LBB4_4: @ %vector.body
774; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
775; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
776; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
777; CHECK-NEXT:    vmov.f32 s12, s6
778; CHECK-NEXT:    vmov.f32 s14, s7
779; CHECK-NEXT:    vmov.f32 s16, s10
780; CHECK-NEXT:    vmov.f32 s18, s11
781; CHECK-NEXT:    vmullb.u32 q5, q4, q3
782; CHECK-NEXT:    vmov.f32 s6, s5
783; CHECK-NEXT:    vmov r10, r5, d10
784; CHECK-NEXT:    lsrl r10, r5, #31
785; CHECK-NEXT:    vmov.f32 s10, s9
786; CHECK-NEXT:    subs.w r6, r10, #-1
787; CHECK-NEXT:    sbcs r5, r5, #0
788; CHECK-NEXT:    mov.w r6, #0
789; CHECK-NEXT:    csetm r5, lo
790; CHECK-NEXT:    vmullb.u32 q4, q2, q1
791; CHECK-NEXT:    bfi r6, r5, #0, #8
792; CHECK-NEXT:    vmov r4, r5, d11
793; CHECK-NEXT:    lsrl r4, r5, #31
794; CHECK-NEXT:    subs.w r7, r4, #-1
795; CHECK-NEXT:    vmov q3[2], q3[0], r10, r4
796; CHECK-NEXT:    sbcs r5, r5, #0
797; CHECK-NEXT:    csetm r5, lo
798; CHECK-NEXT:    bfi r6, r5, #8, #8
799; CHECK-NEXT:    vmov r10, r5, d8
800; CHECK-NEXT:    lsrl r10, r5, #31
801; CHECK-NEXT:    vmsr p0, r6
802; CHECK-NEXT:    subs.w r6, r10, #-1
803; CHECK-NEXT:    vpsel q3, q3, q0
804; CHECK-NEXT:    sbcs r5, r5, #0
805; CHECK-NEXT:    mov.w r6, #0
806; CHECK-NEXT:    csetm r5, lo
807; CHECK-NEXT:    bfi r6, r5, #0, #8
808; CHECK-NEXT:    vmov r4, r5, d9
809; CHECK-NEXT:    lsrl r4, r5, #31
810; CHECK-NEXT:    subs.w r7, r4, #-1
811; CHECK-NEXT:    vmov q1[2], q1[0], r10, r4
812; CHECK-NEXT:    sbcs r5, r5, #0
813; CHECK-NEXT:    csetm r5, lo
814; CHECK-NEXT:    bfi r6, r5, #8, #8
815; CHECK-NEXT:    vmsr p0, r6
816; CHECK-NEXT:    vpsel q1, q1, q0
817; CHECK-NEXT:    vmov.f32 s5, s6
818; CHECK-NEXT:    vmov.f32 s6, s12
819; CHECK-NEXT:    vmov.f32 s7, s14
820; CHECK-NEXT:    vstrb.8 q1, [r2], #16
821; CHECK-NEXT:    le lr, .LBB4_4
822; CHECK-NEXT:  @ %bb.5: @ %middle.block
823; CHECK-NEXT:    cmp r8, r3
824; CHECK-NEXT:    beq .LBB4_8
825; CHECK-NEXT:  .LBB4_6: @ %for.body.preheader21
826; CHECK-NEXT:    sub.w lr, r3, r8
827; CHECK-NEXT:  .LBB4_7: @ %for.body
828; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
829; CHECK-NEXT:    ldr r0, [r12], #4
830; CHECK-NEXT:    ldr r1, [r9], #4
831; CHECK-NEXT:    umull r0, r1, r1, r0
832; CHECK-NEXT:    lsrl r0, r1, #31
833; CHECK-NEXT:    subs.w r2, r0, #-1
834; CHECK-NEXT:    sbcs r1, r1, #0
835; CHECK-NEXT:    it hs
836; CHECK-NEXT:    movhs.w r0, #-1
837; CHECK-NEXT:    str r0, [r11], #4
838; CHECK-NEXT:    le lr, .LBB4_7
839; CHECK-NEXT:  .LBB4_8: @ %for.cond.cleanup
840; CHECK-NEXT:    vpop {d8, d9, d10, d11}
841; CHECK-NEXT:    add sp, #4
842; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
843entry:
844  %cmp8 = icmp eq i32 %N, 0
845  br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
846
847for.body.preheader:                               ; preds = %entry
848  %min.iters.check = icmp ult i32 %N, 4
849  br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
850
851for.body.preheader21:                             ; preds = %middle.block, %for.body.preheader
852  %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
853  %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
854  %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
855  %pDst.addr.09.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
856  br label %for.body
857
858vector.ph:                                        ; preds = %for.body.preheader
859  %n.vec = and i32 %N, -4
860  %ind.end = getelementptr i32, ptr %pSrcA, i32 %n.vec
861  %ind.end15 = getelementptr i32, ptr %pSrcB, i32 %n.vec
862  %ind.end17 = getelementptr i32, ptr %pDst, i32 %n.vec
863  br label %vector.body
864
865vector.body:                                      ; preds = %vector.body, %vector.ph
866  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
867  %next.gep = getelementptr i32, ptr %pSrcA, i32 %index
868  %next.gep18 = getelementptr i32, ptr %pSrcB, i32 %index
869  %next.gep19 = getelementptr i32, ptr %pDst, i32 %index
870  %wide.load = load <4 x i32>, ptr %next.gep, align 4
871  %0 = zext <4 x i32> %wide.load to <4 x i64>
872  %wide.load20 = load <4 x i32>, ptr %next.gep18, align 4
873  %1 = zext <4 x i32> %wide.load20 to <4 x i64>
874  %2 = mul nuw <4 x i64> %1, %0
875  %3 = lshr <4 x i64> %2, <i64 31, i64 31, i64 31, i64 31>
876  %4 = icmp ult <4 x i64> %3, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
877  %5 = select <4 x i1> %4, <4 x i64> %3, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
878  %6 = trunc <4 x i64> %5 to <4 x i32>
879  store <4 x i32> %6, ptr %next.gep19, align 4
880  %index.next = add i32 %index, 4
881  %7 = icmp eq i32 %index.next, %n.vec
882  br i1 %7, label %middle.block, label %vector.body
883
884middle.block:                                     ; preds = %vector.body
885  %cmp.n = icmp eq i32 %n.vec, %N
886  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
887
888for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
889  ret void
890
891for.body:                                         ; preds = %for.body.preheader21, %for.body
892  %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
893  %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
894  %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
895  %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
896  %incdec.ptr = getelementptr inbounds i32, ptr %pSrcA.addr.011, i32 1
897  %8 = load i32, ptr %pSrcA.addr.011, align 4
898  %conv = zext i32 %8 to i64
899  %incdec.ptr1 = getelementptr inbounds i32, ptr %pSrcB.addr.010, i32 1
900  %9 = load i32, ptr %pSrcB.addr.010, align 4
901  %conv2 = zext i32 %9 to i64
902  %mul = mul nuw i64 %conv2, %conv
903  %shr = lshr i64 %mul, 31
904  %10 = icmp ult i64 %shr, 4294967295
905  %retval.0.i = select i1 %10, i64 %shr, i64 4294967295
906  %conv3 = trunc i64 %retval.0.i to i32
907  %incdec.ptr4 = getelementptr inbounds i32, ptr %pDst.addr.09, i32 1
908  store i32 %conv3, ptr %pDst.addr.09, align 4
909  %inc = add nuw i32 %i.012, 1
910  %exitcond = icmp eq i32 %inc, %N
911  br i1 %exitcond, label %for.cond.cleanup, label %for.body
912}
913
914
915; i16
916
917define arm_aapcs_vfpcc void @ssatmul_4_q15(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
918; CHECK-LABEL: ssatmul_4_q15:
919; CHECK:       @ %bb.0: @ %entry
920; CHECK-NEXT:    .save {r4, r5, r6, lr}
921; CHECK-NEXT:    push {r4, r5, r6, lr}
922; CHECK-NEXT:    cbz r3, .LBB5_8
923; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
924; CHECK-NEXT:    cmp r3, #3
925; CHECK-NEXT:    bhi .LBB5_3
926; CHECK-NEXT:  @ %bb.2:
927; CHECK-NEXT:    movs r5, #0
928; CHECK-NEXT:    mov r12, r0
929; CHECK-NEXT:    mov r6, r1
930; CHECK-NEXT:    mov r4, r2
931; CHECK-NEXT:    b .LBB5_6
932; CHECK-NEXT:  .LBB5_3: @ %vector.ph
933; CHECK-NEXT:    bic r5, r3, #3
934; CHECK-NEXT:    movs r4, #1
935; CHECK-NEXT:    subs r6, r5, #4
936; CHECK-NEXT:    add.w r12, r0, r5, lsl #1
937; CHECK-NEXT:    add.w lr, r4, r6, lsr #2
938; CHECK-NEXT:    add.w r4, r2, r5, lsl #1
939; CHECK-NEXT:    add.w r6, r1, r5, lsl #1
940; CHECK-NEXT:  .LBB5_4: @ %vector.body
941; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
942; CHECK-NEXT:    vldrh.s32 q0, [r0], #8
943; CHECK-NEXT:    vldrh.s32 q1, [r1], #8
944; CHECK-NEXT:    vmul.i32 q0, q1, q0
945; CHECK-NEXT:    vqshrnb.s32 q0, q0, #15
946; CHECK-NEXT:    vstrh.32 q0, [r2], #8
947; CHECK-NEXT:    le lr, .LBB5_4
948; CHECK-NEXT:  @ %bb.5: @ %middle.block
949; CHECK-NEXT:    cmp r5, r3
950; CHECK-NEXT:    it eq
951; CHECK-NEXT:    popeq {r4, r5, r6, pc}
952; CHECK-NEXT:  .LBB5_6: @ %for.body.preheader21
953; CHECK-NEXT:    sub.w lr, r3, r5
954; CHECK-NEXT:  .LBB5_7: @ %for.body
955; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
956; CHECK-NEXT:    ldrsh r0, [r12], #2
957; CHECK-NEXT:    ldrsh r1, [r6], #2
958; CHECK-NEXT:    muls r0, r1, r0
959; CHECK-NEXT:    ssat r0, #16, r0, asr #15
960; CHECK-NEXT:    strh r0, [r4], #2
961; CHECK-NEXT:    le lr, .LBB5_7
962; CHECK-NEXT:  .LBB5_8: @ %for.cond.cleanup
963; CHECK-NEXT:    pop {r4, r5, r6, pc}
964entry:
965  %cmp8 = icmp eq i32 %N, 0
966  br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
967
968for.body.preheader:                               ; preds = %entry
969  %min.iters.check = icmp ult i32 %N, 4
970  br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
971
972for.body.preheader21:                             ; preds = %middle.block, %for.body.preheader
973  %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
974  %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
975  %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
976  %pDst.addr.09.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
977  br label %for.body
978
979vector.ph:                                        ; preds = %for.body.preheader
980  %n.vec = and i32 %N, -4
981  %ind.end = getelementptr i16, ptr %pSrcA, i32 %n.vec
982  %ind.end15 = getelementptr i16, ptr %pSrcB, i32 %n.vec
983  %ind.end17 = getelementptr i16, ptr %pDst, i32 %n.vec
984  br label %vector.body
985
986vector.body:                                      ; preds = %vector.body, %vector.ph
987  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
988  %next.gep = getelementptr i16, ptr %pSrcA, i32 %index
989  %next.gep18 = getelementptr i16, ptr %pSrcB, i32 %index
990  %next.gep19 = getelementptr i16, ptr %pDst, i32 %index
991  %wide.load = load <4 x i16>, ptr %next.gep, align 2
992  %0 = sext <4 x i16> %wide.load to <4 x i32>
993  %wide.load20 = load <4 x i16>, ptr %next.gep18, align 2
994  %1 = sext <4 x i16> %wide.load20 to <4 x i32>
995  %2 = mul nsw <4 x i32> %1, %0
996  %3 = ashr <4 x i32> %2, <i32 15, i32 15, i32 15, i32 15>
997  %4 = icmp sgt <4 x i32> %3, <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
998  %5 = select <4 x i1> %4, <4 x i32> %3, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
999  %6 = icmp slt <4 x i32> %5, <i32 32767, i32 32767, i32 32767, i32 32767>
1000  %7 = select <4 x i1> %6, <4 x i32> %5, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
1001  %8 = trunc <4 x i32> %7 to <4 x i16>
1002  store <4 x i16> %8, ptr %next.gep19, align 2
1003  %index.next = add i32 %index, 4
1004  %9 = icmp eq i32 %index.next, %n.vec
1005  br i1 %9, label %middle.block, label %vector.body
1006
1007middle.block:                                     ; preds = %vector.body
1008  %cmp.n = icmp eq i32 %n.vec, %N
1009  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
1010
1011for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
1012  ret void
1013
1014for.body:                                         ; preds = %for.body.preheader21, %for.body
1015  %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
1016  %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
1017  %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
1018  %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
1019  %incdec.ptr = getelementptr inbounds i16, ptr %pSrcA.addr.011, i32 1
1020  %10 = load i16, ptr %pSrcA.addr.011, align 2
1021  %conv = sext i16 %10 to i32
1022  %incdec.ptr1 = getelementptr inbounds i16, ptr %pSrcB.addr.010, i32 1
1023  %11 = load i16, ptr %pSrcB.addr.010, align 2
1024  %conv2 = sext i16 %11 to i32
1025  %mul = mul nsw i32 %conv2, %conv
1026  %shr = ashr i32 %mul, 15
1027  %12 = icmp sgt i32 %shr, -32768
1028  %.val.i = select i1 %12, i32 %shr, i32 -32768
1029  %13 = icmp slt i32 %.val.i, 32767
1030  %retval.0.i = select i1 %13, i32 %.val.i, i32 32767
1031  %conv3 = trunc i32 %retval.0.i to i16
1032  %incdec.ptr4 = getelementptr inbounds i16, ptr %pDst.addr.09, i32 1
1033  store i16 %conv3, ptr %pDst.addr.09, align 2
1034  %inc = add nuw i32 %i.012, 1
1035  %exitcond = icmp eq i32 %inc, %N
1036  br i1 %exitcond, label %for.cond.cleanup, label %for.body
1037}
1038
1039define arm_aapcs_vfpcc void @ssatmul_8_q15(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
1040; CHECK-LABEL: ssatmul_8_q15:
1041; CHECK:       @ %bb.0: @ %entry
1042; CHECK-NEXT:    .save {r4, r5, r6, lr}
1043; CHECK-NEXT:    push {r4, r5, r6, lr}
1044; CHECK-NEXT:    cbz r3, .LBB6_8
1045; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
1046; CHECK-NEXT:    cmp r3, #7
1047; CHECK-NEXT:    bhi .LBB6_3
1048; CHECK-NEXT:  @ %bb.2:
1049; CHECK-NEXT:    movs r5, #0
1050; CHECK-NEXT:    mov r12, r0
1051; CHECK-NEXT:    mov r6, r1
1052; CHECK-NEXT:    mov r4, r2
1053; CHECK-NEXT:    b .LBB6_6
1054; CHECK-NEXT:  .LBB6_3: @ %vector.ph
1055; CHECK-NEXT:    bic r5, r3, #7
1056; CHECK-NEXT:    movs r4, #1
1057; CHECK-NEXT:    sub.w r6, r5, #8
1058; CHECK-NEXT:    add.w r12, r0, r5, lsl #1
1059; CHECK-NEXT:    add.w lr, r4, r6, lsr #3
1060; CHECK-NEXT:    add.w r4, r2, r5, lsl #1
1061; CHECK-NEXT:    add.w r6, r1, r5, lsl #1
1062; CHECK-NEXT:  .LBB6_4: @ %vector.body
1063; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1064; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
1065; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
1066; CHECK-NEXT:    vmullt.s16 q2, q1, q0
1067; CHECK-NEXT:    vmullb.s16 q0, q1, q0
1068; CHECK-NEXT:    vqshrnb.s32 q0, q0, #15
1069; CHECK-NEXT:    vqshrnt.s32 q0, q2, #15
1070; CHECK-NEXT:    vstrb.8 q0, [r2], #16
1071; CHECK-NEXT:    le lr, .LBB6_4
1072; CHECK-NEXT:  @ %bb.5: @ %middle.block
1073; CHECK-NEXT:    cmp r5, r3
1074; CHECK-NEXT:    it eq
1075; CHECK-NEXT:    popeq {r4, r5, r6, pc}
1076; CHECK-NEXT:  .LBB6_6: @ %for.body.preheader21
1077; CHECK-NEXT:    sub.w lr, r3, r5
1078; CHECK-NEXT:  .LBB6_7: @ %for.body
1079; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1080; CHECK-NEXT:    ldrsh r0, [r12], #2
1081; CHECK-NEXT:    ldrsh r1, [r6], #2
1082; CHECK-NEXT:    muls r0, r1, r0
1083; CHECK-NEXT:    ssat r0, #16, r0, asr #15
1084; CHECK-NEXT:    strh r0, [r4], #2
1085; CHECK-NEXT:    le lr, .LBB6_7
1086; CHECK-NEXT:  .LBB6_8: @ %for.cond.cleanup
1087; CHECK-NEXT:    pop {r4, r5, r6, pc}
1088entry:
1089  %cmp8 = icmp eq i32 %N, 0
1090  br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1091
1092for.body.preheader:                               ; preds = %entry
1093  %min.iters.check = icmp ult i32 %N, 8
1094  br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
1095
1096for.body.preheader21:                             ; preds = %middle.block, %for.body.preheader
1097  %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1098  %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
1099  %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
1100  %pDst.addr.09.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
1101  br label %for.body
1102
1103vector.ph:                                        ; preds = %for.body.preheader
1104  %n.vec = and i32 %N, -8
1105  %ind.end = getelementptr i16, ptr %pSrcA, i32 %n.vec
1106  %ind.end15 = getelementptr i16, ptr %pSrcB, i32 %n.vec
1107  %ind.end17 = getelementptr i16, ptr %pDst, i32 %n.vec
1108  br label %vector.body
1109
1110vector.body:                                      ; preds = %vector.body, %vector.ph
1111  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1112  %next.gep = getelementptr i16, ptr %pSrcA, i32 %index
1113  %next.gep18 = getelementptr i16, ptr %pSrcB, i32 %index
1114  %next.gep19 = getelementptr i16, ptr %pDst, i32 %index
1115  %wide.load = load <8 x i16>, ptr %next.gep, align 2
1116  %0 = sext <8 x i16> %wide.load to <8 x i32>
1117  %wide.load20 = load <8 x i16>, ptr %next.gep18, align 2
1118  %1 = sext <8 x i16> %wide.load20 to <8 x i32>
1119  %2 = mul nsw <8 x i32> %1, %0
1120  %3 = ashr <8 x i32> %2, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
1121  %4 = icmp sgt <8 x i32> %3, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1122  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1123  %6 = icmp slt <8 x i32> %5, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
1124  %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
1125  %8 = trunc <8 x i32> %7 to <8 x i16>
1126  store <8 x i16> %8, ptr %next.gep19, align 2
1127  %index.next = add i32 %index, 8
1128  %9 = icmp eq i32 %index.next, %n.vec
1129  br i1 %9, label %middle.block, label %vector.body
1130
1131middle.block:                                     ; preds = %vector.body
1132  %cmp.n = icmp eq i32 %n.vec, %N
1133  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
1134
1135for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
1136  ret void
1137
1138for.body:                                         ; preds = %for.body.preheader21, %for.body
1139  %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
1140  %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
1141  %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
1142  %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
1143  %incdec.ptr = getelementptr inbounds i16, ptr %pSrcA.addr.011, i32 1
1144  %10 = load i16, ptr %pSrcA.addr.011, align 2
1145  %conv = sext i16 %10 to i32
1146  %incdec.ptr1 = getelementptr inbounds i16, ptr %pSrcB.addr.010, i32 1
1147  %11 = load i16, ptr %pSrcB.addr.010, align 2
1148  %conv2 = sext i16 %11 to i32
1149  %mul = mul nsw i32 %conv2, %conv
1150  %shr = ashr i32 %mul, 15
1151  %12 = icmp sgt i32 %shr, -32768
1152  %.val.i = select i1 %12, i32 %shr, i32 -32768
1153  %13 = icmp slt i32 %.val.i, 32767
1154  %retval.0.i = select i1 %13, i32 %.val.i, i32 32767
1155  %conv3 = trunc i32 %retval.0.i to i16
1156  %incdec.ptr4 = getelementptr inbounds i16, ptr %pDst.addr.09, i32 1
1157  store i16 %conv3, ptr %pDst.addr.09, align 2
1158  %inc = add nuw i32 %i.012, 1
1159  %exitcond = icmp eq i32 %inc, %N
1160  br i1 %exitcond, label %for.cond.cleanup, label %for.body
1161}
1162
1163define arm_aapcs_vfpcc void @ssatmul_8i_q15(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
1164; CHECK-LABEL: ssatmul_8i_q15:
1165; CHECK:       @ %bb.0: @ %entry
1166; CHECK-NEXT:    .save {r4, r5, r6, lr}
1167; CHECK-NEXT:    push {r4, r5, r6, lr}
1168; CHECK-NEXT:    cbz r3, .LBB7_8
1169; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
1170; CHECK-NEXT:    cmp r3, #7
1171; CHECK-NEXT:    bhi .LBB7_3
1172; CHECK-NEXT:  @ %bb.2:
1173; CHECK-NEXT:    movs r5, #0
1174; CHECK-NEXT:    mov r12, r0
1175; CHECK-NEXT:    mov r6, r1
1176; CHECK-NEXT:    mov r4, r2
1177; CHECK-NEXT:    b .LBB7_6
1178; CHECK-NEXT:  .LBB7_3: @ %vector.ph
1179; CHECK-NEXT:    bic r5, r3, #7
1180; CHECK-NEXT:    movs r4, #1
1181; CHECK-NEXT:    sub.w r6, r5, #8
1182; CHECK-NEXT:    add.w r12, r0, r5, lsl #1
1183; CHECK-NEXT:    add.w lr, r4, r6, lsr #3
1184; CHECK-NEXT:    add.w r4, r2, r5, lsl #1
1185; CHECK-NEXT:    add.w r6, r1, r5, lsl #1
1186; CHECK-NEXT:  .LBB7_4: @ %vector.body
1187; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1188; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
1189; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
1190; CHECK-NEXT:    vmullt.s16 q2, q1, q0
1191; CHECK-NEXT:    vmullb.s16 q0, q1, q0
1192; CHECK-NEXT:    vqshrnb.s32 q0, q0, #15
1193; CHECK-NEXT:    vqshrnt.s32 q0, q2, #15
1194; CHECK-NEXT:    vstrb.8 q0, [r2], #16
1195; CHECK-NEXT:    le lr, .LBB7_4
1196; CHECK-NEXT:  @ %bb.5: @ %middle.block
1197; CHECK-NEXT:    cmp r5, r3
1198; CHECK-NEXT:    it eq
1199; CHECK-NEXT:    popeq {r4, r5, r6, pc}
1200; CHECK-NEXT:  .LBB7_6: @ %for.body.preheader21
1201; CHECK-NEXT:    sub.w lr, r3, r5
1202; CHECK-NEXT:  .LBB7_7: @ %for.body
1203; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1204; CHECK-NEXT:    ldrsh r0, [r12], #2
1205; CHECK-NEXT:    ldrsh r1, [r6], #2
1206; CHECK-NEXT:    muls r0, r1, r0
1207; CHECK-NEXT:    ssat r0, #16, r0, asr #15
1208; CHECK-NEXT:    strh r0, [r4], #2
1209; CHECK-NEXT:    le lr, .LBB7_7
1210; CHECK-NEXT:  .LBB7_8: @ %for.cond.cleanup
1211; CHECK-NEXT:    pop {r4, r5, r6, pc}
1212entry:
1213  %cmp8 = icmp eq i32 %N, 0
1214  br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1215
1216for.body.preheader:                               ; preds = %entry
1217  %min.iters.check = icmp ult i32 %N, 8
1218  br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
1219
1220for.body.preheader21:                             ; preds = %middle.block, %for.body.preheader
1221  %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1222  %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
1223  %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
1224  %pDst.addr.09.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
1225  br label %for.body
1226
1227vector.ph:                                        ; preds = %for.body.preheader
1228  %n.vec = and i32 %N, -8
1229  %ind.end = getelementptr i16, ptr %pSrcA, i32 %n.vec
1230  %ind.end15 = getelementptr i16, ptr %pSrcB, i32 %n.vec
1231  %ind.end17 = getelementptr i16, ptr %pDst, i32 %n.vec
1232  br label %vector.body
1233
1234vector.body:                                      ; preds = %vector.body, %vector.ph
1235  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1236  %next.gep = getelementptr i16, ptr %pSrcA, i32 %index
1237  %next.gep18 = getelementptr i16, ptr %pSrcB, i32 %index
1238  %next.gep19 = getelementptr i16, ptr %pDst, i32 %index
1239  %wide.load = load <8 x i16>, ptr %next.gep, align 2
1240  %0 = shufflevector <8 x i16> %wide.load, <8 x i16> %wide.load, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1241  %1 = shufflevector <8 x i16> %wide.load, <8 x i16> %wide.load, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1242  %2 = sext <4 x i16> %0 to <4 x i32>
1243  %3 = sext <4 x i16> %1 to <4 x i32>
1244  %wide.load20 = load <8 x i16>, ptr %next.gep18, align 2
1245  %4 = shufflevector <8 x i16> %wide.load20, <8 x i16> %wide.load20, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1246  %5 = shufflevector <8 x i16> %wide.load20, <8 x i16> %wide.load20, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1247  %6 = sext <4 x i16> %4 to <4 x i32>
1248  %7 = sext <4 x i16> %5 to <4 x i32>
1249  %8 = mul <4 x i32> %6, %2
1250  %9 = mul <4 x i32> %7, %3
1251  %10 = ashr <4 x i32> %8, <i32 15, i32 15, i32 15, i32 15>
1252  %11 = ashr <4 x i32> %9, <i32 15, i32 15, i32 15, i32 15>
1253  %12 = icmp sgt <4 x i32> %10, <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1254  %13 = icmp sgt <4 x i32> %11, <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1255  %14 = select <4 x i1> %12, <4 x i32> %10, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1256  %15 = select <4 x i1> %13, <4 x i32> %11, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1257  %16 = icmp slt <4 x i32> %14, <i32 32767, i32 32767, i32 32767, i32 32767>
1258  %17 = icmp slt <4 x i32> %15, <i32 32767, i32 32767, i32 32767, i32 32767>
1259  %18 = select <4 x i1> %16, <4 x i32> %14, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
1260  %19 = select <4 x i1> %17, <4 x i32> %15, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
1261  %20 = shufflevector <4 x i32> %18, <4 x i32> %19, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
1262  %21 = trunc <8 x i32> %20 to <8 x i16>
1263  store <8 x i16> %21, ptr %next.gep19, align 2
1264  %index.next = add i32 %index, 8
1265  %22 = icmp eq i32 %index.next, %n.vec
1266  br i1 %22, label %middle.block, label %vector.body
1267
1268middle.block:                                     ; preds = %vector.body
1269  %cmp.n = icmp eq i32 %n.vec, %N
1270  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
1271
1272for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
1273  ret void
1274
1275for.body:                                         ; preds = %for.body, %for.body.preheader21
1276  %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
1277  %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
1278  %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
1279  %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
1280  %incdec.ptr = getelementptr inbounds i16, ptr %pSrcA.addr.011, i32 1
1281  %23 = load i16, ptr %pSrcA.addr.011, align 2
1282  %conv = sext i16 %23 to i32
1283  %incdec.ptr1 = getelementptr inbounds i16, ptr %pSrcB.addr.010, i32 1
1284  %24 = load i16, ptr %pSrcB.addr.010, align 2
1285  %conv2 = sext i16 %24 to i32
1286  %mul = mul nsw i32 %conv2, %conv
1287  %shr = ashr i32 %mul, 15
1288  %25 = icmp sgt i32 %shr, -32768
1289  %.val.i = select i1 %25, i32 %shr, i32 -32768
1290  %26 = icmp slt i32 %.val.i, 32767
1291  %retval.0.i = select i1 %26, i32 %.val.i, i32 32767
1292  %conv3 = trunc i32 %retval.0.i to i16
1293  %incdec.ptr4 = getelementptr inbounds i16, ptr %pDst.addr.09, i32 1
1294  store i16 %conv3, ptr %pDst.addr.09, align 2
1295  %inc = add nuw i32 %i.012, 1
1296  %exitcond = icmp eq i32 %inc, %N
1297  br i1 %exitcond, label %for.cond.cleanup, label %for.body
1298}
1299
1300define arm_aapcs_vfpcc void @ssatmul_s4t_q15(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
1301; CHECK-LABEL: ssatmul_s4t_q15:
1302; CHECK:       @ %bb.0: @ %entry
1303; CHECK-NEXT:    .save {r4, lr}
1304; CHECK-NEXT:    push {r4, lr}
1305; CHECK-NEXT:    cmp r3, #0
1306; CHECK-NEXT:    it eq
1307; CHECK-NEXT:    popeq {r4, pc}
1308; CHECK-NEXT:  .LBB8_1: @ %vector.ph
1309; CHECK-NEXT:    add.w r12, r3, #3
1310; CHECK-NEXT:    mov.w lr, #1
1311; CHECK-NEXT:    bic r12, r12, #3
1312; CHECK-NEXT:    adr r4, .LCPI8_0
1313; CHECK-NEXT:    sub.w r12, r12, #4
1314; CHECK-NEXT:    vldrw.u32 q0, [r4]
1315; CHECK-NEXT:    add.w lr, lr, r12, lsr #2
1316; CHECK-NEXT:    sub.w r12, r3, #1
1317; CHECK-NEXT:    movs r3, #0
1318; CHECK-NEXT:    vdup.32 q1, r12
1319; CHECK-NEXT:  .LBB8_2: @ %vector.body
1320; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1321; CHECK-NEXT:    vdup.32 q2, r3
1322; CHECK-NEXT:    adds r3, #4
1323; CHECK-NEXT:    vorr q2, q2, q0
1324; CHECK-NEXT:    vptt.u32 cs, q1, q2
1325; CHECK-NEXT:    vldrht.s32 q2, [r0], #8
1326; CHECK-NEXT:    vldrht.s32 q3, [r1], #8
1327; CHECK-NEXT:    vmul.i32 q2, q3, q2
1328; CHECK-NEXT:    vqshrnb.s32 q2, q2, #15
1329; CHECK-NEXT:    vpst
1330; CHECK-NEXT:    vstrht.32 q2, [r2], #8
1331; CHECK-NEXT:    le lr, .LBB8_2
1332; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
1333; CHECK-NEXT:    pop {r4, pc}
1334; CHECK-NEXT:    .p2align 4
1335; CHECK-NEXT:  @ %bb.4:
1336; CHECK-NEXT:  .LCPI8_0:
1337; CHECK-NEXT:    .long 0 @ 0x0
1338; CHECK-NEXT:    .long 1 @ 0x1
1339; CHECK-NEXT:    .long 2 @ 0x2
1340; CHECK-NEXT:    .long 3 @ 0x3
1341entry:
1342  %cmp8 = icmp eq i32 %N, 0
1343  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
1344
1345vector.ph:                                        ; preds = %entry
1346  %n.rnd.up = add i32 %N, 3
1347  %n.vec = and i32 %n.rnd.up, -4
1348  %trip.count.minus.1 = add i32 %N, -1
1349  %broadcast.splatinsert20 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
1350  %broadcast.splat21 = shufflevector <4 x i32> %broadcast.splatinsert20, <4 x i32> undef, <4 x i32> zeroinitializer
1351  br label %vector.body
1352
1353vector.body:                                      ; preds = %vector.body, %vector.ph
1354  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1355  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
1356  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
1357  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
1358  %next.gep = getelementptr i16, ptr %pSrcA, i32 %index
1359  %next.gep18 = getelementptr i16, ptr %pSrcB, i32 %index
1360  %next.gep19 = getelementptr i16, ptr %pDst, i32 %index
1361  %0 = icmp ule <4 x i32> %induction, %broadcast.splat21
1362  %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %next.gep, i32 2, <4 x i1> %0, <4 x i16> undef)
1363  %1 = sext <4 x i16> %wide.masked.load to <4 x i32>
1364  %wide.masked.load22 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %next.gep18, i32 2, <4 x i1> %0, <4 x i16> undef)
1365  %2 = sext <4 x i16> %wide.masked.load22 to <4 x i32>
1366  %3 = mul nsw <4 x i32> %2, %1
1367  %4 = ashr <4 x i32> %3, <i32 15, i32 15, i32 15, i32 15>
1368  %5 = icmp sgt <4 x i32> %4, <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1369  %6 = select <4 x i1> %5, <4 x i32> %4, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1370  %7 = icmp slt <4 x i32> %6, <i32 32767, i32 32767, i32 32767, i32 32767>
1371  %8 = select <4 x i1> %7, <4 x i32> %6, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
1372  %9 = trunc <4 x i32> %8 to <4 x i16>
1373  call void @llvm.masked.store.v4i16.p0(<4 x i16> %9, ptr %next.gep19, i32 2, <4 x i1> %0)
1374  %index.next = add i32 %index, 4
1375  %10 = icmp eq i32 %index.next, %n.vec
1376  br i1 %10, label %for.cond.cleanup, label %vector.body
1377
1378for.cond.cleanup:                                 ; preds = %vector.body, %entry
1379  ret void
1380}
1381
1382define arm_aapcs_vfpcc void @ssatmul_8t_q15(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
1383; CHECK-LABEL: ssatmul_8t_q15:
1384; CHECK:       @ %bb.0: @ %entry
1385; CHECK-NEXT:    .save {r4, r5, r7, lr}
1386; CHECK-NEXT:    push {r4, r5, r7, lr}
1387; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1388; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1389; CHECK-NEXT:    .pad #16
1390; CHECK-NEXT:    sub sp, #16
1391; CHECK-NEXT:    cmp r3, #0
1392; CHECK-NEXT:    beq .LBB9_3
1393; CHECK-NEXT:  @ %bb.1: @ %vector.ph
1394; CHECK-NEXT:    adds r4, r3, #7
1395; CHECK-NEXT:    vmov.i8 q2, #0x0
1396; CHECK-NEXT:    bic r4, r4, #7
1397; CHECK-NEXT:    vmov.i8 q3, #0xff
1398; CHECK-NEXT:    sub.w r12, r4, #8
1399; CHECK-NEXT:    movs r4, #1
1400; CHECK-NEXT:    mov r5, sp
1401; CHECK-NEXT:    add.w lr, r4, r12, lsr #3
1402; CHECK-NEXT:    adr r4, .LCPI9_0
1403; CHECK-NEXT:    vldrw.u32 q0, [r4]
1404; CHECK-NEXT:    adr r4, .LCPI9_1
1405; CHECK-NEXT:    sub.w r12, r3, #1
1406; CHECK-NEXT:    vldrw.u32 q4, [r4]
1407; CHECK-NEXT:    movs r3, #0
1408; CHECK-NEXT:    vdup.32 q1, r12
1409; CHECK-NEXT:  .LBB9_2: @ %vector.body
1410; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1411; CHECK-NEXT:    vdup.32 q5, r3
1412; CHECK-NEXT:    adds r3, #8
1413; CHECK-NEXT:    vorr q6, q5, q0
1414; CHECK-NEXT:    vorr q5, q5, q4
1415; CHECK-NEXT:    vcmp.u32 cs, q1, q6
1416; CHECK-NEXT:    vpsel q6, q3, q2
1417; CHECK-NEXT:    vcmp.u32 cs, q1, q5
1418; CHECK-NEXT:    vpsel q5, q3, q2
1419; CHECK-NEXT:    vstrh.32 q6, [r5, #8]
1420; CHECK-NEXT:    vstrh.32 q5, [r5]
1421; CHECK-NEXT:    vldrw.u32 q5, [r5]
1422; CHECK-NEXT:    vptt.i16 ne, q5, zr
1423; CHECK-NEXT:    vldrht.u16 q5, [r0], #16
1424; CHECK-NEXT:    vldrht.u16 q6, [r1], #16
1425; CHECK-NEXT:    vmullt.s16 q7, q6, q5
1426; CHECK-NEXT:    vmullb.s16 q5, q6, q5
1427; CHECK-NEXT:    vqshrnb.s32 q5, q5, #15
1428; CHECK-NEXT:    vqshrnt.s32 q5, q7, #15
1429; CHECK-NEXT:    vpst
1430; CHECK-NEXT:    vstrht.16 q5, [r2], #16
1431; CHECK-NEXT:    le lr, .LBB9_2
1432; CHECK-NEXT:  .LBB9_3: @ %for.cond.cleanup
1433; CHECK-NEXT:    add sp, #16
1434; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1435; CHECK-NEXT:    pop {r4, r5, r7, pc}
1436; CHECK-NEXT:    .p2align 4
1437; CHECK-NEXT:  @ %bb.4:
1438; CHECK-NEXT:  .LCPI9_0:
1439; CHECK-NEXT:    .long 4 @ 0x4
1440; CHECK-NEXT:    .long 5 @ 0x5
1441; CHECK-NEXT:    .long 6 @ 0x6
1442; CHECK-NEXT:    .long 7 @ 0x7
1443; CHECK-NEXT:  .LCPI9_1:
1444; CHECK-NEXT:    .long 0 @ 0x0
1445; CHECK-NEXT:    .long 1 @ 0x1
1446; CHECK-NEXT:    .long 2 @ 0x2
1447; CHECK-NEXT:    .long 3 @ 0x3
1448entry:
1449  %cmp8 = icmp eq i32 %N, 0
1450  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
1451
1452vector.ph:                                        ; preds = %entry
1453  %n.rnd.up = add i32 %N, 7
1454  %n.vec = and i32 %n.rnd.up, -8
1455  %trip.count.minus.1 = add i32 %N, -1
1456  %broadcast.splatinsert20 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
1457  %broadcast.splat21 = shufflevector <8 x i32> %broadcast.splatinsert20, <8 x i32> undef, <8 x i32> zeroinitializer
1458  br label %vector.body
1459
1460vector.body:                                      ; preds = %vector.body, %vector.ph
1461  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1462  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
1463  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
1464  %induction = or <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1465  %next.gep = getelementptr i16, ptr %pSrcA, i32 %index
1466  %next.gep18 = getelementptr i16, ptr %pSrcB, i32 %index
1467  %next.gep19 = getelementptr i16, ptr %pDst, i32 %index
1468  %0 = icmp ule <8 x i32> %induction, %broadcast.splat21
1469  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %next.gep, i32 2, <8 x i1> %0, <8 x i16> undef)
1470  %1 = sext <8 x i16> %wide.masked.load to <8 x i32>
1471  %wide.masked.load22 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %next.gep18, i32 2, <8 x i1> %0, <8 x i16> undef)
1472  %2 = sext <8 x i16> %wide.masked.load22 to <8 x i32>
1473  %3 = mul nsw <8 x i32> %2, %1
1474  %4 = ashr <8 x i32> %3, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
1475  %5 = icmp sgt <8 x i32> %4, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1476  %6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1477  %7 = icmp slt <8 x i32> %6, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
1478  %8 = select <8 x i1> %7, <8 x i32> %6, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
1479  %9 = trunc <8 x i32> %8 to <8 x i16>
1480  call void @llvm.masked.store.v8i16.p0(<8 x i16> %9, ptr %next.gep19, i32 2, <8 x i1> %0)
1481  %index.next = add i32 %index, 8
1482  %10 = icmp eq i32 %index.next, %n.vec
1483  br i1 %10, label %for.cond.cleanup, label %vector.body
1484
1485for.cond.cleanup:                                 ; preds = %vector.body, %entry
1486  ret void
1487}
1488
1489define arm_aapcs_vfpcc void @ssatmul_8ti_q15(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
1490; CHECK-LABEL: ssatmul_8ti_q15:
1491; CHECK:       @ %bb.0: @ %entry
1492; CHECK-NEXT:    .save {r4, r5, r7, lr}
1493; CHECK-NEXT:    push {r4, r5, r7, lr}
1494; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1495; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1496; CHECK-NEXT:    .pad #16
1497; CHECK-NEXT:    sub sp, #16
1498; CHECK-NEXT:    cmp r3, #0
1499; CHECK-NEXT:    beq .LBB10_3
1500; CHECK-NEXT:  @ %bb.1: @ %vector.ph
1501; CHECK-NEXT:    adds r4, r3, #7
1502; CHECK-NEXT:    vmov.i8 q2, #0x0
1503; CHECK-NEXT:    bic r4, r4, #7
1504; CHECK-NEXT:    vmov.i8 q3, #0xff
1505; CHECK-NEXT:    sub.w r12, r4, #8
1506; CHECK-NEXT:    movs r4, #1
1507; CHECK-NEXT:    mov r5, sp
1508; CHECK-NEXT:    add.w lr, r4, r12, lsr #3
1509; CHECK-NEXT:    adr r4, .LCPI10_0
1510; CHECK-NEXT:    vldrw.u32 q0, [r4]
1511; CHECK-NEXT:    adr r4, .LCPI10_1
1512; CHECK-NEXT:    sub.w r12, r3, #1
1513; CHECK-NEXT:    vldrw.u32 q4, [r4]
1514; CHECK-NEXT:    movs r3, #0
1515; CHECK-NEXT:    vdup.32 q1, r12
1516; CHECK-NEXT:  .LBB10_2: @ %vector.body
1517; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1518; CHECK-NEXT:    vdup.32 q5, r3
1519; CHECK-NEXT:    adds r3, #8
1520; CHECK-NEXT:    vorr q6, q5, q0
1521; CHECK-NEXT:    vorr q5, q5, q4
1522; CHECK-NEXT:    vcmp.u32 cs, q1, q6
1523; CHECK-NEXT:    vpsel q6, q3, q2
1524; CHECK-NEXT:    vcmp.u32 cs, q1, q5
1525; CHECK-NEXT:    vpsel q5, q3, q2
1526; CHECK-NEXT:    vstrh.32 q6, [r5, #8]
1527; CHECK-NEXT:    vstrh.32 q5, [r5]
1528; CHECK-NEXT:    vldrw.u32 q5, [r5]
1529; CHECK-NEXT:    vptt.i16 ne, q5, zr
1530; CHECK-NEXT:    vldrht.u16 q5, [r0], #16
1531; CHECK-NEXT:    vldrht.u16 q6, [r1], #16
1532; CHECK-NEXT:    vmullt.s16 q7, q6, q5
1533; CHECK-NEXT:    vmullb.s16 q5, q6, q5
1534; CHECK-NEXT:    vqshrnb.s32 q5, q5, #15
1535; CHECK-NEXT:    vqshrnt.s32 q5, q7, #15
1536; CHECK-NEXT:    vpst
1537; CHECK-NEXT:    vstrht.16 q5, [r2], #16
1538; CHECK-NEXT:    le lr, .LBB10_2
1539; CHECK-NEXT:  .LBB10_3: @ %for.cond.cleanup
1540; CHECK-NEXT:    add sp, #16
1541; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1542; CHECK-NEXT:    pop {r4, r5, r7, pc}
1543; CHECK-NEXT:    .p2align 4
1544; CHECK-NEXT:  @ %bb.4:
1545; CHECK-NEXT:  .LCPI10_0:
1546; CHECK-NEXT:    .long 4 @ 0x4
1547; CHECK-NEXT:    .long 5 @ 0x5
1548; CHECK-NEXT:    .long 6 @ 0x6
1549; CHECK-NEXT:    .long 7 @ 0x7
1550; CHECK-NEXT:  .LCPI10_1:
1551; CHECK-NEXT:    .long 0 @ 0x0
1552; CHECK-NEXT:    .long 1 @ 0x1
1553; CHECK-NEXT:    .long 2 @ 0x2
1554; CHECK-NEXT:    .long 3 @ 0x3
1555entry:
1556  %cmp8 = icmp eq i32 %N, 0
1557  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
1558
1559vector.ph:                                        ; preds = %entry
1560  %n.rnd.up = add i32 %N, 7
1561  %n.vec = and i32 %n.rnd.up, -8
1562  %trip.count.minus.1 = add i32 %N, -1
1563  %broadcast.splatinsert20 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
1564  %broadcast.splat21 = shufflevector <8 x i32> %broadcast.splatinsert20, <8 x i32> undef, <8 x i32> zeroinitializer
1565  br label %vector.body
1566
1567vector.body:                                      ; preds = %vector.body, %vector.ph
1568  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1569  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
1570  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
1571  %induction = or <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1572  %next.gep = getelementptr i16, ptr %pSrcA, i32 %index
1573  %next.gep18 = getelementptr i16, ptr %pSrcB, i32 %index
1574  %next.gep19 = getelementptr i16, ptr %pDst, i32 %index
1575  %0 = icmp ule <8 x i32> %induction, %broadcast.splat21
1576  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %next.gep, i32 2, <8 x i1> %0, <8 x i16> undef)
1577  %1 = shufflevector <8 x i16> %wide.masked.load, <8 x i16> %wide.masked.load, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1578  %2 = shufflevector <8 x i16> %wide.masked.load, <8 x i16> %wide.masked.load, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1579  %3 = sext <4 x i16> %1 to <4 x i32>
1580  %4 = sext <4 x i16> %2 to <4 x i32>
1581  %wide.masked.load22 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %next.gep18, i32 2, <8 x i1> %0, <8 x i16> undef)
1582  %5 = shufflevector <8 x i16> %wide.masked.load22, <8 x i16> %wide.masked.load22, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1583  %6 = shufflevector <8 x i16> %wide.masked.load22, <8 x i16> %wide.masked.load22, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1584  %7 = sext <4 x i16> %5 to <4 x i32>
1585  %8 = sext <4 x i16> %6 to <4 x i32>
1586  %9 = mul <4 x i32> %7, %3
1587  %10 = mul <4 x i32> %8, %4
1588  %11 = ashr <4 x i32> %9, <i32 15, i32 15, i32 15, i32 15>
1589  %12 = ashr <4 x i32> %10, <i32 15, i32 15, i32 15, i32 15>
1590  %13 = icmp sgt <4 x i32> %11, <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1591  %14 = icmp sgt <4 x i32> %12, <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1592  %15 = select <4 x i1> %13, <4 x i32> %11, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1593  %16 = select <4 x i1> %14, <4 x i32> %12, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1594  %17 = icmp slt <4 x i32> %15, <i32 32767, i32 32767, i32 32767, i32 32767>
1595  %18 = icmp slt <4 x i32> %16, <i32 32767, i32 32767, i32 32767, i32 32767>
1596  %19 = select <4 x i1> %17, <4 x i32> %15, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
1597  %20 = select <4 x i1> %18, <4 x i32> %16, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
1598  %21 = shufflevector <4 x i32> %19, <4 x i32> %20, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
1599  %22 = trunc <8 x i32> %21 to <8 x i16>
1600  call void @llvm.masked.store.v8i16.p0(<8 x i16> %22, ptr %next.gep19, i32 2, <8 x i1> %0)
1601  %index.next = add i32 %index, 8
1602  %23 = icmp eq i32 %index.next, %n.vec
1603  br i1 %23, label %for.cond.cleanup, label %vector.body
1604
1605for.cond.cleanup:                                 ; preds = %vector.body, %entry
1606  ret void
1607}
1608
1609define arm_aapcs_vfpcc void @usatmul_4_q15(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
1610; CHECK-LABEL: usatmul_4_q15:
1611; CHECK:       @ %bb.0: @ %entry
1612; CHECK-NEXT:    .save {r4, r5, r6, lr}
1613; CHECK-NEXT:    push {r4, r5, r6, lr}
1614; CHECK-NEXT:    cmp r3, #0
1615; CHECK-NEXT:    beq .LBB11_8
1616; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
1617; CHECK-NEXT:    cmp r3, #3
1618; CHECK-NEXT:    bhi .LBB11_3
1619; CHECK-NEXT:  @ %bb.2:
1620; CHECK-NEXT:    movs r5, #0
1621; CHECK-NEXT:    mov r12, r0
1622; CHECK-NEXT:    mov r6, r1
1623; CHECK-NEXT:    mov r4, r2
1624; CHECK-NEXT:    b .LBB11_6
1625; CHECK-NEXT:  .LBB11_3: @ %vector.ph
1626; CHECK-NEXT:    bic r5, r3, #3
1627; CHECK-NEXT:    movs r4, #1
1628; CHECK-NEXT:    subs r6, r5, #4
1629; CHECK-NEXT:    add.w r12, r0, r5, lsl #1
1630; CHECK-NEXT:    add.w lr, r4, r6, lsr #2
1631; CHECK-NEXT:    add.w r4, r2, r5, lsl #1
1632; CHECK-NEXT:    add.w r6, r1, r5, lsl #1
1633; CHECK-NEXT:  .LBB11_4: @ %vector.body
1634; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1635; CHECK-NEXT:    vldrh.u32 q0, [r0], #8
1636; CHECK-NEXT:    vldrh.u32 q1, [r1], #8
1637; CHECK-NEXT:    vmul.i32 q0, q1, q0
1638; CHECK-NEXT:    vqshrnb.u32 q0, q0, #15
1639; CHECK-NEXT:    vstrh.32 q0, [r2], #8
1640; CHECK-NEXT:    le lr, .LBB11_4
1641; CHECK-NEXT:  @ %bb.5: @ %middle.block
1642; CHECK-NEXT:    cmp r5, r3
1643; CHECK-NEXT:    it eq
1644; CHECK-NEXT:    popeq {r4, r5, r6, pc}
1645; CHECK-NEXT:  .LBB11_6: @ %for.body.preheader21
1646; CHECK-NEXT:    sub.w lr, r3, r5
1647; CHECK-NEXT:    movw r0, #65535
1648; CHECK-NEXT:  .LBB11_7: @ %for.body
1649; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1650; CHECK-NEXT:    ldrh r1, [r12], #2
1651; CHECK-NEXT:    ldrh r2, [r6], #2
1652; CHECK-NEXT:    muls r1, r2, r1
1653; CHECK-NEXT:    lsrs r2, r1, #15
1654; CHECK-NEXT:    cmp r2, r0
1655; CHECK-NEXT:    movw r2, #65535
1656; CHECK-NEXT:    it lo
1657; CHECK-NEXT:    lsrlo r2, r1, #15
1658; CHECK-NEXT:    strh r2, [r4], #2
1659; CHECK-NEXT:    le lr, .LBB11_7
1660; CHECK-NEXT:  .LBB11_8: @ %for.cond.cleanup
1661; CHECK-NEXT:    pop {r4, r5, r6, pc}
1662entry:
1663  %cmp8 = icmp eq i32 %N, 0
1664  br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1665
1666for.body.preheader:                               ; preds = %entry
1667  %min.iters.check = icmp ult i32 %N, 4
1668  br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
1669
1670for.body.preheader21:                             ; preds = %middle.block, %for.body.preheader
1671  %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1672  %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
1673  %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
1674  %pDst.addr.09.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
1675  br label %for.body
1676
1677vector.ph:                                        ; preds = %for.body.preheader
1678  %n.vec = and i32 %N, -4
1679  %ind.end = getelementptr i16, ptr %pSrcA, i32 %n.vec
1680  %ind.end15 = getelementptr i16, ptr %pSrcB, i32 %n.vec
1681  %ind.end17 = getelementptr i16, ptr %pDst, i32 %n.vec
1682  br label %vector.body
1683
1684vector.body:                                      ; preds = %vector.body, %vector.ph
1685  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1686  %next.gep = getelementptr i16, ptr %pSrcA, i32 %index
1687  %next.gep18 = getelementptr i16, ptr %pSrcB, i32 %index
1688  %next.gep19 = getelementptr i16, ptr %pDst, i32 %index
1689  %wide.load = load <4 x i16>, ptr %next.gep, align 2
1690  %0 = zext <4 x i16> %wide.load to <4 x i32>
1691  %wide.load20 = load <4 x i16>, ptr %next.gep18, align 2
1692  %1 = zext <4 x i16> %wide.load20 to <4 x i32>
1693  %2 = mul nuw <4 x i32> %1, %0
1694  %3 = lshr <4 x i32> %2, <i32 15, i32 15, i32 15, i32 15>
1695  %4 = icmp ult <4 x i32> %3, <i32 65535, i32 65535, i32 65535, i32 65535>
1696  %5 = select <4 x i1> %4, <4 x i32> %3, <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>
1697  %6 = trunc <4 x i32> %5 to <4 x i16>
1698  store <4 x i16> %6, ptr %next.gep19, align 2
1699  %index.next = add i32 %index, 4
1700  %7 = icmp eq i32 %index.next, %n.vec
1701  br i1 %7, label %middle.block, label %vector.body
1702
1703middle.block:                                     ; preds = %vector.body
1704  %cmp.n = icmp eq i32 %n.vec, %N
1705  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
1706
1707for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
1708  ret void
1709
1710for.body:                                         ; preds = %for.body.preheader21, %for.body
1711  %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
1712  %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
1713  %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
1714  %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
1715  %incdec.ptr = getelementptr inbounds i16, ptr %pSrcA.addr.011, i32 1
1716  %8 = load i16, ptr %pSrcA.addr.011, align 2
1717  %conv = zext i16 %8 to i32
1718  %incdec.ptr1 = getelementptr inbounds i16, ptr %pSrcB.addr.010, i32 1
1719  %9 = load i16, ptr %pSrcB.addr.010, align 2
1720  %conv2 = zext i16 %9 to i32
1721  %mul = mul nuw i32 %conv2, %conv
1722  %shr = lshr i32 %mul, 15
1723  %10 = icmp ult i32 %shr, 65535
1724  %retval.0.i = select i1 %10, i32 %shr, i32 65535
1725  %conv3 = trunc i32 %retval.0.i to i16
1726  %incdec.ptr4 = getelementptr inbounds i16, ptr %pDst.addr.09, i32 1
1727  store i16 %conv3, ptr %pDst.addr.09, align 2
1728  %inc = add nuw i32 %i.012, 1
1729  %exitcond = icmp eq i32 %inc, %N
1730  br i1 %exitcond, label %for.cond.cleanup, label %for.body
1731}
1732
1733define arm_aapcs_vfpcc void @usatmul_8_q15(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
1734; CHECK-LABEL: usatmul_8_q15:
1735; CHECK:       @ %bb.0: @ %entry
1736; CHECK-NEXT:    .save {r4, r5, r6, lr}
1737; CHECK-NEXT:    push {r4, r5, r6, lr}
1738; CHECK-NEXT:    cmp r3, #0
1739; CHECK-NEXT:    beq .LBB12_8
1740; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
1741; CHECK-NEXT:    cmp r3, #7
1742; CHECK-NEXT:    bhi .LBB12_3
1743; CHECK-NEXT:  @ %bb.2:
1744; CHECK-NEXT:    movs r5, #0
1745; CHECK-NEXT:    mov r12, r0
1746; CHECK-NEXT:    mov r6, r1
1747; CHECK-NEXT:    mov r4, r2
1748; CHECK-NEXT:    b .LBB12_6
1749; CHECK-NEXT:  .LBB12_3: @ %vector.ph
1750; CHECK-NEXT:    bic r5, r3, #7
1751; CHECK-NEXT:    movs r4, #1
1752; CHECK-NEXT:    sub.w r6, r5, #8
1753; CHECK-NEXT:    add.w r12, r0, r5, lsl #1
1754; CHECK-NEXT:    add.w lr, r4, r6, lsr #3
1755; CHECK-NEXT:    add.w r4, r2, r5, lsl #1
1756; CHECK-NEXT:    add.w r6, r1, r5, lsl #1
1757; CHECK-NEXT:  .LBB12_4: @ %vector.body
1758; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1759; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
1760; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
1761; CHECK-NEXT:    vmullt.u16 q2, q1, q0
1762; CHECK-NEXT:    vmullb.u16 q0, q1, q0
1763; CHECK-NEXT:    vqshrnb.u32 q0, q0, #15
1764; CHECK-NEXT:    vqshrnt.u32 q0, q2, #15
1765; CHECK-NEXT:    vstrb.8 q0, [r2], #16
1766; CHECK-NEXT:    le lr, .LBB12_4
1767; CHECK-NEXT:  @ %bb.5: @ %middle.block
1768; CHECK-NEXT:    cmp r5, r3
1769; CHECK-NEXT:    it eq
1770; CHECK-NEXT:    popeq {r4, r5, r6, pc}
1771; CHECK-NEXT:  .LBB12_6: @ %for.body.preheader21
1772; CHECK-NEXT:    sub.w lr, r3, r5
1773; CHECK-NEXT:    movw r0, #65535
1774; CHECK-NEXT:  .LBB12_7: @ %for.body
1775; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1776; CHECK-NEXT:    ldrh r1, [r12], #2
1777; CHECK-NEXT:    ldrh r2, [r6], #2
1778; CHECK-NEXT:    muls r1, r2, r1
1779; CHECK-NEXT:    lsrs r2, r1, #15
1780; CHECK-NEXT:    cmp r2, r0
1781; CHECK-NEXT:    movw r2, #65535
1782; CHECK-NEXT:    it lo
1783; CHECK-NEXT:    lsrlo r2, r1, #15
1784; CHECK-NEXT:    strh r2, [r4], #2
1785; CHECK-NEXT:    le lr, .LBB12_7
1786; CHECK-NEXT:  .LBB12_8: @ %for.cond.cleanup
1787; CHECK-NEXT:    pop {r4, r5, r6, pc}
1788entry:
1789  %cmp8 = icmp eq i32 %N, 0
1790  br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1791
1792for.body.preheader:                               ; preds = %entry
1793  %min.iters.check = icmp ult i32 %N, 8
1794  br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
1795
1796for.body.preheader21:                             ; preds = %middle.block, %for.body.preheader
1797  %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1798  %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
1799  %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
1800  %pDst.addr.09.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
1801  br label %for.body
1802
1803vector.ph:                                        ; preds = %for.body.preheader
1804  %n.vec = and i32 %N, -8
1805  %ind.end = getelementptr i16, ptr %pSrcA, i32 %n.vec
1806  %ind.end15 = getelementptr i16, ptr %pSrcB, i32 %n.vec
1807  %ind.end17 = getelementptr i16, ptr %pDst, i32 %n.vec
1808  br label %vector.body
1809
1810vector.body:                                      ; preds = %vector.body, %vector.ph
1811  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1812  %next.gep = getelementptr i16, ptr %pSrcA, i32 %index
1813  %next.gep18 = getelementptr i16, ptr %pSrcB, i32 %index
1814  %next.gep19 = getelementptr i16, ptr %pDst, i32 %index
1815  %wide.load = load <8 x i16>, ptr %next.gep, align 2
1816  %0 = zext <8 x i16> %wide.load to <8 x i32>
1817  %wide.load20 = load <8 x i16>, ptr %next.gep18, align 2
1818  %1 = zext <8 x i16> %wide.load20 to <8 x i32>
1819  %2 = mul nuw <8 x i32> %1, %0
1820  %3 = lshr <8 x i32> %2, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
1821  %4 = icmp ult <8 x i32> %3, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
1822  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
1823  %6 = trunc <8 x i32> %5 to <8 x i16>
1824  store <8 x i16> %6, ptr %next.gep19, align 2
1825  %index.next = add i32 %index, 8
1826  %7 = icmp eq i32 %index.next, %n.vec
1827  br i1 %7, label %middle.block, label %vector.body
1828
1829middle.block:                                     ; preds = %vector.body
1830  %cmp.n = icmp eq i32 %n.vec, %N
1831  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
1832
1833for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
1834  ret void
1835
1836for.body:                                         ; preds = %for.body.preheader21, %for.body
1837  %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
1838  %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
1839  %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
1840  %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
1841  %incdec.ptr = getelementptr inbounds i16, ptr %pSrcA.addr.011, i32 1
1842  %8 = load i16, ptr %pSrcA.addr.011, align 2
1843  %conv = zext i16 %8 to i32
1844  %incdec.ptr1 = getelementptr inbounds i16, ptr %pSrcB.addr.010, i32 1
1845  %9 = load i16, ptr %pSrcB.addr.010, align 2
1846  %conv2 = zext i16 %9 to i32
1847  %mul = mul nuw i32 %conv2, %conv
1848  %shr = lshr i32 %mul, 15
1849  %10 = icmp ult i32 %shr, 65535
1850  %retval.0.i = select i1 %10, i32 %shr, i32 65535
1851  %conv3 = trunc i32 %retval.0.i to i16
1852  %incdec.ptr4 = getelementptr inbounds i16, ptr %pDst.addr.09, i32 1
1853  store i16 %conv3, ptr %pDst.addr.09, align 2
1854  %inc = add nuw i32 %i.012, 1
1855  %exitcond = icmp eq i32 %inc, %N
1856  br i1 %exitcond, label %for.cond.cleanup, label %for.body
1857}
1858
1859
1860; i8
1861
1862define arm_aapcs_vfpcc void @ssatmul_4_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
1863; CHECK-LABEL: ssatmul_4_q7:
1864; CHECK:       @ %bb.0: @ %entry
1865; CHECK-NEXT:    .save {r4, r5, r6, lr}
1866; CHECK-NEXT:    push {r4, r5, r6, lr}
1867; CHECK-NEXT:    cmp r3, #0
1868; CHECK-NEXT:    beq .LBB13_8
1869; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
1870; CHECK-NEXT:    cmp r3, #3
1871; CHECK-NEXT:    bhi .LBB13_3
1872; CHECK-NEXT:  @ %bb.2:
1873; CHECK-NEXT:    movs r5, #0
1874; CHECK-NEXT:    mov r12, r0
1875; CHECK-NEXT:    mov r6, r1
1876; CHECK-NEXT:    mov r4, r2
1877; CHECK-NEXT:    b .LBB13_6
1878; CHECK-NEXT:  .LBB13_3: @ %vector.ph
1879; CHECK-NEXT:    bic r5, r3, #3
1880; CHECK-NEXT:    movs r4, #1
1881; CHECK-NEXT:    subs r6, r5, #4
1882; CHECK-NEXT:    add.w r12, r0, r5
1883; CHECK-NEXT:    vmvn.i32 q0, #0x7f
1884; CHECK-NEXT:    vmov.i32 q1, #0x7f
1885; CHECK-NEXT:    add.w lr, r4, r6, lsr #2
1886; CHECK-NEXT:    adds r4, r2, r5
1887; CHECK-NEXT:    adds r6, r1, r5
1888; CHECK-NEXT:  .LBB13_4: @ %vector.body
1889; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1890; CHECK-NEXT:    vldrb.s32 q2, [r0], #4
1891; CHECK-NEXT:    vldrb.s32 q3, [r1], #4
1892; CHECK-NEXT:    vmul.i32 q2, q3, q2
1893; CHECK-NEXT:    vshr.s32 q2, q2, #7
1894; CHECK-NEXT:    vmax.s32 q2, q2, q0
1895; CHECK-NEXT:    vmin.s32 q2, q2, q1
1896; CHECK-NEXT:    vstrb.32 q2, [r2], #4
1897; CHECK-NEXT:    le lr, .LBB13_4
1898; CHECK-NEXT:  @ %bb.5: @ %middle.block
1899; CHECK-NEXT:    cmp r5, r3
1900; CHECK-NEXT:    it eq
1901; CHECK-NEXT:    popeq {r4, r5, r6, pc}
1902; CHECK-NEXT:  .LBB13_6: @ %for.body.preheader21
1903; CHECK-NEXT:    sub.w lr, r3, r5
1904; CHECK-NEXT:  .LBB13_7: @ %for.body
1905; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1906; CHECK-NEXT:    ldrsb r0, [r12], #1
1907; CHECK-NEXT:    ldrsb r1, [r6], #1
1908; CHECK-NEXT:    muls r0, r1, r0
1909; CHECK-NEXT:    ssat r0, #8, r0, asr #7
1910; CHECK-NEXT:    strb r0, [r4], #1
1911; CHECK-NEXT:    le lr, .LBB13_7
1912; CHECK-NEXT:  .LBB13_8: @ %for.cond.cleanup
1913; CHECK-NEXT:    pop {r4, r5, r6, pc}
1914entry:
1915  %cmp8 = icmp eq i32 %N, 0
1916  br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1917
1918for.body.preheader:                               ; preds = %entry
1919  %min.iters.check = icmp ult i32 %N, 4
1920  br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
1921
1922for.body.preheader21:                             ; preds = %middle.block, %for.body.preheader
1923  %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1924  %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
1925  %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
1926  %pDst.addr.09.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
1927  br label %for.body
1928
1929vector.ph:                                        ; preds = %for.body.preheader
1930  %n.vec = and i32 %N, -4
1931  %ind.end = getelementptr i8, ptr %pSrcA, i32 %n.vec
1932  %ind.end15 = getelementptr i8, ptr %pSrcB, i32 %n.vec
1933  %ind.end17 = getelementptr i8, ptr %pDst, i32 %n.vec
1934  br label %vector.body
1935
1936vector.body:                                      ; preds = %vector.body, %vector.ph
1937  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1938  %next.gep = getelementptr i8, ptr %pSrcA, i32 %index
1939  %next.gep18 = getelementptr i8, ptr %pSrcB, i32 %index
1940  %next.gep19 = getelementptr i8, ptr %pDst, i32 %index
1941  %wide.load = load <4 x i8>, ptr %next.gep, align 1
1942  %0 = sext <4 x i8> %wide.load to <4 x i32>
1943  %wide.load20 = load <4 x i8>, ptr %next.gep18, align 1
1944  %1 = sext <4 x i8> %wide.load20 to <4 x i32>
1945  %2 = mul nsw <4 x i32> %1, %0
1946  %3 = ashr <4 x i32> %2, <i32 7, i32 7, i32 7, i32 7>
1947  %4 = icmp sgt <4 x i32> %3, <i32 -128, i32 -128, i32 -128, i32 -128>
1948  %5 = select <4 x i1> %4, <4 x i32> %3, <4 x i32> <i32 -128, i32 -128, i32 -128, i32 -128>
1949  %6 = icmp slt <4 x i32> %5, <i32 127, i32 127, i32 127, i32 127>
1950  %7 = select <4 x i1> %6, <4 x i32> %5, <4 x i32> <i32 127, i32 127, i32 127, i32 127>
1951  %8 = trunc <4 x i32> %7 to <4 x i8>
1952  store <4 x i8> %8, ptr %next.gep19, align 1
1953  %index.next = add i32 %index, 4
1954  %9 = icmp eq i32 %index.next, %n.vec
1955  br i1 %9, label %middle.block, label %vector.body
1956
1957middle.block:                                     ; preds = %vector.body
1958  %cmp.n = icmp eq i32 %n.vec, %N
1959  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
1960
1961for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
1962  ret void
1963
1964for.body:                                         ; preds = %for.body.preheader21, %for.body
1965  %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
1966  %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
1967  %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
1968  %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
1969  %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1
1970  %10 = load i8, ptr %pSrcA.addr.011, align 1
1971  %conv = sext i8 %10 to i32
1972  %incdec.ptr1 = getelementptr inbounds i8, ptr %pSrcB.addr.010, i32 1
1973  %11 = load i8, ptr %pSrcB.addr.010, align 1
1974  %conv2 = sext i8 %11 to i32
1975  %mul = mul nsw i32 %conv2, %conv
1976  %shr = ashr i32 %mul, 7
1977  %12 = icmp sgt i32 %shr, -128
1978  %.val.i = select i1 %12, i32 %shr, i32 -128
1979  %13 = icmp slt i32 %.val.i, 127
1980  %retval.0.i = select i1 %13, i32 %.val.i, i32 127
1981  %conv3 = trunc i32 %retval.0.i to i8
1982  %incdec.ptr4 = getelementptr inbounds i8, ptr %pDst.addr.09, i32 1
1983  store i8 %conv3, ptr %pDst.addr.09, align 1
1984  %inc = add nuw i32 %i.012, 1
1985  %exitcond = icmp eq i32 %inc, %N
1986  br i1 %exitcond, label %for.cond.cleanup, label %for.body
1987}
1988
1989define arm_aapcs_vfpcc void @ssatmul_8_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
1990; CHECK-LABEL: ssatmul_8_q7:
1991; CHECK:       @ %bb.0: @ %entry
1992; CHECK-NEXT:    .save {r4, r5, r6, lr}
1993; CHECK-NEXT:    push {r4, r5, r6, lr}
1994; CHECK-NEXT:    cbz r3, .LBB14_8
1995; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
1996; CHECK-NEXT:    cmp r3, #7
1997; CHECK-NEXT:    bhi .LBB14_3
1998; CHECK-NEXT:  @ %bb.2:
1999; CHECK-NEXT:    movs r5, #0
2000; CHECK-NEXT:    mov r12, r0
2001; CHECK-NEXT:    mov r6, r1
2002; CHECK-NEXT:    mov r4, r2
2003; CHECK-NEXT:    b .LBB14_6
2004; CHECK-NEXT:  .LBB14_3: @ %vector.ph
2005; CHECK-NEXT:    bic r5, r3, #7
2006; CHECK-NEXT:    movs r4, #1
2007; CHECK-NEXT:    sub.w r6, r5, #8
2008; CHECK-NEXT:    add.w r12, r0, r5
2009; CHECK-NEXT:    add.w lr, r4, r6, lsr #3
2010; CHECK-NEXT:    adds r4, r2, r5
2011; CHECK-NEXT:    adds r6, r1, r5
2012; CHECK-NEXT:  .LBB14_4: @ %vector.body
2013; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
2014; CHECK-NEXT:    vldrb.s16 q0, [r0], #8
2015; CHECK-NEXT:    vldrb.s16 q1, [r1], #8
2016; CHECK-NEXT:    vmul.i16 q0, q1, q0
2017; CHECK-NEXT:    vqshrnb.s16 q0, q0, #7
2018; CHECK-NEXT:    vstrb.16 q0, [r2], #8
2019; CHECK-NEXT:    le lr, .LBB14_4
2020; CHECK-NEXT:  @ %bb.5: @ %middle.block
2021; CHECK-NEXT:    cmp r5, r3
2022; CHECK-NEXT:    it eq
2023; CHECK-NEXT:    popeq {r4, r5, r6, pc}
2024; CHECK-NEXT:  .LBB14_6: @ %for.body.preheader23
2025; CHECK-NEXT:    sub.w lr, r3, r5
2026; CHECK-NEXT:  .LBB14_7: @ %for.body
2027; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
2028; CHECK-NEXT:    ldrsb r0, [r12], #1
2029; CHECK-NEXT:    ldrsb r1, [r6], #1
2030; CHECK-NEXT:    muls r0, r1, r0
2031; CHECK-NEXT:    ssat r0, #8, r0, asr #7
2032; CHECK-NEXT:    strb r0, [r4], #1
2033; CHECK-NEXT:    le lr, .LBB14_7
2034; CHECK-NEXT:  .LBB14_8: @ %for.cond.cleanup
2035; CHECK-NEXT:    pop {r4, r5, r6, pc}
2036entry:
2037  %cmp10 = icmp eq i32 %N, 0
2038  br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
2039
2040for.body.preheader:                               ; preds = %entry
2041  %min.iters.check = icmp ult i32 %N, 8
2042  br i1 %min.iters.check, label %for.body.preheader23, label %vector.ph
2043
2044for.body.preheader23:                             ; preds = %middle.block, %for.body.preheader
2045  %i.014.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
2046  %pSrcA.addr.013.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
2047  %pSrcB.addr.012.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end17, %middle.block ]
2048  %pDst.addr.011.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end19, %middle.block ]
2049  br label %for.body
2050
2051vector.ph:                                        ; preds = %for.body.preheader
2052  %n.vec = and i32 %N, -8
2053  %ind.end = getelementptr i8, ptr %pSrcA, i32 %n.vec
2054  %ind.end17 = getelementptr i8, ptr %pSrcB, i32 %n.vec
2055  %ind.end19 = getelementptr i8, ptr %pDst, i32 %n.vec
2056  br label %vector.body
2057
2058vector.body:                                      ; preds = %vector.body, %vector.ph
2059  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2060  %next.gep = getelementptr i8, ptr %pSrcA, i32 %index
2061  %next.gep20 = getelementptr i8, ptr %pSrcB, i32 %index
2062  %next.gep21 = getelementptr i8, ptr %pDst, i32 %index
2063  %wide.load = load <8 x i8>, ptr %next.gep, align 1
2064  %0 = sext <8 x i8> %wide.load to <8 x i16>
2065  %wide.load22 = load <8 x i8>, ptr %next.gep20, align 1
2066  %1 = sext <8 x i8> %wide.load22 to <8 x i16>
2067  %2 = mul nsw <8 x i16> %1, %0
2068  %3 = ashr <8 x i16> %2, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
2069  %4 = icmp sgt <8 x i16> %3, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2070  %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2071  %6 = icmp slt <8 x i16> %5, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2072  %7 = select <8 x i1> %6, <8 x i16> %5, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2073  %8 = trunc <8 x i16> %7 to <8 x i8>
2074  store <8 x i8> %8, ptr %next.gep21, align 1
2075  %index.next = add i32 %index, 8
2076  %9 = icmp eq i32 %index.next, %n.vec
2077  br i1 %9, label %middle.block, label %vector.body
2078
2079middle.block:                                     ; preds = %vector.body
2080  %cmp.n = icmp eq i32 %n.vec, %N
2081  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader23
2082
2083for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
2084  ret void
2085
2086for.body:                                         ; preds = %for.body.preheader23, %for.body
2087  %i.014 = phi i32 [ %inc, %for.body ], [ %i.014.ph, %for.body.preheader23 ]
2088  %pSrcA.addr.013 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.013.ph, %for.body.preheader23 ]
2089  %pSrcB.addr.012 = phi ptr [ %incdec.ptr2, %for.body ], [ %pSrcB.addr.012.ph, %for.body.preheader23 ]
2090  %pDst.addr.011 = phi ptr [ %incdec.ptr6, %for.body ], [ %pDst.addr.011.ph, %for.body.preheader23 ]
2091  %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.013, i32 1
2092  %10 = load i8, ptr %pSrcA.addr.013, align 1
2093  %conv1 = sext i8 %10 to i16
2094  %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.012, i32 1
2095  %11 = load i8, ptr %pSrcB.addr.012, align 1
2096  %conv3 = sext i8 %11 to i16
2097  %mul = mul nsw i16 %conv3, %conv1
2098  %shr = ashr i16 %mul, 7
2099  %12 = icmp sgt i16 %shr, -128
2100  %.val.i = select i1 %12, i16 %shr, i16 -128
2101  %13 = icmp slt i16 %.val.i, 127
2102  %retval.0.i = select i1 %13, i16 %.val.i, i16 127
2103  %conv5 = trunc i16 %retval.0.i to i8
2104  %incdec.ptr6 = getelementptr inbounds i8, ptr %pDst.addr.011, i32 1
2105  store i8 %conv5, ptr %pDst.addr.011, align 1
2106  %inc = add nuw i32 %i.014, 1
2107  %exitcond = icmp eq i32 %inc, %N
2108  br i1 %exitcond, label %for.cond.cleanup, label %for.body
2109}
2110
2111define arm_aapcs_vfpcc void @ssatmul_16_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
2112; CHECK-LABEL: ssatmul_16_q7:
2113; CHECK:       @ %bb.0: @ %entry
2114; CHECK-NEXT:    .save {r4, r5, r6, lr}
2115; CHECK-NEXT:    push {r4, r5, r6, lr}
2116; CHECK-NEXT:    cbz r3, .LBB15_8
2117; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
2118; CHECK-NEXT:    cmp r3, #15
2119; CHECK-NEXT:    bhi .LBB15_3
2120; CHECK-NEXT:  @ %bb.2:
2121; CHECK-NEXT:    movs r5, #0
2122; CHECK-NEXT:    mov r12, r0
2123; CHECK-NEXT:    mov r6, r1
2124; CHECK-NEXT:    mov r4, r2
2125; CHECK-NEXT:    b .LBB15_6
2126; CHECK-NEXT:  .LBB15_3: @ %vector.ph
2127; CHECK-NEXT:    bic r5, r3, #15
2128; CHECK-NEXT:    movs r4, #1
2129; CHECK-NEXT:    sub.w r6, r5, #16
2130; CHECK-NEXT:    add.w r12, r0, r5
2131; CHECK-NEXT:    add.w lr, r4, r6, lsr #4
2132; CHECK-NEXT:    adds r4, r2, r5
2133; CHECK-NEXT:    adds r6, r1, r5
2134; CHECK-NEXT:  .LBB15_4: @ %vector.body
2135; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
2136; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
2137; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
2138; CHECK-NEXT:    vmullt.s8 q2, q1, q0
2139; CHECK-NEXT:    vmullb.s8 q0, q1, q0
2140; CHECK-NEXT:    vqshrnb.s16 q0, q0, #7
2141; CHECK-NEXT:    vqshrnt.s16 q0, q2, #7
2142; CHECK-NEXT:    vstrb.8 q0, [r2], #16
2143; CHECK-NEXT:    le lr, .LBB15_4
2144; CHECK-NEXT:  @ %bb.5: @ %middle.block
2145; CHECK-NEXT:    cmp r5, r3
2146; CHECK-NEXT:    it eq
2147; CHECK-NEXT:    popeq {r4, r5, r6, pc}
2148; CHECK-NEXT:  .LBB15_6: @ %for.body.preheader23
2149; CHECK-NEXT:    sub.w lr, r3, r5
2150; CHECK-NEXT:  .LBB15_7: @ %for.body
2151; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
2152; CHECK-NEXT:    ldrsb r0, [r12], #1
2153; CHECK-NEXT:    ldrsb r1, [r6], #1
2154; CHECK-NEXT:    muls r0, r1, r0
2155; CHECK-NEXT:    ssat r0, #8, r0, asr #7
2156; CHECK-NEXT:    strb r0, [r4], #1
2157; CHECK-NEXT:    le lr, .LBB15_7
2158; CHECK-NEXT:  .LBB15_8: @ %for.cond.cleanup
2159; CHECK-NEXT:    pop {r4, r5, r6, pc}
2160entry:
2161  %cmp10 = icmp eq i32 %N, 0
2162  br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
2163
2164for.body.preheader:                               ; preds = %entry
2165  %min.iters.check = icmp ult i32 %N, 16
2166  br i1 %min.iters.check, label %for.body.preheader23, label %vector.ph
2167
2168for.body.preheader23:                             ; preds = %middle.block, %for.body.preheader
2169  %i.014.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
2170  %pSrcA.addr.013.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
2171  %pSrcB.addr.012.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end17, %middle.block ]
2172  %pDst.addr.011.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end19, %middle.block ]
2173  br label %for.body
2174
2175vector.ph:                                        ; preds = %for.body.preheader
2176  %n.vec = and i32 %N, -16
2177  %ind.end = getelementptr i8, ptr %pSrcA, i32 %n.vec
2178  %ind.end17 = getelementptr i8, ptr %pSrcB, i32 %n.vec
2179  %ind.end19 = getelementptr i8, ptr %pDst, i32 %n.vec
2180  br label %vector.body
2181
2182vector.body:                                      ; preds = %vector.body, %vector.ph
2183  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2184  %next.gep = getelementptr i8, ptr %pSrcA, i32 %index
2185  %next.gep20 = getelementptr i8, ptr %pSrcB, i32 %index
2186  %next.gep21 = getelementptr i8, ptr %pDst, i32 %index
2187  %wide.load = load <16 x i8>, ptr %next.gep, align 1
2188  %0 = sext <16 x i8> %wide.load to <16 x i16>
2189  %wide.load22 = load <16 x i8>, ptr %next.gep20, align 1
2190  %1 = sext <16 x i8> %wide.load22 to <16 x i16>
2191  %2 = mul nsw <16 x i16> %1, %0
2192  %3 = ashr <16 x i16> %2, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
2193  %4 = icmp sgt <16 x i16> %3, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2194  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2195  %6 = icmp slt <16 x i16> %5, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2196  %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2197  %8 = trunc <16 x i16> %7 to <16 x i8>
2198  store <16 x i8> %8, ptr %next.gep21, align 1
2199  %index.next = add i32 %index, 16
2200  %9 = icmp eq i32 %index.next, %n.vec
2201  br i1 %9, label %middle.block, label %vector.body
2202
2203middle.block:                                     ; preds = %vector.body
2204  %cmp.n = icmp eq i32 %n.vec, %N
2205  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader23
2206
2207for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
2208  ret void
2209
2210for.body:                                         ; preds = %for.body.preheader23, %for.body
2211  %i.014 = phi i32 [ %inc, %for.body ], [ %i.014.ph, %for.body.preheader23 ]
2212  %pSrcA.addr.013 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.013.ph, %for.body.preheader23 ]
2213  %pSrcB.addr.012 = phi ptr [ %incdec.ptr2, %for.body ], [ %pSrcB.addr.012.ph, %for.body.preheader23 ]
2214  %pDst.addr.011 = phi ptr [ %incdec.ptr6, %for.body ], [ %pDst.addr.011.ph, %for.body.preheader23 ]
2215  %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.013, i32 1
2216  %10 = load i8, ptr %pSrcA.addr.013, align 1
2217  %conv1 = sext i8 %10 to i16
2218  %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.012, i32 1
2219  %11 = load i8, ptr %pSrcB.addr.012, align 1
2220  %conv3 = sext i8 %11 to i16
2221  %mul = mul nsw i16 %conv3, %conv1
2222  %shr = ashr i16 %mul, 7
2223  %12 = icmp sgt i16 %shr, -128
2224  %.val.i = select i1 %12, i16 %shr, i16 -128
2225  %13 = icmp slt i16 %.val.i, 127
2226  %retval.0.i = select i1 %13, i16 %.val.i, i16 127
2227  %conv5 = trunc i16 %retval.0.i to i8
2228  %incdec.ptr6 = getelementptr inbounds i8, ptr %pDst.addr.011, i32 1
2229  store i8 %conv5, ptr %pDst.addr.011, align 1
2230  %inc = add nuw i32 %i.014, 1
2231  %exitcond = icmp eq i32 %inc, %N
2232  br i1 %exitcond, label %for.cond.cleanup, label %for.body
2233}
2234
2235define arm_aapcs_vfpcc void @ssatmul_16i_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
2236; CHECK-LABEL: ssatmul_16i_q7:
2237; CHECK:       @ %bb.0: @ %entry
2238; CHECK-NEXT:    .save {r4, r5, r6, lr}
2239; CHECK-NEXT:    push {r4, r5, r6, lr}
2240; CHECK-NEXT:    cbz r3, .LBB16_8
2241; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
2242; CHECK-NEXT:    cmp r3, #15
2243; CHECK-NEXT:    bhi .LBB16_3
2244; CHECK-NEXT:  @ %bb.2:
2245; CHECK-NEXT:    movs r5, #0
2246; CHECK-NEXT:    mov r12, r0
2247; CHECK-NEXT:    mov r6, r1
2248; CHECK-NEXT:    mov r4, r2
2249; CHECK-NEXT:    b .LBB16_6
2250; CHECK-NEXT:  .LBB16_3: @ %vector.ph
2251; CHECK-NEXT:    bic r5, r3, #15
2252; CHECK-NEXT:    movs r4, #1
2253; CHECK-NEXT:    sub.w r6, r5, #16
2254; CHECK-NEXT:    add.w r12, r0, r5
2255; CHECK-NEXT:    add.w lr, r4, r6, lsr #4
2256; CHECK-NEXT:    adds r4, r2, r5
2257; CHECK-NEXT:    adds r6, r1, r5
2258; CHECK-NEXT:  .LBB16_4: @ %vector.body
2259; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
2260; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
2261; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
2262; CHECK-NEXT:    vmullt.s8 q2, q1, q0
2263; CHECK-NEXT:    vmullb.s8 q0, q1, q0
2264; CHECK-NEXT:    vqshrnb.s16 q0, q0, #7
2265; CHECK-NEXT:    vqshrnt.s16 q0, q2, #7
2266; CHECK-NEXT:    vstrb.8 q0, [r2], #16
2267; CHECK-NEXT:    le lr, .LBB16_4
2268; CHECK-NEXT:  @ %bb.5: @ %middle.block
2269; CHECK-NEXT:    cmp r5, r3
2270; CHECK-NEXT:    it eq
2271; CHECK-NEXT:    popeq {r4, r5, r6, pc}
2272; CHECK-NEXT:  .LBB16_6: @ %for.body.preheader23
2273; CHECK-NEXT:    sub.w lr, r3, r5
2274; CHECK-NEXT:  .LBB16_7: @ %for.body
2275; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
2276; CHECK-NEXT:    ldrsb r0, [r12], #1
2277; CHECK-NEXT:    ldrsb r1, [r6], #1
2278; CHECK-NEXT:    muls r0, r1, r0
2279; CHECK-NEXT:    ssat r0, #8, r0, asr #7
2280; CHECK-NEXT:    strb r0, [r4], #1
2281; CHECK-NEXT:    le lr, .LBB16_7
2282; CHECK-NEXT:  .LBB16_8: @ %for.cond.cleanup
2283; CHECK-NEXT:    pop {r4, r5, r6, pc}
2284entry:
2285  %cmp10 = icmp eq i32 %N, 0
2286  br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
2287
2288for.body.preheader:                               ; preds = %entry
2289  %min.iters.check = icmp ult i32 %N, 16
2290  br i1 %min.iters.check, label %for.body.preheader23, label %vector.ph
2291
2292for.body.preheader23:                             ; preds = %middle.block, %for.body.preheader
2293  %i.014.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
2294  %pSrcA.addr.013.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
2295  %pSrcB.addr.012.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end17, %middle.block ]
2296  %pDst.addr.011.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end19, %middle.block ]
2297  br label %for.body
2298
2299vector.ph:                                        ; preds = %for.body.preheader
2300  %n.vec = and i32 %N, -16
2301  %ind.end = getelementptr i8, ptr %pSrcA, i32 %n.vec
2302  %ind.end17 = getelementptr i8, ptr %pSrcB, i32 %n.vec
2303  %ind.end19 = getelementptr i8, ptr %pDst, i32 %n.vec
2304  br label %vector.body
2305
2306vector.body:                                      ; preds = %vector.body, %vector.ph
2307  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2308  %next.gep = getelementptr i8, ptr %pSrcA, i32 %index
2309  %next.gep20 = getelementptr i8, ptr %pSrcB, i32 %index
2310  %next.gep21 = getelementptr i8, ptr %pDst, i32 %index
2311  %wide.load = load <16 x i8>, ptr %next.gep, align 1
2312  %0 = shufflevector <16 x i8> %wide.load, <16 x i8> %wide.load, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
2313  %1 = shufflevector <16 x i8> %wide.load, <16 x i8> %wide.load, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
2314  %2 = sext <8 x i8> %0 to <8 x i16>
2315  %3 = sext <8 x i8> %1 to <8 x i16>
2316  %wide.load22 = load <16 x i8>, ptr %next.gep20, align 1
2317  %4 = shufflevector <16 x i8> %wide.load22, <16 x i8> %wide.load22, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
2318  %5 = shufflevector <16 x i8> %wide.load22, <16 x i8> %wide.load22, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
2319  %6 = sext <8 x i8> %4 to <8 x i16>
2320  %7 = sext <8 x i8> %5 to <8 x i16>
2321  %8 = mul <8 x i16> %6, %2
2322  %9 = mul <8 x i16> %7, %3
2323  %10 = ashr <8 x i16> %8, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
2324  %11 = ashr <8 x i16> %9, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
2325  %12 = icmp sgt <8 x i16> %10, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2326  %13 = icmp sgt <8 x i16> %11, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2327  %14 = select <8 x i1> %12, <8 x i16> %10, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2328  %15 = select <8 x i1> %13, <8 x i16> %11, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2329  %16 = icmp slt <8 x i16> %14, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2330  %17 = icmp slt <8 x i16> %15, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2331  %18 = select <8 x i1> %16, <8 x i16> %14, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2332  %19 = select <8 x i1> %17, <8 x i16> %15, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2333  %20 = shufflevector <8 x i16> %18, <8 x i16> %19, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
2334  %21 = trunc <16 x i16> %20 to <16 x i8>
2335  store <16 x i8> %21, ptr %next.gep21, align 1
2336  %index.next = add i32 %index, 16
2337  %22 = icmp eq i32 %index.next, %n.vec
2338  br i1 %22, label %middle.block, label %vector.body
2339
2340middle.block:                                     ; preds = %vector.body
2341  %cmp.n = icmp eq i32 %n.vec, %N
2342  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader23
2343
2344for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
2345  ret void
2346
2347for.body:                                         ; preds = %for.body, %for.body.preheader23
2348  %i.014 = phi i32 [ %inc, %for.body ], [ %i.014.ph, %for.body.preheader23 ]
2349  %pSrcA.addr.013 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.013.ph, %for.body.preheader23 ]
2350  %pSrcB.addr.012 = phi ptr [ %incdec.ptr2, %for.body ], [ %pSrcB.addr.012.ph, %for.body.preheader23 ]
2351  %pDst.addr.011 = phi ptr [ %incdec.ptr6, %for.body ], [ %pDst.addr.011.ph, %for.body.preheader23 ]
2352  %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.013, i32 1
2353  %23 = load i8, ptr %pSrcA.addr.013, align 1
2354  %conv1 = sext i8 %23 to i16
2355  %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.012, i32 1
2356  %24 = load i8, ptr %pSrcB.addr.012, align 1
2357  %conv3 = sext i8 %24 to i16
2358  %mul = mul nsw i16 %conv3, %conv1
2359  %shr = ashr i16 %mul, 7
2360  %25 = icmp sgt i16 %shr, -128
2361  %.val.i = select i1 %25, i16 %shr, i16 -128
2362  %26 = icmp slt i16 %.val.i, 127
2363  %retval.0.i = select i1 %26, i16 %.val.i, i16 127
2364  %conv5 = trunc i16 %retval.0.i to i8
2365  %incdec.ptr6 = getelementptr inbounds i8, ptr %pDst.addr.011, i32 1
2366  store i8 %conv5, ptr %pDst.addr.011, align 1
2367  %inc = add nuw i32 %i.014, 1
2368  %exitcond = icmp eq i32 %inc, %N
2369  br i1 %exitcond, label %for.cond.cleanup, label %for.body
2370}
2371
2372define arm_aapcs_vfpcc void @ssatmul_8t_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
2373; CHECK-LABEL: ssatmul_8t_q7:
2374; CHECK:       @ %bb.0: @ %entry
2375; CHECK-NEXT:    .save {r4, r5, r7, lr}
2376; CHECK-NEXT:    push {r4, r5, r7, lr}
2377; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
2378; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
2379; CHECK-NEXT:    .pad #16
2380; CHECK-NEXT:    sub sp, #16
2381; CHECK-NEXT:    cmp r3, #0
2382; CHECK-NEXT:    beq .LBB17_3
2383; CHECK-NEXT:  @ %bb.1: @ %vector.ph
2384; CHECK-NEXT:    adds r4, r3, #7
2385; CHECK-NEXT:    vmov.i8 q2, #0x0
2386; CHECK-NEXT:    bic r4, r4, #7
2387; CHECK-NEXT:    vmov.i8 q3, #0xff
2388; CHECK-NEXT:    sub.w r12, r4, #8
2389; CHECK-NEXT:    movs r4, #1
2390; CHECK-NEXT:    mov r5, sp
2391; CHECK-NEXT:    add.w lr, r4, r12, lsr #3
2392; CHECK-NEXT:    adr r4, .LCPI17_0
2393; CHECK-NEXT:    vldrw.u32 q0, [r4]
2394; CHECK-NEXT:    adr r4, .LCPI17_1
2395; CHECK-NEXT:    sub.w r12, r3, #1
2396; CHECK-NEXT:    vldrw.u32 q4, [r4]
2397; CHECK-NEXT:    movs r3, #0
2398; CHECK-NEXT:    vdup.32 q1, r12
2399; CHECK-NEXT:  .LBB17_2: @ %vector.body
2400; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
2401; CHECK-NEXT:    vdup.32 q5, r3
2402; CHECK-NEXT:    adds r3, #8
2403; CHECK-NEXT:    vorr q6, q5, q0
2404; CHECK-NEXT:    vorr q5, q5, q4
2405; CHECK-NEXT:    vcmp.u32 cs, q1, q6
2406; CHECK-NEXT:    vpsel q6, q3, q2
2407; CHECK-NEXT:    vcmp.u32 cs, q1, q5
2408; CHECK-NEXT:    vpsel q5, q3, q2
2409; CHECK-NEXT:    vstrh.32 q6, [r5, #8]
2410; CHECK-NEXT:    vstrh.32 q5, [r5]
2411; CHECK-NEXT:    vldrw.u32 q5, [r5]
2412; CHECK-NEXT:    vptt.i16 ne, q5, zr
2413; CHECK-NEXT:    vldrbt.s16 q5, [r0], #8
2414; CHECK-NEXT:    vldrbt.s16 q6, [r1], #8
2415; CHECK-NEXT:    vmul.i16 q5, q6, q5
2416; CHECK-NEXT:    vqshrnb.s16 q5, q5, #7
2417; CHECK-NEXT:    vpst
2418; CHECK-NEXT:    vstrbt.16 q5, [r2], #8
2419; CHECK-NEXT:    le lr, .LBB17_2
2420; CHECK-NEXT:  .LBB17_3: @ %for.cond.cleanup
2421; CHECK-NEXT:    add sp, #16
2422; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
2423; CHECK-NEXT:    pop {r4, r5, r7, pc}
2424; CHECK-NEXT:    .p2align 4
2425; CHECK-NEXT:  @ %bb.4:
2426; CHECK-NEXT:  .LCPI17_0:
2427; CHECK-NEXT:    .long 4 @ 0x4
2428; CHECK-NEXT:    .long 5 @ 0x5
2429; CHECK-NEXT:    .long 6 @ 0x6
2430; CHECK-NEXT:    .long 7 @ 0x7
2431; CHECK-NEXT:  .LCPI17_1:
2432; CHECK-NEXT:    .long 0 @ 0x0
2433; CHECK-NEXT:    .long 1 @ 0x1
2434; CHECK-NEXT:    .long 2 @ 0x2
2435; CHECK-NEXT:    .long 3 @ 0x3
2436entry:
2437  %cmp10 = icmp eq i32 %N, 0
2438  br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
2439
2440vector.ph:                                        ; preds = %entry
2441  %n.rnd.up = add i32 %N, 7
2442  %n.vec = and i32 %n.rnd.up, -8
2443  %trip.count.minus.1 = add i32 %N, -1
2444  %broadcast.splatinsert22 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
2445  %broadcast.splat23 = shufflevector <8 x i32> %broadcast.splatinsert22, <8 x i32> undef, <8 x i32> zeroinitializer
2446  br label %vector.body
2447
2448vector.body:                                      ; preds = %vector.body, %vector.ph
2449  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2450  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
2451  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
2452  %induction = or <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2453  %next.gep = getelementptr i8, ptr %pSrcA, i32 %index
2454  %next.gep20 = getelementptr i8, ptr %pSrcB, i32 %index
2455  %next.gep21 = getelementptr i8, ptr %pDst, i32 %index
2456  %0 = icmp ule <8 x i32> %induction, %broadcast.splat23
2457  %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %next.gep, i32 1, <8 x i1> %0, <8 x i8> undef)
2458  %1 = sext <8 x i8> %wide.masked.load to <8 x i16>
2459  %wide.masked.load24 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %next.gep20, i32 1, <8 x i1> %0, <8 x i8> undef)
2460  %2 = sext <8 x i8> %wide.masked.load24 to <8 x i16>
2461  %3 = mul nsw <8 x i16> %2, %1
2462  %4 = ashr <8 x i16> %3, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
2463  %5 = icmp sgt <8 x i16> %4, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2464  %6 = select <8 x i1> %5, <8 x i16> %4, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2465  %7 = icmp slt <8 x i16> %6, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2466  %8 = select <8 x i1> %7, <8 x i16> %6, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2467  %9 = trunc <8 x i16> %8 to <8 x i8>
2468  call void @llvm.masked.store.v8i8.p0(<8 x i8> %9, ptr %next.gep21, i32 1, <8 x i1> %0)
2469  %index.next = add i32 %index, 8
2470  %10 = icmp eq i32 %index.next, %n.vec
2471  br i1 %10, label %for.cond.cleanup, label %vector.body
2472
2473for.cond.cleanup:                                 ; preds = %vector.body, %entry
2474  ret void
2475}
2476
2477define arm_aapcs_vfpcc void @ssatmul_16t_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
2478; CHECK-LABEL: ssatmul_16t_q7:
2479; CHECK:       @ %bb.0: @ %entry
2480; CHECK-NEXT:    .save {r4, r5, r6, lr}
2481; CHECK-NEXT:    push {r4, r5, r6, lr}
2482; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
2483; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
2484; CHECK-NEXT:    .pad #80
2485; CHECK-NEXT:    sub sp, #80
2486; CHECK-NEXT:    cmp r3, #0
2487; CHECK-NEXT:    beq .LBB18_3
2488; CHECK-NEXT:  @ %bb.1: @ %vector.ph
2489; CHECK-NEXT:    add.w r6, r3, #15
2490; CHECK-NEXT:    movs r5, #1
2491; CHECK-NEXT:    bic r6, r6, #15
2492; CHECK-NEXT:    add r4, sp, #48
2493; CHECK-NEXT:    subs r6, #16
2494; CHECK-NEXT:    vmov.i8 q2, #0x0
2495; CHECK-NEXT:    vmov.i8 q3, #0xff
2496; CHECK-NEXT:    add.w lr, r5, r6, lsr #4
2497; CHECK-NEXT:    adr r5, .LCPI18_0
2498; CHECK-NEXT:    subs r6, r3, #1
2499; CHECK-NEXT:    vldrw.u32 q0, [r5]
2500; CHECK-NEXT:    vdup.32 q1, r6
2501; CHECK-NEXT:    adr r6, .LCPI18_1
2502; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
2503; CHECK-NEXT:    vldrw.u32 q0, [r6]
2504; CHECK-NEXT:    adr r6, .LCPI18_2
2505; CHECK-NEXT:    vldrw.u32 q5, [r6]
2506; CHECK-NEXT:    adr r6, .LCPI18_3
2507; CHECK-NEXT:    vldrw.u32 q6, [r6]
2508; CHECK-NEXT:    add r5, sp, #32
2509; CHECK-NEXT:    add r6, sp, #64
2510; CHECK-NEXT:    movs r3, #0
2511; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
2512; CHECK-NEXT:  .LBB18_2: @ %vector.body
2513; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
2514; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
2515; CHECK-NEXT:    vdup.32 q7, r3
2516; CHECK-NEXT:    adds r3, #16
2517; CHECK-NEXT:    vorr q0, q7, q0
2518; CHECK-NEXT:    vcmp.u32 cs, q1, q0
2519; CHECK-NEXT:    vpsel q0, q3, q2
2520; CHECK-NEXT:    vstrh.32 q0, [r4, #8]
2521; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
2522; CHECK-NEXT:    vorr q0, q7, q0
2523; CHECK-NEXT:    vcmp.u32 cs, q1, q0
2524; CHECK-NEXT:    vpsel q0, q3, q2
2525; CHECK-NEXT:    vstrh.32 q0, [r4]
2526; CHECK-NEXT:    vorr q0, q7, q5
2527; CHECK-NEXT:    vcmp.u32 cs, q1, q0
2528; CHECK-NEXT:    vpsel q0, q3, q2
2529; CHECK-NEXT:    vstrh.32 q0, [r5, #8]
2530; CHECK-NEXT:    vorr q0, q7, q6
2531; CHECK-NEXT:    vcmp.u32 cs, q1, q0
2532; CHECK-NEXT:    vpsel q0, q3, q2
2533; CHECK-NEXT:    vstrh.32 q0, [r5]
2534; CHECK-NEXT:    vldrw.u32 q0, [r4]
2535; CHECK-NEXT:    vcmp.i16 ne, q0, zr
2536; CHECK-NEXT:    vpsel q0, q3, q2
2537; CHECK-NEXT:    vstrb.16 q0, [r6, #8]
2538; CHECK-NEXT:    vldrw.u32 q0, [r5]
2539; CHECK-NEXT:    vcmp.i16 ne, q0, zr
2540; CHECK-NEXT:    vpsel q0, q3, q2
2541; CHECK-NEXT:    vstrb.16 q0, [r6]
2542; CHECK-NEXT:    vldrw.u32 q0, [r6]
2543; CHECK-NEXT:    vptt.i8 ne, q0, zr
2544; CHECK-NEXT:    vldrbt.u8 q0, [r0], #16
2545; CHECK-NEXT:    vldrbt.u8 q7, [r1], #16
2546; CHECK-NEXT:    vmullt.s8 q4, q7, q0
2547; CHECK-NEXT:    vmullb.s8 q0, q7, q0
2548; CHECK-NEXT:    vqshrnb.s16 q0, q0, #7
2549; CHECK-NEXT:    vqshrnt.s16 q0, q4, #7
2550; CHECK-NEXT:    vpst
2551; CHECK-NEXT:    vstrbt.8 q0, [r2], #16
2552; CHECK-NEXT:    le lr, .LBB18_2
2553; CHECK-NEXT:  .LBB18_3: @ %for.cond.cleanup
2554; CHECK-NEXT:    add sp, #80
2555; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
2556; CHECK-NEXT:    pop {r4, r5, r6, pc}
2557; CHECK-NEXT:    .p2align 4
2558; CHECK-NEXT:  @ %bb.4:
2559; CHECK-NEXT:  .LCPI18_0:
2560; CHECK-NEXT:    .long 12 @ 0xc
2561; CHECK-NEXT:    .long 13 @ 0xd
2562; CHECK-NEXT:    .long 14 @ 0xe
2563; CHECK-NEXT:    .long 15 @ 0xf
2564; CHECK-NEXT:  .LCPI18_1:
2565; CHECK-NEXT:    .long 8 @ 0x8
2566; CHECK-NEXT:    .long 9 @ 0x9
2567; CHECK-NEXT:    .long 10 @ 0xa
2568; CHECK-NEXT:    .long 11 @ 0xb
2569; CHECK-NEXT:  .LCPI18_2:
2570; CHECK-NEXT:    .long 4 @ 0x4
2571; CHECK-NEXT:    .long 5 @ 0x5
2572; CHECK-NEXT:    .long 6 @ 0x6
2573; CHECK-NEXT:    .long 7 @ 0x7
2574; CHECK-NEXT:  .LCPI18_3:
2575; CHECK-NEXT:    .long 0 @ 0x0
2576; CHECK-NEXT:    .long 1 @ 0x1
2577; CHECK-NEXT:    .long 2 @ 0x2
2578; CHECK-NEXT:    .long 3 @ 0x3
2579entry:
2580  %cmp10 = icmp eq i32 %N, 0
2581  br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
2582
2583vector.ph:                                        ; preds = %entry
2584  %n.rnd.up = add i32 %N, 15
2585  %n.vec = and i32 %n.rnd.up, -16
2586  %trip.count.minus.1 = add i32 %N, -1
2587  %broadcast.splatinsert22 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0
2588  %broadcast.splat23 = shufflevector <16 x i32> %broadcast.splatinsert22, <16 x i32> undef, <16 x i32> zeroinitializer
2589  br label %vector.body
2590
2591vector.body:                                      ; preds = %vector.body, %vector.ph
2592  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2593  %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
2594  %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
2595  %induction = or <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2596  %next.gep = getelementptr i8, ptr %pSrcA, i32 %index
2597  %next.gep20 = getelementptr i8, ptr %pSrcB, i32 %index
2598  %next.gep21 = getelementptr i8, ptr %pDst, i32 %index
2599  %0 = icmp ule <16 x i32> %induction, %broadcast.splat23
2600  %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %next.gep, i32 1, <16 x i1> %0, <16 x i8> undef)
2601  %1 = sext <16 x i8> %wide.masked.load to <16 x i16>
2602  %wide.masked.load24 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %next.gep20, i32 1, <16 x i1> %0, <16 x i8> undef)
2603  %2 = sext <16 x i8> %wide.masked.load24 to <16 x i16>
2604  %3 = mul nsw <16 x i16> %2, %1
2605  %4 = ashr <16 x i16> %3, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
2606  %5 = icmp sgt <16 x i16> %4, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2607  %6 = select <16 x i1> %5, <16 x i16> %4, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2608  %7 = icmp slt <16 x i16> %6, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2609  %8 = select <16 x i1> %7, <16 x i16> %6, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2610  %9 = trunc <16 x i16> %8 to <16 x i8>
2611  call void @llvm.masked.store.v16i8.p0(<16 x i8> %9, ptr %next.gep21, i32 1, <16 x i1> %0)
2612  %index.next = add i32 %index, 16
2613  %10 = icmp eq i32 %index.next, %n.vec
2614  br i1 %10, label %for.cond.cleanup, label %vector.body
2615
2616for.cond.cleanup:                                 ; preds = %vector.body, %entry
2617  ret void
2618}
2619
2620define arm_aapcs_vfpcc void @ssatmul_16ti_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
2621; CHECK-LABEL: ssatmul_16ti_q7:
2622; CHECK:       @ %bb.0: @ %entry
2623; CHECK-NEXT:    .save {r4, r5, r6, lr}
2624; CHECK-NEXT:    push {r4, r5, r6, lr}
2625; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
2626; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
2627; CHECK-NEXT:    .pad #80
2628; CHECK-NEXT:    sub sp, #80
2629; CHECK-NEXT:    cmp r3, #0
2630; CHECK-NEXT:    beq .LBB19_3
2631; CHECK-NEXT:  @ %bb.1: @ %vector.ph
2632; CHECK-NEXT:    add.w r6, r3, #15
2633; CHECK-NEXT:    movs r5, #1
2634; CHECK-NEXT:    bic r6, r6, #15
2635; CHECK-NEXT:    add r4, sp, #48
2636; CHECK-NEXT:    subs r6, #16
2637; CHECK-NEXT:    vmov.i8 q2, #0x0
2638; CHECK-NEXT:    vmov.i8 q3, #0xff
2639; CHECK-NEXT:    add.w lr, r5, r6, lsr #4
2640; CHECK-NEXT:    adr r5, .LCPI19_0
2641; CHECK-NEXT:    subs r6, r3, #1
2642; CHECK-NEXT:    vldrw.u32 q0, [r5]
2643; CHECK-NEXT:    vdup.32 q1, r6
2644; CHECK-NEXT:    adr r6, .LCPI19_1
2645; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
2646; CHECK-NEXT:    vldrw.u32 q0, [r6]
2647; CHECK-NEXT:    adr r6, .LCPI19_2
2648; CHECK-NEXT:    vldrw.u32 q5, [r6]
2649; CHECK-NEXT:    adr r6, .LCPI19_3
2650; CHECK-NEXT:    vldrw.u32 q6, [r6]
2651; CHECK-NEXT:    add r5, sp, #32
2652; CHECK-NEXT:    add r6, sp, #64
2653; CHECK-NEXT:    movs r3, #0
2654; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
2655; CHECK-NEXT:  .LBB19_2: @ %vector.body
2656; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
2657; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
2658; CHECK-NEXT:    vdup.32 q7, r3
2659; CHECK-NEXT:    adds r3, #16
2660; CHECK-NEXT:    vorr q0, q7, q0
2661; CHECK-NEXT:    vcmp.u32 cs, q1, q0
2662; CHECK-NEXT:    vpsel q0, q3, q2
2663; CHECK-NEXT:    vstrh.32 q0, [r4, #8]
2664; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
2665; CHECK-NEXT:    vorr q0, q7, q0
2666; CHECK-NEXT:    vcmp.u32 cs, q1, q0
2667; CHECK-NEXT:    vpsel q0, q3, q2
2668; CHECK-NEXT:    vstrh.32 q0, [r4]
2669; CHECK-NEXT:    vorr q0, q7, q5
2670; CHECK-NEXT:    vcmp.u32 cs, q1, q0
2671; CHECK-NEXT:    vpsel q0, q3, q2
2672; CHECK-NEXT:    vstrh.32 q0, [r5, #8]
2673; CHECK-NEXT:    vorr q0, q7, q6
2674; CHECK-NEXT:    vcmp.u32 cs, q1, q0
2675; CHECK-NEXT:    vpsel q0, q3, q2
2676; CHECK-NEXT:    vstrh.32 q0, [r5]
2677; CHECK-NEXT:    vldrw.u32 q0, [r4]
2678; CHECK-NEXT:    vcmp.i16 ne, q0, zr
2679; CHECK-NEXT:    vpsel q0, q3, q2
2680; CHECK-NEXT:    vstrb.16 q0, [r6, #8]
2681; CHECK-NEXT:    vldrw.u32 q0, [r5]
2682; CHECK-NEXT:    vcmp.i16 ne, q0, zr
2683; CHECK-NEXT:    vpsel q0, q3, q2
2684; CHECK-NEXT:    vstrb.16 q0, [r6]
2685; CHECK-NEXT:    vldrw.u32 q0, [r6]
2686; CHECK-NEXT:    vptt.i8 ne, q0, zr
2687; CHECK-NEXT:    vldrbt.u8 q0, [r0], #16
2688; CHECK-NEXT:    vldrbt.u8 q7, [r1], #16
2689; CHECK-NEXT:    vmullt.s8 q4, q7, q0
2690; CHECK-NEXT:    vmullb.s8 q0, q7, q0
2691; CHECK-NEXT:    vqshrnb.s16 q0, q0, #7
2692; CHECK-NEXT:    vqshrnt.s16 q0, q4, #7
2693; CHECK-NEXT:    vpst
2694; CHECK-NEXT:    vstrbt.8 q0, [r2], #16
2695; CHECK-NEXT:    le lr, .LBB19_2
2696; CHECK-NEXT:  .LBB19_3: @ %for.cond.cleanup
2697; CHECK-NEXT:    add sp, #80
2698; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
2699; CHECK-NEXT:    pop {r4, r5, r6, pc}
2700; CHECK-NEXT:    .p2align 4
2701; CHECK-NEXT:  @ %bb.4:
2702; CHECK-NEXT:  .LCPI19_0:
2703; CHECK-NEXT:    .long 12 @ 0xc
2704; CHECK-NEXT:    .long 13 @ 0xd
2705; CHECK-NEXT:    .long 14 @ 0xe
2706; CHECK-NEXT:    .long 15 @ 0xf
2707; CHECK-NEXT:  .LCPI19_1:
2708; CHECK-NEXT:    .long 8 @ 0x8
2709; CHECK-NEXT:    .long 9 @ 0x9
2710; CHECK-NEXT:    .long 10 @ 0xa
2711; CHECK-NEXT:    .long 11 @ 0xb
2712; CHECK-NEXT:  .LCPI19_2:
2713; CHECK-NEXT:    .long 4 @ 0x4
2714; CHECK-NEXT:    .long 5 @ 0x5
2715; CHECK-NEXT:    .long 6 @ 0x6
2716; CHECK-NEXT:    .long 7 @ 0x7
2717; CHECK-NEXT:  .LCPI19_3:
2718; CHECK-NEXT:    .long 0 @ 0x0
2719; CHECK-NEXT:    .long 1 @ 0x1
2720; CHECK-NEXT:    .long 2 @ 0x2
2721; CHECK-NEXT:    .long 3 @ 0x3
2722entry:
2723  %cmp10 = icmp eq i32 %N, 0
2724  br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
2725
2726vector.ph:                                        ; preds = %entry
2727  %n.rnd.up = add i32 %N, 15
2728  %n.vec = and i32 %n.rnd.up, -16
2729  %trip.count.minus.1 = add i32 %N, -1
2730  %broadcast.splatinsert22 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0
2731  %broadcast.splat23 = shufflevector <16 x i32> %broadcast.splatinsert22, <16 x i32> undef, <16 x i32> zeroinitializer
2732  br label %vector.body
2733
2734vector.body:                                      ; preds = %vector.body, %vector.ph
2735  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2736  %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
2737  %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
2738  %induction = or <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2739  %next.gep = getelementptr i8, ptr %pSrcA, i32 %index
2740  %next.gep20 = getelementptr i8, ptr %pSrcB, i32 %index
2741  %next.gep21 = getelementptr i8, ptr %pDst, i32 %index
2742  %0 = icmp ule <16 x i32> %induction, %broadcast.splat23
2743  %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %next.gep, i32 1, <16 x i1> %0, <16 x i8> undef)
2744  %1 = shufflevector <16 x i8> %wide.masked.load, <16 x i8> %wide.masked.load, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
2745  %2 = shufflevector <16 x i8> %wide.masked.load, <16 x i8> %wide.masked.load, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
2746  %3 = sext <8 x i8> %1 to <8 x i16>
2747  %4 = sext <8 x i8> %2 to <8 x i16>
2748  %wide.masked.load24 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %next.gep20, i32 1, <16 x i1> %0, <16 x i8> undef)
2749  %5 = shufflevector <16 x i8> %wide.masked.load24, <16 x i8> %wide.masked.load24, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
2750  %6 = shufflevector <16 x i8> %wide.masked.load24, <16 x i8> %wide.masked.load24, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
2751  %7 = sext <8 x i8> %5 to <8 x i16>
2752  %8 = sext <8 x i8> %6 to <8 x i16>
2753  %9 = mul <8 x i16> %7, %3
2754  %10 = mul <8 x i16> %8, %4
2755  %11 = ashr <8 x i16> %9, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
2756  %12 = ashr <8 x i16> %10, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
2757  %13 = icmp sgt <8 x i16> %11, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2758  %14 = icmp sgt <8 x i16> %12, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2759  %15 = select <8 x i1> %13, <8 x i16> %11, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2760  %16 = select <8 x i1> %14, <8 x i16> %12, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2761  %17 = icmp slt <8 x i16> %15, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2762  %18 = icmp slt <8 x i16> %16, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2763  %19 = select <8 x i1> %17, <8 x i16> %15, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2764  %20 = select <8 x i1> %18, <8 x i16> %16, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2765  %21 = shufflevector <8 x i16> %19, <8 x i16> %20, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
2766  %22 = trunc <16 x i16> %21 to <16 x i8>
2767  call void @llvm.masked.store.v16i8.p0(<16 x i8> %22, ptr %next.gep21, i32 1, <16 x i1> %0)
2768  %index.next = add i32 %index, 16
2769  %23 = icmp eq i32 %index.next, %n.vec
2770  br i1 %23, label %for.cond.cleanup, label %vector.body
2771
2772for.cond.cleanup:                                 ; preds = %vector.body, %entry
2773  ret void
2774}
2775
2776define arm_aapcs_vfpcc void @usatmul_8_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
2777; CHECK-LABEL: usatmul_8_q7:
2778; CHECK:       @ %bb.0: @ %entry
2779; CHECK-NEXT:    .save {r4, r5, r6, lr}
2780; CHECK-NEXT:    push {r4, r5, r6, lr}
2781; CHECK-NEXT:    cbz r3, .LBB20_8
2782; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
2783; CHECK-NEXT:    cmp r3, #7
2784; CHECK-NEXT:    bhi .LBB20_3
2785; CHECK-NEXT:  @ %bb.2:
2786; CHECK-NEXT:    movs r5, #0
2787; CHECK-NEXT:    mov r12, r0
2788; CHECK-NEXT:    mov r6, r1
2789; CHECK-NEXT:    mov r4, r2
2790; CHECK-NEXT:    b .LBB20_6
2791; CHECK-NEXT:  .LBB20_3: @ %vector.ph
2792; CHECK-NEXT:    bic r5, r3, #7
2793; CHECK-NEXT:    movs r4, #1
2794; CHECK-NEXT:    sub.w r6, r5, #8
2795; CHECK-NEXT:    add.w r12, r0, r5
2796; CHECK-NEXT:    add.w lr, r4, r6, lsr #3
2797; CHECK-NEXT:    adds r4, r2, r5
2798; CHECK-NEXT:    adds r6, r1, r5
2799; CHECK-NEXT:  .LBB20_4: @ %vector.body
2800; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
2801; CHECK-NEXT:    vldrb.u16 q0, [r0], #8
2802; CHECK-NEXT:    vldrb.u16 q1, [r1], #8
2803; CHECK-NEXT:    vmul.i16 q0, q1, q0
2804; CHECK-NEXT:    vqshrnb.u16 q0, q0, #7
2805; CHECK-NEXT:    vstrb.16 q0, [r2], #8
2806; CHECK-NEXT:    le lr, .LBB20_4
2807; CHECK-NEXT:  @ %bb.5: @ %middle.block
2808; CHECK-NEXT:    cmp r5, r3
2809; CHECK-NEXT:    it eq
2810; CHECK-NEXT:    popeq {r4, r5, r6, pc}
2811; CHECK-NEXT:  .LBB20_6: @ %for.body.preheader23
2812; CHECK-NEXT:    sub.w lr, r3, r5
2813; CHECK-NEXT:  .LBB20_7: @ %for.body
2814; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
2815; CHECK-NEXT:    ldrb r0, [r12], #1
2816; CHECK-NEXT:    ldrb r1, [r6], #1
2817; CHECK-NEXT:    muls r0, r1, r0
2818; CHECK-NEXT:    lsrs r1, r0, #7
2819; CHECK-NEXT:    cmp r1, #255
2820; CHECK-NEXT:    mov.w r1, #255
2821; CHECK-NEXT:    it lo
2822; CHECK-NEXT:    lsrlo r1, r0, #7
2823; CHECK-NEXT:    strb r1, [r4], #1
2824; CHECK-NEXT:    le lr, .LBB20_7
2825; CHECK-NEXT:  .LBB20_8: @ %for.cond.cleanup
2826; CHECK-NEXT:    pop {r4, r5, r6, pc}
2827entry:
2828  %cmp10 = icmp eq i32 %N, 0
2829  br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
2830
2831for.body.preheader:                               ; preds = %entry
2832  %min.iters.check = icmp ult i32 %N, 8
2833  br i1 %min.iters.check, label %for.body.preheader23, label %vector.ph
2834
2835for.body.preheader23:                             ; preds = %middle.block, %for.body.preheader
2836  %i.014.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
2837  %pSrcA.addr.013.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
2838  %pSrcB.addr.012.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end17, %middle.block ]
2839  %pDst.addr.011.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end19, %middle.block ]
2840  br label %for.body
2841
2842vector.ph:                                        ; preds = %for.body.preheader
2843  %n.vec = and i32 %N, -8
2844  %ind.end = getelementptr i8, ptr %pSrcA, i32 %n.vec
2845  %ind.end17 = getelementptr i8, ptr %pSrcB, i32 %n.vec
2846  %ind.end19 = getelementptr i8, ptr %pDst, i32 %n.vec
2847  br label %vector.body
2848
2849vector.body:                                      ; preds = %vector.body, %vector.ph
2850  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2851  %next.gep = getelementptr i8, ptr %pSrcA, i32 %index
2852  %next.gep20 = getelementptr i8, ptr %pSrcB, i32 %index
2853  %next.gep21 = getelementptr i8, ptr %pDst, i32 %index
2854  %wide.load = load <8 x i8>, ptr %next.gep, align 1
2855  %0 = zext <8 x i8> %wide.load to <8 x i16>
2856  %wide.load22 = load <8 x i8>, ptr %next.gep20, align 1
2857  %1 = zext <8 x i8> %wide.load22 to <8 x i16>
2858  %2 = mul nuw <8 x i16> %1, %0
2859  %3 = lshr <8 x i16> %2, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
2860  %4 = icmp ult <8 x i16> %3, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
2861  %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
2862  %6 = trunc <8 x i16> %5 to <8 x i8>
2863  store <8 x i8> %6, ptr %next.gep21, align 1
2864  %index.next = add i32 %index, 8
2865  %7 = icmp eq i32 %index.next, %n.vec
2866  br i1 %7, label %middle.block, label %vector.body
2867
2868middle.block:                                     ; preds = %vector.body
2869  %cmp.n = icmp eq i32 %n.vec, %N
2870  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader23
2871
2872for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
2873  ret void
2874
2875for.body:                                         ; preds = %for.body.preheader23, %for.body
2876  %i.014 = phi i32 [ %inc, %for.body ], [ %i.014.ph, %for.body.preheader23 ]
2877  %pSrcA.addr.013 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.013.ph, %for.body.preheader23 ]
2878  %pSrcB.addr.012 = phi ptr [ %incdec.ptr2, %for.body ], [ %pSrcB.addr.012.ph, %for.body.preheader23 ]
2879  %pDst.addr.011 = phi ptr [ %incdec.ptr6, %for.body ], [ %pDst.addr.011.ph, %for.body.preheader23 ]
2880  %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.013, i32 1
2881  %8 = load i8, ptr %pSrcA.addr.013, align 1
2882  %conv1 = zext i8 %8 to i16
2883  %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.012, i32 1
2884  %9 = load i8, ptr %pSrcB.addr.012, align 1
2885  %conv3 = zext i8 %9 to i16
2886  %mul = mul nuw i16 %conv3, %conv1
2887  %10 = lshr i16 %mul, 7
2888  %11 = icmp ult i16 %10, 255
2889  %retval.0.i = select i1 %11, i16 %10, i16 255
2890  %conv5 = trunc i16 %retval.0.i to i8
2891  %incdec.ptr6 = getelementptr inbounds i8, ptr %pDst.addr.011, i32 1
2892  store i8 %conv5, ptr %pDst.addr.011, align 1
2893  %inc = add nuw i32 %i.014, 1
2894  %exitcond = icmp eq i32 %inc, %N
2895  br i1 %exitcond, label %for.cond.cleanup, label %for.body
2896}
2897
2898define arm_aapcs_vfpcc void @usatmul_16_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
2899; CHECK-LABEL: usatmul_16_q7:
2900; CHECK:       @ %bb.0: @ %entry
2901; CHECK-NEXT:    .save {r4, r5, r6, lr}
2902; CHECK-NEXT:    push {r4, r5, r6, lr}
2903; CHECK-NEXT:    cmp r3, #0
2904; CHECK-NEXT:    beq .LBB21_8
2905; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
2906; CHECK-NEXT:    cmp r3, #15
2907; CHECK-NEXT:    bhi .LBB21_3
2908; CHECK-NEXT:  @ %bb.2:
2909; CHECK-NEXT:    movs r5, #0
2910; CHECK-NEXT:    mov r12, r0
2911; CHECK-NEXT:    mov r6, r1
2912; CHECK-NEXT:    mov r4, r2
2913; CHECK-NEXT:    b .LBB21_6
2914; CHECK-NEXT:  .LBB21_3: @ %vector.ph
2915; CHECK-NEXT:    bic r5, r3, #15
2916; CHECK-NEXT:    movs r4, #1
2917; CHECK-NEXT:    sub.w r6, r5, #16
2918; CHECK-NEXT:    add.w r12, r0, r5
2919; CHECK-NEXT:    add.w lr, r4, r6, lsr #4
2920; CHECK-NEXT:    adds r4, r2, r5
2921; CHECK-NEXT:    adds r6, r1, r5
2922; CHECK-NEXT:  .LBB21_4: @ %vector.body
2923; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
2924; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
2925; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
2926; CHECK-NEXT:    vmullt.u8 q2, q1, q0
2927; CHECK-NEXT:    vmullb.u8 q0, q1, q0
2928; CHECK-NEXT:    vqshrnb.u16 q0, q0, #7
2929; CHECK-NEXT:    vqshrnt.u16 q0, q2, #7
2930; CHECK-NEXT:    vstrb.8 q0, [r2], #16
2931; CHECK-NEXT:    le lr, .LBB21_4
2932; CHECK-NEXT:  @ %bb.5: @ %middle.block
2933; CHECK-NEXT:    cmp r5, r3
2934; CHECK-NEXT:    it eq
2935; CHECK-NEXT:    popeq {r4, r5, r6, pc}
2936; CHECK-NEXT:  .LBB21_6: @ %for.body.preheader23
2937; CHECK-NEXT:    sub.w lr, r3, r5
2938; CHECK-NEXT:  .LBB21_7: @ %for.body
2939; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
2940; CHECK-NEXT:    ldrb r0, [r12], #1
2941; CHECK-NEXT:    ldrb r1, [r6], #1
2942; CHECK-NEXT:    muls r0, r1, r0
2943; CHECK-NEXT:    lsrs r1, r0, #7
2944; CHECK-NEXT:    cmp r1, #255
2945; CHECK-NEXT:    mov.w r1, #255
2946; CHECK-NEXT:    it lo
2947; CHECK-NEXT:    lsrlo r1, r0, #7
2948; CHECK-NEXT:    strb r1, [r4], #1
2949; CHECK-NEXT:    le lr, .LBB21_7
2950; CHECK-NEXT:  .LBB21_8: @ %for.cond.cleanup
2951; CHECK-NEXT:    pop {r4, r5, r6, pc}
2952entry:
2953  %cmp10 = icmp eq i32 %N, 0
2954  br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
2955
2956for.body.preheader:                               ; preds = %entry
2957  %min.iters.check = icmp ult i32 %N, 16
2958  br i1 %min.iters.check, label %for.body.preheader23, label %vector.ph
2959
2960for.body.preheader23:                             ; preds = %middle.block, %for.body.preheader
2961  %i.014.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
2962  %pSrcA.addr.013.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
2963  %pSrcB.addr.012.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end17, %middle.block ]
2964  %pDst.addr.011.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end19, %middle.block ]
2965  br label %for.body
2966
2967vector.ph:                                        ; preds = %for.body.preheader
2968  %n.vec = and i32 %N, -16
2969  %ind.end = getelementptr i8, ptr %pSrcA, i32 %n.vec
2970  %ind.end17 = getelementptr i8, ptr %pSrcB, i32 %n.vec
2971  %ind.end19 = getelementptr i8, ptr %pDst, i32 %n.vec
2972  br label %vector.body
2973
2974vector.body:                                      ; preds = %vector.body, %vector.ph
2975  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2976  %next.gep = getelementptr i8, ptr %pSrcA, i32 %index
2977  %next.gep20 = getelementptr i8, ptr %pSrcB, i32 %index
2978  %next.gep21 = getelementptr i8, ptr %pDst, i32 %index
2979  %wide.load = load <16 x i8>, ptr %next.gep, align 1
2980  %0 = zext <16 x i8> %wide.load to <16 x i16>
2981  %wide.load22 = load <16 x i8>, ptr %next.gep20, align 1
2982  %1 = zext <16 x i8> %wide.load22 to <16 x i16>
2983  %2 = mul nuw <16 x i16> %1, %0
2984  %3 = lshr <16 x i16> %2, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
2985  %4 = icmp ult <16 x i16> %3, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
2986  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
2987  %6 = trunc <16 x i16> %5 to <16 x i8>
2988  store <16 x i8> %6, ptr %next.gep21, align 1
2989  %index.next = add i32 %index, 16
2990  %7 = icmp eq i32 %index.next, %n.vec
2991  br i1 %7, label %middle.block, label %vector.body
2992
2993middle.block:                                     ; preds = %vector.body
2994  %cmp.n = icmp eq i32 %n.vec, %N
2995  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader23
2996
2997for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
2998  ret void
2999
3000for.body:                                         ; preds = %for.body.preheader23, %for.body
3001  %i.014 = phi i32 [ %inc, %for.body ], [ %i.014.ph, %for.body.preheader23 ]
3002  %pSrcA.addr.013 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.013.ph, %for.body.preheader23 ]
3003  %pSrcB.addr.012 = phi ptr [ %incdec.ptr2, %for.body ], [ %pSrcB.addr.012.ph, %for.body.preheader23 ]
3004  %pDst.addr.011 = phi ptr [ %incdec.ptr6, %for.body ], [ %pDst.addr.011.ph, %for.body.preheader23 ]
3005  %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.013, i32 1
3006  %8 = load i8, ptr %pSrcA.addr.013, align 1
3007  %conv1 = zext i8 %8 to i16
3008  %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.012, i32 1
3009  %9 = load i8, ptr %pSrcB.addr.012, align 1
3010  %conv3 = zext i8 %9 to i16
3011  %mul = mul nuw i16 %conv3, %conv1
3012  %10 = lshr i16 %mul, 7
3013  %11 = icmp ult i16 %10, 255
3014  %retval.0.i = select i1 %11, i16 %10, i16 255
3015  %conv5 = trunc i16 %retval.0.i to i8
3016  %incdec.ptr6 = getelementptr inbounds i8, ptr %pDst.addr.011, i32 1
3017  store i8 %conv5, ptr %pDst.addr.011, align 1
3018  %inc = add nuw i32 %i.014, 1
3019  %exitcond = icmp eq i32 %inc, %N
3020  br i1 %exitcond, label %for.cond.cleanup, label %for.body
3021}
3022
3023declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>)
3024declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32, <8 x i1>, <8 x i16>)
3025declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32, <4 x i1>, <4 x i16>)
3026declare <8 x i8> @llvm.masked.load.v8i8.p0(ptr, i32, <8 x i1>, <8 x i8>)
3027declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32, <16 x i1>, <16 x i8>)
3028declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>)
3029declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32, <8 x i1>)
3030declare void @llvm.masked.store.v4i16.p0(<4 x i16>, ptr, i32, <4 x i1>)
3031declare void @llvm.masked.store.v8i8.p0(<8 x i8>, ptr, i32, <8 x i1>)
3032declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32, <16 x i1>)
3033