xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll (revision 23d45e55edb0ca4567f5876e7051ff4a649213df)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
3
4define arm_aapcs_vfpcc i16 @reduce_v16i16_shift_mul(<16 x i8> %s0, <16 x i8> %s1) {
5; CHECK-LABEL: reduce_v16i16_shift_mul:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    vmullt.u8 q2, q0, q1
8; CHECK-NEXT:    vmullb.u8 q0, q0, q1
9; CHECK-NEXT:    vshr.s16 q2, q2, #14
10; CHECK-NEXT:    vshr.s16 q0, q0, #14
11; CHECK-NEXT:    vaddv.u16 r0, q2
12; CHECK-NEXT:    vaddva.u16 r0, q0
13; CHECK-NEXT:    bx lr
14entry:
15  %s0s = zext <16 x i8> %s0 to <16 x i16>
16  %s1s = zext <16 x i8> %s1 to <16 x i16>
17  %m = mul <16 x i16> %s0s, %s1s
18  %sh = ashr <16 x i16> %m, <i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14>
19  %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %sh)
20  ret i16 %result
21}
22
23define arm_aapcs_vfpcc i16 @reduce_v8i16_shift_mul(<8 x i8> %s0, <8 x i8> %s1) {
24; CHECK-LABEL: reduce_v8i16_shift_mul:
25; CHECK:       @ %bb.0: @ %entry
26; CHECK-NEXT:    vmullb.u8 q0, q0, q1
27; CHECK-NEXT:    vshr.s16 q0, q0, #14
28; CHECK-NEXT:    vaddv.u16 r0, q0
29; CHECK-NEXT:    bx lr
30entry:
31  %s0s = zext <8 x i8> %s0 to <8 x i16>
32  %s1s = zext <8 x i8> %s1 to <8 x i16>
33  %m = mul <8 x i16> %s0s, %s1s
34  %sh = ashr <8 x i16> %m, <i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14>
35  %result = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %sh)
36  ret i16 %result
37}
38
39define arm_aapcs_vfpcc i16 @reduce_v16i16_shift_sub(<16 x i8> %s0, <16 x i8> %s1) {
40; CHECK-LABEL: reduce_v16i16_shift_sub:
41; CHECK:       @ %bb.0: @ %entry
42; CHECK-NEXT:    vmovlt.u8 q2, q1
43; CHECK-NEXT:    vmovlt.u8 q3, q0
44; CHECK-NEXT:    vsub.i16 q2, q3, q2
45; CHECK-NEXT:    vmovlb.u8 q1, q1
46; CHECK-NEXT:    vmovlb.u8 q0, q0
47; CHECK-NEXT:    vshr.s16 q2, q2, #14
48; CHECK-NEXT:    vsub.i16 q0, q0, q1
49; CHECK-NEXT:    vaddv.u16 r0, q2
50; CHECK-NEXT:    vshr.s16 q0, q0, #14
51; CHECK-NEXT:    vaddva.u16 r0, q0
52; CHECK-NEXT:    bx lr
53entry:
54  %s0s = zext <16 x i8> %s0 to <16 x i16>
55  %s1s = zext <16 x i8> %s1 to <16 x i16>
56  %m = sub <16 x i16> %s0s, %s1s
57  %sh = ashr <16 x i16> %m, <i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14>
58  %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %sh)
59  ret i16 %result
60}
61
62define arm_aapcs_vfpcc i32 @mlapred_v4i32_v4i64_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %a, <8 x i16> %b) {
63; CHECK-LABEL: mlapred_v4i32_v4i64_zext:
64; CHECK:       @ %bb.0: @ %entry
65; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
66; CHECK-NEXT:    vpush {d8, d9, d10, d11}
67; CHECK-NEXT:    .pad #32
68; CHECK-NEXT:    sub sp, #32
69; CHECK-NEXT:    vorr q2, q2, q3
70; CHECK-NEXT:    mov r0, sp
71; CHECK-NEXT:    vstrw.32 q2, [r0]
72; CHECK-NEXT:    vmov.i8 q3, #0xff
73; CHECK-NEXT:    vldrh.u32 q2, [r0, #8]
74; CHECK-NEXT:    vldrh.u32 q5, [r0]
75; CHECK-NEXT:    add r0, sp, #16
76; CHECK-NEXT:    vcmp.i32 eq, q2, zr
77; CHECK-NEXT:    vmov.i8 q2, #0x0
78; CHECK-NEXT:    vpsel q4, q3, q2
79; CHECK-NEXT:    vcmp.i32 eq, q5, zr
80; CHECK-NEXT:    vpsel q2, q3, q2
81; CHECK-NEXT:    vstrh.32 q4, [r0, #8]
82; CHECK-NEXT:    vstrh.32 q2, [r0]
83; CHECK-NEXT:    vldrw.u32 q2, [r0]
84; CHECK-NEXT:    vpt.i16 ne, q2, zr
85; CHECK-NEXT:    vmlavt.u16 r0, q0, q1
86; CHECK-NEXT:    add sp, #32
87; CHECK-NEXT:    vpop {d8, d9, d10, d11}
88; CHECK-NEXT:    bx lr
89entry:
90  %aa = zext <8 x i16> %a to <8 x i32>
91  %bb = zext <8 x i16> %b to <8 x i32>
92  %c1 = icmp eq <8 x i32> %aa, zeroinitializer
93  %c2 = icmp eq <8 x i32> %bb, zeroinitializer
94  %c = and <8 x i1> %c1, %c2
95  %xx = zext <8 x i16> %x to <8 x i32>
96  %yy = zext <8 x i16> %y to <8 x i32>
97  %m = mul <8 x i32> %xx, %yy
98  %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
99  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
100  ret i32 %z
101}
102
103define void @correlate(ptr nocapture noundef readonly %ID, ptr nocapture noundef writeonly %ACD, i16 noundef signext %DS, i16 noundef signext %Ls, i16 noundef signext %S) {
104; CHECK-LABEL: correlate:
105; CHECK:       @ %bb.0: @ %entry
106; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
107; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
108; CHECK-NEXT:    .pad #12
109; CHECK-NEXT:    sub sp, #12
110; CHECK-NEXT:    cmp r3, #1
111; CHECK-NEXT:    stm.w sp, {r0, r1, r3} @ 12-byte Folded Spill
112; CHECK-NEXT:    blt .LBB4_12
113; CHECK-NEXT:  @ %bb.1: @ %for.body.lr.ph
114; CHECK-NEXT:    ldr r1, [sp, #48]
115; CHECK-NEXT:    add.w r12, r2, #3
116; CHECK-NEXT:    ldr.w r11, [sp] @ 4-byte Reload
117; CHECK-NEXT:    mov.w r10, #0
118; CHECK-NEXT:    mov r8, r2
119; CHECK-NEXT:    mov r0, r2
120; CHECK-NEXT:    uxth r3, r1
121; CHECK-NEXT:    b .LBB4_4
122; CHECK-NEXT:  .LBB4_2: @ in Loop: Header=BB4_4 Depth=1
123; CHECK-NEXT:    movs r6, #0
124; CHECK-NEXT:  .LBB4_3: @ %for.end
125; CHECK-NEXT:    @ in Loop: Header=BB4_4 Depth=1
126; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
127; CHECK-NEXT:    lsrs r2, r6, #16
128; CHECK-NEXT:    sub.w r12, r12, #1
129; CHECK-NEXT:    add.w r11, r11, #2
130; CHECK-NEXT:    sub.w r8, r8, #1
131; CHECK-NEXT:    strh.w r2, [r7, r10, lsl #1]
132; CHECK-NEXT:    add.w r10, r10, #1
133; CHECK-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
134; CHECK-NEXT:    cmp r10, r2
135; CHECK-NEXT:    mov r2, r0
136; CHECK-NEXT:    beq .LBB4_12
137; CHECK-NEXT:  .LBB4_4: @ %for.body
138; CHECK-NEXT:    @ =>This Loop Header: Depth=1
139; CHECK-NEXT:    @ Child Loop BB4_8 Depth 2
140; CHECK-NEXT:    @ Child Loop BB4_11 Depth 2
141; CHECK-NEXT:    cmp r2, r10
142; CHECK-NEXT:    ble .LBB4_2
143; CHECK-NEXT:  @ %bb.5: @ %vector.main.loop.iter.check
144; CHECK-NEXT:    @ in Loop: Header=BB4_4 Depth=1
145; CHECK-NEXT:    sub.w r4, r2, r10
146; CHECK-NEXT:    cmp r4, #8
147; CHECK-NEXT:    bhs .LBB4_7
148; CHECK-NEXT:  @ %bb.6: @ in Loop: Header=BB4_4 Depth=1
149; CHECK-NEXT:    movs r6, #0
150; CHECK-NEXT:    mov.w r9, #0
151; CHECK-NEXT:    b .LBB4_10
152; CHECK-NEXT:  .LBB4_7: @ %vector.ph
153; CHECK-NEXT:    @ in Loop: Header=BB4_4 Depth=1
154; CHECK-NEXT:    bic r2, r8, #7
155; CHECK-NEXT:    movs r7, #1
156; CHECK-NEXT:    subs r2, #8
157; CHECK-NEXT:    bic r9, r4, #7
158; CHECK-NEXT:    movs r6, #0
159; CHECK-NEXT:    mov r5, r11
160; CHECK-NEXT:    add.w lr, r7, r2, lsr #3
161; CHECK-NEXT:    ldr r2, [sp] @ 4-byte Reload
162; CHECK-NEXT:  .LBB4_8: @ %vector.body
163; CHECK-NEXT:    @ Parent Loop BB4_4 Depth=1
164; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
165; CHECK-NEXT:    vldrh.u16 q0, [r2], #16
166; CHECK-NEXT:    vldrh.u16 q1, [r5], #16
167; CHECK-NEXT:    rsbs r7, r3, #0
168; CHECK-NEXT:    vmullb.s16 q2, q1, q0
169; CHECK-NEXT:    vmullt.s16 q0, q1, q0
170; CHECK-NEXT:    vshl.s32 q2, r7
171; CHECK-NEXT:    vshl.s32 q0, r7
172; CHECK-NEXT:    vaddva.u32 r6, q2
173; CHECK-NEXT:    vaddva.u32 r6, q0
174; CHECK-NEXT:    le lr, .LBB4_8
175; CHECK-NEXT:  @ %bb.9: @ %middle.block
176; CHECK-NEXT:    @ in Loop: Header=BB4_4 Depth=1
177; CHECK-NEXT:    cmp r4, r9
178; CHECK-NEXT:    beq .LBB4_3
179; CHECK-NEXT:  .LBB4_10: @ %vec.epilog.ph
180; CHECK-NEXT:    @ in Loop: Header=BB4_4 Depth=1
181; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
182; CHECK-NEXT:    add.w r2, r9, r10
183; CHECK-NEXT:    sub.w r5, r8, r9
184; CHECK-NEXT:    add.w r7, r1, r9, lsl #1
185; CHECK-NEXT:    add.w r2, r1, r2, lsl #1
186; CHECK-NEXT:    dlstp.32 lr, r5
187; CHECK-NEXT:  .LBB4_11: @ %vec.epilog.vector.body
188; CHECK-NEXT:    @ Parent Loop BB4_4 Depth=1
189; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
190; CHECK-NEXT:    rsbs r4, r3, #0
191; CHECK-NEXT:    vldrh.s32 q0, [r7], #8
192; CHECK-NEXT:    vldrh.s32 q1, [r2], #8
193; CHECK-NEXT:    vmul.i32 q0, q1, q0
194; CHECK-NEXT:    vshl.s32 q0, r4
195; CHECK-NEXT:    vaddva.u32 r6, q0
196; CHECK-NEXT:    letp lr, .LBB4_11
197; CHECK-NEXT:    b .LBB4_3
198; CHECK-NEXT:  .LBB4_12: @ %for.end17
199; CHECK-NEXT:    add sp, #12
200; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
201entry:
202  %conv = sext i16 %Ls to i32
203  %cmp31 = icmp sgt i16 %Ls, 0
204  br i1 %cmp31, label %for.body.lr.ph, label %for.end17
205
206for.body.lr.ph:                                   ; preds = %entry
207  %conv2 = sext i16 %DS to i32
208  %conv1027 = zext i16 %S to i32
209  %broadcast.splatinsert = insertelement <8 x i32> poison, i32 %conv1027, i64 0
210  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer
211  %broadcast.splatinsert40 = insertelement <4 x i32> poison, i32 %conv1027, i64 0
212  %broadcast.splat41 = shufflevector <4 x i32> %broadcast.splatinsert40, <4 x i32> poison, <4 x i32> zeroinitializer
213  br label %for.body
214
215for.body:                                         ; preds = %for.body.lr.ph, %for.end
216  %lag.032 = phi i32 [ 0, %for.body.lr.ph ], [ %inc16, %for.end ]
217  %0 = sub i32 %conv2, %lag.032
218  %cmp428 = icmp slt i32 %lag.032, %conv2
219  br i1 %cmp428, label %vector.main.loop.iter.check, label %for.end
220
221vector.main.loop.iter.check:                      ; preds = %for.body
222  %min.iters.check = icmp ult i32 %0, 8
223  br i1 %min.iters.check, label %vec.epilog.ph, label %vector.ph
224
225vector.ph:                                        ; preds = %vector.main.loop.iter.check
226  %n.vec = and i32 %0, -8
227  br label %vector.body
228
229vector.body:                                      ; preds = %vector.body, %vector.ph
230  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
231  %vec.phi = phi i32 [ 0, %vector.ph ], [ %9, %vector.body ]
232  %1 = getelementptr inbounds i16, ptr %ID, i32 %index
233  %wide.load = load <8 x i16>, ptr %1, align 2
234  %2 = sext <8 x i16> %wide.load to <8 x i32>
235  %3 = add nuw nsw i32 %index, %lag.032
236  %4 = getelementptr inbounds i16, ptr %ID, i32 %3
237  %wide.load34 = load <8 x i16>, ptr %4, align 2
238  %5 = sext <8 x i16> %wide.load34 to <8 x i32>
239  %6 = mul nsw <8 x i32> %5, %2
240  %7 = ashr <8 x i32> %6, %broadcast.splat
241  %8 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %7)
242  %9 = add i32 %8, %vec.phi
243  %index.next = add nuw i32 %index, 8
244  %10 = icmp eq i32 %index.next, %n.vec
245  br i1 %10, label %middle.block, label %vector.body
246
247middle.block:                                     ; preds = %vector.body
248  %cmp.n = icmp eq i32 %0, %n.vec
249  br i1 %cmp.n, label %for.end, label %vec.epilog.ph
250
251vec.epilog.ph:                                    ; preds = %middle.block, %vector.main.loop.iter.check
252  %bc.merge.rdx = phi i32 [ 0, %vector.main.loop.iter.check ], [ %9, %middle.block ]
253  %vec.epilog.resume.val = phi i32 [ 0, %vector.main.loop.iter.check ], [ %n.vec, %middle.block ]
254  %n.rnd.up = add i32 %0, 3
255  %n.vec36 = and i32 %n.rnd.up, -4
256  br label %vec.epilog.vector.body
257
258vec.epilog.vector.body:                           ; preds = %vec.epilog.vector.body, %vec.epilog.ph
259  %index37 = phi i32 [ %vec.epilog.resume.val, %vec.epilog.ph ], [ %index.next42, %vec.epilog.vector.body ]
260  %vec.phi38 = phi i32 [ %bc.merge.rdx, %vec.epilog.ph ], [ %20, %vec.epilog.vector.body ]
261  %active.lane.mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index37, i32 %0)
262  %11 = getelementptr inbounds i16, ptr %ID, i32 %index37
263  %wide.masked.load = tail call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %11, i32 2, <4 x i1> %active.lane.mask, <4 x i16> poison)
264  %12 = sext <4 x i16> %wide.masked.load to <4 x i32>
265  %13 = add nuw nsw i32 %index37, %lag.032
266  %14 = getelementptr inbounds i16, ptr %ID, i32 %13
267  %wide.masked.load39 = tail call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %14, i32 2, <4 x i1> %active.lane.mask, <4 x i16> poison)
268  %15 = sext <4 x i16> %wide.masked.load39 to <4 x i32>
269  %16 = mul nsw <4 x i32> %15, %12
270  %17 = ashr <4 x i32> %16, %broadcast.splat41
271  %18 = select <4 x i1> %active.lane.mask, <4 x i32> %17, <4 x i32> zeroinitializer
272  %19 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %18)
273  %20 = add i32 %19, %vec.phi38
274  %index.next42 = add i32 %index37, 4
275  %21 = icmp eq i32 %index.next42, %n.vec36
276  br i1 %21, label %for.end, label %vec.epilog.vector.body
277
278for.end:                                          ; preds = %vec.epilog.vector.body, %middle.block, %for.body
279  %Accumulator.0.lcssa = phi i32 [ 0, %for.body ], [ %9, %middle.block ], [ %20, %vec.epilog.vector.body ]
280  %22 = lshr i32 %Accumulator.0.lcssa, 16
281  %conv13 = trunc i32 %22 to i16
282  %arrayidx14 = getelementptr inbounds i16, ptr %ACD, i32 %lag.032
283  store i16 %conv13, ptr %arrayidx14, align 2
284  %inc16 = add nuw nsw i32 %lag.032, 1
285  %exitcond33.not = icmp eq i32 %inc16, %conv
286  br i1 %exitcond33.not, label %for.end17, label %for.body
287
288for.end17:                                        ; preds = %for.end, %entry
289  ret void
290}
291
292declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %sh)
293declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %sh)
294declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
295declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
296declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr nocapture, i32 immarg, <4 x i1>, <4 x i16>)
297declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
298