xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll (revision b5b663aac17415625340eb29c8010832bfc4c21c)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s
3
4define void @arm_cmplx_mag_squared_f16(ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 %numSamples) {
5; CHECK-LABEL: arm_cmplx_mag_squared_f16:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    .save {r4, r5, r7, lr}
8; CHECK-NEXT:    push {r4, r5, r7, lr}
9; CHECK-NEXT:    cmp r2, #0
10; CHECK-NEXT:    beq .LBB0_8
11; CHECK-NEXT:  @ %bb.1: @ %while.body.preheader
12; CHECK-NEXT:    cmp r2, #8
13; CHECK-NEXT:    blo .LBB0_9
14; CHECK-NEXT:  @ %bb.2: @ %vector.memcheck
15; CHECK-NEXT:    add.w r3, r0, r2, lsl #2
16; CHECK-NEXT:    cmp r3, r1
17; CHECK-NEXT:    itt hi
18; CHECK-NEXT:    addhi.w r3, r1, r2, lsl #1
19; CHECK-NEXT:    cmphi r3, r0
20; CHECK-NEXT:    bhi .LBB0_9
21; CHECK-NEXT:  @ %bb.3: @ %vector.ph
22; CHECK-NEXT:    bic r4, r2, #7
23; CHECK-NEXT:    movs r5, #1
24; CHECK-NEXT:    sub.w r3, r4, #8
25; CHECK-NEXT:    add.w r12, r1, r4, lsl #1
26; CHECK-NEXT:    add.w lr, r5, r3, lsr #3
27; CHECK-NEXT:    add.w r3, r0, r4, lsl #2
28; CHECK-NEXT:    and r5, r2, #7
29; CHECK-NEXT:  .LBB0_4: @ %vector.body
30; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
31; CHECK-NEXT:    vld20.16 {q0, q1}, [r0]
32; CHECK-NEXT:    vld21.16 {q0, q1}, [r0]!
33; CHECK-NEXT:    vmul.f16 q0, q0, q0
34; CHECK-NEXT:    vfma.f16 q0, q1, q1
35; CHECK-NEXT:    vstrb.8 q0, [r1], #16
36; CHECK-NEXT:    le lr, .LBB0_4
37; CHECK-NEXT:  @ %bb.5: @ %middle.block
38; CHECK-NEXT:    cmp r4, r2
39; CHECK-NEXT:    it eq
40; CHECK-NEXT:    popeq {r4, r5, r7, pc}
41; CHECK-NEXT:  .LBB0_6: @ %while.body.preheader26
42; CHECK-NEXT:    dls lr, r5
43; CHECK-NEXT:  .LBB0_7: @ %while.body
44; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
45; CHECK-NEXT:    vldr.16 s0, [r3]
46; CHECK-NEXT:    vldr.16 s2, [r3, #2]
47; CHECK-NEXT:    adds r3, #4
48; CHECK-NEXT:    vmul.f16 s0, s0, s0
49; CHECK-NEXT:    vfma.f16 s0, s2, s2
50; CHECK-NEXT:    vstr.16 s0, [r12]
51; CHECK-NEXT:    add.w r12, r12, #2
52; CHECK-NEXT:    le lr, .LBB0_7
53; CHECK-NEXT:  .LBB0_8: @ %while.end
54; CHECK-NEXT:    pop {r4, r5, r7, pc}
55; CHECK-NEXT:  .LBB0_9:
56; CHECK-NEXT:    mov r3, r0
57; CHECK-NEXT:    mov r12, r1
58; CHECK-NEXT:    mov r5, r2
59; CHECK-NEXT:    b .LBB0_6
60entry:
61  %cmp.not11 = icmp eq i32 %numSamples, 0
62  br i1 %cmp.not11, label %while.end, label %while.body.preheader
63
64while.body.preheader:                             ; preds = %entry
65  %min.iters.check = icmp ult i32 %numSamples, 8
66  br i1 %min.iters.check, label %while.body.preheader26, label %vector.memcheck
67
68vector.memcheck:                                  ; preds = %while.body.preheader
69  %scevgep = getelementptr half, ptr %pDst, i32 %numSamples
70  %0 = shl i32 %numSamples, 1
71  %scevgep18 = getelementptr half, ptr %pSrc, i32 %0
72  %bound0 = icmp ugt ptr %scevgep18, %pDst
73  %bound1 = icmp ugt ptr %scevgep, %pSrc
74  %found.conflict = and i1 %bound0, %bound1
75  br i1 %found.conflict, label %while.body.preheader26, label %vector.ph
76
77vector.ph:                                        ; preds = %vector.memcheck
78  %n.vec = and i32 %numSamples, -8
79  %1 = shl i32 %n.vec, 1
80  %ind.end = getelementptr half, ptr %pSrc, i32 %1
81  %ind.end21 = getelementptr half, ptr %pDst, i32 %n.vec
82  %ind.end23 = and i32 %numSamples, 7
83  br label %vector.body
84
85vector.body:                                      ; preds = %vector.body, %vector.ph
86  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
87  %2 = shl i32 %index, 1
88  %next.gep = getelementptr half, ptr %pSrc, i32 %2
89  %next.gep24 = getelementptr half, ptr %pDst, i32 %index
90  %wide.vec = load <16 x half>, ptr %next.gep, align 2
91  %3 = fmul fast <16 x half> %wide.vec, %wide.vec
92  %4 = shufflevector <16 x half> %3, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
93  %5 = fmul fast <16 x half> %wide.vec, %wide.vec
94  %6 = shufflevector <16 x half> %5, <16 x half> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
95  %7 = fadd fast <8 x half> %6, %4
96  store <8 x half> %7, ptr %next.gep24, align 2
97  %index.next = add i32 %index, 8
98  %8 = icmp eq i32 %index.next, %n.vec
99  br i1 %8, label %middle.block, label %vector.body
100
101middle.block:                                     ; preds = %vector.body
102  %cmp.n = icmp eq i32 %n.vec, %numSamples
103  br i1 %cmp.n, label %while.end, label %while.body.preheader26
104
105while.body.preheader26:                           ; preds = %middle.block, %vector.memcheck, %while.body.preheader
106  %pSrc.addr.014.ph = phi ptr [ %pSrc, %vector.memcheck ], [ %pSrc, %while.body.preheader ], [ %ind.end, %middle.block ]
107  %pDst.addr.013.ph = phi ptr [ %pDst, %vector.memcheck ], [ %pDst, %while.body.preheader ], [ %ind.end21, %middle.block ]
108  %blkCnt.012.ph = phi i32 [ %numSamples, %vector.memcheck ], [ %numSamples, %while.body.preheader ], [ %ind.end23, %middle.block ]
109  br label %while.body
110
111while.body:                                       ; preds = %while.body.preheader26, %while.body
112  %pSrc.addr.014 = phi ptr [ %incdec.ptr1, %while.body ], [ %pSrc.addr.014.ph, %while.body.preheader26 ]
113  %pDst.addr.013 = phi ptr [ %incdec.ptr3, %while.body ], [ %pDst.addr.013.ph, %while.body.preheader26 ]
114  %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blkCnt.012.ph, %while.body.preheader26 ]
115  %incdec.ptr = getelementptr inbounds half, ptr %pSrc.addr.014, i32 1
116  %9 = load half, ptr %pSrc.addr.014, align 2
117  %incdec.ptr1 = getelementptr inbounds half, ptr %pSrc.addr.014, i32 2
118  %10 = load half, ptr %incdec.ptr, align 2
119  %mul = fmul fast half %9, %9
120  %mul2 = fmul fast half %10, %10
121  %add = fadd fast half %mul2, %mul
122  %incdec.ptr3 = getelementptr inbounds half, ptr %pDst.addr.013, i32 1
123  store half %add, ptr %pDst.addr.013, align 2
124  %dec = add i32 %blkCnt.012, -1
125  %cmp.not = icmp eq i32 %dec, 0
126  br i1 %cmp.not, label %while.end, label %while.body
127
128while.end:                                        ; preds = %while.body, %middle.block, %entry
129  ret void
130}
131
132define void @arm_cmplx_mag_squared_f32(ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 %numSamples) {
133; CHECK-LABEL: arm_cmplx_mag_squared_f32:
134; CHECK:       @ %bb.0: @ %entry
135; CHECK-NEXT:    .save {r4, r5, r7, lr}
136; CHECK-NEXT:    push {r4, r5, r7, lr}
137; CHECK-NEXT:    cbz r2, .LBB1_8
138; CHECK-NEXT:  @ %bb.1: @ %while.body.preheader
139; CHECK-NEXT:    cmp r2, #4
140; CHECK-NEXT:    blo .LBB1_9
141; CHECK-NEXT:  @ %bb.2: @ %vector.memcheck
142; CHECK-NEXT:    add.w r3, r0, r2, lsl #3
143; CHECK-NEXT:    cmp r3, r1
144; CHECK-NEXT:    itt hi
145; CHECK-NEXT:    addhi.w r3, r1, r2, lsl #2
146; CHECK-NEXT:    cmphi r3, r0
147; CHECK-NEXT:    bhi .LBB1_9
148; CHECK-NEXT:  @ %bb.3: @ %vector.ph
149; CHECK-NEXT:    bic r4, r2, #3
150; CHECK-NEXT:    movs r5, #1
151; CHECK-NEXT:    subs r3, r4, #4
152; CHECK-NEXT:    add.w r12, r1, r4, lsl #2
153; CHECK-NEXT:    add.w lr, r5, r3, lsr #2
154; CHECK-NEXT:    add.w r3, r0, r4, lsl #3
155; CHECK-NEXT:    and r5, r2, #3
156; CHECK-NEXT:  .LBB1_4: @ %vector.body
157; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
158; CHECK-NEXT:    vld20.32 {q0, q1}, [r0]
159; CHECK-NEXT:    vld21.32 {q0, q1}, [r0]!
160; CHECK-NEXT:    vmul.f32 q0, q0, q0
161; CHECK-NEXT:    vfma.f32 q0, q1, q1
162; CHECK-NEXT:    vstrb.8 q0, [r1], #16
163; CHECK-NEXT:    le lr, .LBB1_4
164; CHECK-NEXT:  @ %bb.5: @ %middle.block
165; CHECK-NEXT:    cmp r4, r2
166; CHECK-NEXT:    it eq
167; CHECK-NEXT:    popeq {r4, r5, r7, pc}
168; CHECK-NEXT:  .LBB1_6: @ %while.body.preheader26
169; CHECK-NEXT:    dls lr, r5
170; CHECK-NEXT:  .LBB1_7: @ %while.body
171; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
172; CHECK-NEXT:    vldr s0, [r3]
173; CHECK-NEXT:    vldr s2, [r3, #4]
174; CHECK-NEXT:    adds r3, #8
175; CHECK-NEXT:    vmul.f32 s0, s0, s0
176; CHECK-NEXT:    vfma.f32 s0, s2, s2
177; CHECK-NEXT:    vstmia r12!, {s0}
178; CHECK-NEXT:    le lr, .LBB1_7
179; CHECK-NEXT:  .LBB1_8: @ %while.end
180; CHECK-NEXT:    pop {r4, r5, r7, pc}
181; CHECK-NEXT:  .LBB1_9:
182; CHECK-NEXT:    mov r3, r0
183; CHECK-NEXT:    mov r12, r1
184; CHECK-NEXT:    mov r5, r2
185; CHECK-NEXT:    b .LBB1_6
186entry:
187  %cmp.not11 = icmp eq i32 %numSamples, 0
188  br i1 %cmp.not11, label %while.end, label %while.body.preheader
189
190while.body.preheader:                             ; preds = %entry
191  %min.iters.check = icmp ult i32 %numSamples, 4
192  br i1 %min.iters.check, label %while.body.preheader26, label %vector.memcheck
193
194vector.memcheck:                                  ; preds = %while.body.preheader
195  %scevgep = getelementptr float, ptr %pDst, i32 %numSamples
196  %0 = shl i32 %numSamples, 1
197  %scevgep18 = getelementptr float, ptr %pSrc, i32 %0
198  %bound0 = icmp ugt ptr %scevgep18, %pDst
199  %bound1 = icmp ugt ptr %scevgep, %pSrc
200  %found.conflict = and i1 %bound0, %bound1
201  br i1 %found.conflict, label %while.body.preheader26, label %vector.ph
202
203vector.ph:                                        ; preds = %vector.memcheck
204  %n.vec = and i32 %numSamples, -4
205  %1 = shl i32 %n.vec, 1
206  %ind.end = getelementptr float, ptr %pSrc, i32 %1
207  %ind.end21 = getelementptr float, ptr %pDst, i32 %n.vec
208  %ind.end23 = and i32 %numSamples, 3
209  br label %vector.body
210
211vector.body:                                      ; preds = %vector.body, %vector.ph
212  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
213  %2 = shl i32 %index, 1
214  %next.gep = getelementptr float, ptr %pSrc, i32 %2
215  %next.gep24 = getelementptr float, ptr %pDst, i32 %index
216  %wide.vec = load <8 x float>, ptr %next.gep, align 4
217  %3 = fmul fast <8 x float> %wide.vec, %wide.vec
218  %4 = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
219  %5 = fmul fast <8 x float> %wide.vec, %wide.vec
220  %6 = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
221  %7 = fadd fast <4 x float> %6, %4
222  store <4 x float> %7, ptr %next.gep24, align 4
223  %index.next = add i32 %index, 4
224  %8 = icmp eq i32 %index.next, %n.vec
225  br i1 %8, label %middle.block, label %vector.body
226
227middle.block:                                     ; preds = %vector.body
228  %cmp.n = icmp eq i32 %n.vec, %numSamples
229  br i1 %cmp.n, label %while.end, label %while.body.preheader26
230
231while.body.preheader26:                           ; preds = %middle.block, %vector.memcheck, %while.body.preheader
232  %pSrc.addr.014.ph = phi ptr [ %pSrc, %vector.memcheck ], [ %pSrc, %while.body.preheader ], [ %ind.end, %middle.block ]
233  %pDst.addr.013.ph = phi ptr [ %pDst, %vector.memcheck ], [ %pDst, %while.body.preheader ], [ %ind.end21, %middle.block ]
234  %blkCnt.012.ph = phi i32 [ %numSamples, %vector.memcheck ], [ %numSamples, %while.body.preheader ], [ %ind.end23, %middle.block ]
235  br label %while.body
236
237while.body:                                       ; preds = %while.body.preheader26, %while.body
238  %pSrc.addr.014 = phi ptr [ %incdec.ptr1, %while.body ], [ %pSrc.addr.014.ph, %while.body.preheader26 ]
239  %pDst.addr.013 = phi ptr [ %incdec.ptr3, %while.body ], [ %pDst.addr.013.ph, %while.body.preheader26 ]
240  %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blkCnt.012.ph, %while.body.preheader26 ]
241  %incdec.ptr = getelementptr inbounds float, ptr %pSrc.addr.014, i32 1
242  %9 = load float, ptr %pSrc.addr.014, align 4
243  %incdec.ptr1 = getelementptr inbounds float, ptr %pSrc.addr.014, i32 2
244  %10 = load float, ptr %incdec.ptr, align 4
245  %mul = fmul fast float %9, %9
246  %mul2 = fmul fast float %10, %10
247  %add = fadd fast float %mul2, %mul
248  %incdec.ptr3 = getelementptr inbounds float, ptr %pDst.addr.013, i32 1
249  store float %add, ptr %pDst.addr.013, align 4
250  %dec = add i32 %blkCnt.012, -1
251  %cmp.not = icmp eq i32 %dec, 0
252  br i1 %cmp.not, label %while.end, label %while.body
253
254while.end:                                        ; preds = %while.body, %middle.block, %entry
255  ret void
256}
257
258define void @arm_cmplx_mag_squared_f16_cse(ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 %numSamples) {
259; CHECK-LABEL: arm_cmplx_mag_squared_f16_cse:
260; CHECK:       @ %bb.0: @ %entry
261; CHECK-NEXT:    .save {r4, r5, r7, lr}
262; CHECK-NEXT:    push {r4, r5, r7, lr}
263; CHECK-NEXT:    cmp r2, #0
264; CHECK-NEXT:    beq .LBB2_8
265; CHECK-NEXT:  @ %bb.1: @ %while.body.preheader
266; CHECK-NEXT:    cmp r2, #8
267; CHECK-NEXT:    blo .LBB2_9
268; CHECK-NEXT:  @ %bb.2: @ %vector.memcheck
269; CHECK-NEXT:    add.w r3, r0, r2, lsl #2
270; CHECK-NEXT:    cmp r3, r1
271; CHECK-NEXT:    itt hi
272; CHECK-NEXT:    addhi.w r3, r1, r2, lsl #1
273; CHECK-NEXT:    cmphi r3, r0
274; CHECK-NEXT:    bhi .LBB2_9
275; CHECK-NEXT:  @ %bb.3: @ %vector.ph
276; CHECK-NEXT:    bic r4, r2, #7
277; CHECK-NEXT:    movs r5, #1
278; CHECK-NEXT:    sub.w r3, r4, #8
279; CHECK-NEXT:    add.w r12, r1, r4, lsl #1
280; CHECK-NEXT:    add.w lr, r5, r3, lsr #3
281; CHECK-NEXT:    add.w r3, r0, r4, lsl #2
282; CHECK-NEXT:    and r5, r2, #7
283; CHECK-NEXT:  .LBB2_4: @ %vector.body
284; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
285; CHECK-NEXT:    vld20.16 {q0, q1}, [r0]
286; CHECK-NEXT:    vld21.16 {q0, q1}, [r0]!
287; CHECK-NEXT:    vmul.f16 q0, q0, q0
288; CHECK-NEXT:    vfma.f16 q0, q1, q1
289; CHECK-NEXT:    vstrb.8 q0, [r1], #16
290; CHECK-NEXT:    le lr, .LBB2_4
291; CHECK-NEXT:  @ %bb.5: @ %middle.block
292; CHECK-NEXT:    cmp r4, r2
293; CHECK-NEXT:    it eq
294; CHECK-NEXT:    popeq {r4, r5, r7, pc}
295; CHECK-NEXT:  .LBB2_6: @ %while.body.preheader26
296; CHECK-NEXT:    dls lr, r5
297; CHECK-NEXT:  .LBB2_7: @ %while.body
298; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
299; CHECK-NEXT:    vldr.16 s0, [r3]
300; CHECK-NEXT:    vldr.16 s2, [r3, #2]
301; CHECK-NEXT:    adds r3, #4
302; CHECK-NEXT:    vmul.f16 s0, s0, s0
303; CHECK-NEXT:    vfma.f16 s0, s2, s2
304; CHECK-NEXT:    vstr.16 s0, [r12]
305; CHECK-NEXT:    add.w r12, r12, #2
306; CHECK-NEXT:    le lr, .LBB2_7
307; CHECK-NEXT:  .LBB2_8: @ %while.end
308; CHECK-NEXT:    pop {r4, r5, r7, pc}
309; CHECK-NEXT:  .LBB2_9:
310; CHECK-NEXT:    mov r3, r0
311; CHECK-NEXT:    mov r12, r1
312; CHECK-NEXT:    mov r5, r2
313; CHECK-NEXT:    b .LBB2_6
314entry:
315  %cmp.not11 = icmp eq i32 %numSamples, 0
316  br i1 %cmp.not11, label %while.end, label %while.body.preheader
317
318while.body.preheader:                             ; preds = %entry
319  %min.iters.check = icmp ult i32 %numSamples, 8
320  br i1 %min.iters.check, label %while.body.preheader26, label %vector.memcheck
321
322vector.memcheck:                                  ; preds = %while.body.preheader
323  %scevgep = getelementptr half, ptr %pDst, i32 %numSamples
324  %0 = shl i32 %numSamples, 1
325  %scevgep18 = getelementptr half, ptr %pSrc, i32 %0
326  %bound0 = icmp ugt ptr %scevgep18, %pDst
327  %bound1 = icmp ugt ptr %scevgep, %pSrc
328  %found.conflict = and i1 %bound0, %bound1
329  br i1 %found.conflict, label %while.body.preheader26, label %vector.ph
330
331vector.ph:                                        ; preds = %vector.memcheck
332  %n.vec = and i32 %numSamples, -8
333  %1 = shl i32 %n.vec, 1
334  %ind.end = getelementptr half, ptr %pSrc, i32 %1
335  %ind.end21 = getelementptr half, ptr %pDst, i32 %n.vec
336  %ind.end23 = and i32 %numSamples, 7
337  br label %vector.body
338
339vector.body:                                      ; preds = %vector.body, %vector.ph
340  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
341  %2 = shl i32 %index, 1
342  %next.gep = getelementptr half, ptr %pSrc, i32 %2
343  %next.gep24 = getelementptr half, ptr %pDst, i32 %index
344  %wide.vec = load <16 x half>, ptr %next.gep, align 2
345  %3 = fmul fast <16 x half> %wide.vec, %wide.vec
346  %4 = shufflevector <16 x half> %3, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
347  %5 = shufflevector <16 x half> %3, <16 x half> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
348  %6 = fadd fast <8 x half> %5, %4
349  store <8 x half> %6, ptr %next.gep24, align 2
350  %index.next = add i32 %index, 8
351  %7 = icmp eq i32 %index.next, %n.vec
352  br i1 %7, label %middle.block, label %vector.body
353
354middle.block:                                     ; preds = %vector.body
355  %cmp.n = icmp eq i32 %n.vec, %numSamples
356  br i1 %cmp.n, label %while.end, label %while.body.preheader26
357
358while.body.preheader26:                           ; preds = %middle.block, %vector.memcheck, %while.body.preheader
359  %pSrc.addr.014.ph = phi ptr [ %pSrc, %vector.memcheck ], [ %pSrc, %while.body.preheader ], [ %ind.end, %middle.block ]
360  %pDst.addr.013.ph = phi ptr [ %pDst, %vector.memcheck ], [ %pDst, %while.body.preheader ], [ %ind.end21, %middle.block ]
361  %blkCnt.012.ph = phi i32 [ %numSamples, %vector.memcheck ], [ %numSamples, %while.body.preheader ], [ %ind.end23, %middle.block ]
362  br label %while.body
363
364while.body:                                       ; preds = %while.body, %while.body.preheader26
365  %pSrc.addr.014 = phi ptr [ %incdec.ptr1, %while.body ], [ %pSrc.addr.014.ph, %while.body.preheader26 ]
366  %pDst.addr.013 = phi ptr [ %incdec.ptr3, %while.body ], [ %pDst.addr.013.ph, %while.body.preheader26 ]
367  %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blkCnt.012.ph, %while.body.preheader26 ]
368  %incdec.ptr = getelementptr inbounds half, ptr %pSrc.addr.014, i32 1
369  %8 = load half, ptr %pSrc.addr.014, align 2
370  %incdec.ptr1 = getelementptr inbounds half, ptr %pSrc.addr.014, i32 2
371  %9 = load half, ptr %incdec.ptr, align 2
372  %mul = fmul fast half %8, %8
373  %mul2 = fmul fast half %9, %9
374  %add = fadd fast half %mul2, %mul
375  %incdec.ptr3 = getelementptr inbounds half, ptr %pDst.addr.013, i32 1
376  store half %add, ptr %pDst.addr.013, align 2
377  %dec = add i32 %blkCnt.012, -1
378  %cmp.not = icmp eq i32 %dec, 0
379  br i1 %cmp.not, label %while.end, label %while.body
380
381while.end:                                        ; preds = %while.body, %middle.block, %entry
382  ret void
383}
384
385define void @arm_cmplx_mag_squared_f32_cse(ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 %numSamples) {
386; CHECK-LABEL: arm_cmplx_mag_squared_f32_cse:
387; CHECK:       @ %bb.0: @ %entry
388; CHECK-NEXT:    .save {r4, r5, r7, lr}
389; CHECK-NEXT:    push {r4, r5, r7, lr}
390; CHECK-NEXT:    cbz r2, .LBB3_8
391; CHECK-NEXT:  @ %bb.1: @ %while.body.preheader
392; CHECK-NEXT:    cmp r2, #4
393; CHECK-NEXT:    blo .LBB3_9
394; CHECK-NEXT:  @ %bb.2: @ %vector.memcheck
395; CHECK-NEXT:    add.w r3, r0, r2, lsl #3
396; CHECK-NEXT:    cmp r3, r1
397; CHECK-NEXT:    itt hi
398; CHECK-NEXT:    addhi.w r3, r1, r2, lsl #2
399; CHECK-NEXT:    cmphi r3, r0
400; CHECK-NEXT:    bhi .LBB3_9
401; CHECK-NEXT:  @ %bb.3: @ %vector.ph
402; CHECK-NEXT:    bic r4, r2, #3
403; CHECK-NEXT:    movs r5, #1
404; CHECK-NEXT:    subs r3, r4, #4
405; CHECK-NEXT:    add.w r12, r1, r4, lsl #2
406; CHECK-NEXT:    add.w lr, r5, r3, lsr #2
407; CHECK-NEXT:    add.w r3, r0, r4, lsl #3
408; CHECK-NEXT:    and r5, r2, #3
409; CHECK-NEXT:  .LBB3_4: @ %vector.body
410; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
411; CHECK-NEXT:    vld20.32 {q0, q1}, [r0]
412; CHECK-NEXT:    vld21.32 {q0, q1}, [r0]!
413; CHECK-NEXT:    vmul.f32 q0, q0, q0
414; CHECK-NEXT:    vfma.f32 q0, q1, q1
415; CHECK-NEXT:    vstrb.8 q0, [r1], #16
416; CHECK-NEXT:    le lr, .LBB3_4
417; CHECK-NEXT:  @ %bb.5: @ %middle.block
418; CHECK-NEXT:    cmp r4, r2
419; CHECK-NEXT:    it eq
420; CHECK-NEXT:    popeq {r4, r5, r7, pc}
421; CHECK-NEXT:  .LBB3_6: @ %while.body.preheader26
422; CHECK-NEXT:    dls lr, r5
423; CHECK-NEXT:  .LBB3_7: @ %while.body
424; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
425; CHECK-NEXT:    vldr s0, [r3]
426; CHECK-NEXT:    vldr s2, [r3, #4]
427; CHECK-NEXT:    adds r3, #8
428; CHECK-NEXT:    vmul.f32 s0, s0, s0
429; CHECK-NEXT:    vfma.f32 s0, s2, s2
430; CHECK-NEXT:    vstmia r12!, {s0}
431; CHECK-NEXT:    le lr, .LBB3_7
432; CHECK-NEXT:  .LBB3_8: @ %while.end
433; CHECK-NEXT:    pop {r4, r5, r7, pc}
434; CHECK-NEXT:  .LBB3_9:
435; CHECK-NEXT:    mov r3, r0
436; CHECK-NEXT:    mov r12, r1
437; CHECK-NEXT:    mov r5, r2
438; CHECK-NEXT:    b .LBB3_6
439entry:
440  %cmp.not11 = icmp eq i32 %numSamples, 0
441  br i1 %cmp.not11, label %while.end, label %while.body.preheader
442
443while.body.preheader:                             ; preds = %entry
444  %min.iters.check = icmp ult i32 %numSamples, 4
445  br i1 %min.iters.check, label %while.body.preheader26, label %vector.memcheck
446
447vector.memcheck:                                  ; preds = %while.body.preheader
448  %scevgep = getelementptr float, ptr %pDst, i32 %numSamples
449  %0 = shl i32 %numSamples, 1
450  %scevgep18 = getelementptr float, ptr %pSrc, i32 %0
451  %bound0 = icmp ugt ptr %scevgep18, %pDst
452  %bound1 = icmp ugt ptr %scevgep, %pSrc
453  %found.conflict = and i1 %bound0, %bound1
454  br i1 %found.conflict, label %while.body.preheader26, label %vector.ph
455
456vector.ph:                                        ; preds = %vector.memcheck
457  %n.vec = and i32 %numSamples, -4
458  %1 = shl i32 %n.vec, 1
459  %ind.end = getelementptr float, ptr %pSrc, i32 %1
460  %ind.end21 = getelementptr float, ptr %pDst, i32 %n.vec
461  %ind.end23 = and i32 %numSamples, 3
462  br label %vector.body
463
464vector.body:                                      ; preds = %vector.body, %vector.ph
465  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
466  %2 = shl i32 %index, 1
467  %next.gep = getelementptr float, ptr %pSrc, i32 %2
468  %next.gep24 = getelementptr float, ptr %pDst, i32 %index
469  %wide.vec = load <8 x float>, ptr %next.gep, align 4
470  %3 = fmul fast <8 x float> %wide.vec, %wide.vec
471  %4 = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
472  %5 = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
473  %6 = fadd fast <4 x float> %5, %4
474  store <4 x float> %6, ptr %next.gep24, align 4
475  %index.next = add i32 %index, 4
476  %7 = icmp eq i32 %index.next, %n.vec
477  br i1 %7, label %middle.block, label %vector.body
478
479middle.block:                                     ; preds = %vector.body
480  %cmp.n = icmp eq i32 %n.vec, %numSamples
481  br i1 %cmp.n, label %while.end, label %while.body.preheader26
482
483while.body.preheader26:                           ; preds = %middle.block, %vector.memcheck, %while.body.preheader
484  %pSrc.addr.014.ph = phi ptr [ %pSrc, %vector.memcheck ], [ %pSrc, %while.body.preheader ], [ %ind.end, %middle.block ]
485  %pDst.addr.013.ph = phi ptr [ %pDst, %vector.memcheck ], [ %pDst, %while.body.preheader ], [ %ind.end21, %middle.block ]
486  %blkCnt.012.ph = phi i32 [ %numSamples, %vector.memcheck ], [ %numSamples, %while.body.preheader ], [ %ind.end23, %middle.block ]
487  br label %while.body
488
489while.body:                                       ; preds = %while.body, %while.body.preheader26
490  %pSrc.addr.014 = phi ptr [ %incdec.ptr1, %while.body ], [ %pSrc.addr.014.ph, %while.body.preheader26 ]
491  %pDst.addr.013 = phi ptr [ %incdec.ptr3, %while.body ], [ %pDst.addr.013.ph, %while.body.preheader26 ]
492  %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blkCnt.012.ph, %while.body.preheader26 ]
493  %incdec.ptr = getelementptr inbounds float, ptr %pSrc.addr.014, i32 1
494  %8 = load float, ptr %pSrc.addr.014, align 4
495  %incdec.ptr1 = getelementptr inbounds float, ptr %pSrc.addr.014, i32 2
496  %9 = load float, ptr %incdec.ptr, align 4
497  %mul = fmul fast float %8, %8
498  %mul2 = fmul fast float %9, %9
499  %add = fadd fast float %mul2, %mul
500  %incdec.ptr3 = getelementptr inbounds float, ptr %pDst.addr.013, i32 1
501  store float %add, ptr %pDst.addr.013, align 4
502  %dec = add i32 %blkCnt.012, -1
503  %cmp.not = icmp eq i32 %dec, 0
504  br i1 %cmp.not, label %while.end, label %while.body
505
506while.end:                                        ; preds = %while.body, %middle.block, %entry
507  ret void
508}
509