xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll (revision b5b663aac17415625340eb29c8010832bfc4c21c)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedldst -verify-machineinstrs %s -o - | FileCheck %s
3
4define arm_aapcs_vfpcc void @thres_i32(ptr %data, i16 zeroext %N, i32 %T) {
5; CHECK-LABEL: thres_i32:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    .save {r7, lr}
8; CHECK-NEXT:    push {r7, lr}
9; CHECK-NEXT:    cmp r1, #0
10; CHECK-NEXT:    it eq
11; CHECK-NEXT:    popeq {r7, pc}
12; CHECK-NEXT:  .LBB0_1: @ %vector.ph
13; CHECK-NEXT:    mvn r3, #3
14; CHECK-NEXT:    add.w r1, r3, r1, lsl #2
15; CHECK-NEXT:    movs r3, #1
16; CHECK-NEXT:    vmov.i32 q0, #0x0
17; CHECK-NEXT:    add.w lr, r3, r1, lsr #2
18; CHECK-NEXT:    rsbs r1, r2, #0
19; CHECK-NEXT:  .LBB0_2: @ %vector.body
20; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
21; CHECK-NEXT:    vldrw.u32 q1, [r0]
22; CHECK-NEXT:    vpte.s32 ge, q1, r2
23; CHECK-NEXT:    vcmpt.s32 le, q1, r1
24; CHECK-NEXT:    vstrwe.32 q0, [r0], #16
25; CHECK-NEXT:    le lr, .LBB0_2
26; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
27; CHECK-NEXT:    pop {r7, pc}
28entry:
29  %conv = zext i16 %N to i32
30  %mul = shl nuw nsw i32 %conv, 2
31  %cmp15 = icmp eq i16 %N, 0
32  br i1 %cmp15, label %for.cond.cleanup, label %vector.ph
33
34vector.ph:                                        ; preds = %entry
35  %sub = sub nsw i32 0, %T
36  %broadcast.splatinsert17 = insertelement <4 x i32> undef, i32 %T, i32 0
37  %broadcast.splat18 = shufflevector <4 x i32> %broadcast.splatinsert17, <4 x i32> undef, <4 x i32> zeroinitializer
38  %broadcast.splatinsert19 = insertelement <4 x i32> undef, i32 %sub, i32 0
39  %broadcast.splat20 = shufflevector <4 x i32> %broadcast.splatinsert19, <4 x i32> undef, <4 x i32> zeroinitializer
40  br label %vector.body
41
42vector.body:                                      ; preds = %vector.body, %vector.ph
43  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
44  %0 = getelementptr inbounds i32, ptr %data, i32 %index
45  %wide.load = load <4 x i32>, ptr %0, align 4
46  %1 = icmp slt <4 x i32> %wide.load, %broadcast.splat18
47  %2 = icmp sgt <4 x i32> %wide.load, %broadcast.splat20
48  %3 = or <4 x i1> %1, %2
49  call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr %0, i32 4, <4 x i1> %3)
50  %index.next = add i32 %index, 4
51  %4 = icmp eq i32 %index.next, %mul
52  br i1 %4, label %for.cond.cleanup, label %vector.body
53
54for.cond.cleanup:                                 ; preds = %vector.body, %entry
55  ret void
56}
57
58define arm_aapcs_vfpcc void @thresh_i16(ptr %data, i16 zeroext %N, i16 signext %T) {
59; CHECK-LABEL: thresh_i16:
60; CHECK:       @ %bb.0: @ %entry
61; CHECK-NEXT:    .save {r7, lr}
62; CHECK-NEXT:    push {r7, lr}
63; CHECK-NEXT:    cmp r1, #0
64; CHECK-NEXT:    it eq
65; CHECK-NEXT:    popeq {r7, pc}
66; CHECK-NEXT:  .LBB1_1: @ %vector.ph
67; CHECK-NEXT:    mvn r3, #7
68; CHECK-NEXT:    add.w r1, r3, r1, lsl #3
69; CHECK-NEXT:    movs r3, #1
70; CHECK-NEXT:    vmov.i32 q0, #0x0
71; CHECK-NEXT:    add.w lr, r3, r1, lsr #3
72; CHECK-NEXT:    rsbs r1, r2, #0
73; CHECK-NEXT:  .LBB1_2: @ %vector.body
74; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
75; CHECK-NEXT:    vldrh.u16 q1, [r0]
76; CHECK-NEXT:    vpte.s16 ge, q1, r2
77; CHECK-NEXT:    vcmpt.s16 le, q1, r1
78; CHECK-NEXT:    vstrhe.16 q0, [r0], #16
79; CHECK-NEXT:    le lr, .LBB1_2
80; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
81; CHECK-NEXT:    pop {r7, pc}
82entry:
83  %conv2 = zext i16 %N to i32
84  %mul = shl nuw nsw i32 %conv2, 3
85  %cmp22 = icmp eq i16 %N, 0
86  br i1 %cmp22, label %for.cond.cleanup, label %vector.ph
87
88vector.ph:                                        ; preds = %entry
89  %sub = sub i16 0, %T
90  %broadcast.splatinsert24 = insertelement <8 x i16> undef, i16 %T, i32 0
91  %broadcast.splat25 = shufflevector <8 x i16> %broadcast.splatinsert24, <8 x i16> undef, <8 x i32> zeroinitializer
92  %broadcast.splatinsert26 = insertelement <8 x i16> undef, i16 %sub, i32 0
93  %broadcast.splat27 = shufflevector <8 x i16> %broadcast.splatinsert26, <8 x i16> undef, <8 x i32> zeroinitializer
94  br label %vector.body
95
96vector.body:                                      ; preds = %vector.body, %vector.ph
97  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
98  %0 = getelementptr inbounds i16, ptr %data, i32 %index
99  %wide.load = load <8 x i16>, ptr %0, align 2
100  %1 = icmp slt <8 x i16> %wide.load, %broadcast.splat25
101  %2 = icmp sgt <8 x i16> %wide.load, %broadcast.splat27
102  %3 = or <8 x i1> %1, %2
103  call void @llvm.masked.store.v8i16.p0(<8 x i16> zeroinitializer, ptr %0, i32 2, <8 x i1> %3)
104  %index.next = add i32 %index, 8
105  %4 = icmp eq i32 %index.next, %mul
106  br i1 %4, label %for.cond.cleanup, label %vector.body
107
108for.cond.cleanup:                                 ; preds = %vector.body, %entry
109  ret void
110}
111
112define arm_aapcs_vfpcc void @thresh_i8(ptr %data, i16 zeroext %N, i8 signext %T) {
113; CHECK-LABEL: thresh_i8:
114; CHECK:       @ %bb.0: @ %entry
115; CHECK-NEXT:    .save {r7, lr}
116; CHECK-NEXT:    push {r7, lr}
117; CHECK-NEXT:    cmp r1, #0
118; CHECK-NEXT:    it eq
119; CHECK-NEXT:    popeq {r7, pc}
120; CHECK-NEXT:  .LBB2_1: @ %vector.ph
121; CHECK-NEXT:    mvn r3, #15
122; CHECK-NEXT:    add.w r1, r3, r1, lsl #4
123; CHECK-NEXT:    movs r3, #1
124; CHECK-NEXT:    vmov.i32 q0, #0x0
125; CHECK-NEXT:    add.w lr, r3, r1, lsr #4
126; CHECK-NEXT:    rsbs r1, r2, #0
127; CHECK-NEXT:  .LBB2_2: @ %vector.body
128; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
129; CHECK-NEXT:    vldrb.u8 q1, [r0]
130; CHECK-NEXT:    vpte.s8 ge, q1, r2
131; CHECK-NEXT:    vcmpt.s8 le, q1, r1
132; CHECK-NEXT:    vstrbe.8 q0, [r0], #16
133; CHECK-NEXT:    le lr, .LBB2_2
134; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
135; CHECK-NEXT:    pop {r7, pc}
136entry:
137  %conv2 = zext i16 %N to i32
138  %mul = shl nuw nsw i32 %conv2, 4
139  %cmp20 = icmp eq i16 %N, 0
140  br i1 %cmp20, label %for.cond.cleanup, label %vector.ph
141
142vector.ph:                                        ; preds = %entry
143  %sub = sub i8 0, %T
144  %broadcast.splatinsert22 = insertelement <16 x i8> undef, i8 %T, i32 0
145  %broadcast.splat23 = shufflevector <16 x i8> %broadcast.splatinsert22, <16 x i8> undef, <16 x i32> zeroinitializer
146  %broadcast.splatinsert24 = insertelement <16 x i8> undef, i8 %sub, i32 0
147  %broadcast.splat25 = shufflevector <16 x i8> %broadcast.splatinsert24, <16 x i8> undef, <16 x i32> zeroinitializer
148  br label %vector.body
149
150vector.body:                                      ; preds = %vector.body, %vector.ph
151  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
152  %0 = getelementptr inbounds i8, ptr %data, i32 %index
153  %wide.load = load <16 x i8>, ptr %0, align 1
154  %1 = icmp slt <16 x i8> %wide.load, %broadcast.splat23
155  %2 = icmp sgt <16 x i8> %wide.load, %broadcast.splat25
156  %3 = or <16 x i1> %1, %2
157  call void @llvm.masked.store.v16i8.p0(<16 x i8> zeroinitializer, ptr %0, i32 1, <16 x i1> %3)
158  %index.next = add i32 %index, 16
159  %4 = icmp eq i32 %index.next, %mul
160  br i1 %4, label %for.cond.cleanup, label %vector.body
161
162for.cond.cleanup:                                 ; preds = %vector.body, %entry
163  ret void
164}
165
166define arm_aapcs_vfpcc void @thresh_f32(ptr %data, i16 zeroext %N, float %T) {
167; CHECK-LABEL: thresh_f32:
168; CHECK:       @ %bb.0: @ %entry
169; CHECK-NEXT:    .save {r7, lr}
170; CHECK-NEXT:    push {r7, lr}
171; CHECK-NEXT:    cmp r1, #0
172; CHECK-NEXT:    it eq
173; CHECK-NEXT:    popeq {r7, pc}
174; CHECK-NEXT:  .LBB3_1: @ %vector.ph
175; CHECK-NEXT:    mvn r2, #3
176; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
177; CHECK-NEXT:    movs r2, #1
178; CHECK-NEXT:    add.w lr, r2, r1, lsr #2
179; CHECK-NEXT:    vmov r1, s0
180; CHECK-NEXT:    vmov.i32 q0, #0x0
181; CHECK-NEXT:    eor r2, r1, #-2147483648
182; CHECK-NEXT:  .LBB3_2: @ %vector.body
183; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
184; CHECK-NEXT:    vldrw.u32 q1, [r0]
185; CHECK-NEXT:    vpte.f32 ge, q1, r1
186; CHECK-NEXT:    vcmpt.f32 le, q1, r2
187; CHECK-NEXT:    vstrwe.32 q0, [r0], #16
188; CHECK-NEXT:    le lr, .LBB3_2
189; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
190; CHECK-NEXT:    pop {r7, pc}
191entry:
192  %conv = zext i16 %N to i32
193  %mul = shl nuw nsw i32 %conv, 2
194  %cmp15 = icmp eq i16 %N, 0
195  br i1 %cmp15, label %for.cond.cleanup, label %vector.ph
196
197vector.ph:                                        ; preds = %entry
198  %fneg = fneg fast float %T
199  %broadcast.splatinsert17 = insertelement <4 x float> undef, float %T, i32 0
200  %broadcast.splat18 = shufflevector <4 x float> %broadcast.splatinsert17, <4 x float> undef, <4 x i32> zeroinitializer
201  %broadcast.splatinsert19 = insertelement <4 x float> undef, float %fneg, i32 0
202  %broadcast.splat20 = shufflevector <4 x float> %broadcast.splatinsert19, <4 x float> undef, <4 x i32> zeroinitializer
203  br label %vector.body
204
205vector.body:                                      ; preds = %vector.body, %vector.ph
206  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
207  %0 = getelementptr inbounds float, ptr %data, i32 %index
208  %wide.load = load <4 x float>, ptr %0, align 4
209  %1 = fcmp fast olt <4 x float> %wide.load, %broadcast.splat18
210  %2 = fcmp fast ogt <4 x float> %wide.load, %broadcast.splat20
211  %3 = or <4 x i1> %1, %2
212  call void @llvm.masked.store.v4f32.p0(<4 x float> zeroinitializer, ptr %0, i32 4, <4 x i1> %3)
213  %index.next = add i32 %index, 4
214  %4 = icmp eq i32 %index.next, %mul
215  br i1 %4, label %for.cond.cleanup, label %vector.body
216
217for.cond.cleanup:                                 ; preds = %vector.body, %entry
218  ret void
219}
220
221define arm_aapcs_vfpcc void @thresh_f16(ptr %data, i16 zeroext %N, float %T.coerce) {
222; CHECK-LABEL: thresh_f16:
223; CHECK:       @ %bb.0: @ %entry
224; CHECK-NEXT:    .save {r7, lr}
225; CHECK-NEXT:    push {r7, lr}
226; CHECK-NEXT:    cmp r1, #0
227; CHECK-NEXT:    it eq
228; CHECK-NEXT:    popeq {r7, pc}
229; CHECK-NEXT:  .LBB4_1: @ %vector.ph
230; CHECK-NEXT:    mvn r3, #7
231; CHECK-NEXT:    add.w r1, r3, r1, lsl #3
232; CHECK-NEXT:    vmov r2, s0
233; CHECK-NEXT:    vneg.f16 s0, s0
234; CHECK-NEXT:    movs r3, #1
235; CHECK-NEXT:    add.w lr, r3, r1, lsr #3
236; CHECK-NEXT:    vmov.f16 r1, s0
237; CHECK-NEXT:    vmov.i32 q0, #0x0
238; CHECK-NEXT:  .LBB4_2: @ %vector.body
239; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
240; CHECK-NEXT:    vldrh.u16 q1, [r0]
241; CHECK-NEXT:    vpte.f16 ge, q1, r2
242; CHECK-NEXT:    vcmpt.f16 le, q1, r1
243; CHECK-NEXT:    vstrhe.16 q0, [r0], #16
244; CHECK-NEXT:    le lr, .LBB4_2
245; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
246; CHECK-NEXT:    pop {r7, pc}
247entry:
248  %0 = bitcast float %T.coerce to i32
249  %tmp.0.extract.trunc = trunc i32 %0 to i16
250  %1 = bitcast i16 %tmp.0.extract.trunc to half
251  %conv = zext i16 %N to i32
252  %mul = shl nuw nsw i32 %conv, 3
253  %cmp17 = icmp eq i16 %N, 0
254  br i1 %cmp17, label %for.cond.cleanup, label %vector.ph
255
256vector.ph:                                        ; preds = %entry
257  %fneg = fneg fast half %1
258  %broadcast.splatinsert19 = insertelement <8 x half> undef, half %1, i32 0
259  %broadcast.splat20 = shufflevector <8 x half> %broadcast.splatinsert19, <8 x half> undef, <8 x i32> zeroinitializer
260  %broadcast.splatinsert21 = insertelement <8 x half> undef, half %fneg, i32 0
261  %broadcast.splat22 = shufflevector <8 x half> %broadcast.splatinsert21, <8 x half> undef, <8 x i32> zeroinitializer
262  br label %vector.body
263
264vector.body:                                      ; preds = %vector.body, %vector.ph
265  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
266  %2 = getelementptr inbounds half, ptr %data, i32 %index
267  %wide.load = load <8 x half>, ptr %2, align 2
268  %3 = fcmp fast olt <8 x half> %wide.load, %broadcast.splat20
269  %4 = fcmp fast ogt <8 x half> %wide.load, %broadcast.splat22
270  %5 = or <8 x i1> %3, %4
271  call void @llvm.masked.store.v8f16.p0(<8 x half> zeroinitializer, ptr %2, i32 2, <8 x i1> %5)
272  %index.next = add i32 %index, 8
273  %6 = icmp eq i32 %index.next, %mul
274  br i1 %6, label %for.cond.cleanup, label %vector.body
275
276for.cond.cleanup:                                 ; preds = %vector.body, %entry
277  ret void
278}
279
280
281
282define arm_aapcs_vfpcc void @thres_rev_i32(ptr %data, i16 zeroext %N, i32 %T) {
283; CHECK-LABEL: thres_rev_i32:
284; CHECK:       @ %bb.0: @ %entry
285; CHECK-NEXT:    .save {r7, lr}
286; CHECK-NEXT:    push {r7, lr}
287; CHECK-NEXT:    cmp r1, #0
288; CHECK-NEXT:    it eq
289; CHECK-NEXT:    popeq {r7, pc}
290; CHECK-NEXT:  .LBB5_1: @ %vector.ph
291; CHECK-NEXT:    mvn r3, #3
292; CHECK-NEXT:    add.w r1, r3, r1, lsl #2
293; CHECK-NEXT:    movs r3, #1
294; CHECK-NEXT:    vmov.i32 q0, #0x0
295; CHECK-NEXT:    add.w lr, r3, r1, lsr #2
296; CHECK-NEXT:    rsbs r1, r2, #0
297; CHECK-NEXT:  .LBB5_2: @ %vector.body
298; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
299; CHECK-NEXT:    vldrw.u32 q1, [r0]
300; CHECK-NEXT:    vpte.s32 ge, q1, r2
301; CHECK-NEXT:    vcmpt.s32 le, q1, r1
302; CHECK-NEXT:    vstrwe.32 q0, [r0], #16
303; CHECK-NEXT:    le lr, .LBB5_2
304; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
305; CHECK-NEXT:    pop {r7, pc}
306entry:
307  %conv = zext i16 %N to i32
308  %mul = shl nuw nsw i32 %conv, 2
309  %cmp15 = icmp eq i16 %N, 0
310  br i1 %cmp15, label %for.cond.cleanup, label %vector.ph
311
312vector.ph:                                        ; preds = %entry
313  %sub = sub nsw i32 0, %T
314  %broadcast.splatinsert17 = insertelement <4 x i32> undef, i32 %T, i32 0
315  %broadcast.splat18 = shufflevector <4 x i32> %broadcast.splatinsert17, <4 x i32> undef, <4 x i32> zeroinitializer
316  %broadcast.splatinsert19 = insertelement <4 x i32> undef, i32 %sub, i32 0
317  %broadcast.splat20 = shufflevector <4 x i32> %broadcast.splatinsert19, <4 x i32> undef, <4 x i32> zeroinitializer
318  br label %vector.body
319
320vector.body:                                      ; preds = %vector.body, %vector.ph
321  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
322  %0 = getelementptr inbounds i32, ptr %data, i32 %index
323  %wide.load = load <4 x i32>, ptr %0, align 4
324  %1 = icmp sgt <4 x i32> %broadcast.splat18, %wide.load
325  %2 = icmp slt <4 x i32> %broadcast.splat20, %wide.load
326  %3 = or <4 x i1> %1, %2
327  call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr %0, i32 4, <4 x i1> %3)
328  %index.next = add i32 %index, 4
329  %4 = icmp eq i32 %index.next, %mul
330  br i1 %4, label %for.cond.cleanup, label %vector.body
331
332for.cond.cleanup:                                 ; preds = %vector.body, %entry
333  ret void
334}
335
336define arm_aapcs_vfpcc void @thresh_rev_i16(ptr %data, i16 zeroext %N, i16 signext %T) {
337; CHECK-LABEL: thresh_rev_i16:
338; CHECK:       @ %bb.0: @ %entry
339; CHECK-NEXT:    .save {r7, lr}
340; CHECK-NEXT:    push {r7, lr}
341; CHECK-NEXT:    cmp r1, #0
342; CHECK-NEXT:    it eq
343; CHECK-NEXT:    popeq {r7, pc}
344; CHECK-NEXT:  .LBB6_1: @ %vector.ph
345; CHECK-NEXT:    mvn r3, #7
346; CHECK-NEXT:    add.w r1, r3, r1, lsl #3
347; CHECK-NEXT:    movs r3, #1
348; CHECK-NEXT:    vmov.i32 q0, #0x0
349; CHECK-NEXT:    add.w lr, r3, r1, lsr #3
350; CHECK-NEXT:    rsbs r1, r2, #0
351; CHECK-NEXT:  .LBB6_2: @ %vector.body
352; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
353; CHECK-NEXT:    vldrh.u16 q1, [r0]
354; CHECK-NEXT:    vpte.s16 ge, q1, r2
355; CHECK-NEXT:    vcmpt.s16 le, q1, r1
356; CHECK-NEXT:    vstrhe.16 q0, [r0], #16
357; CHECK-NEXT:    le lr, .LBB6_2
358; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
359; CHECK-NEXT:    pop {r7, pc}
360entry:
361  %conv2 = zext i16 %N to i32
362  %mul = shl nuw nsw i32 %conv2, 3
363  %cmp22 = icmp eq i16 %N, 0
364  br i1 %cmp22, label %for.cond.cleanup, label %vector.ph
365
366vector.ph:                                        ; preds = %entry
367  %sub = sub i16 0, %T
368  %broadcast.splatinsert24 = insertelement <8 x i16> undef, i16 %T, i32 0
369  %broadcast.splat25 = shufflevector <8 x i16> %broadcast.splatinsert24, <8 x i16> undef, <8 x i32> zeroinitializer
370  %broadcast.splatinsert26 = insertelement <8 x i16> undef, i16 %sub, i32 0
371  %broadcast.splat27 = shufflevector <8 x i16> %broadcast.splatinsert26, <8 x i16> undef, <8 x i32> zeroinitializer
372  br label %vector.body
373
374vector.body:                                      ; preds = %vector.body, %vector.ph
375  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
376  %0 = getelementptr inbounds i16, ptr %data, i32 %index
377  %wide.load = load <8 x i16>, ptr %0, align 2
378  %1 = icmp sgt <8 x i16> %broadcast.splat25, %wide.load
379  %2 = icmp slt <8 x i16> %broadcast.splat27, %wide.load
380  %3 = or <8 x i1> %1, %2
381  call void @llvm.masked.store.v8i16.p0(<8 x i16> zeroinitializer, ptr %0, i32 2, <8 x i1> %3)
382  %index.next = add i32 %index, 8
383  %4 = icmp eq i32 %index.next, %mul
384  br i1 %4, label %for.cond.cleanup, label %vector.body
385
386for.cond.cleanup:                                 ; preds = %vector.body, %entry
387  ret void
388}
389
390define arm_aapcs_vfpcc void @thresh_rev_i8(ptr %data, i16 zeroext %N, i8 signext %T) {
391; CHECK-LABEL: thresh_rev_i8:
392; CHECK:       @ %bb.0: @ %entry
393; CHECK-NEXT:    .save {r7, lr}
394; CHECK-NEXT:    push {r7, lr}
395; CHECK-NEXT:    cmp r1, #0
396; CHECK-NEXT:    it eq
397; CHECK-NEXT:    popeq {r7, pc}
398; CHECK-NEXT:  .LBB7_1: @ %vector.ph
399; CHECK-NEXT:    mvn r3, #15
400; CHECK-NEXT:    add.w r1, r3, r1, lsl #4
401; CHECK-NEXT:    movs r3, #1
402; CHECK-NEXT:    vmov.i32 q0, #0x0
403; CHECK-NEXT:    add.w lr, r3, r1, lsr #4
404; CHECK-NEXT:    rsbs r1, r2, #0
405; CHECK-NEXT:  .LBB7_2: @ %vector.body
406; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
407; CHECK-NEXT:    vldrb.u8 q1, [r0]
408; CHECK-NEXT:    vpte.s8 ge, q1, r2
409; CHECK-NEXT:    vcmpt.s8 le, q1, r1
410; CHECK-NEXT:    vstrbe.8 q0, [r0], #16
411; CHECK-NEXT:    le lr, .LBB7_2
412; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
413; CHECK-NEXT:    pop {r7, pc}
414entry:
415  %conv2 = zext i16 %N to i32
416  %mul = shl nuw nsw i32 %conv2, 4
417  %cmp20 = icmp eq i16 %N, 0
418  br i1 %cmp20, label %for.cond.cleanup, label %vector.ph
419
420vector.ph:                                        ; preds = %entry
421  %sub = sub i8 0, %T
422  %broadcast.splatinsert22 = insertelement <16 x i8> undef, i8 %T, i32 0
423  %broadcast.splat23 = shufflevector <16 x i8> %broadcast.splatinsert22, <16 x i8> undef, <16 x i32> zeroinitializer
424  %broadcast.splatinsert24 = insertelement <16 x i8> undef, i8 %sub, i32 0
425  %broadcast.splat25 = shufflevector <16 x i8> %broadcast.splatinsert24, <16 x i8> undef, <16 x i32> zeroinitializer
426  br label %vector.body
427
428vector.body:                                      ; preds = %vector.body, %vector.ph
429  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
430  %0 = getelementptr inbounds i8, ptr %data, i32 %index
431  %wide.load = load <16 x i8>, ptr %0, align 1
432  %1 = icmp sgt <16 x i8> %broadcast.splat23, %wide.load
433  %2 = icmp slt <16 x i8> %broadcast.splat25, %wide.load
434  %3 = or <16 x i1> %1, %2
435  call void @llvm.masked.store.v16i8.p0(<16 x i8> zeroinitializer, ptr %0, i32 1, <16 x i1> %3)
436  %index.next = add i32 %index, 16
437  %4 = icmp eq i32 %index.next, %mul
438  br i1 %4, label %for.cond.cleanup, label %vector.body
439
440for.cond.cleanup:                                 ; preds = %vector.body, %entry
441  ret void
442}
443
444define arm_aapcs_vfpcc void @thresh_rev_f32(ptr %data, i16 zeroext %N, float %T) {
445; CHECK-LABEL: thresh_rev_f32:
446; CHECK:       @ %bb.0: @ %entry
447; CHECK-NEXT:    .save {r7, lr}
448; CHECK-NEXT:    push {r7, lr}
449; CHECK-NEXT:    cmp r1, #0
450; CHECK-NEXT:    it eq
451; CHECK-NEXT:    popeq {r7, pc}
452; CHECK-NEXT:  .LBB8_1: @ %vector.ph
453; CHECK-NEXT:    mvn r2, #3
454; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
455; CHECK-NEXT:    movs r2, #1
456; CHECK-NEXT:    add.w lr, r2, r1, lsr #2
457; CHECK-NEXT:    vmov r1, s0
458; CHECK-NEXT:    vmov.i32 q0, #0x0
459; CHECK-NEXT:    eor r2, r1, #-2147483648
460; CHECK-NEXT:  .LBB8_2: @ %vector.body
461; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
462; CHECK-NEXT:    vldrw.u32 q1, [r0]
463; CHECK-NEXT:    vpte.f32 ge, q1, r1
464; CHECK-NEXT:    vcmpt.f32 le, q1, r2
465; CHECK-NEXT:    vstrwe.32 q0, [r0], #16
466; CHECK-NEXT:    le lr, .LBB8_2
467; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
468; CHECK-NEXT:    pop {r7, pc}
469entry:
470  %conv = zext i16 %N to i32
471  %mul = shl nuw nsw i32 %conv, 2
472  %cmp15 = icmp eq i16 %N, 0
473  br i1 %cmp15, label %for.cond.cleanup, label %vector.ph
474
475vector.ph:                                        ; preds = %entry
476  %fneg = fneg fast float %T
477  %broadcast.splatinsert17 = insertelement <4 x float> undef, float %T, i32 0
478  %broadcast.splat18 = shufflevector <4 x float> %broadcast.splatinsert17, <4 x float> undef, <4 x i32> zeroinitializer
479  %broadcast.splatinsert19 = insertelement <4 x float> undef, float %fneg, i32 0
480  %broadcast.splat20 = shufflevector <4 x float> %broadcast.splatinsert19, <4 x float> undef, <4 x i32> zeroinitializer
481  br label %vector.body
482
483vector.body:                                      ; preds = %vector.body, %vector.ph
484  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
485  %0 = getelementptr inbounds float, ptr %data, i32 %index
486  %wide.load = load <4 x float>, ptr %0, align 4
487  %1 = fcmp fast ogt <4 x float> %broadcast.splat18, %wide.load
488  %2 = fcmp fast olt <4 x float> %broadcast.splat20, %wide.load
489  %3 = or <4 x i1> %1, %2
490  call void @llvm.masked.store.v4f32.p0(<4 x float> zeroinitializer, ptr %0, i32 4, <4 x i1> %3)
491  %index.next = add i32 %index, 4
492  %4 = icmp eq i32 %index.next, %mul
493  br i1 %4, label %for.cond.cleanup, label %vector.body
494
495for.cond.cleanup:                                 ; preds = %vector.body, %entry
496  ret void
497}
498
499define arm_aapcs_vfpcc void @thresh_rev_f16(ptr %data, i16 zeroext %N, float %T.coerce) {
500; CHECK-LABEL: thresh_rev_f16:
501; CHECK:       @ %bb.0: @ %entry
502; CHECK-NEXT:    .save {r7, lr}
503; CHECK-NEXT:    push {r7, lr}
504; CHECK-NEXT:    cmp r1, #0
505; CHECK-NEXT:    it eq
506; CHECK-NEXT:    popeq {r7, pc}
507; CHECK-NEXT:  .LBB9_1: @ %vector.ph
508; CHECK-NEXT:    mvn r3, #7
509; CHECK-NEXT:    add.w r1, r3, r1, lsl #3
510; CHECK-NEXT:    vmov r2, s0
511; CHECK-NEXT:    vneg.f16 s0, s0
512; CHECK-NEXT:    movs r3, #1
513; CHECK-NEXT:    add.w lr, r3, r1, lsr #3
514; CHECK-NEXT:    vmov.f16 r1, s0
515; CHECK-NEXT:    vmov.i32 q0, #0x0
516; CHECK-NEXT:  .LBB9_2: @ %vector.body
517; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
518; CHECK-NEXT:    vldrh.u16 q1, [r0]
519; CHECK-NEXT:    vpte.f16 ge, q1, r2
520; CHECK-NEXT:    vcmpt.f16 le, q1, r1
521; CHECK-NEXT:    vstrhe.16 q0, [r0], #16
522; CHECK-NEXT:    le lr, .LBB9_2
523; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
524; CHECK-NEXT:    pop {r7, pc}
525entry:
526  %0 = bitcast float %T.coerce to i32
527  %tmp.0.extract.trunc = trunc i32 %0 to i16
528  %1 = bitcast i16 %tmp.0.extract.trunc to half
529  %conv = zext i16 %N to i32
530  %mul = shl nuw nsw i32 %conv, 3
531  %cmp17 = icmp eq i16 %N, 0
532  br i1 %cmp17, label %for.cond.cleanup, label %vector.ph
533
534vector.ph:                                        ; preds = %entry
535  %fneg = fneg fast half %1
536  %broadcast.splatinsert19 = insertelement <8 x half> undef, half %1, i32 0
537  %broadcast.splat20 = shufflevector <8 x half> %broadcast.splatinsert19, <8 x half> undef, <8 x i32> zeroinitializer
538  %broadcast.splatinsert21 = insertelement <8 x half> undef, half %fneg, i32 0
539  %broadcast.splat22 = shufflevector <8 x half> %broadcast.splatinsert21, <8 x half> undef, <8 x i32> zeroinitializer
540  br label %vector.body
541
542vector.body:                                      ; preds = %vector.body, %vector.ph
543  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
544  %2 = getelementptr inbounds half, ptr %data, i32 %index
545  %wide.load = load <8 x half>, ptr %2, align 2
546  %3 = fcmp fast ogt <8 x half> %broadcast.splat20, %wide.load
547  %4 = fcmp fast olt <8 x half> %broadcast.splat22, %wide.load
548  %5 = or <8 x i1> %3, %4
549  call void @llvm.masked.store.v8f16.p0(<8 x half> zeroinitializer, ptr %2, i32 2, <8 x i1> %5)
550  %index.next = add i32 %index, 8
551  %6 = icmp eq i32 %index.next, %mul
552  br i1 %6, label %for.cond.cleanup, label %vector.body
553
554for.cond.cleanup:                                 ; preds = %vector.body, %entry
555  ret void
556}
557
558
559
560
561declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>)
562declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32 immarg, <8 x i1>)
563declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32 immarg, <16 x i1>)
564declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32 immarg, <4 x i1>)
565declare void @llvm.masked.store.v8f16.p0(<8 x half>, ptr, i32 immarg, <8 x i1>)
566