xref: /llvm-project/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll (revision b31fffbc7f1e0491bf599e82b7195e320d26e140)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=armv8.1m.main -mattr=+mve -tail-predication=enabled --verify-machineinstrs %s -o - | FileCheck %s
3
4define dso_local i32 @mul_reduce_add(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i32 %N) {
5; CHECK-LABEL: mul_reduce_add:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    cmp r2, #0
8; CHECK-NEXT:    itt eq
9; CHECK-NEXT:    moveq r0, #0
10; CHECK-NEXT:    bxeq lr
11; CHECK-NEXT:  .LBB0_1: @ %vector.ph
12; CHECK-NEXT:    push {r7, lr}
13; CHECK-NEXT:    adds r3, r2, #3
14; CHECK-NEXT:    vmov.i32 q1, #0x0
15; CHECK-NEXT:    bic r3, r3, #3
16; CHECK-NEXT:    sub.w r12, r3, #4
17; CHECK-NEXT:    movs r3, #1
18; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
19; CHECK-NEXT:    dls lr, r3
20; CHECK-NEXT:  .LBB0_2: @ %vector.body
21; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
22; CHECK-NEXT:    vctp.32 r2
23; CHECK-NEXT:    vmov q0, q1
24; CHECK-NEXT:    vpstt
25; CHECK-NEXT:    vldrwt.u32 q1, [r0], #16
26; CHECK-NEXT:    vldrwt.u32 q2, [r1], #16
27; CHECK-NEXT:    subs r2, #4
28; CHECK-NEXT:    vmul.i32 q1, q2, q1
29; CHECK-NEXT:    vadd.i32 q1, q1, q0
30; CHECK-NEXT:    le lr, .LBB0_2
31; CHECK-NEXT:  @ %bb.3: @ %middle.block
32; CHECK-NEXT:    vpsel q0, q1, q0
33; CHECK-NEXT:    vaddv.u32 r0, q0
34; CHECK-NEXT:    pop {r7, pc}
35entry:
36  %cmp8 = icmp eq i32 %N, 0
37  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
38
39vector.ph:                                        ; preds = %entry
40  %n.rnd.up = add i32 %N, 3
41  %n.vec = and i32 %n.rnd.up, -4
42  br label %vector.body
43
44vector.body:                                      ; preds = %vector.body, %vector.ph
45  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
46  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %6, %vector.body ]
47  %0 = getelementptr inbounds i32, ptr %a, i32 %index
48  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
49  %2 = bitcast ptr %0 to ptr
50  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x i32> undef)
51  %3 = getelementptr inbounds i32, ptr %b, i32 %index
52  %4 = bitcast ptr %3 to ptr
53  %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %4, i32 4, <4 x i1> %1, <4 x i32> undef)
54  %5 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load
55  %6 = add nsw <4 x i32> %5, %vec.phi
56  %index.next = add i32 %index, 4
57  %7 = icmp eq i32 %index.next, %n.vec
58  br i1 %7, label %middle.block, label %vector.body
59
60middle.block:                                     ; preds = %vector.body
61  %8 = select <4 x i1> %1, <4 x i32> %6, <4 x i32> %vec.phi
62  %9 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %8)
63  br label %for.cond.cleanup
64
65for.cond.cleanup:                                 ; preds = %middle.block, %entry
66  %res.0.lcssa = phi i32 [ 0, %entry ], [ %9, %middle.block ]
67  ret i32 %res.0.lcssa
68}
69
70define dso_local i32 @mul_reduce_add_const(ptr noalias nocapture readonly %a, i32 %b, i32 %N) {
71; CHECK-LABEL: mul_reduce_add_const:
72; CHECK:       @ %bb.0: @ %entry
73; CHECK-NEXT:    cmp r2, #0
74; CHECK-NEXT:    itt eq
75; CHECK-NEXT:    moveq r0, #0
76; CHECK-NEXT:    bxeq lr
77; CHECK-NEXT:  .LBB1_1: @ %vector.ph
78; CHECK-NEXT:    push {r7, lr}
79; CHECK-NEXT:    adds r1, r2, #3
80; CHECK-NEXT:    movs r3, #1
81; CHECK-NEXT:    bic r1, r1, #3
82; CHECK-NEXT:    vmov.i32 q0, #0x0
83; CHECK-NEXT:    subs r1, #4
84; CHECK-NEXT:    add.w r1, r3, r1, lsr #2
85; CHECK-NEXT:    dls lr, r1
86; CHECK-NEXT:  .LBB1_2: @ %vector.body
87; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
88; CHECK-NEXT:    vctp.32 r2
89; CHECK-NEXT:    vmov q1, q0
90; CHECK-NEXT:    vpst
91; CHECK-NEXT:    vldrwt.u32 q0, [r0], #16
92; CHECK-NEXT:    subs r2, #4
93; CHECK-NEXT:    vadd.i32 q0, q0, q1
94; CHECK-NEXT:    le lr, .LBB1_2
95; CHECK-NEXT:  @ %bb.3: @ %middle.block
96; CHECK-NEXT:    vpsel q0, q0, q1
97; CHECK-NEXT:    vaddv.u32 r0, q0
98; CHECK-NEXT:    pop {r7, pc}
99entry:
100  %cmp6 = icmp eq i32 %N, 0
101  br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
102
103vector.ph:                                        ; preds = %entry
104  %n.rnd.up = add i32 %N, 3
105  %n.vec = and i32 %n.rnd.up, -4
106  br label %vector.body
107
108vector.body:                                      ; preds = %vector.body, %vector.ph
109  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
110  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
111  %0 = getelementptr inbounds i32, ptr %a, i32 %index
112  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
113  %2 = bitcast ptr %0 to ptr
114  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x i32> undef)
115  %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi
116  %index.next = add i32 %index, 4
117  %4 = icmp eq i32 %index.next, %n.vec
118  br i1 %4, label %middle.block, label %vector.body
119
120middle.block:                                     ; preds = %vector.body
121  %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi
122  %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5)
123  br label %for.cond.cleanup
124
125for.cond.cleanup:                                 ; preds = %middle.block, %entry
126  %res.0.lcssa = phi i32 [ 0, %entry ], [ %6, %middle.block ]
127  ret i32 %res.0.lcssa
128}
129
130define dso_local i32 @add_reduce_add_const(ptr noalias nocapture readonly %a, i32 %b, i32 %N) {
131; CHECK-LABEL: add_reduce_add_const:
132; CHECK:       @ %bb.0: @ %entry
133; CHECK-NEXT:    cmp r2, #0
134; CHECK-NEXT:    itt eq
135; CHECK-NEXT:    moveq r0, #0
136; CHECK-NEXT:    bxeq lr
137; CHECK-NEXT:  .LBB2_1: @ %vector.ph
138; CHECK-NEXT:    push {r7, lr}
139; CHECK-NEXT:    adds r1, r2, #3
140; CHECK-NEXT:    movs r3, #1
141; CHECK-NEXT:    bic r1, r1, #3
142; CHECK-NEXT:    vmov.i32 q0, #0x0
143; CHECK-NEXT:    subs r1, #4
144; CHECK-NEXT:    add.w r1, r3, r1, lsr #2
145; CHECK-NEXT:    dls lr, r1
146; CHECK-NEXT:  .LBB2_2: @ %vector.body
147; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
148; CHECK-NEXT:    vctp.32 r2
149; CHECK-NEXT:    vmov q1, q0
150; CHECK-NEXT:    vpst
151; CHECK-NEXT:    vldrwt.u32 q0, [r0], #16
152; CHECK-NEXT:    subs r2, #4
153; CHECK-NEXT:    vadd.i32 q0, q0, q1
154; CHECK-NEXT:    le lr, .LBB2_2
155; CHECK-NEXT:  @ %bb.3: @ %middle.block
156; CHECK-NEXT:    vpsel q0, q0, q1
157; CHECK-NEXT:    vaddv.u32 r0, q0
158; CHECK-NEXT:    pop {r7, pc}
159entry:
160  %cmp6 = icmp eq i32 %N, 0
161  br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
162
163vector.ph:                                        ; preds = %entry
164  %n.rnd.up = add i32 %N, 3
165  %n.vec = and i32 %n.rnd.up, -4
166  br label %vector.body
167
168vector.body:                                      ; preds = %vector.body, %vector.ph
169  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
170  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
171  %0 = getelementptr inbounds i32, ptr %a, i32 %index
172  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
173  %2 = bitcast ptr %0 to ptr
174  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x i32> undef)
175  %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi
176  %index.next = add i32 %index, 4
177  %4 = icmp eq i32 %index.next, %n.vec
178  br i1 %4, label %middle.block, label %vector.body
179
180middle.block:                                     ; preds = %vector.body
181  %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi
182  %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5)
183  br label %for.cond.cleanup
184
185for.cond.cleanup:                                 ; preds = %middle.block, %entry
186  %res.0.lcssa = phi i32 [ 0, %entry ], [ %6, %middle.block ]
187  ret i32 %res.0.lcssa
188}
189
190define dso_local void @vector_mul_const(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i32 %c, i32 %N) {
191; CHECK-LABEL: vector_mul_const:
192; CHECK:       @ %bb.0: @ %entry
193; CHECK-NEXT:    push {r7, lr}
194; CHECK-NEXT:    cmp r3, #0
195; CHECK-NEXT:    it eq
196; CHECK-NEXT:    popeq {r7, pc}
197; CHECK-NEXT:  .LBB3_1: @ %vector.ph
198; CHECK-NEXT:    dlstp.32 lr, r3
199; CHECK-NEXT:  .LBB3_2: @ %vector.body
200; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
201; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
202; CHECK-NEXT:    vmul.i32 q0, q0, r2
203; CHECK-NEXT:    vstrw.32 q0, [r0], #16
204; CHECK-NEXT:    letp lr, .LBB3_2
205; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
206; CHECK-NEXT:    pop {r7, pc}
207entry:
208  %cmp6 = icmp eq i32 %N, 0
209  br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
210
211vector.ph:                                        ; preds = %entry
212  %n.rnd.up = add i32 %N, 3
213  %n.vec = and i32 %n.rnd.up, -4
214  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0
215  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
216  br label %vector.body
217
218vector.body:                                      ; preds = %vector.body, %vector.ph
219  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
220  %0 = getelementptr inbounds i32, ptr %b, i32 %index
221  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
222  %2 = bitcast ptr %0 to ptr
223  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x i32> undef)
224  %3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat11
225  %4 = getelementptr inbounds i32, ptr %a, i32 %index
226  %5 = bitcast ptr %4 to ptr
227  call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %5, i32 4, <4 x i1> %1)
228  %index.next = add i32 %index, 4
229  %6 = icmp eq i32 %index.next, %n.vec
230  br i1 %6, label %for.cond.cleanup, label %vector.body
231
232for.cond.cleanup:                                 ; preds = %vector.body, %entry
233  ret void
234}
235
236define dso_local void @vector_add_const(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i32 %c, i32 %N) {
237; CHECK-LABEL: vector_add_const:
238; CHECK:       @ %bb.0: @ %entry
239; CHECK-NEXT:    push {r7, lr}
240; CHECK-NEXT:    cmp r3, #0
241; CHECK-NEXT:    it eq
242; CHECK-NEXT:    popeq {r7, pc}
243; CHECK-NEXT:  .LBB4_1: @ %vector.ph
244; CHECK-NEXT:    dlstp.32 lr, r3
245; CHECK-NEXT:  .LBB4_2: @ %vector.body
246; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
247; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
248; CHECK-NEXT:    vadd.i32 q0, q0, r2
249; CHECK-NEXT:    vstrw.32 q0, [r0], #16
250; CHECK-NEXT:    letp lr, .LBB4_2
251; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
252; CHECK-NEXT:    pop {r7, pc}
253entry:
254  %cmp6 = icmp eq i32 %N, 0
255  br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
256
257vector.ph:                                        ; preds = %entry
258  %n.rnd.up = add i32 %N, 3
259  %n.vec = and i32 %n.rnd.up, -4
260  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0
261  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
262  br label %vector.body
263
264vector.body:                                      ; preds = %vector.body, %vector.ph
265  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
266  %0 = getelementptr inbounds i32, ptr %b, i32 %index
267  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
268  %2 = bitcast ptr %0 to ptr
269  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x i32> undef)
270  %3 = add nsw <4 x i32> %wide.masked.load, %broadcast.splat11
271  %4 = getelementptr inbounds i32, ptr %a, i32 %index
272  %5 = bitcast ptr %4 to ptr
273  call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %5, i32 4, <4 x i1> %1)
274  %index.next = add i32 %index, 4
275  %6 = icmp eq i32 %index.next, %n.vec
276  br i1 %6, label %for.cond.cleanup, label %vector.body
277
278for.cond.cleanup:                                 ; preds = %vector.body, %entry
279  ret void
280}
281
282define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i8(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %c, i32 %N) {
283; CHECK-LABEL: vector_mul_vector_i8:
284; CHECK:       @ %bb.0: @ %entry
285; CHECK-NEXT:    push {r7, lr}
286; CHECK-NEXT:    cmp r3, #0
287; CHECK-NEXT:    it eq
288; CHECK-NEXT:    popeq {r7, pc}
289; CHECK-NEXT:  .LBB5_1: @ %vector.ph
290; CHECK-NEXT:    dlstp.8 lr, r3
291; CHECK-NEXT:  .LBB5_2: @ %vector.body
292; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
293; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
294; CHECK-NEXT:    vldrb.u8 q1, [r2], #16
295; CHECK-NEXT:    vmul.i8 q0, q1, q0
296; CHECK-NEXT:    vstrb.8 q0, [r0], #16
297; CHECK-NEXT:    letp lr, .LBB5_2
298; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
299; CHECK-NEXT:    pop {r7, pc}
300entry:
301  %cmp10 = icmp eq i32 %N, 0
302  br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
303
304vector.ph:                                        ; preds = %entry
305  %n.rnd.up = add i32 %N, 15
306  %n.vec = and i32 %n.rnd.up, -16
307  br label %vector.body
308
309vector.body:                                      ; preds = %vector.body, %vector.ph
310  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
311  %0 = getelementptr inbounds i8, ptr %b, i32 %index
312  %1 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
313  %2 = bitcast ptr %0 to ptr
314  %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %2, i32 1, <16 x i1> %1, <16 x i8> undef)
315  %3 = getelementptr inbounds i8, ptr %c, i32 %index
316  %4 = bitcast ptr %3 to ptr
317  %wide.masked.load14 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %4, i32 1, <16 x i1> %1, <16 x i8> undef)
318  %5 = mul <16 x i8> %wide.masked.load14, %wide.masked.load
319  %6 = getelementptr inbounds i8, ptr %a, i32 %index
320  %7 = bitcast ptr %6 to ptr
321  call void @llvm.masked.store.v16i8.p0(<16 x i8> %5, ptr %7, i32 1, <16 x i1> %1)
322  %index.next = add i32 %index, 16
323  %8 = icmp eq i32 %index.next, %n.vec
324  br i1 %8, label %for.cond.cleanup, label %vector.body
325
326for.cond.cleanup:                                 ; preds = %vector.body, %entry
327  ret void
328}
329
330; Function Attrs: nofree norecurse nounwind
331define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i16(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %c, i32 %N) local_unnamed_addr #0 {
332; CHECK-LABEL: vector_mul_vector_i16:
333; CHECK:       @ %bb.0: @ %entry
334; CHECK-NEXT:    push {r7, lr}
335; CHECK-NEXT:    cmp r3, #0
336; CHECK-NEXT:    it eq
337; CHECK-NEXT:    popeq {r7, pc}
338; CHECK-NEXT:  .LBB6_1: @ %vector.ph
339; CHECK-NEXT:    dlstp.16 lr, r3
340; CHECK-NEXT:  .LBB6_2: @ %vector.body
341; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
342; CHECK-NEXT:    vldrh.u16 q0, [r1], #16
343; CHECK-NEXT:    vldrh.u16 q1, [r2], #16
344; CHECK-NEXT:    vmul.i16 q0, q1, q0
345; CHECK-NEXT:    vstrh.16 q0, [r0], #16
346; CHECK-NEXT:    letp lr, .LBB6_2
347; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
348; CHECK-NEXT:    pop {r7, pc}
349entry:
350  %cmp10 = icmp eq i32 %N, 0
351  br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
352
353vector.ph:                                        ; preds = %entry
354  %n.rnd.up = add i32 %N, 7
355  %n.vec = and i32 %n.rnd.up, -8
356  br label %vector.body
357
358vector.body:                                      ; preds = %vector.body, %vector.ph
359  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
360  %0 = getelementptr inbounds i16, ptr %b, i32 %index
361  %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
362  %2 = bitcast ptr %0 to ptr
363  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %2, i32 2, <8 x i1> %1, <8 x i16> undef)
364  %3 = getelementptr inbounds i16, ptr %c, i32 %index
365  %4 = bitcast ptr %3 to ptr
366  %wide.masked.load14 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %4, i32 2, <8 x i1> %1, <8 x i16> undef)
367  %5 = mul <8 x i16> %wide.masked.load14, %wide.masked.load
368  %6 = getelementptr inbounds i16, ptr %a, i32 %index
369  %7 = bitcast ptr %6 to ptr
370  call void @llvm.masked.store.v8i16.p0(<8 x i16> %5, ptr %7, i32 2, <8 x i1> %1)
371  %index.next = add i32 %index, 8
372  %8 = icmp eq i32 %index.next, %n.vec
373  br i1 %8, label %for.cond.cleanup, label %vector.body
374
375for.cond.cleanup:                                 ; preds = %vector.body, %entry
376  ret void
377}
378
379declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32 immarg, <16 x i1>, <16 x i8>)
380declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32 immarg, <8 x i1>, <8 x i16>)
381declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>)
382declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32 immarg, <16 x i1>)
383declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32 immarg, <8 x i1>)
384declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>)
385declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
386declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
387declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
388declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
389