xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll (revision b31fffbc7f1e0491bf599e82b7195e320d26e140)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedldst -enable-mem-access-versioning=false -tail-predication=force-enabled %s -o - | FileCheck %s
3
4define dso_local void @mve_gather_qi_wb(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, ptr noalias nocapture %C, i32 %n, i32 %m, i32 %l) {
5; CHECK-LABEL: mve_gather_qi_wb:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    .save {r4, lr}
8; CHECK-NEXT:    push {r4, lr}
9; CHECK-NEXT:    add.w r4, r0, r3, lsl #2
10; CHECK-NEXT:    adr r0, .LCPI0_0
11; CHECK-NEXT:    vldrw.u32 q1, [r0]
12; CHECK-NEXT:    vmov.i32 q0, #0x0
13; CHECK-NEXT:    vadd.i32 q1, q1, r1
14; CHECK-NEXT:    adds r1, r3, #4
15; CHECK-NEXT:    dlstp.32 lr, r3
16; CHECK-NEXT:  .LBB0_1: @ %vector.body
17; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
18; CHECK-NEXT:    vldrw.u32 q2, [r4], #16
19; CHECK-NEXT:    vldrw.u32 q3, [q1, #80]!
20; CHECK-NEXT:    vmul.i32 q2, q3, q2
21; CHECK-NEXT:    vadd.i32 q0, q0, q2
22; CHECK-NEXT:    letp lr, .LBB0_1
23; CHECK-NEXT:  @ %bb.2: @ %middle.block
24; CHECK-NEXT:    vaddv.u32 r0, q0
25; CHECK-NEXT:    str.w r0, [r2, r1, lsl #2]
26; CHECK-NEXT:    pop {r4, pc}
27; CHECK-NEXT:    .p2align 4
28; CHECK-NEXT:  @ %bb.3:
29; CHECK-NEXT:  .LCPI0_0:
30; CHECK-NEXT:    .long 4294967228 @ 0xffffffbc
31; CHECK-NEXT:    .long 4294967248 @ 0xffffffd0
32; CHECK-NEXT:    .long 4294967268 @ 0xffffffe4
33; CHECK-NEXT:    .long 4294967288 @ 0xfffffff8
34entry:                                  ; preds = %middle.
35  %add.us.us = add i32 4, %n
36  %arrayidx.us.us = getelementptr inbounds i32, ptr %C, i32 %add.us.us
37  br label %vector.body
38vector.body:                                      ; preds = %vector.body, %entry
39  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
40  %vec.phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %7, %vector.body ]
41  %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %entry ], [ %vec.ind.next, %vector.body ]
42  %0 = add i32 %index, %n
43  %1 = getelementptr inbounds i32, ptr %A, i32 %0
44  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
45  %2 = bitcast ptr %1 to ptr
46  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
47  %3 = mul <4 x i32> %vec.ind, <i32 5, i32 5, i32 5, i32 5>
48  %4 = add <4 x i32> %3, <i32 3, i32 3, i32 3, i32 3>
49  %5 = getelementptr inbounds i32, ptr %B, <4 x i32> %4
50  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %5, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
51  %6 = mul nsw <4 x i32> %wide.masked.gather, %wide.masked.load
52  %7 = add <4 x i32> %vec.phi, %6
53  %index.next = add i32 %index, 4
54  %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
55  %8 = icmp eq i32 %index.next, 5000
56  br i1 %8, label %middle.block, label %vector.body
57middle.block:                                     ; preds = %vector.body
58  %9 = select <4 x i1> %active.lane.mask, <4 x i32> %7, <4 x i32> %vec.phi
59  %10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7)
60  store i32 %10, ptr %arrayidx.us.us, align 4
61  %inc21.us.us = add nuw i32 4, 1
62  %exitcond81.not = icmp eq i32 %inc21.us.us, %n
63  br label %end
64end:                                 ; preds = %middle.block
65  ret void
66}
67
68define dso_local void @mve_gatherscatter_offset(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, ptr noalias nocapture %C, i32 %n, i32 %m, i32 %l) {
69; CHECK-LABEL: mve_gatherscatter_offset:
70; CHECK:       @ %bb.0: @ %entry
71; CHECK-NEXT:    .save {r4, lr}
72; CHECK-NEXT:    push {r4, lr}
73; CHECK-NEXT:    .vsave {d8, d9}
74; CHECK-NEXT:    vpush {d8, d9}
75; CHECK-NEXT:    add.w r4, r0, r3, lsl #2
76; CHECK-NEXT:    adr r0, .LCPI1_0
77; CHECK-NEXT:    vldrw.u32 q2, [r0]
78; CHECK-NEXT:    add.w r12, r3, #4
79; CHECK-NEXT:    vmov.i32 q0, #0x0
80; CHECK-NEXT:    vmov.i32 q1, #0x14
81; CHECK-NEXT:    dlstp.32 lr, r3
82; CHECK-NEXT:  .LBB1_1: @ %vector.body
83; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
84; CHECK-NEXT:    vldrw.u32 q3, [r1, q2, uxtw #2]
85; CHECK-NEXT:    vldrw.u32 q4, [r4], #16
86; CHECK-NEXT:    vmul.i32 q3, q3, q4
87; CHECK-NEXT:    vstrw.32 q3, [r1, q2, uxtw #2]
88; CHECK-NEXT:    vadd.i32 q2, q2, q1
89; CHECK-NEXT:    vadd.i32 q0, q0, q3
90; CHECK-NEXT:    letp lr, .LBB1_1
91; CHECK-NEXT:  @ %bb.2: @ %middle.block
92; CHECK-NEXT:    vaddv.u32 r0, q0
93; CHECK-NEXT:    str.w r0, [r2, r12, lsl #2]
94; CHECK-NEXT:    vpop {d8, d9}
95; CHECK-NEXT:    pop {r4, pc}
96; CHECK-NEXT:    .p2align 4
97; CHECK-NEXT:  @ %bb.3:
98; CHECK-NEXT:  .LCPI1_0:
99; CHECK-NEXT:    .long 3 @ 0x3
100; CHECK-NEXT:    .long 8 @ 0x8
101; CHECK-NEXT:    .long 13 @ 0xd
102; CHECK-NEXT:    .long 18 @ 0x12
103entry:                                  ; preds = %middle.
104  %add.us.us = add i32 4, %n
105  %arrayidx.us.us = getelementptr inbounds i32, ptr %C, i32 %add.us.us
106  br label %vector.body
107vector.body:                                      ; preds = %vector.body, %entry
108  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
109  %vec.phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %7, %vector.body ]
110  %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %entry ], [ %vec.ind.next, %vector.body ]
111  %0 = add i32 %index, %n
112  %1 = getelementptr inbounds i32, ptr %A, i32 %0
113  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
114  %2 = bitcast ptr %1 to ptr
115  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
116  %3 = mul <4 x i32> %vec.ind, <i32 5, i32 5, i32 5, i32 5>
117  %4 = add <4 x i32> %3, <i32 3, i32 3, i32 3, i32 3>
118  %5 = getelementptr inbounds i32, ptr %B, <4 x i32> %4
119  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %5, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
120  %6 = mul nsw <4 x i32> %wide.masked.gather, %wide.masked.load
121  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %6, <4 x ptr> %5, i32 4, <4 x i1> %active.lane.mask)
122  %7 = add <4 x i32> %vec.phi, %6
123  %index.next = add i32 %index, 4
124  %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
125  %8 = icmp eq i32 %index.next, 5000
126  br i1 %8, label %middle.block, label %vector.body
127middle.block:                                     ; preds = %vector.body
128  %9 = select <4 x i1> %active.lane.mask, <4 x i32> %7, <4 x i32> %vec.phi
129  %10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7)
130  store i32 %10, ptr %arrayidx.us.us, align 4
131  %inc21.us.us = add nuw i32 4, 1
132  %exitcond81.not = icmp eq i32 %inc21.us.us, %n
133  br label %end
134end:                                 ; preds = %middle.block
135  ret void
136}
137
138define dso_local void @mve_scatter_qi(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, ptr noalias nocapture %C, i32 %n, i32 %m, i32 %l) {
139; CHECK-LABEL: mve_scatter_qi:
140; CHECK:       @ %bb.0: @ %entry
141; CHECK-NEXT:    .save {r4, lr}
142; CHECK-NEXT:    push {r4, lr}
143; CHECK-NEXT:    add.w r4, r0, r3, lsl #2
144; CHECK-NEXT:    adr r0, .LCPI2_0
145; CHECK-NEXT:    vldrw.u32 q1, [r0]
146; CHECK-NEXT:    add.w r12, r3, #4
147; CHECK-NEXT:    vmov.i32 q0, #0x0
148; CHECK-NEXT:    movw lr, #1250
149; CHECK-NEXT:    vadd.i32 q1, q1, r1
150; CHECK-NEXT:    movs r1, #3
151; CHECK-NEXT:  .LBB2_1: @ %vector.body
152; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
153; CHECK-NEXT:    vctp.32 r3
154; CHECK-NEXT:    subs r3, #4
155; CHECK-NEXT:    vpst
156; CHECK-NEXT:    vldrwt.u32 q2, [r4], #16
157; CHECK-NEXT:    vmul.i32 q3, q2, r1
158; CHECK-NEXT:    vmla.i32 q0, q2, r1
159; CHECK-NEXT:    vpst
160; CHECK-NEXT:    vstrwt.32 q3, [q1, #80]!
161; CHECK-NEXT:    le lr, .LBB2_1
162; CHECK-NEXT:  @ %bb.2: @ %middle.block
163; CHECK-NEXT:    vaddv.u32 r0, q0
164; CHECK-NEXT:    str.w r0, [r2, r12, lsl #2]
165; CHECK-NEXT:    pop {r4, pc}
166; CHECK-NEXT:    .p2align 4
167; CHECK-NEXT:  @ %bb.3:
168; CHECK-NEXT:  .LCPI2_0:
169; CHECK-NEXT:    .long 4294967228 @ 0xffffffbc
170; CHECK-NEXT:    .long 4294967248 @ 0xffffffd0
171; CHECK-NEXT:    .long 4294967268 @ 0xffffffe4
172; CHECK-NEXT:    .long 4294967288 @ 0xfffffff8
173entry:                                  ; preds = %middle.
174  %add.us.us = add i32 4, %n
175  %arrayidx.us.us = getelementptr inbounds i32, ptr %C, i32 %add.us.us
176  br label %vector.body
177vector.body:                                      ; preds = %vector.body, %entry
178  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
179  %vec.phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %7, %vector.body ]
180  %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %entry ], [ %vec.ind.next, %vector.body ]
181  %0 = add i32 %index, %n
182  %1 = getelementptr inbounds i32, ptr %A, i32 %0
183  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
184  %2 = bitcast ptr %1 to ptr
185  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
186  %3 = mul <4 x i32> %vec.ind, <i32 5, i32 5, i32 5, i32 5>
187  %4 = add <4 x i32> %3, <i32 3, i32 3, i32 3, i32 3>
188  %5 = getelementptr inbounds i32, ptr %B, <4 x i32> %4
189  %6 = mul nsw <4 x i32> <i32 3, i32 3, i32 3, i32 3>, %wide.masked.load
190  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %6, <4 x ptr> %5, i32 4, <4 x i1> %active.lane.mask)
191  %7 = add <4 x i32> %vec.phi, %6
192  %index.next = add i32 %index, 4
193  %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
194  %8 = icmp eq i32 %index.next, 5000
195  br i1 %8, label %middle.block, label %vector.body
196middle.block:                                     ; preds = %vector.body
197  %9 = select <4 x i1> %active.lane.mask, <4 x i32> %7, <4 x i32> %vec.phi
198  %10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7)
199  store i32 %10, ptr %arrayidx.us.us, align 4
200  %inc21.us.us = add nuw i32 4, 1
201  %exitcond81.not = icmp eq i32 %inc21.us.us, %n
202  br label %end
203end:                                 ; preds = %middle.block
204  ret void
205}
206
207define void @justoffsets(ptr noalias nocapture readonly %r, ptr noalias nocapture %w, i32 %N) {
208; CHECK-LABEL: justoffsets:
209; CHECK:       @ %bb.0: @ %entry
210; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
211; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
212; CHECK-NEXT:    .pad #4
213; CHECK-NEXT:    sub sp, #4
214; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
215; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
216; CHECK-NEXT:    cmp r2, #0
217; CHECK-NEXT:    beq .LBB3_3
218; CHECK-NEXT:  @ %bb.1: @ %vector.ph
219; CHECK-NEXT:    adr r5, .LCPI3_1
220; CHECK-NEXT:    adr r4, .LCPI3_0
221; CHECK-NEXT:    vldrw.u32 q0, [r5]
222; CHECK-NEXT:    adr r5, .LCPI3_2
223; CHECK-NEXT:    movw r9, #47888
224; CHECK-NEXT:    movw r10, #50417
225; CHECK-NEXT:    vldrw.u32 q1, [r5]
226; CHECK-NEXT:    vldrw.u32 q2, [r4]
227; CHECK-NEXT:    movw r4, #32769
228; CHECK-NEXT:    movw r12, #7471
229; CHECK-NEXT:    mov.w r3, #32768
230; CHECK-NEXT:    movw r11, #38470
231; CHECK-NEXT:    movw r8, #19595
232; CHECK-NEXT:    movt r9, #65535
233; CHECK-NEXT:    movt r10, #65535
234; CHECK-NEXT:    movw r7, #32767
235; CHECK-NEXT:    movt r4, #65535
236; CHECK-NEXT:    movw r5, #13282
237; CHECK-NEXT:    movw r6, #19485
238; CHECK-NEXT:    dlstp.32 lr, r2
239; CHECK-NEXT:  .LBB3_2: @ %vector.body
240; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
241; CHECK-NEXT:    vldrb.u32 q4, [r0, q0]
242; CHECK-NEXT:    vldrb.u32 q3, [r0, q1]
243; CHECK-NEXT:    vldrb.u32 q5, [r0, q2]
244; CHECK-NEXT:    adds r0, #12
245; CHECK-NEXT:    vmul.i32 q6, q4, r11
246; CHECK-NEXT:    vmla.i32 q6, q3, r8
247; CHECK-NEXT:    vmla.i32 q6, q5, r12
248; CHECK-NEXT:    vadd.i32 q6, q6, r3
249; CHECK-NEXT:    vshr.u32 q6, q6, #16
250; CHECK-NEXT:    vstrb.32 q6, [r1, q1]
251; CHECK-NEXT:    vmul.i32 q6, q4, r4
252; CHECK-NEXT:    vmul.i32 q4, q4, r10
253; CHECK-NEXT:    vmla.i32 q6, q3, r5
254; CHECK-NEXT:    vmla.i32 q4, q3, r7
255; CHECK-NEXT:    vmla.i32 q6, q5, r6
256; CHECK-NEXT:    vmla.i32 q4, q5, r9
257; CHECK-NEXT:    vadd.i32 q6, q6, r3
258; CHECK-NEXT:    vadd.i32 q3, q4, r3
259; CHECK-NEXT:    vshr.u32 q6, q6, #16
260; CHECK-NEXT:    vshr.u32 q3, q3, #16
261; CHECK-NEXT:    vstrb.32 q3, [r1, q0]
262; CHECK-NEXT:    vstrb.32 q6, [r1, q2]
263; CHECK-NEXT:    adds r1, #12
264; CHECK-NEXT:    letp lr, .LBB3_2
265; CHECK-NEXT:  .LBB3_3: @ %for.cond.cleanup
266; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
267; CHECK-NEXT:    add sp, #4
268; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
269; CHECK-NEXT:    .p2align 4
270; CHECK-NEXT:  @ %bb.4:
271; CHECK-NEXT:  .LCPI3_0:
272; CHECK-NEXT:    .long 2 @ 0x2
273; CHECK-NEXT:    .long 5 @ 0x5
274; CHECK-NEXT:    .long 8 @ 0x8
275; CHECK-NEXT:    .long 11 @ 0xb
276; CHECK-NEXT:  .LCPI3_1:
277; CHECK-NEXT:    .long 1 @ 0x1
278; CHECK-NEXT:    .long 4 @ 0x4
279; CHECK-NEXT:    .long 7 @ 0x7
280; CHECK-NEXT:    .long 10 @ 0xa
281; CHECK-NEXT:  .LCPI3_2:
282; CHECK-NEXT:    .long 0 @ 0x0
283; CHECK-NEXT:    .long 3 @ 0x3
284; CHECK-NEXT:    .long 6 @ 0x6
285; CHECK-NEXT:    .long 9 @ 0x9
286entry:
287  %cmp47.not = icmp eq i32 %N, 0
288  br i1 %cmp47.not, label %for.cond.cleanup, label %vector.ph
289
290vector.ph:                                        ; preds = %vector.memcheck
291  %n.rnd.up = add i32 %N, 3
292  %n.vec = and i32 %n.rnd.up, -4
293  br label %vector.body
294
295vector.body:                                      ; preds = %vector.body, %vector.ph
296  %pointer.phi = phi ptr [ %r, %vector.ph ], [ %ptr.ind, %vector.body ]
297  %pointer.phi55 = phi ptr [ %w, %vector.ph ], [ %ptr.ind56, %vector.body ]
298  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
299  %l1 = getelementptr i8, ptr %pointer.phi, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
300  %l2 = getelementptr i8, ptr %pointer.phi55, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
301  %l3 = getelementptr inbounds i8, <4 x ptr> %l1, i32 1
302  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
303  %wide.masked.gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %l1, i32 1, <4 x i1> %active.lane.mask, <4 x i8> undef)
304  %l4 = getelementptr inbounds i8, <4 x ptr> %l1, i32 2
305  %wide.masked.gather57 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %l3, i32 1, <4 x i1> %active.lane.mask, <4 x i8> undef)
306  %wide.masked.gather58 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %l4, i32 1, <4 x i1> %active.lane.mask, <4 x i8> undef)
307  %l5 = zext <4 x i8> %wide.masked.gather to <4 x i32>
308  %l6 = mul nuw nsw <4 x i32> %l5, <i32 19595, i32 19595, i32 19595, i32 19595>
309  %l7 = zext <4 x i8> %wide.masked.gather57 to <4 x i32>
310  %l8 = mul nuw nsw <4 x i32> %l7, <i32 38470, i32 38470, i32 38470, i32 38470>
311  %l9 = zext <4 x i8> %wide.masked.gather58 to <4 x i32>
312  %l10 = mul nuw nsw <4 x i32> %l9, <i32 7471, i32 7471, i32 7471, i32 7471>
313  %l11 = add nuw nsw <4 x i32> %l6, <i32 32768, i32 32768, i32 32768, i32 32768>
314  %l12 = add nuw nsw <4 x i32> %l11, %l8
315  %l13 = add nuw nsw <4 x i32> %l12, %l10
316  %l14 = lshr <4 x i32> %l13, <i32 16, i32 16, i32 16, i32 16>
317  %l15 = trunc <4 x i32> %l14 to <4 x i8>
318  %l16 = mul nuw nsw <4 x i32> %l5, <i32 32767, i32 32767, i32 32767, i32 32767>
319  %l17 = mul nsw <4 x i32> %l7, <i32 -15119, i32 -15119, i32 -15119, i32 -15119>
320  %l18 = mul nsw <4 x i32> %l9, <i32 -17648, i32 -17648, i32 -17648, i32 -17648>
321  %l19 = add nuw nsw <4 x i32> %l16, <i32 32768, i32 32768, i32 32768, i32 32768>
322  %l20 = add nsw <4 x i32> %l19, %l17
323  %l21 = add nsw <4 x i32> %l20, %l18
324  %l22 = lshr <4 x i32> %l21, <i32 16, i32 16, i32 16, i32 16>
325  %l23 = trunc <4 x i32> %l22 to <4 x i8>
326  %l24 = mul nuw nsw <4 x i32> %l5, <i32 13282, i32 13282, i32 13282, i32 13282>
327  %l25 = mul nsw <4 x i32> %l7, <i32 -32767, i32 -32767, i32 -32767, i32 -32767>
328  %l26 = mul nuw nsw <4 x i32> %l9, <i32 19485, i32 19485, i32 19485, i32 19485>
329  %l27 = add nuw nsw <4 x i32> %l24, <i32 32768, i32 32768, i32 32768, i32 32768>
330  %l28 = add nsw <4 x i32> %l27, %l25
331  %l29 = add nsw <4 x i32> %l28, %l26
332  %l30 = lshr <4 x i32> %l29, <i32 16, i32 16, i32 16, i32 16>
333  %l31 = trunc <4 x i32> %l30 to <4 x i8>
334  %l32 = getelementptr inbounds i8, <4 x ptr> %l2, i32 1
335  call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %l15, <4 x ptr> %l2, i32 1, <4 x i1> %active.lane.mask)
336  %l33 = getelementptr inbounds i8, <4 x ptr> %l2, i32 2
337  call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %l23, <4 x ptr> %l32, i32 1, <4 x i1> %active.lane.mask)
338  call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %l31, <4 x ptr> %l33, i32 1, <4 x i1> %active.lane.mask)
339  %index.next = add i32 %index, 4
340  %l34 = icmp eq i32 %index.next, %n.vec
341  %ptr.ind = getelementptr i8, ptr %pointer.phi, i32 12
342  %ptr.ind56 = getelementptr i8, ptr %pointer.phi55, i32 12
343  br i1 %l34, label %for.cond.cleanup, label %vector.body
344
345for.cond.cleanup:                                 ; preds = %vector.body, %for.body, %entry
346  ret void
347}
348
349declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
350declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>)
351declare <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i8>)
352declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
353declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>)
354declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, i32, <4 x i1>)
355declare void @llvm.masked.scatter.v4i8.v4p0(<4 x i8>, <4 x ptr>, i32, <4 x i1>)
356