xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll (revision 7b3bbd83c0c24087072ec5b22a76799ab31f87d5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedldst %s -o - | FileCheck %s
3
4
5define arm_aapcs_vfpcc void @scatter_inc_minipred_4i32(<4 x i32> %data, ptr %dst, <4 x i32> %offs) {
6; CHECK-LABEL: scatter_inc_minipred_4i32:
7; CHECK:       @ %bb.0:
8; CHECK-NEXT:    movs r1, #4
9; CHECK-NEXT:    movw r2, #3855
10; CHECK-NEXT:    vadd.i32 q1, q1, r1
11; CHECK-NEXT:    vmsr p0, r2
12; CHECK-NEXT:    vpst
13; CHECK-NEXT:    vstrwt.32 q0, [r0, q1, uxtw #2]
14; CHECK-NEXT:    bx lr
15  %1 = add <4 x i32> %offs, <i32 4, i32 4, i32 4, i32 4>
16  %2 = getelementptr inbounds i32, ptr %dst, <4 x i32> %1
17  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> %2, i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>)
18  ret void
19}
20
21define arm_aapcs_vfpcc void @scatter_inc_mini_8i16(<8 x i16> %data, ptr %dst, <8 x i32> %offs) {
22; CHECK-LABEL: scatter_inc_mini_8i16:
23; CHECK:       @ %bb.0:
24; CHECK-NEXT:    .save {r4, r5, r6, lr}
25; CHECK-NEXT:    push {r4, r5, r6, lr}
26; CHECK-NEXT:    vshl.i32 q1, q1, #1
27; CHECK-NEXT:    mov.w r12, #16
28; CHECK-NEXT:    vadd.i32 q1, q1, r0
29; CHECK-NEXT:    vmov.u16 r6, q0[0]
30; CHECK-NEXT:    vadd.i32 q1, q1, r12
31; CHECK-NEXT:    vmov r2, r3, d2
32; CHECK-NEXT:    vmov r1, lr, d3
33; CHECK-NEXT:    vshl.i32 q1, q2, #1
34; CHECK-NEXT:    vadd.i32 q1, q1, r0
35; CHECK-NEXT:    vadd.i32 q1, q1, r12
36; CHECK-NEXT:    vmov r0, r12, d2
37; CHECK-NEXT:    vmov r4, r5, d3
38; CHECK-NEXT:    strh r6, [r2]
39; CHECK-NEXT:    vmov.u16 r2, q0[1]
40; CHECK-NEXT:    strh r2, [r3]
41; CHECK-NEXT:    vmov.u16 r2, q0[2]
42; CHECK-NEXT:    strh r2, [r1]
43; CHECK-NEXT:    vmov.u16 r1, q0[3]
44; CHECK-NEXT:    strh.w r1, [lr]
45; CHECK-NEXT:    vmov.u16 r1, q0[4]
46; CHECK-NEXT:    strh r1, [r0]
47; CHECK-NEXT:    vmov.u16 r0, q0[5]
48; CHECK-NEXT:    strh.w r0, [r12]
49; CHECK-NEXT:    vmov.u16 r0, q0[6]
50; CHECK-NEXT:    strh r0, [r4]
51; CHECK-NEXT:    vmov.u16 r0, q0[7]
52; CHECK-NEXT:    strh r0, [r5]
53; CHECK-NEXT:    pop {r4, r5, r6, pc}
54  %1 = add <8 x i32> %offs, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
55  %2 = getelementptr inbounds i16, ptr %dst, <8 x i32> %1
56  call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %data, <8 x ptr> %2, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
57  ret void
58}
59
60define arm_aapcs_vfpcc void @scatter_inc_mini_16i8(<16 x i8> %data, ptr %dst, <16 x i32> %offs) {
61; CHECK-LABEL: scatter_inc_mini_16i8:
62; CHECK:       @ %bb.0:
63; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
64; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
65; CHECK-NEXT:    .pad #4
66; CHECK-NEXT:    sub sp, #4
67; CHECK-NEXT:    movs r1, #16
68; CHECK-NEXT:    vadd.i32 q1, q1, r0
69; CHECK-NEXT:    vadd.i32 q1, q1, r1
70; CHECK-NEXT:    add.w r12, sp, #32
71; CHECK-NEXT:    vmov r2, r3, d2
72; CHECK-NEXT:    vadd.i32 q3, q3, r0
73; CHECK-NEXT:    vmov lr, r5, d3
74; CHECK-NEXT:    vadd.i32 q1, q2, r0
75; CHECK-NEXT:    vadd.i32 q2, q1, r1
76; CHECK-NEXT:    vldrw.u32 q1, [r12]
77; CHECK-NEXT:    vmov r4, r12, d4
78; CHECK-NEXT:    vmov.u8 r6, q0[0]
79; CHECK-NEXT:    vadd.i32 q1, q1, r0
80; CHECK-NEXT:    vmov r0, r8, d5
81; CHECK-NEXT:    vadd.i32 q3, q3, r1
82; CHECK-NEXT:    vadd.i32 q1, q1, r1
83; CHECK-NEXT:    vmov.u8 r1, q0[4]
84; CHECK-NEXT:    vmov.u8 r7, q0[6]
85; CHECK-NEXT:    strb r6, [r2]
86; CHECK-NEXT:    vmov.u8 r2, q0[1]
87; CHECK-NEXT:    strb r2, [r3]
88; CHECK-NEXT:    vmov.u8 r6, q0[2]
89; CHECK-NEXT:    vmov r2, r9, d6
90; CHECK-NEXT:    strb.w r6, [lr]
91; CHECK-NEXT:    vmov.u8 r6, q0[3]
92; CHECK-NEXT:    vmov.u8 r3, q0[8]
93; CHECK-NEXT:    strb r6, [r5]
94; CHECK-NEXT:    vmov r6, r5, d7
95; CHECK-NEXT:    strb r1, [r4]
96; CHECK-NEXT:    vmov.u8 r1, q0[5]
97; CHECK-NEXT:    strb.w r1, [r12]
98; CHECK-NEXT:    vmov r1, r4, d2
99; CHECK-NEXT:    strb r7, [r0]
100; CHECK-NEXT:    vmov.u8 r0, q0[7]
101; CHECK-NEXT:    strb.w r0, [r8]
102; CHECK-NEXT:    vmov r0, r7, d3
103; CHECK-NEXT:    strb r3, [r2]
104; CHECK-NEXT:    vmov.u8 r2, q0[9]
105; CHECK-NEXT:    strb.w r2, [r9]
106; CHECK-NEXT:    vmov.u8 r2, q0[10]
107; CHECK-NEXT:    strb r2, [r6]
108; CHECK-NEXT:    vmov.u8 r2, q0[11]
109; CHECK-NEXT:    strb r2, [r5]
110; CHECK-NEXT:    vmov.u8 r2, q0[12]
111; CHECK-NEXT:    strb r2, [r1]
112; CHECK-NEXT:    vmov.u8 r1, q0[13]
113; CHECK-NEXT:    strb r1, [r4]
114; CHECK-NEXT:    vmov.u8 r1, q0[14]
115; CHECK-NEXT:    strb r1, [r0]
116; CHECK-NEXT:    vmov.u8 r0, q0[15]
117; CHECK-NEXT:    strb r0, [r7]
118; CHECK-NEXT:    add sp, #4
119; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
120  %1 = add <16 x i32> %offs, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
121  %2 = getelementptr inbounds i8, ptr %dst, <16 x i32> %1
122  call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %data, <16 x ptr> %2, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
123  ret void
124}
125
126define arm_aapcs_vfpcc void @scatter_inc_v4i32_complex(<4 x i32> %data1, <4 x i32> %data2, <4 x i32> %data3, ptr %dst, i32 %n) {
127; CHECK-LABEL: scatter_inc_v4i32_complex:
128; CHECK:       @ %bb.0: @ %entry
129; CHECK-NEXT:    cmp r1, #1
130; CHECK-NEXT:    it lt
131; CHECK-NEXT:    bxlt lr
132; CHECK-NEXT:  .LBB3_1: @ %vector.ph.preheader
133; CHECK-NEXT:    .save {r4, lr}
134; CHECK-NEXT:    push {r4, lr}
135; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
136; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
137; CHECK-NEXT:    .pad #16
138; CHECK-NEXT:    sub sp, #16
139; CHECK-NEXT:    adr r4, .LCPI3_2
140; CHECK-NEXT:    bic r2, r1, #3
141; CHECK-NEXT:    vldrw.u32 q3, [r4]
142; CHECK-NEXT:    sub.w r12, r2, #4
143; CHECK-NEXT:    adr.w lr, .LCPI3_1
144; CHECK-NEXT:    movs r3, #1
145; CHECK-NEXT:    vadd.i32 q3, q3, r0
146; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
147; CHECK-NEXT:    vstrw.32 q3, [sp] @ 16-byte Spill
148; CHECK-NEXT:    vldrw.u32 q3, [lr]
149; CHECK-NEXT:    adr.w r12, .LCPI3_0
150; CHECK-NEXT:    vadd.i32 q4, q3, r0
151; CHECK-NEXT:    vldrw.u32 q3, [r12]
152; CHECK-NEXT:    vadd.i32 q3, q3, r0
153; CHECK-NEXT:  .LBB3_2: @ %vector.ph
154; CHECK-NEXT:    @ =>This Loop Header: Depth=1
155; CHECK-NEXT:    @ Child Loop BB3_3 Depth 2
156; CHECK-NEXT:    dls lr, r3
157; CHECK-NEXT:    vmov q6, q4
158; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
159; CHECK-NEXT:    vmov q5, q3
160; CHECK-NEXT:  .LBB3_3: @ %vector.body
161; CHECK-NEXT:    @ Parent Loop BB3_2 Depth=1
162; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
163; CHECK-NEXT:    vstrw.32 q0, [q5, #48]!
164; CHECK-NEXT:    vstrw.32 q1, [q6, #48]!
165; CHECK-NEXT:    vstrw.32 q2, [q7, #48]!
166; CHECK-NEXT:    le lr, .LBB3_3
167; CHECK-NEXT:  @ %bb.4: @ %middle.block
168; CHECK-NEXT:    @ in Loop: Header=BB3_2 Depth=1
169; CHECK-NEXT:    cmp r2, r1
170; CHECK-NEXT:    bne .LBB3_2
171; CHECK-NEXT:  @ %bb.5:
172; CHECK-NEXT:    add sp, #16
173; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
174; CHECK-NEXT:    pop.w {r4, lr}
175; CHECK-NEXT:    bx lr
176; CHECK-NEXT:    .p2align 4
177; CHECK-NEXT:  @ %bb.6:
178; CHECK-NEXT:  .LCPI3_0:
179; CHECK-NEXT:    .long 4294967248 @ 0xffffffd0
180; CHECK-NEXT:    .long 4294967260 @ 0xffffffdc
181; CHECK-NEXT:    .long 4294967272 @ 0xffffffe8
182; CHECK-NEXT:    .long 4294967284 @ 0xfffffff4
183; CHECK-NEXT:  .LCPI3_1:
184; CHECK-NEXT:    .long 4294967252 @ 0xffffffd4
185; CHECK-NEXT:    .long 4294967264 @ 0xffffffe0
186; CHECK-NEXT:    .long 4294967276 @ 0xffffffec
187; CHECK-NEXT:    .long 4294967288 @ 0xfffffff8
188; CHECK-NEXT:  .LCPI3_2:
189; CHECK-NEXT:    .long 4294967256 @ 0xffffffd8
190; CHECK-NEXT:    .long 4294967268 @ 0xffffffe4
191; CHECK-NEXT:    .long 4294967280 @ 0xfffffff0
192; CHECK-NEXT:    .long 4294967292 @ 0xfffffffc
193entry:
194  %cmp22 = icmp sgt i32 %n, 0
195  br i1 %cmp22, label %vector.ph, label %for.cond.cleanup
196
197vector.ph:                                        ; preds = %for.body.preheader
198  %n.vec = and i32 %n, -4
199  br label %vector.body
200
201vector.body:                                      ; preds = %vector.body, %vector.ph
202  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
203  %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
204  %0 = mul nuw nsw <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
205  %1 = getelementptr inbounds i32, ptr %dst, <4 x i32> %0
206  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data1, <4 x ptr> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
207  %2 = add nuw nsw <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1>
208  %3 = getelementptr inbounds i32, ptr %dst, <4 x i32> %2
209  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data2, <4 x ptr> %3, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
210  %4 = add nuw nsw <4 x i32> %0, <i32 2, i32 2, i32 2, i32 2>
211  %5 = getelementptr inbounds i32, ptr %dst, <4 x i32> %4
212  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data3, <4 x ptr> %5, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
213  %index.next = add i32 %index, 4
214  %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
215  %6 = icmp eq i32 %index.next, %n.vec
216  br i1 %6, label %middle.block, label %vector.body
217
218middle.block:                                     ; preds = %vector.body
219  %cmp.n = icmp eq i32 %n.vec, %n
220  br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph
221
222for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
223  ret void
224}
225
226define void @shl(ptr nocapture readonly %x, ptr noalias nocapture %y, i32 %n) {
227; CHECK-LABEL: shl:
228; CHECK:       @ %bb.0: @ %entry
229; CHECK-NEXT:    .save {r7, lr}
230; CHECK-NEXT:    push {r7, lr}
231; CHECK-NEXT:    cmp r2, #1
232; CHECK-NEXT:    it lt
233; CHECK-NEXT:    poplt {r7, pc}
234; CHECK-NEXT:  .LBB4_1: @ %vector.ph
235; CHECK-NEXT:    adr r3, .LCPI4_0
236; CHECK-NEXT:    vldrw.u32 q0, [r3]
237; CHECK-NEXT:    vadd.i32 q0, q0, r1
238; CHECK-NEXT:    dlstp.32 lr, r2
239; CHECK-NEXT:  .LBB4_2: @ %vector.body
240; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
241; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
242; CHECK-NEXT:    vstrw.32 q1, [q0, #64]!
243; CHECK-NEXT:    letp lr, .LBB4_2
244; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
245; CHECK-NEXT:    pop {r7, pc}
246; CHECK-NEXT:    .p2align 4
247; CHECK-NEXT:  @ %bb.4:
248; CHECK-NEXT:  .LCPI4_0:
249; CHECK-NEXT:    .long 4294967232 @ 0xffffffc0
250; CHECK-NEXT:    .long 4294967248 @ 0xffffffd0
251; CHECK-NEXT:    .long 4294967264 @ 0xffffffe0
252; CHECK-NEXT:    .long 4294967280 @ 0xfffffff0
253entry:
254  %cmp6 = icmp sgt i32 %n, 0
255  br i1 %cmp6, label %vector.ph, label %for.cond.cleanup
256
257vector.ph:                                        ; preds = %entry
258  %n.rnd.up = add i32 %n, 3
259  %n.vec = and i32 %n.rnd.up, -4
260  br label %vector.body
261
262vector.body:                                      ; preds = %vector.body, %vector.ph
263  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
264  %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
265  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
266  %0 = getelementptr inbounds i32, ptr %x, i32 %index
267  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %0, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison)
268  %1 = shl nsw <4 x i32> %vec.ind, <i32 2, i32 2, i32 2, i32 2>
269  %2 = getelementptr inbounds i32, ptr %y, <4 x i32> %1
270  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %wide.masked.load, <4 x ptr> %2, i32 4, <4 x i1> %active.lane.mask)
271  %index.next = add i32 %index, 4
272  %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
273  %3 = icmp eq i32 %index.next, %n.vec
274  br i1 %3, label %for.cond.cleanup, label %vector.body
275
276for.cond.cleanup:                                 ; preds = %vector.body, %entry
277  ret void
278}
279
280define void @shlor(ptr nocapture readonly %x, ptr noalias nocapture %y, i32 %n) {
281; CHECK-LABEL: shlor:
282; CHECK:       @ %bb.0: @ %entry
283; CHECK-NEXT:    .save {r4, r5, r6, lr}
284; CHECK-NEXT:    push {r4, r5, r6, lr}
285; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
286; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
287; CHECK-NEXT:    cmp r2, #1
288; CHECK-NEXT:    blt .LBB5_3
289; CHECK-NEXT:  @ %bb.1: @ %vector.ph
290; CHECK-NEXT:    adr.w lr, .LCPI5_0
291; CHECK-NEXT:    adr r4, .LCPI5_1
292; CHECK-NEXT:    adr r5, .LCPI5_2
293; CHECK-NEXT:    adr r6, .LCPI5_3
294; CHECK-NEXT:    vldrw.u32 q2, [r4]
295; CHECK-NEXT:    vldrw.u32 q0, [r6]
296; CHECK-NEXT:    vldrw.u32 q1, [r5]
297; CHECK-NEXT:    vldrw.u32 q3, [lr]
298; CHECK-NEXT:    vadd.i32 q0, q0, r1
299; CHECK-NEXT:    vadd.i32 q1, q1, r1
300; CHECK-NEXT:    vadd.i32 q2, q2, r1
301; CHECK-NEXT:    vadd.i32 q3, q3, r1
302; CHECK-NEXT:    mov.w r12, #1
303; CHECK-NEXT:    movs r4, #3
304; CHECK-NEXT:    movs r3, #2
305; CHECK-NEXT:    movs r1, #4
306; CHECK-NEXT:    dlstp.32 lr, r2
307; CHECK-NEXT:  .LBB5_2: @ %vector.body
308; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
309; CHECK-NEXT:    vldrw.u32 q4, [r0], #16
310; CHECK-NEXT:    vadd.i32 q6, q4, r12
311; CHECK-NEXT:    vadd.i32 q5, q4, r1
312; CHECK-NEXT:    vstrw.32 q6, [q3, #128]!
313; CHECK-NEXT:    vadd.i32 q6, q4, r3
314; CHECK-NEXT:    vadd.i32 q4, q4, r4
315; CHECK-NEXT:    vstrw.32 q6, [q2, #128]!
316; CHECK-NEXT:    vstrw.32 q4, [q1, #128]!
317; CHECK-NEXT:    vstrw.32 q5, [q0, #128]!
318; CHECK-NEXT:    letp lr, .LBB5_2
319; CHECK-NEXT:  .LBB5_3: @ %for.cond.cleanup
320; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
321; CHECK-NEXT:    pop {r4, r5, r6, pc}
322; CHECK-NEXT:    .p2align 4
323; CHECK-NEXT:  @ %bb.4:
324; CHECK-NEXT:  .LCPI5_0:
325; CHECK-NEXT:    .long 4294967168 @ 0xffffff80
326; CHECK-NEXT:    .long 4294967200 @ 0xffffffa0
327; CHECK-NEXT:    .long 4294967232 @ 0xffffffc0
328; CHECK-NEXT:    .long 4294967264 @ 0xffffffe0
329; CHECK-NEXT:  .LCPI5_1:
330; CHECK-NEXT:    .long 4294967176 @ 0xffffff88
331; CHECK-NEXT:    .long 4294967208 @ 0xffffffa8
332; CHECK-NEXT:    .long 4294967240 @ 0xffffffc8
333; CHECK-NEXT:    .long 4294967272 @ 0xffffffe8
334; CHECK-NEXT:  .LCPI5_2:
335; CHECK-NEXT:    .long 4294967184 @ 0xffffff90
336; CHECK-NEXT:    .long 4294967216 @ 0xffffffb0
337; CHECK-NEXT:    .long 4294967248 @ 0xffffffd0
338; CHECK-NEXT:    .long 4294967280 @ 0xfffffff0
339; CHECK-NEXT:  .LCPI5_3:
340; CHECK-NEXT:    .long 4294967192 @ 0xffffff98
341; CHECK-NEXT:    .long 4294967224 @ 0xffffffb8
342; CHECK-NEXT:    .long 4294967256 @ 0xffffffd8
343; CHECK-NEXT:    .long 4294967288 @ 0xfffffff8
344entry:
345  %cmp33 = icmp sgt i32 %n, 0
346  br i1 %cmp33, label %vector.ph, label %for.cond.cleanup
347
348vector.ph:                                        ; preds = %entry
349  %n.rnd.up = add i32 %n, 3
350  %n.vec = and i32 %n.rnd.up, -4
351  br label %vector.body
352
353vector.body:                                      ; preds = %vector.body, %vector.ph
354  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
355  %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
356  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
357  %0 = getelementptr inbounds i32, ptr %x, i32 %index
358  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %0, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison)
359  %1 = add nsw <4 x i32> %wide.masked.load, <i32 1, i32 1, i32 1, i32 1>
360  %2 = shl nsw <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
361  %3 = getelementptr inbounds i32, ptr %y, <4 x i32> %2
362  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %1, <4 x ptr> %3, i32 4, <4 x i1> %active.lane.mask)
363  %4 = add nsw <4 x i32> %wide.masked.load, <i32 2, i32 2, i32 2, i32 2>
364  %5 = or <4 x i32> %2, <i32 2, i32 2, i32 2, i32 2>
365  %6 = getelementptr inbounds i32, ptr %y, <4 x i32> %5
366  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %4, <4 x ptr> %6, i32 4, <4 x i1> %active.lane.mask)
367  %7 = add nsw <4 x i32> %wide.masked.load, <i32 3, i32 3, i32 3, i32 3>
368  %8 = or <4 x i32> %2, <i32 4, i32 4, i32 4, i32 4>
369  %9 = getelementptr inbounds i32, ptr %y, <4 x i32> %8
370  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %7, <4 x ptr> %9, i32 4, <4 x i1> %active.lane.mask)
371  %10 = add nsw <4 x i32> %wide.masked.load, <i32 4, i32 4, i32 4, i32 4>
372  %11 = or <4 x i32> %2, <i32 6, i32 6, i32 6, i32 6>
373  %12 = getelementptr inbounds i32, ptr %y, <4 x i32> %11
374  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %10, <4 x ptr> %12, i32 4, <4 x i1> %active.lane.mask)
375  %index.next = add i32 %index, 4
376  %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
377  %13 = icmp eq i32 %index.next, %n.vec
378  br i1 %13, label %for.cond.cleanup, label %vector.body
379
380for.cond.cleanup:                                 ; preds = %vector.body, %entry
381  ret void
382}
383
384declare void @llvm.masked.scatter.v8i8.v8p0(<8 x i8>, <8 x ptr>, i32, <8 x i1>)
385declare void @llvm.masked.scatter.v8i16.v8p0(<8 x i16>, <8 x ptr>, i32, <8 x i1>)
386declare void @llvm.masked.scatter.v8f16.v8p0(<8 x half>, <8 x ptr>, i32, <8 x i1>)
387declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32, <16 x i1>)
388declare void @llvm.masked.scatter.v4i8.v4p0(<4 x i8>, <4 x ptr>, i32, <4 x i1>)
389declare void @llvm.masked.scatter.v4i16.v4p0(<4 x i16>, <4 x ptr>, i32, <4 x i1>)
390declare void @llvm.masked.scatter.v4f16.v4p0(<4 x half>, <4 x ptr>, i32, <4 x i1>)
391declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, i32, <4 x i1>)
392declare void @llvm.masked.scatter.v4f32.v4p0(<4 x float>, <4 x ptr>, i32, <4 x i1>)
393declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
394declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>)
395