xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll (revision 7b3bbd83c0c24087072ec5b22a76799ab31f87d5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
3
4; VLDRB.u16 Qd, [base, offs]
5define arm_aapcs_vfpcc void @ext_unscaled_i8_i16(ptr %base, ptr %offptr, <8 x i16> %input) {
6; CHECK-LABEL: ext_unscaled_i8_i16:
7; CHECK:       @ %bb.0: @ %entry
8; CHECK-NEXT:    vldrh.u16 q1, [r1]
9; CHECK-NEXT:    vstrb.16 q0, [r0, q1]
10; CHECK-NEXT:    bx lr
11entry:
12  %offs = load <8 x i16>, ptr %offptr, align 2
13  %offs.zext = zext <8 x i16> %offs to <8 x i32>
14  %ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.zext
15  %t = trunc <8 x i16> %input to <8 x i8>
16  call void @llvm.masked.scatter.v8i8(<8 x i8> %t, <8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
17  ret void
18}
19
20; VLDRB.u16 Qd, [base, offs]
21define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(ptr %base, ptr %offptr, <8 x i16> %input) {
22; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8:
23; CHECK:       @ %bb.0: @ %entry
24; CHECK-NEXT:    vldrb.u16 q1, [r1]
25; CHECK-NEXT:    vstrb.16 q0, [r0, q1]
26; CHECK-NEXT:    bx lr
27entry:
28  %offs = load <8 x i8>, ptr %offptr, align 1
29  %offs.zext = zext <8 x i8> %offs to <8 x i32>
30  %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.zext
31  %input.trunc = trunc <8 x i16> %input to <8 x i8>
32  call void @llvm.masked.scatter.v8i8(<8 x i8> %input.trunc, <8 x ptr> %byte_ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
33  ret void
34}
35
36; VLDRH.16 Qd, [base, offs]
37define arm_aapcs_vfpcc void @unscaled_i16_i16(ptr %base, ptr %offptr, <8 x i16> %input) {
38; CHECK-LABEL: unscaled_i16_i16:
39; CHECK:       @ %bb.0: @ %entry
40; CHECK-NEXT:    vldrh.u16 q1, [r1]
41; CHECK-NEXT:    vstrh.16 q0, [r0, q1]
42; CHECK-NEXT:    bx lr
43entry:
44  %offs = load <8 x i16>, ptr %offptr, align 2
45  %offs.zext = zext <8 x i16> %offs to <8 x i32>
46  %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.zext
47  %ptrs = bitcast <8 x ptr> %byte_ptrs to <8 x ptr>
48  call void @llvm.masked.scatter.v8i16(<8 x i16> %input, <8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
49  ret void
50}
51
52; VLDRH.s16 Qd, [base, offs]
53define arm_aapcs_vfpcc void @unscaled_v8f16_i16(ptr %base, ptr %offptr, <8 x half> %input) {
54; CHECK-LABEL: unscaled_v8f16_i16:
55; CHECK:       @ %bb.0: @ %entry
56; CHECK-NEXT:    vldrh.u16 q1, [r1]
57; CHECK-NEXT:    vstrh.16 q0, [r0, q1]
58; CHECK-NEXT:    bx lr
59entry:
60  %offs = load <8 x i16>, ptr %offptr, align 2
61  %offs.zext = zext <8 x i16> %offs to <8 x i32>
62  %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.zext
63  %ptrs = bitcast <8 x ptr> %byte_ptrs to <8 x ptr>
64  call void @llvm.masked.scatter.v8f16(<8 x half> %input, <8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
65  ret void
66}
67
68; Expand - sext offsets
69define arm_aapcs_vfpcc void @unscaled_v8i16_sext(ptr %base, ptr %offptr, <8 x i16> %input) {
70; CHECK-LABEL: unscaled_v8i16_sext:
71; CHECK:       @ %bb.0: @ %entry
72; CHECK-NEXT:    .save {r4, r5, r6, lr}
73; CHECK-NEXT:    push {r4, r5, r6, lr}
74; CHECK-NEXT:    vldrh.s32 q1, [r1]
75; CHECK-NEXT:    vmov.u16 r6, q0[0]
76; CHECK-NEXT:    vadd.i32 q1, q1, r0
77; CHECK-NEXT:    vmov r2, r3, d2
78; CHECK-NEXT:    vmov r12, lr, d3
79; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
80; CHECK-NEXT:    vadd.i32 q1, q1, r0
81; CHECK-NEXT:    vmov r0, r1, d2
82; CHECK-NEXT:    vmov r4, r5, d3
83; CHECK-NEXT:    strh r6, [r2]
84; CHECK-NEXT:    vmov.u16 r2, q0[1]
85; CHECK-NEXT:    strh r2, [r3]
86; CHECK-NEXT:    vmov.u16 r2, q0[2]
87; CHECK-NEXT:    strh.w r2, [r12]
88; CHECK-NEXT:    vmov.u16 r2, q0[3]
89; CHECK-NEXT:    strh.w r2, [lr]
90; CHECK-NEXT:    vmov.u16 r2, q0[4]
91; CHECK-NEXT:    strh r2, [r0]
92; CHECK-NEXT:    vmov.u16 r0, q0[5]
93; CHECK-NEXT:    strh r0, [r1]
94; CHECK-NEXT:    vmov.u16 r0, q0[6]
95; CHECK-NEXT:    strh r0, [r4]
96; CHECK-NEXT:    vmov.u16 r0, q0[7]
97; CHECK-NEXT:    strh r0, [r5]
98; CHECK-NEXT:    pop {r4, r5, r6, pc}
99entry:
100  %offs = load <8 x i16>, ptr %offptr, align 2
101  %offs.sext = sext <8 x i16> %offs to <8 x i32>
102  %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.sext
103  %ptrs = bitcast <8 x ptr> %byte_ptrs to <8 x ptr>
104  call void @llvm.masked.scatter.v8i16(<8 x i16> %input, <8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
105  ret void
106}
107
108; Expand - sext offsets
109define arm_aapcs_vfpcc void @unscaled_v8f16_sext(ptr %base, ptr %offptr, <8 x half> %input) {
110; CHECK-LABEL: unscaled_v8f16_sext:
111; CHECK:       @ %bb.0: @ %entry
112; CHECK-NEXT:    vldrh.s32 q2, [r1]
113; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
114; CHECK-NEXT:    vadd.i32 q2, q2, r0
115; CHECK-NEXT:    vadd.i32 q1, q1, r0
116; CHECK-NEXT:    vmov r1, r2, d4
117; CHECK-NEXT:    vstr.16 s0, [r1]
118; CHECK-NEXT:    vmovx.f16 s0, s0
119; CHECK-NEXT:    vstr.16 s0, [r2]
120; CHECK-NEXT:    vmov r1, r2, d5
121; CHECK-NEXT:    vmovx.f16 s0, s1
122; CHECK-NEXT:    vstr.16 s1, [r1]
123; CHECK-NEXT:    vstr.16 s0, [r2]
124; CHECK-NEXT:    vmov r0, r1, d2
125; CHECK-NEXT:    vmovx.f16 s0, s2
126; CHECK-NEXT:    vstr.16 s2, [r0]
127; CHECK-NEXT:    vstr.16 s0, [r1]
128; CHECK-NEXT:    vmov r0, r1, d3
129; CHECK-NEXT:    vmovx.f16 s0, s3
130; CHECK-NEXT:    vstr.16 s3, [r0]
131; CHECK-NEXT:    vstr.16 s0, [r1]
132; CHECK-NEXT:    bx lr
133entry:
134  %offs = load <8 x i16>, ptr %offptr, align 2
135  %offs.sext = sext <8 x i16> %offs to <8 x i32>
136  %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.sext
137  %ptrs = bitcast <8 x ptr> %byte_ptrs to <8 x ptr>
138  call void @llvm.masked.scatter.v8f16(<8 x half> %input, <8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
139  ret void
140}
141
142; Expand - i32 offsets
143define arm_aapcs_vfpcc void @unscaled_v8i16_noext(ptr %base, ptr %offptr, <8 x i16> %input) {
144; CHECK-LABEL: unscaled_v8i16_noext:
145; CHECK:       @ %bb.0: @ %entry
146; CHECK-NEXT:    .save {r4, r5, r6, lr}
147; CHECK-NEXT:    push {r4, r5, r6, lr}
148; CHECK-NEXT:    vldrw.u32 q1, [r1]
149; CHECK-NEXT:    vmov.u16 r6, q0[0]
150; CHECK-NEXT:    vadd.i32 q1, q1, r0
151; CHECK-NEXT:    vmov r2, r3, d2
152; CHECK-NEXT:    vmov r12, lr, d3
153; CHECK-NEXT:    vldrw.u32 q1, [r1, #16]
154; CHECK-NEXT:    vadd.i32 q1, q1, r0
155; CHECK-NEXT:    vmov r0, r1, d2
156; CHECK-NEXT:    vmov r4, r5, d3
157; CHECK-NEXT:    strh r6, [r2]
158; CHECK-NEXT:    vmov.u16 r2, q0[1]
159; CHECK-NEXT:    strh r2, [r3]
160; CHECK-NEXT:    vmov.u16 r2, q0[2]
161; CHECK-NEXT:    strh.w r2, [r12]
162; CHECK-NEXT:    vmov.u16 r2, q0[3]
163; CHECK-NEXT:    strh.w r2, [lr]
164; CHECK-NEXT:    vmov.u16 r2, q0[4]
165; CHECK-NEXT:    strh r2, [r0]
166; CHECK-NEXT:    vmov.u16 r0, q0[5]
167; CHECK-NEXT:    strh r0, [r1]
168; CHECK-NEXT:    vmov.u16 r0, q0[6]
169; CHECK-NEXT:    strh r0, [r4]
170; CHECK-NEXT:    vmov.u16 r0, q0[7]
171; CHECK-NEXT:    strh r0, [r5]
172; CHECK-NEXT:    pop {r4, r5, r6, pc}
173entry:
174  %offs = load <8 x i32>, ptr %offptr, align 4
175  %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs
176  %ptrs = bitcast <8 x ptr> %byte_ptrs to <8 x ptr>
177  call void @llvm.masked.scatter.v8i16(<8 x i16> %input, <8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
178  ret void
179}
180
181; Expand - i32 offsets
182define arm_aapcs_vfpcc void @unscaled_v8f16_noext(ptr %base, ptr %offptr, <8 x half> %input) {
183; CHECK-LABEL: unscaled_v8f16_noext:
184; CHECK:       @ %bb.0: @ %entry
185; CHECK-NEXT:    vldrw.u32 q2, [r1]
186; CHECK-NEXT:    vldrw.u32 q1, [r1, #16]
187; CHECK-NEXT:    vadd.i32 q2, q2, r0
188; CHECK-NEXT:    vadd.i32 q1, q1, r0
189; CHECK-NEXT:    vmov r1, r2, d4
190; CHECK-NEXT:    vstr.16 s0, [r1]
191; CHECK-NEXT:    vmovx.f16 s0, s0
192; CHECK-NEXT:    vstr.16 s0, [r2]
193; CHECK-NEXT:    vmov r1, r2, d5
194; CHECK-NEXT:    vmovx.f16 s0, s1
195; CHECK-NEXT:    vstr.16 s1, [r1]
196; CHECK-NEXT:    vstr.16 s0, [r2]
197; CHECK-NEXT:    vmov r0, r1, d2
198; CHECK-NEXT:    vmovx.f16 s0, s2
199; CHECK-NEXT:    vstr.16 s2, [r0]
200; CHECK-NEXT:    vstr.16 s0, [r1]
201; CHECK-NEXT:    vmov r0, r1, d3
202; CHECK-NEXT:    vmovx.f16 s0, s3
203; CHECK-NEXT:    vstr.16 s3, [r0]
204; CHECK-NEXT:    vstr.16 s0, [r1]
205; CHECK-NEXT:    bx lr
206entry:
207  %offs = load <8 x i32>, ptr %offptr, align 4
208  %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs
209  %ptrs = bitcast <8 x ptr> %byte_ptrs to <8 x ptr>
210  call void @llvm.masked.scatter.v8f16(<8 x half> %input, <8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
211  ret void
212}
213
214; VLDRH.16 Qd, [base, zext(offs)]
215define arm_aapcs_vfpcc void @unsigned_unscaled_i16_i8(ptr %base, ptr %offptr, <8 x i16> %input) {
216; CHECK-LABEL: unsigned_unscaled_i16_i8:
217; CHECK:       @ %bb.0: @ %entry
218; CHECK-NEXT:    vldrb.u16 q1, [r1]
219; CHECK-NEXT:    vstrh.16 q0, [r0, q1]
220; CHECK-NEXT:    bx lr
221entry:
222  %offs = load <8 x i8>, ptr %offptr, align 1
223  %offs.zext = zext <8 x i8> %offs to <8 x i32>
224  %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.zext
225  %ptrs = bitcast <8 x ptr> %byte_ptrs to <8 x ptr>
226  call void @llvm.masked.scatter.v8i16(<8 x i16> %input, <8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
227  ret void
228}
229
230; VLDRH.16 Qd, [base, zext(offs)]
231define arm_aapcs_vfpcc void @unsigned_unscaled_f16_i8(ptr %base, ptr %offptr, <8 x half> %input) {
232; CHECK-LABEL: unsigned_unscaled_f16_i8:
233; CHECK:       @ %bb.0: @ %entry
234; CHECK-NEXT:    vldrb.u16 q1, [r1]
235; CHECK-NEXT:    vstrh.16 q0, [r0, q1]
236; CHECK-NEXT:    bx lr
237entry:
238  %offs = load <8 x i8>, ptr %offptr, align 1
239  %offs.zext = zext <8 x i8> %offs to <8 x i32>
240  %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.zext
241  %ptrs = bitcast <8 x ptr> %byte_ptrs to <8 x ptr>
242  call void @llvm.masked.scatter.v8f16(<8 x half> %input, <8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
243  ret void
244}
245
246; Expand - sext offsets
247define arm_aapcs_vfpcc void @trunc_signed_unscaled_i64_i8(ptr %base, ptr %offptr, <8 x i64> %input) {
248; CHECK-LABEL: trunc_signed_unscaled_i64_i8:
249; CHECK:       @ %bb.0: @ %entry
250; CHECK-NEXT:    .save {r4, r5, r7, lr}
251; CHECK-NEXT:    push {r4, r5, r7, lr}
252; CHECK-NEXT:    .vsave {d8, d9}
253; CHECK-NEXT:    vpush {d8, d9}
254; CHECK-NEXT:    vldrb.s32 q4, [r1]
255; CHECK-NEXT:    vmov r4, s0
256; CHECK-NEXT:    vadd.i32 q4, q4, r0
257; CHECK-NEXT:    vmov r2, r3, d8
258; CHECK-NEXT:    vmov r12, lr, d9
259; CHECK-NEXT:    vldrb.s32 q4, [r1, #4]
260; CHECK-NEXT:    vadd.i32 q4, q4, r0
261; CHECK-NEXT:    vmov r0, r1, d8
262; CHECK-NEXT:    strh r4, [r2]
263; CHECK-NEXT:    vmov r2, s2
264; CHECK-NEXT:    vmov r4, r5, d9
265; CHECK-NEXT:    strh r2, [r3]
266; CHECK-NEXT:    vmov r2, s4
267; CHECK-NEXT:    strh.w r2, [r12]
268; CHECK-NEXT:    vmov r2, s6
269; CHECK-NEXT:    strh.w r2, [lr]
270; CHECK-NEXT:    vmov r2, s8
271; CHECK-NEXT:    strh r2, [r0]
272; CHECK-NEXT:    vmov r0, s10
273; CHECK-NEXT:    strh r0, [r1]
274; CHECK-NEXT:    vmov r0, s12
275; CHECK-NEXT:    strh r0, [r4]
276; CHECK-NEXT:    vmov r0, s14
277; CHECK-NEXT:    strh r0, [r5]
278; CHECK-NEXT:    vpop {d8, d9}
279; CHECK-NEXT:    pop {r4, r5, r7, pc}
280entry:
281  %offs = load <8 x i8>, ptr %offptr, align 1
282  %offs.sext = sext <8 x i8> %offs to <8 x i32>
283  %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.sext
284  %ptrs = bitcast <8 x ptr> %byte_ptrs to <8 x ptr>
285  %input.trunc = trunc <8 x i64> %input to <8 x i16>
286  call void @llvm.masked.scatter.v8i16(<8 x i16> %input.trunc, <8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
287  ret void
288}
289
290define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(ptr %base, ptr %offptr, <8 x i64> %input) {
291; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8:
292; CHECK:       @ %bb.0: @ %entry
293; CHECK-NEXT:    .vsave {d8, d9}
294; CHECK-NEXT:    vpush {d8, d9}
295; CHECK-NEXT:    vmov r3, s0
296; CHECK-NEXT:    vmov.16 q4[0], r3
297; CHECK-NEXT:    vmov r3, s2
298; CHECK-NEXT:    vmov.16 q4[1], r3
299; CHECK-NEXT:    vmov r3, s4
300; CHECK-NEXT:    vmov.16 q4[2], r3
301; CHECK-NEXT:    vmov r3, s6
302; CHECK-NEXT:    vmov.16 q4[3], r3
303; CHECK-NEXT:    vmov r3, s8
304; CHECK-NEXT:    vmov.16 q4[4], r3
305; CHECK-NEXT:    vmov r3, s10
306; CHECK-NEXT:    vmov.16 q4[5], r3
307; CHECK-NEXT:    vmov r3, s12
308; CHECK-NEXT:    vmov r2, s14
309; CHECK-NEXT:    vmov.16 q4[6], r3
310; CHECK-NEXT:    vldrb.u16 q0, [r1]
311; CHECK-NEXT:    vmov.16 q4[7], r2
312; CHECK-NEXT:    vstrh.16 q4, [r0, q0]
313; CHECK-NEXT:    vpop {d8, d9}
314; CHECK-NEXT:    bx lr
315entry:
316  %offs = load <8 x i8>, ptr %offptr, align 1
317  %offs.zext = zext <8 x i8> %offs to <8 x i32>
318  %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.zext
319  %ptrs = bitcast <8 x ptr> %byte_ptrs to <8 x ptr>
320  %input.trunc = trunc <8 x i64> %input to <8 x i16>
321  call void @llvm.masked.scatter.v8i16(<8 x i16> %input.trunc, <8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
322  ret void
323}
324
325; Expand - sext offsets
326define arm_aapcs_vfpcc void @trunc_signed_unscaled_i32_i8(ptr %base, ptr %offptr, <8 x i32> %input) {
327; CHECK-LABEL: trunc_signed_unscaled_i32_i8:
328; CHECK:       @ %bb.0: @ %entry
329; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
330; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
331; CHECK-NEXT:    vldrb.s32 q2, [r1]
332; CHECK-NEXT:    vmov r4, r5, d0
333; CHECK-NEXT:    vadd.i32 q2, q2, r0
334; CHECK-NEXT:    vmov r2, r3, d4
335; CHECK-NEXT:    vmov r12, lr, d5
336; CHECK-NEXT:    vldrb.s32 q2, [r1, #4]
337; CHECK-NEXT:    vadd.i32 q2, q2, r0
338; CHECK-NEXT:    vmov r0, r6, d1
339; CHECK-NEXT:    strh r4, [r2]
340; CHECK-NEXT:    vmov r2, r7, d4
341; CHECK-NEXT:    strh r5, [r3]
342; CHECK-NEXT:    vmov r3, r5, d5
343; CHECK-NEXT:    strh.w r0, [r12]
344; CHECK-NEXT:    vmov r0, r1, d2
345; CHECK-NEXT:    strh.w r6, [lr]
346; CHECK-NEXT:    vmov r6, r4, d3
347; CHECK-NEXT:    strh r0, [r2]
348; CHECK-NEXT:    strh r1, [r7]
349; CHECK-NEXT:    strh r6, [r3]
350; CHECK-NEXT:    strh r4, [r5]
351; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
352entry:
353  %offs = load <8 x i8>, ptr %offptr, align 1
354  %offs.sext = sext <8 x i8> %offs to <8 x i32>
355  %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.sext
356  %ptrs = bitcast <8 x ptr> %byte_ptrs to <8 x ptr>
357  %input.trunc = trunc <8 x i32> %input to <8 x i16>
358  call void @llvm.masked.scatter.v8i16(<8 x i16> %input.trunc, <8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
359  ret void
360}
361
362define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(ptr %base, ptr %offptr, <8 x i32> %input) {
363; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8:
364; CHECK:       @ %bb.0: @ %entry
365; CHECK-NEXT:    .pad #16
366; CHECK-NEXT:    sub sp, #16
367; CHECK-NEXT:    mov r2, sp
368; CHECK-NEXT:    vstrh.32 q1, [r2, #8]
369; CHECK-NEXT:    vstrh.32 q0, [r2]
370; CHECK-NEXT:    vldrb.u16 q0, [r1]
371; CHECK-NEXT:    vldrw.u32 q1, [r2]
372; CHECK-NEXT:    vstrh.16 q1, [r0, q0]
373; CHECK-NEXT:    add sp, #16
374; CHECK-NEXT:    bx lr
375entry:
376  %offs = load <8 x i8>, ptr %offptr, align 1
377  %offs.zext = zext <8 x i8> %offs to <8 x i32>
378  %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.zext
379  %ptrs = bitcast <8 x ptr> %byte_ptrs to <8 x ptr>
380  %input.trunc = trunc <8 x i32> %input to <8 x i16>
381  call void @llvm.masked.scatter.v8i16(<8 x i16> %input.trunc, <8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
382  ret void
383}
384
385; Expand - sext offsets
386define arm_aapcs_vfpcc void @trunc_signed_unscaled_i16_i8(ptr %base, ptr %offptr, <8 x i16> %input) {
387; CHECK-LABEL: trunc_signed_unscaled_i16_i8:
388; CHECK:       @ %bb.0: @ %entry
389; CHECK-NEXT:    .save {r4, r5, r6, lr}
390; CHECK-NEXT:    push {r4, r5, r6, lr}
391; CHECK-NEXT:    vldrb.s32 q1, [r1]
392; CHECK-NEXT:    vmov.u16 r6, q0[0]
393; CHECK-NEXT:    vadd.i32 q1, q1, r0
394; CHECK-NEXT:    vmov r2, r3, d2
395; CHECK-NEXT:    vmov r12, lr, d3
396; CHECK-NEXT:    vldrb.s32 q1, [r1, #4]
397; CHECK-NEXT:    vadd.i32 q1, q1, r0
398; CHECK-NEXT:    vmov r0, r1, d2
399; CHECK-NEXT:    vmov r4, r5, d3
400; CHECK-NEXT:    strb r6, [r2]
401; CHECK-NEXT:    vmov.u16 r2, q0[1]
402; CHECK-NEXT:    strb r2, [r3]
403; CHECK-NEXT:    vmov.u16 r2, q0[2]
404; CHECK-NEXT:    strb.w r2, [r12]
405; CHECK-NEXT:    vmov.u16 r2, q0[3]
406; CHECK-NEXT:    strb.w r2, [lr]
407; CHECK-NEXT:    vmov.u16 r2, q0[4]
408; CHECK-NEXT:    strb r2, [r0]
409; CHECK-NEXT:    vmov.u16 r0, q0[5]
410; CHECK-NEXT:    strb r0, [r1]
411; CHECK-NEXT:    vmov.u16 r0, q0[6]
412; CHECK-NEXT:    strb r0, [r4]
413; CHECK-NEXT:    vmov.u16 r0, q0[7]
414; CHECK-NEXT:    strb r0, [r5]
415; CHECK-NEXT:    pop {r4, r5, r6, pc}
416entry:
417  %offs = load <8 x i8>, ptr %offptr, align 1
418  %offs.sext = sext <8 x i8> %offs to <8 x i32>
419  %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.sext
420  %input.trunc = trunc <8 x i16> %input to <8 x i8>
421  call void @llvm.masked.scatter.v8i8(<8 x i8> %input.trunc, <8 x ptr> %byte_ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
422  ret void
423}
424
425declare void @llvm.masked.scatter.v8i8(<8 x i8>, <8 x ptr>, i32, <8 x i1>)
426declare void @llvm.masked.scatter.v8i16(<8 x i16>, <8 x ptr>, i32, <8 x i1>)
427declare void @llvm.masked.scatter.v8f16(<8 x half>, <8 x ptr>, i32, <8 x i1>)
428