xref: /llvm-project/llvm/test/CodeGen/AArch64/vector-compress.ll (revision 61510b51c33464a6bc15e4cf5b1ee07e2e0ec1c9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2; RUN: llc -mtriple=aarch64-apple-darwin -verify-machineinstrs < %s | FileCheck %s
3
4define <4 x i32> @test_compress_v4i32(<4 x i32> %vec, <4 x i1> %mask) {
5; CHECK-LABEL: test_compress_v4i32:
6; CHECK:       ; %bb.0:
7; CHECK-NEXT:    sub sp, sp, #16
8; CHECK-NEXT:    .cfi_def_cfa_offset 16
9; CHECK-NEXT:    ushll.4s v1, v1, #0
10; CHECK-NEXT:    mov x8, sp
11; CHECK-NEXT:    str s0, [sp]
12; CHECK-NEXT:    shl.4s v1, v1, #31
13; CHECK-NEXT:    cmlt.4s v1, v1, #0
14; CHECK-NEXT:    mov.s w9, v1[1]
15; CHECK-NEXT:    mov.s w10, v1[2]
16; CHECK-NEXT:    fmov w11, s1
17; CHECK-NEXT:    bfi x8, x11, #2, #1
18; CHECK-NEXT:    and x11, x11, #0x1
19; CHECK-NEXT:    and x9, x9, #0x1
20; CHECK-NEXT:    and w10, w10, #0x1
21; CHECK-NEXT:    add x9, x11, x9
22; CHECK-NEXT:    mov x11, sp
23; CHECK-NEXT:    st1.s { v0 }[1], [x8]
24; CHECK-NEXT:    add w10, w9, w10
25; CHECK-NEXT:    orr x9, x11, x9, lsl #2
26; CHECK-NEXT:    bfi x11, x10, #2, #2
27; CHECK-NEXT:    st1.s { v0 }[2], [x9]
28; CHECK-NEXT:    st1.s { v0 }[3], [x11]
29; CHECK-NEXT:    ldr q0, [sp], #16
30; CHECK-NEXT:    ret
31    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> undef)
32    ret <4 x i32> %out
33}
34
35
36define <4 x i32> @test_compress_v4i32_with_passthru(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> %passthru) {
37; CHECK-LABEL: test_compress_v4i32_with_passthru:
38; CHECK:       ; %bb.0:
39; CHECK-NEXT:    str q2, [sp, #-16]!
40; CHECK-NEXT:    .cfi_def_cfa_offset 16
41; CHECK-NEXT:    ushll.4s v1, v1, #0
42; CHECK-NEXT:    movi.4s v3, #1
43; CHECK-NEXT:    mov x12, sp
44; CHECK-NEXT:    mov x10, sp
45; CHECK-NEXT:    mov x9, sp
46; CHECK-NEXT:    mov x14, sp
47; CHECK-NEXT:    mov w15, #3 ; =0x3
48; CHECK-NEXT:    shl.4s v1, v1, #31
49; CHECK-NEXT:    cmlt.4s v1, v1, #0
50; CHECK-NEXT:    and.16b v3, v1, v3
51; CHECK-NEXT:    mov.s w8, v1[1]
52; CHECK-NEXT:    fmov w16, s1
53; CHECK-NEXT:    mov.s w11, v1[2]
54; CHECK-NEXT:    mov.s w13, v1[3]
55; CHECK-NEXT:    addv.4s s2, v3
56; CHECK-NEXT:    bfi x12, x16, #2, #1
57; CHECK-NEXT:    and x16, x16, #0x1
58; CHECK-NEXT:    and x8, x8, #0x1
59; CHECK-NEXT:    add x8, x16, x8
60; CHECK-NEXT:    and x11, x11, #0x1
61; CHECK-NEXT:    and x13, x13, #0x1
62; CHECK-NEXT:    fmov w16, s2
63; CHECK-NEXT:    add x11, x8, x11
64; CHECK-NEXT:    orr x8, x9, x8, lsl #2
65; CHECK-NEXT:    add x13, x11, x13
66; CHECK-NEXT:    bfi x14, x11, #2, #2
67; CHECK-NEXT:    cmp x13, #3
68; CHECK-NEXT:    bfi x10, x16, #2, #2
69; CHECK-NEXT:    mov.s w16, v0[3]
70; CHECK-NEXT:    csel x11, x13, x15, lo
71; CHECK-NEXT:    ldr w10, [x10]
72; CHECK-NEXT:    str s0, [sp]
73; CHECK-NEXT:    st1.s { v0 }[1], [x12]
74; CHECK-NEXT:    st1.s { v0 }[2], [x8]
75; CHECK-NEXT:    orr x8, x9, x11, lsl #2
76; CHECK-NEXT:    csel w9, w16, w10, hi
77; CHECK-NEXT:    st1.s { v0 }[3], [x14]
78; CHECK-NEXT:    str w9, [x8]
79; CHECK-NEXT:    ldr q0, [sp], #16
80; CHECK-NEXT:    ret
81    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> %passthru)
82    ret <4 x i32> %out
83}
84
85define <2 x double> @test_compress_v2f64(<2 x double> %vec, <2 x i1> %mask) {
86; CHECK-LABEL: test_compress_v2f64:
87; CHECK:       ; %bb.0:
88; CHECK-NEXT:    sub sp, sp, #16
89; CHECK-NEXT:    .cfi_def_cfa_offset 16
90; CHECK-NEXT:    ushll.2d v1, v1, #0
91; CHECK-NEXT:    mov x8, sp
92; CHECK-NEXT:    str d0, [sp]
93; CHECK-NEXT:    shl.2d v1, v1, #63
94; CHECK-NEXT:    cmlt.2d v1, v1, #0
95; CHECK-NEXT:    fmov x9, d1
96; CHECK-NEXT:    bfi x8, x9, #3, #1
97; CHECK-NEXT:    st1.d { v0 }[1], [x8]
98; CHECK-NEXT:    ldr q0, [sp], #16
99; CHECK-NEXT:    ret
100    %out = call <2 x double> @llvm.experimental.vector.compress.v2f64(<2 x double> %vec, <2 x i1> %mask, <2 x double> undef)
101    ret <2 x double> %out
102}
103
104define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask) {
105; CHECK-LABEL: test_compress_v16i8:
106; CHECK:       ; %bb.0:
107; CHECK-NEXT:    sub sp, sp, #16
108; CHECK-NEXT:    .cfi_def_cfa_offset 16
109; CHECK-NEXT:    shl.16b v1, v1, #7
110; CHECK-NEXT:    mov x12, sp
111; CHECK-NEXT:    mov x8, sp
112; CHECK-NEXT:    st1.b { v0 }[0], [x8]
113; CHECK-NEXT:    mov x13, sp
114; CHECK-NEXT:    cmlt.16b v1, v1, #0
115; CHECK-NEXT:    umov.b w9, v1[0]
116; CHECK-NEXT:    umov.b w10, v1[1]
117; CHECK-NEXT:    umov.b w11, v1[2]
118; CHECK-NEXT:    umov.b w14, v1[3]
119; CHECK-NEXT:    bfxil x12, x9, #0, #1
120; CHECK-NEXT:    and x10, x10, #0x1
121; CHECK-NEXT:    and x9, x9, #0x1
122; CHECK-NEXT:    add x9, x9, x10
123; CHECK-NEXT:    umov.b w10, v1[4]
124; CHECK-NEXT:    and x11, x11, #0x1
125; CHECK-NEXT:    st1.b { v0 }[1], [x12]
126; CHECK-NEXT:    orr x12, x8, x9
127; CHECK-NEXT:    add x9, x9, x11
128; CHECK-NEXT:    umov.b w11, v1[5]
129; CHECK-NEXT:    and x14, x14, #0x1
130; CHECK-NEXT:    st1.b { v0 }[2], [x12]
131; CHECK-NEXT:    add x14, x9, x14
132; CHECK-NEXT:    umov.b w12, v1[6]
133; CHECK-NEXT:    orr x9, x8, x9
134; CHECK-NEXT:    and x10, x10, #0x1
135; CHECK-NEXT:    st1.b { v0 }[3], [x9]
136; CHECK-NEXT:    orr x9, x8, x14
137; CHECK-NEXT:    add x10, x14, x10
138; CHECK-NEXT:    umov.b w14, v1[7]
139; CHECK-NEXT:    st1.b { v0 }[4], [x9]
140; CHECK-NEXT:    and x11, x11, #0x1
141; CHECK-NEXT:    bfxil x13, x10, #0, #4
142; CHECK-NEXT:    mov x9, sp
143; CHECK-NEXT:    add x10, x10, x11
144; CHECK-NEXT:    umov.b w11, v1[8]
145; CHECK-NEXT:    and x12, x12, #0x1
146; CHECK-NEXT:    bfxil x9, x10, #0, #4
147; CHECK-NEXT:    st1.b { v0 }[5], [x13]
148; CHECK-NEXT:    umov.b w13, v1[9]
149; CHECK-NEXT:    add x10, x10, x12
150; CHECK-NEXT:    mov x12, sp
151; CHECK-NEXT:    and x14, x14, #0x1
152; CHECK-NEXT:    st1.b { v0 }[6], [x9]
153; CHECK-NEXT:    umov.b w9, v1[10]
154; CHECK-NEXT:    bfxil x12, x10, #0, #4
155; CHECK-NEXT:    add x10, x10, x14
156; CHECK-NEXT:    mov x14, sp
157; CHECK-NEXT:    and x11, x11, #0x1
158; CHECK-NEXT:    bfxil x14, x10, #0, #4
159; CHECK-NEXT:    add x10, x10, x11
160; CHECK-NEXT:    mov x11, sp
161; CHECK-NEXT:    and x13, x13, #0x1
162; CHECK-NEXT:    st1.b { v0 }[7], [x12]
163; CHECK-NEXT:    mov x12, sp
164; CHECK-NEXT:    bfxil x11, x10, #0, #4
165; CHECK-NEXT:    add x10, x10, x13
166; CHECK-NEXT:    umov.b w13, v1[11]
167; CHECK-NEXT:    st1.b { v0 }[8], [x14]
168; CHECK-NEXT:    umov.b w14, v1[12]
169; CHECK-NEXT:    and x9, x9, #0x1
170; CHECK-NEXT:    bfxil x12, x10, #0, #4
171; CHECK-NEXT:    add x9, x10, x9
172; CHECK-NEXT:    mov x10, sp
173; CHECK-NEXT:    st1.b { v0 }[9], [x11]
174; CHECK-NEXT:    umov.b w11, v1[13]
175; CHECK-NEXT:    bfxil x10, x9, #0, #4
176; CHECK-NEXT:    st1.b { v0 }[10], [x12]
177; CHECK-NEXT:    umov.b w12, v1[14]
178; CHECK-NEXT:    and x13, x13, #0x1
179; CHECK-NEXT:    and x14, x14, #0x1
180; CHECK-NEXT:    add x9, x9, x13
181; CHECK-NEXT:    st1.b { v0 }[11], [x10]
182; CHECK-NEXT:    mov x10, sp
183; CHECK-NEXT:    add x13, x9, x14
184; CHECK-NEXT:    mov x14, sp
185; CHECK-NEXT:    bfxil x10, x9, #0, #4
186; CHECK-NEXT:    and x9, x11, #0x1
187; CHECK-NEXT:    mov x11, sp
188; CHECK-NEXT:    add x9, x13, x9
189; CHECK-NEXT:    and w12, w12, #0x1
190; CHECK-NEXT:    bfxil x14, x13, #0, #4
191; CHECK-NEXT:    bfxil x11, x9, #0, #4
192; CHECK-NEXT:    add w9, w9, w12
193; CHECK-NEXT:    st1.b { v0 }[12], [x10]
194; CHECK-NEXT:    bfxil x8, x9, #0, #4
195; CHECK-NEXT:    st1.b { v0 }[13], [x14]
196; CHECK-NEXT:    st1.b { v0 }[14], [x11]
197; CHECK-NEXT:    st1.b { v0 }[15], [x8]
198; CHECK-NEXT:    ldr q0, [sp], #16
199; CHECK-NEXT:    ret
200    %out = call <16 x i8> @llvm.experimental.vector.compress(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> undef)
201    ret <16 x i8> %out
202}
203
204define <8 x i32> @test_compress_large(<8 x i32> %vec, <8 x i1> %mask) {
205; CHECK-LABEL: test_compress_large:
206; CHECK:       ; %bb.0:
207; CHECK-NEXT:    sub sp, sp, #32
208; CHECK-NEXT:    .cfi_def_cfa_offset 32
209; CHECK-NEXT:    ; kill: def $d2 killed $d2 def $q2
210; CHECK-NEXT:    umov.b w9, v2[0]
211; CHECK-NEXT:    umov.b w10, v2[1]
212; CHECK-NEXT:    mov x12, sp
213; CHECK-NEXT:    umov.b w11, v2[2]
214; CHECK-NEXT:    umov.b w13, v2[3]
215; CHECK-NEXT:    mov x8, sp
216; CHECK-NEXT:    umov.b w14, v2[4]
217; CHECK-NEXT:    str s0, [sp]
218; CHECK-NEXT:    and x10, x10, #0x1
219; CHECK-NEXT:    and x15, x9, #0x1
220; CHECK-NEXT:    bfi x12, x9, #2, #1
221; CHECK-NEXT:    and x9, x11, #0x1
222; CHECK-NEXT:    add x10, x15, x10
223; CHECK-NEXT:    umov.b w11, v2[5]
224; CHECK-NEXT:    add x9, x10, x9
225; CHECK-NEXT:    orr x15, x8, x10, lsl #2
226; CHECK-NEXT:    umov.b w10, v2[6]
227; CHECK-NEXT:    st1.s { v0 }[1], [x12]
228; CHECK-NEXT:    add x12, x8, x9, lsl #2
229; CHECK-NEXT:    and x13, x13, #0x1
230; CHECK-NEXT:    st1.s { v0 }[2], [x15]
231; CHECK-NEXT:    add x9, x9, x13
232; CHECK-NEXT:    st1.s { v0 }[3], [x12]
233; CHECK-NEXT:    and x12, x14, #0x1
234; CHECK-NEXT:    and x11, x11, #0x1
235; CHECK-NEXT:    add x12, x9, x12
236; CHECK-NEXT:    and w10, w10, #0x1
237; CHECK-NEXT:    and x9, x9, #0x7
238; CHECK-NEXT:    add x11, x12, x11
239; CHECK-NEXT:    and x12, x12, #0x7
240; CHECK-NEXT:    str s1, [x8, x9, lsl #2]
241; CHECK-NEXT:    add w10, w11, w10
242; CHECK-NEXT:    and x11, x11, #0x7
243; CHECK-NEXT:    add x12, x8, x12, lsl #2
244; CHECK-NEXT:    and x10, x10, #0x7
245; CHECK-NEXT:    add x9, x8, x11, lsl #2
246; CHECK-NEXT:    add x8, x8, x10, lsl #2
247; CHECK-NEXT:    st1.s { v1 }[1], [x12]
248; CHECK-NEXT:    st1.s { v1 }[2], [x9]
249; CHECK-NEXT:    st1.s { v1 }[3], [x8]
250; CHECK-NEXT:    ldp q0, q1, [sp], #32
251; CHECK-NEXT:    ret
252    %out = call <8 x i32> @llvm.experimental.vector.compress(<8 x i32> %vec, <8 x i1> %mask, <8 x i32> undef)
253    ret <8 x i32> %out
254}
255
256define <4 x i32> @test_compress_all_const() {
257; CHECK-LABEL: test_compress_all_const:
258; CHECK:       ; %bb.0:
259; CHECK-NEXT:  Lloh0:
260; CHECK-NEXT:    adrp x8, lCPI5_0@PAGE
261; CHECK-NEXT:  Lloh1:
262; CHECK-NEXT:    ldr q0, [x8, lCPI5_0@PAGEOFF]
263; CHECK-NEXT:    ret
264; CHECK-NEXT:    .loh AdrpLdr Lloh0, Lloh1
265    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> <i32 3, i32 5, i32 7, i32 9>,
266                                                <4 x i1>   <i1 0,  i1 1,  i1 0,  i1 1>,
267                                                <4 x i32> undef)
268    ret <4 x i32> %out
269}
270
271define <4 x i32> @test_compress_const_mask(<4 x i32> %vec) {
272; CHECK-LABEL: test_compress_const_mask:
273; CHECK:       ; %bb.0:
274; CHECK-NEXT:    mov.s v0[1], v0[3]
275; CHECK-NEXT:    ret
276    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 undef, i1 0, i1 1>, <4 x i32> undef)
277    ret <4 x i32> %out
278}
279
280define <4 x i32> @test_compress_const_mask_passthrough(<4 x i32> %vec, <4 x i32> %passthru) {
281; CHECK-LABEL: test_compress_const_mask_passthrough:
282; CHECK:       ; %bb.0:
283; CHECK-NEXT:    mov.d v1[0], v0[1]
284; CHECK-NEXT:    mov.s v1[0], v0[0]
285; CHECK-NEXT:    mov.16b v0, v1
286; CHECK-NEXT:    ret
287    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 undef, i1 0, i1 1>, <4 x i32> %passthru)
288    ret <4 x i32> %out
289}
290
291define <4 x i32> @test_compress_const_mask_const_passthrough(<4 x i32> %vec) {
292; CHECK-LABEL: test_compress_const_mask_const_passthrough:
293; CHECK:       ; %bb.0:
294; CHECK-NEXT:    mov.s v0[1], v0[3]
295; CHECK-NEXT:    mov w8, #7 ; =0x7
296; CHECK-NEXT:    mov.s v0[2], w8
297; CHECK-NEXT:    mov w8, #8 ; =0x8
298; CHECK-NEXT:    mov.s v0[3], w8
299; CHECK-NEXT:    ret
300    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i32> <i32 5, i32 6, i32 7, i32 8>)
301    ret <4 x i32> %out
302}
303
304; We pass a placeholder value for the const_mask* tests to check that they are converted to a no-op by simply copying
305; the second vector input register to the return register or doing nothing.
306define <4 x i32> @test_compress_const_splat1_mask(<4 x i32> %ignore, <4 x i32> %vec) {
307; CHECK-LABEL: test_compress_const_splat1_mask:
308; CHECK:       ; %bb.0:
309; CHECK-NEXT:    mov.16b v0, v1
310; CHECK-NEXT:    ret
311    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 -1), <4 x i32> undef)
312    ret <4 x i32> %out
313}
314define <4 x i32> @test_compress_const_splat0_mask(<4 x i32> %ignore, <4 x i32> %vec) {
315; CHECK-LABEL: test_compress_const_splat0_mask:
316; CHECK:       ; %bb.0:
317; CHECK-NEXT:    ret
318    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 0), <4 x i32> undef)
319    ret <4 x i32> %out
320}
321define <4 x i32> @test_compress_undef_mask(<4 x i32> %ignore, <4 x i32> %vec) {
322; CHECK-LABEL: test_compress_undef_mask:
323; CHECK:       ; %bb.0:
324; CHECK-NEXT:    ret
325    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> undef, <4 x i32> undef)
326    ret <4 x i32> %out
327}
328define <4 x i32> @test_compress_const_splat0_mask_with_passthru(<4 x i32> %ignore, <4 x i32> %vec, <4 x i32> %passthru) {
329; CHECK-LABEL: test_compress_const_splat0_mask_with_passthru:
330; CHECK:       ; %bb.0:
331; CHECK-NEXT:    mov.16b v0, v2
332; CHECK-NEXT:    ret
333    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 0), <4 x i32> %passthru)
334    ret <4 x i32> %out
335}
336define <4 x i32> @test_compress_const_splat0_mask_without_passthru(<4 x i32> %ignore, <4 x i32> %vec) {
337; CHECK-LABEL: test_compress_const_splat0_mask_without_passthru:
338; CHECK:       ; %bb.0:
339; CHECK-NEXT:    ret
340    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 0), <4 x i32> undef)
341    ret <4 x i32> %out
342}
343
344define <4 x i8> @test_compress_small(<4 x i8> %vec, <4 x i1> %mask) {
345; CHECK-LABEL: test_compress_small:
346; CHECK:       ; %bb.0:
347; CHECK-NEXT:    sub sp, sp, #16
348; CHECK-NEXT:    .cfi_def_cfa_offset 16
349; CHECK-NEXT:    shl.4h v1, v1, #15
350; CHECK-NEXT:    add x8, sp, #8
351; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
352; CHECK-NEXT:    str h0, [sp, #8]
353; CHECK-NEXT:    cmlt.4h v1, v1, #0
354; CHECK-NEXT:    umov.h w9, v1[0]
355; CHECK-NEXT:    umov.h w10, v1[1]
356; CHECK-NEXT:    umov.h w11, v1[2]
357; CHECK-NEXT:    bfi x8, x9, #1, #1
358; CHECK-NEXT:    and x10, x10, #0x1
359; CHECK-NEXT:    and x9, x9, #0x1
360; CHECK-NEXT:    add x9, x9, x10
361; CHECK-NEXT:    and w11, w11, #0x1
362; CHECK-NEXT:    add x10, sp, #8
363; CHECK-NEXT:    add w11, w9, w11
364; CHECK-NEXT:    orr x9, x10, x9, lsl #1
365; CHECK-NEXT:    st1.h { v0 }[1], [x8]
366; CHECK-NEXT:    bfi x10, x11, #1, #2
367; CHECK-NEXT:    st1.h { v0 }[2], [x9]
368; CHECK-NEXT:    st1.h { v0 }[3], [x10]
369; CHECK-NEXT:    ldr d0, [sp, #8]
370; CHECK-NEXT:    add sp, sp, #16
371; CHECK-NEXT:    ret
372    %out = call <4 x i8> @llvm.experimental.vector.compress(<4 x i8> %vec, <4 x i1> %mask, <4 x i8> undef)
373    ret <4 x i8> %out
374}
375
376define <4 x i4> @test_compress_illegal_element_type(<4 x i4> %vec, <4 x i1> %mask) {
377; CHECK-LABEL: test_compress_illegal_element_type:
378; CHECK:       ; %bb.0:
379; CHECK-NEXT:    sub sp, sp, #16
380; CHECK-NEXT:    .cfi_def_cfa_offset 16
381; CHECK-NEXT:    shl.4h v1, v1, #15
382; CHECK-NEXT:    add x8, sp, #8
383; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
384; CHECK-NEXT:    str h0, [sp, #8]
385; CHECK-NEXT:    cmlt.4h v1, v1, #0
386; CHECK-NEXT:    umov.h w9, v1[0]
387; CHECK-NEXT:    umov.h w10, v1[1]
388; CHECK-NEXT:    umov.h w11, v1[2]
389; CHECK-NEXT:    bfi x8, x9, #1, #1
390; CHECK-NEXT:    and x10, x10, #0x1
391; CHECK-NEXT:    and x9, x9, #0x1
392; CHECK-NEXT:    add x9, x9, x10
393; CHECK-NEXT:    and w11, w11, #0x1
394; CHECK-NEXT:    add x10, sp, #8
395; CHECK-NEXT:    add w11, w9, w11
396; CHECK-NEXT:    orr x9, x10, x9, lsl #1
397; CHECK-NEXT:    st1.h { v0 }[1], [x8]
398; CHECK-NEXT:    bfi x10, x11, #1, #2
399; CHECK-NEXT:    st1.h { v0 }[2], [x9]
400; CHECK-NEXT:    st1.h { v0 }[3], [x10]
401; CHECK-NEXT:    ldr d0, [sp, #8]
402; CHECK-NEXT:    add sp, sp, #16
403; CHECK-NEXT:    ret
404    %out = call <4 x i4> @llvm.experimental.vector.compress(<4 x i4> %vec, <4 x i1> %mask, <4 x i4> undef)
405    ret <4 x i4> %out
406}
407
408define <3 x i32> @test_compress_narrow(<3 x i32> %vec, <3 x i1> %mask) {
409; CHECK-LABEL: test_compress_narrow:
410; CHECK:       ; %bb.0:
411; CHECK-NEXT:    sub sp, sp, #16
412; CHECK-NEXT:    .cfi_def_cfa_offset 16
413; CHECK-NEXT:    movi.2d v1, #0000000000000000
414; CHECK-NEXT:    mov x11, sp
415; CHECK-NEXT:    str s0, [sp]
416; CHECK-NEXT:    mov.h v1[0], w0
417; CHECK-NEXT:    mov.h v1[1], w1
418; CHECK-NEXT:    mov.h v1[2], w2
419; CHECK-NEXT:    ushll.4s v1, v1, #0
420; CHECK-NEXT:    shl.4s v1, v1, #31
421; CHECK-NEXT:    cmlt.4s v1, v1, #0
422; CHECK-NEXT:    mov.s w8, v1[1]
423; CHECK-NEXT:    mov.s w9, v1[2]
424; CHECK-NEXT:    fmov w10, s1
425; CHECK-NEXT:    bfi x11, x10, #2, #1
426; CHECK-NEXT:    and x10, x10, #0x1
427; CHECK-NEXT:    and x8, x8, #0x1
428; CHECK-NEXT:    and w9, w9, #0x1
429; CHECK-NEXT:    add x8, x10, x8
430; CHECK-NEXT:    mov x10, sp
431; CHECK-NEXT:    st1.s { v0 }[1], [x11]
432; CHECK-NEXT:    add w9, w8, w9
433; CHECK-NEXT:    orr x8, x10, x8, lsl #2
434; CHECK-NEXT:    bfi x10, x9, #2, #2
435; CHECK-NEXT:    st1.s { v0 }[2], [x8]
436; CHECK-NEXT:    st1.s { v0 }[3], [x10]
437; CHECK-NEXT:    ldr q0, [sp], #16
438; CHECK-NEXT:    ret
439    %out = call <3 x i32> @llvm.experimental.vector.compress(<3 x i32> %vec, <3 x i1> %mask, <3 x i32> undef)
440    ret <3 x i32> %out
441}
442
443define <3 x i3> @test_compress_narrow_illegal_element_type(<3 x i3> %vec, <3 x i1> %mask) {
444; CHECK-LABEL: test_compress_narrow_illegal_element_type:
445; CHECK:       ; %bb.0:
446; CHECK-NEXT:    sub sp, sp, #16
447; CHECK-NEXT:    .cfi_def_cfa_offset 16
448; CHECK-NEXT:    movi.2d v0, #0000000000000000
449; CHECK-NEXT:    add x10, sp, #8
450; CHECK-NEXT:    strh w0, [sp, #8]
451; CHECK-NEXT:    mov.h v0[0], w3
452; CHECK-NEXT:    mov.h v0[1], w4
453; CHECK-NEXT:    mov.h v0[2], w5
454; CHECK-NEXT:    shl.4h v0, v0, #15
455; CHECK-NEXT:    cmlt.4h v0, v0, #0
456; CHECK-NEXT:    umov.h w8, v0[0]
457; CHECK-NEXT:    umov.h w9, v0[1]
458; CHECK-NEXT:    and x9, x9, #0x1
459; CHECK-NEXT:    and x11, x8, #0x1
460; CHECK-NEXT:    bfi x10, x8, #1, #1
461; CHECK-NEXT:    add x8, x11, x9
462; CHECK-NEXT:    add x9, sp, #8
463; CHECK-NEXT:    orr x8, x9, x8, lsl #1
464; CHECK-NEXT:    strh w1, [x10]
465; CHECK-NEXT:    strh w2, [x8]
466; CHECK-NEXT:    ldr d0, [sp, #8]
467; CHECK-NEXT:    umov.h w0, v0[0]
468; CHECK-NEXT:    umov.h w1, v0[1]
469; CHECK-NEXT:    umov.h w2, v0[2]
470; CHECK-NEXT:    add sp, sp, #16
471; CHECK-NEXT:    ret
472    %out = call <3 x i3> @llvm.experimental.vector.compress(<3 x i3> %vec, <3 x i1> %mask, <3 x i3> undef)
473    ret <3 x i3> %out
474}
475