xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll (revision cc82f1290a1e2157a6c0530d78d8cc84d2b8553d)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
5
6target triple = "aarch64-unknown-linux-gnu"
7
8;
9; Masked Stores
10;
11
12define void @masked_store_v2f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
13; CHECK-LABEL: masked_store_v2f16:
14; CHECK:       // %bb.0:
15; CHECK-NEXT:    ldr s1, [x0]
16; CHECK-NEXT:    ldr s2, [x1]
17; CHECK-NEXT:    movi v0.2d, #0000000000000000
18; CHECK-NEXT:    ptrue p0.h, vl4
19; CHECK-NEXT:    fcmeq v2.4h, v1.4h, v2.4h
20; CHECK-NEXT:    sshll v2.4s, v2.4h, #0
21; CHECK-NEXT:    mov v0.h[0], v2.h[0]
22; CHECK-NEXT:    mov w8, v2.s[1]
23; CHECK-NEXT:    mov v0.h[1], w8
24; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
25; CHECK-NEXT:    st1h { z1.h }, p0, [x1]
26; CHECK-NEXT:    ret
27  %a = load <2 x half>, ptr %ap
28  %b = load <2 x half>, ptr %bp
29  %mask = fcmp oeq <2 x half> %a, %b
30  call void @llvm.masked.store.v2f16(<2 x half> %a, ptr %bp, i32 8, <2 x i1> %mask)
31  ret void
32}
33
34define void @masked_store_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
35; CHECK-LABEL: masked_store_v2f32:
36; CHECK:       // %bb.0:
37; CHECK-NEXT:    ldr d0, [x0]
38; CHECK-NEXT:    ldr d1, [x1]
39; CHECK-NEXT:    ptrue p0.s, vl2
40; CHECK-NEXT:    fcmeq v1.2s, v0.2s, v1.2s
41; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
42; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
43; CHECK-NEXT:    ret
44  %a = load <2 x float>, ptr %ap
45  %b = load <2 x float>, ptr %bp
46  %mask = fcmp oeq <2 x float> %a, %b
47  call void @llvm.masked.store.v2f32(<2 x float> %a, ptr %bp, i32 8, <2 x i1> %mask)
48  ret void
49}
50
51define void @masked_store_v4f32(ptr %ap, ptr %bp) vscale_range(1,0) #0 {
52; CHECK-LABEL: masked_store_v4f32:
53; CHECK:       // %bb.0:
54; CHECK-NEXT:    ldr q0, [x0]
55; CHECK-NEXT:    ldr q1, [x1]
56; CHECK-NEXT:    ptrue p0.s, vl4
57; CHECK-NEXT:    fcmeq v1.4s, v0.4s, v1.4s
58; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
59; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
60; CHECK-NEXT:    ret
61  %a = load <4 x float>, ptr %ap
62  %b = load <4 x float>, ptr %bp
63  %mask = fcmp oeq <4 x float> %a, %b
64  call void @llvm.masked.store.v4f32(<4 x float> %a, ptr %bp, i32 8, <4 x i1> %mask)
65  ret void
66}
67
68define void @masked_store_v8f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
69; CHECK-LABEL: masked_store_v8f32:
70; CHECK:       // %bb.0:
71; CHECK-NEXT:    ptrue p0.s, vl8
72; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
73; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
74; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, z1.s
75; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
76; CHECK-NEXT:    ret
77  %a = load <8 x float>, ptr %ap
78  %b = load <8 x float>, ptr %bp
79  %mask = fcmp oeq <8 x float> %a, %b
80  call void @llvm.masked.store.v8f32(<8 x float> %a, ptr %bp, i32 8, <8 x i1> %mask)
81  ret void
82}
83
84define void @masked_store_v16f32(ptr %ap, ptr %bp) #0 {
85; VBITS_GE_256-LABEL: masked_store_v16f32:
86; VBITS_GE_256:       // %bb.0:
87; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
88; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
89; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
90; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
91; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
92; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
93; VBITS_GE_256-NEXT:    fcmeq p1.s, p0/z, z0.s, z1.s
94; VBITS_GE_256-NEXT:    fcmeq p0.s, p0/z, z2.s, z3.s
95; VBITS_GE_256-NEXT:    st1w { z0.s }, p1, [x0, x8, lsl #2]
96; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0]
97; VBITS_GE_256-NEXT:    ret
98;
99; VBITS_GE_512-LABEL: masked_store_v16f32:
100; VBITS_GE_512:       // %bb.0:
101; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
102; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
103; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
104; VBITS_GE_512-NEXT:    fcmeq p0.s, p0/z, z0.s, z1.s
105; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
106; VBITS_GE_512-NEXT:    ret
107  %a = load <16 x float>, ptr %ap
108  %b = load <16 x float>, ptr %bp
109  %mask = fcmp oeq <16 x float> %a, %b
110  call void @llvm.masked.store.v16f32(<16 x float> %a, ptr %ap, i32 8, <16 x i1> %mask)
111  ret void
112}
113
114define void @masked_store_v32f32(ptr %ap, ptr %bp) vscale_range(8,0) #0 {
115; CHECK-LABEL: masked_store_v32f32:
116; CHECK:       // %bb.0:
117; CHECK-NEXT:    ptrue p0.s, vl32
118; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
119; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
120; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, z1.s
121; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
122; CHECK-NEXT:    ret
123  %a = load <32 x float>, ptr %ap
124  %b = load <32 x float>, ptr %bp
125  %mask = fcmp oeq <32 x float> %a, %b
126  call void @llvm.masked.store.v32f32(<32 x float> %a, ptr %ap, i32 8, <32 x i1> %mask)
127  ret void
128}
129
130define void @masked_store_v64f32(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
131; CHECK-LABEL: masked_store_v64f32:
132; CHECK:       // %bb.0:
133; CHECK-NEXT:    ptrue p0.s, vl64
134; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
135; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
136; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, z1.s
137; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
138; CHECK-NEXT:    ret
139  %a = load <64 x float>, ptr %ap
140  %b = load <64 x float>, ptr %bp
141  %mask = fcmp oeq <64 x float> %a, %b
142  call void @llvm.masked.store.v64f32(<64 x float> %a, ptr %ap, i32 8, <64 x i1> %mask)
143  ret void
144}
145
146define void @masked_store_trunc_v8i64i8(ptr %ap, ptr %bp, ptr %dest) #0 {
147; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i8:
148; VBITS_GE_256:       // %bb.0:
149; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
150; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
151; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
152; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
153; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
154; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
155; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, z2.d
156; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
157; VBITS_GE_256-NEXT:    cmpeq p0.d, p0/z, z1.d, z3.d
158; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
159; VBITS_GE_256-NEXT:    mov z2.d, p1/z, #-1 // =0xffffffffffffffff
160; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
161; VBITS_GE_256-NEXT:    mov z3.d, p0/z, #-1 // =0xffffffffffffffff
162; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
163; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
164; VBITS_GE_256-NEXT:    splice z1.s, p0, z1.s, z0.s
165; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
166; VBITS_GE_256-NEXT:    splice z3.s, p0, z3.s, z2.s
167; VBITS_GE_256-NEXT:    cmpne p1.s, p1/z, z3.s, #0
168; VBITS_GE_256-NEXT:    st1b { z1.s }, p1, [x2]
169; VBITS_GE_256-NEXT:    ret
170;
171; VBITS_GE_512-LABEL: masked_store_trunc_v8i64i8:
172; VBITS_GE_512:       // %bb.0:
173; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
174; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
175; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
176; VBITS_GE_512-NEXT:    cmpeq p0.d, p0/z, z0.d, z1.d
177; VBITS_GE_512-NEXT:    st1b { z0.d }, p0, [x2]
178; VBITS_GE_512-NEXT:    ret
179  %a = load <8 x i64>, ptr %ap
180  %b = load <8 x i64>, ptr %bp
181  %mask = icmp eq <8 x i64> %a, %b
182  %val = trunc <8 x i64> %a to <8 x i8>
183  call void @llvm.masked.store.v8i8(<8 x i8> %val, ptr %dest, i32 8, <8 x i1> %mask)
184  ret void
185}
186
187define void @masked_store_trunc_v8i64i16(ptr %ap, ptr %bp, ptr %dest) #0 {
188; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i16:
189; VBITS_GE_256:       // %bb.0:
190; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
191; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
192; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
193; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
194; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
195; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
196; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, z2.d
197; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
198; VBITS_GE_256-NEXT:    cmpeq p0.d, p0/z, z1.d, z3.d
199; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
200; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
201; VBITS_GE_256-NEXT:    mov z2.d, p1/z, #-1 // =0xffffffffffffffff
202; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
203; VBITS_GE_256-NEXT:    mov z3.d, p0/z, #-1 // =0xffffffffffffffff
204; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
205; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
206; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
207; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
208; VBITS_GE_256-NEXT:    splice z3.s, p0, z3.s, z2.s
209; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
210; VBITS_GE_256-NEXT:    uzp1 z2.h, z3.h, z3.h
211; VBITS_GE_256-NEXT:    cmpne p0.h, p0/z, z2.h, #0
212; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x2]
213; VBITS_GE_256-NEXT:    ret
214;
215; VBITS_GE_512-LABEL: masked_store_trunc_v8i64i16:
216; VBITS_GE_512:       // %bb.0:
217; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
218; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
219; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
220; VBITS_GE_512-NEXT:    cmpeq p0.d, p0/z, z0.d, z1.d
221; VBITS_GE_512-NEXT:    st1h { z0.d }, p0, [x2]
222; VBITS_GE_512-NEXT:    ret
223  %a = load <8 x i64>, ptr %ap
224  %b = load <8 x i64>, ptr %bp
225  %mask = icmp eq <8 x i64> %a, %b
226  %val = trunc <8 x i64> %a to <8 x i16>
227  call void @llvm.masked.store.v8i16(<8 x i16> %val, ptr %dest, i32 8, <8 x i1> %mask)
228  ret void
229}
230
231define void @masked_store_trunc_v8i64i32(ptr %ap, ptr %bp, ptr %dest) #0 {
232; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i32:
233; VBITS_GE_256:       // %bb.0:
234; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
235; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
236; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
237; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
238; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
239; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
240; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, z2.d
241; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
242; VBITS_GE_256-NEXT:    cmpeq p0.d, p0/z, z1.d, z3.d
243; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
244; VBITS_GE_256-NEXT:    mov z2.d, p1/z, #-1 // =0xffffffffffffffff
245; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
246; VBITS_GE_256-NEXT:    mov z3.d, p0/z, #-1 // =0xffffffffffffffff
247; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
248; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
249; VBITS_GE_256-NEXT:    splice z1.s, p0, z1.s, z0.s
250; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
251; VBITS_GE_256-NEXT:    splice z3.s, p0, z3.s, z2.s
252; VBITS_GE_256-NEXT:    cmpne p1.s, p1/z, z3.s, #0
253; VBITS_GE_256-NEXT:    st1w { z1.s }, p1, [x2]
254; VBITS_GE_256-NEXT:    ret
255;
256; VBITS_GE_512-LABEL: masked_store_trunc_v8i64i32:
257; VBITS_GE_512:       // %bb.0:
258; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
259; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
260; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
261; VBITS_GE_512-NEXT:    cmpeq p0.d, p0/z, z0.d, z1.d
262; VBITS_GE_512-NEXT:    st1w { z0.d }, p0, [x2]
263; VBITS_GE_512-NEXT:    ret
264  %a = load <8 x i64>, ptr %ap
265  %b = load <8 x i64>, ptr %bp
266  %mask = icmp eq <8 x i64> %a, %b
267  %val = trunc <8 x i64> %a to <8 x i32>
268  call void @llvm.masked.store.v8i32(<8 x i32> %val, ptr %dest, i32 8, <8 x i1> %mask)
269  ret void
270}
271
272define void @masked_store_trunc_v16i32i8(ptr %ap, ptr %bp, ptr %dest) #0 {
273; VBITS_GE_256-LABEL: masked_store_trunc_v16i32i8:
274; VBITS_GE_256:       // %bb.0:
275; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
276; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
277; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
278; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
279; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
280; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
281; VBITS_GE_256-NEXT:    cmpeq p1.s, p0/z, z0.s, z2.s
282; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
283; VBITS_GE_256-NEXT:    cmpeq p0.s, p0/z, z1.s, z3.s
284; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
285; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
286; VBITS_GE_256-NEXT:    mov z2.s, p1/z, #-1 // =0xffffffffffffffff
287; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
288; VBITS_GE_256-NEXT:    mov z3.s, p0/z, #-1 // =0xffffffffffffffff
289; VBITS_GE_256-NEXT:    ptrue p0.b, vl16
290; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
291; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
292; VBITS_GE_256-NEXT:    uzp1 z3.h, z3.h, z3.h
293; VBITS_GE_256-NEXT:    uzp1 z2.b, z2.b, z2.b
294; VBITS_GE_256-NEXT:    uzp1 z3.b, z3.b, z3.b
295; VBITS_GE_256-NEXT:    mov v3.d[1], v2.d[0]
296; VBITS_GE_256-NEXT:    cmpne p0.b, p0/z, z3.b, #0
297; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x2]
298; VBITS_GE_256-NEXT:    ret
299;
300; VBITS_GE_512-LABEL: masked_store_trunc_v16i32i8:
301; VBITS_GE_512:       // %bb.0:
302; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
303; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
304; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
305; VBITS_GE_512-NEXT:    cmpeq p0.s, p0/z, z0.s, z1.s
306; VBITS_GE_512-NEXT:    st1b { z0.s }, p0, [x2]
307; VBITS_GE_512-NEXT:    ret
308  %a = load <16 x i32>, ptr %ap
309  %b = load <16 x i32>, ptr %bp
310  %mask = icmp eq <16 x i32> %a, %b
311  %val = trunc <16 x i32> %a to <16 x i8>
312  call void @llvm.masked.store.v16i8(<16 x i8> %val, ptr %dest, i32 8, <16 x i1> %mask)
313  ret void
314}
315
316define void @masked_store_trunc_v16i32i16(ptr %ap, ptr %bp, ptr %dest) #0 {
317; VBITS_GE_256-LABEL: masked_store_trunc_v16i32i16:
318; VBITS_GE_256:       // %bb.0:
319; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
320; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
321; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
322; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
323; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
324; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
325; VBITS_GE_256-NEXT:    cmpeq p1.s, p0/z, z0.s, z2.s
326; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
327; VBITS_GE_256-NEXT:    cmpeq p0.s, p0/z, z1.s, z3.s
328; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
329; VBITS_GE_256-NEXT:    mov z2.s, p1/z, #-1 // =0xffffffffffffffff
330; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
331; VBITS_GE_256-NEXT:    mov z3.s, p0/z, #-1 // =0xffffffffffffffff
332; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
333; VBITS_GE_256-NEXT:    splice z1.h, p1, z1.h, z0.h
334; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
335; VBITS_GE_256-NEXT:    uzp1 z3.h, z3.h, z3.h
336; VBITS_GE_256-NEXT:    uzp1 z2.b, z2.b, z2.b
337; VBITS_GE_256-NEXT:    uzp1 z3.b, z3.b, z3.b
338; VBITS_GE_256-NEXT:    mov v3.d[1], v2.d[0]
339; VBITS_GE_256-NEXT:    sunpklo z2.h, z3.b
340; VBITS_GE_256-NEXT:    cmpne p0.h, p0/z, z2.h, #0
341; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x2]
342; VBITS_GE_256-NEXT:    ret
343;
344; VBITS_GE_512-LABEL: masked_store_trunc_v16i32i16:
345; VBITS_GE_512:       // %bb.0:
346; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
347; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
348; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
349; VBITS_GE_512-NEXT:    cmpeq p0.s, p0/z, z0.s, z1.s
350; VBITS_GE_512-NEXT:    st1h { z0.s }, p0, [x2]
351; VBITS_GE_512-NEXT:    ret
352  %a = load <16 x i32>, ptr %ap
353  %b = load <16 x i32>, ptr %bp
354  %mask = icmp eq <16 x i32> %a, %b
355  %val = trunc <16 x i32> %a to <16 x i16>
356  call void @llvm.masked.store.v16i16(<16 x i16> %val, ptr %dest, i32 8, <16 x i1> %mask)
357  ret void
358}
359
360define void @masked_store_trunc_v32i16i8(ptr %ap, ptr %bp, ptr %dest) #0 {
361; VBITS_GE_256-LABEL: masked_store_trunc_v32i16i8:
362; VBITS_GE_256:       // %bb.0:
363; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
364; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
365; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
366; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
367; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
368; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
369; VBITS_GE_256-NEXT:    cmpeq p1.h, p0/z, z0.h, z2.h
370; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
371; VBITS_GE_256-NEXT:    cmpeq p0.h, p0/z, z1.h, z3.h
372; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
373; VBITS_GE_256-NEXT:    mov z2.h, p1/z, #-1 // =0xffffffffffffffff
374; VBITS_GE_256-NEXT:    ptrue p1.b, vl32
375; VBITS_GE_256-NEXT:    mov z3.h, p0/z, #-1 // =0xffffffffffffffff
376; VBITS_GE_256-NEXT:    ptrue p0.b, vl16
377; VBITS_GE_256-NEXT:    uzp1 z2.b, z2.b, z2.b
378; VBITS_GE_256-NEXT:    splice z1.b, p0, z1.b, z0.b
379; VBITS_GE_256-NEXT:    uzp1 z3.b, z3.b, z3.b
380; VBITS_GE_256-NEXT:    splice z3.b, p0, z3.b, z2.b
381; VBITS_GE_256-NEXT:    cmpne p1.b, p1/z, z3.b, #0
382; VBITS_GE_256-NEXT:    st1b { z1.b }, p1, [x2]
383; VBITS_GE_256-NEXT:    ret
384;
385; VBITS_GE_512-LABEL: masked_store_trunc_v32i16i8:
386; VBITS_GE_512:       // %bb.0:
387; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
388; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
389; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
390; VBITS_GE_512-NEXT:    cmpeq p0.h, p0/z, z0.h, z1.h
391; VBITS_GE_512-NEXT:    st1b { z0.h }, p0, [x2]
392; VBITS_GE_512-NEXT:    ret
393  %a = load <32 x i16>, ptr %ap
394  %b = load <32 x i16>, ptr %bp
395  %mask = icmp eq <32 x i16> %a, %b
396  %val = trunc <32 x i16> %a to <32 x i8>
397  call void @llvm.masked.store.v32i8(<32 x i8> %val, ptr %dest, i32 8, <32 x i1> %mask)
398  ret void
399}
400
401declare void @llvm.masked.store.v2f16(<2 x half>, ptr, i32, <2 x i1>)
402declare void @llvm.masked.store.v2f32(<2 x float>, ptr, i32, <2 x i1>)
403declare void @llvm.masked.store.v4f32(<4 x float>, ptr, i32, <4 x i1>)
404declare void @llvm.masked.store.v8f32(<8 x float>, ptr, i32, <8 x i1>)
405declare void @llvm.masked.store.v16f32(<16 x float>, ptr, i32, <16 x i1>)
406declare void @llvm.masked.store.v32f32(<32 x float>, ptr, i32, <32 x i1>)
407declare void @llvm.masked.store.v64f32(<64 x float>, ptr, i32, <64 x i1>)
408
409declare void @llvm.masked.store.v8i8(<8 x i8>, ptr, i32, <8 x i1>)
410declare void @llvm.masked.store.v8i16(<8 x i16>, ptr, i32, <8 x i1>)
411declare void @llvm.masked.store.v8i32(<8 x i32>, ptr, i32, <8 x i1>)
412declare void @llvm.masked.store.v16i8(<16 x i8>, ptr, i32, <16 x i1>)
413declare void @llvm.masked.store.v16i16(<16 x i16>, ptr, i32, <16 x i1>)
414declare void @llvm.masked.store.v32i8(<32 x i8>, ptr, i32, <32 x i1>)
415
416attributes #0 = { "target-features"="+sve" }
417