xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll (revision cc82f1290a1e2157a6c0530d78d8cc84d2b8553d)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
5
6target triple = "aarch64-unknown-linux-gnu"
7
8;
9; ST1B
10;
11
12define void @masked_scatter_v2i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
13; CHECK-LABEL: masked_scatter_v2i8:
14; CHECK:       // %bb.0:
15; CHECK-NEXT:    ldrb w8, [x0]
16; CHECK-NEXT:    ldrb w9, [x0, #1]
17; CHECK-NEXT:    ptrue p0.d, vl2
18; CHECK-NEXT:    fmov s0, w8
19; CHECK-NEXT:    mov v0.s[1], w9
20; CHECK-NEXT:    cmeq v1.2s, v0.2s, #0
21; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
22; CHECK-NEXT:    sshll v1.2d, v1.2s, #0
23; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
24; CHECK-NEXT:    ldr q1, [x1]
25; CHECK-NEXT:    st1b { z0.d }, p0, [z1.d]
26; CHECK-NEXT:    ret
27  %vals = load <2 x i8>, ptr %a
28  %ptrs = load <2 x ptr>, ptr %b
29  %mask = icmp eq <2 x i8> %vals, zeroinitializer
30  call void @llvm.masked.scatter.v2i8(<2 x i8> %vals, <2 x ptr> %ptrs, i32 8, <2 x i1> %mask)
31  ret void
32}
33
34define void @masked_scatter_v4i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
35; CHECK-LABEL: masked_scatter_v4i8:
36; CHECK:       // %bb.0:
37; CHECK-NEXT:    ldr s0, [x0]
38; CHECK-NEXT:    ptrue p0.d, vl4
39; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
40; CHECK-NEXT:    cmeq v1.4h, v0.4h, #0
41; CHECK-NEXT:    uunpklo z0.s, z0.h
42; CHECK-NEXT:    sunpklo z1.s, z1.h
43; CHECK-NEXT:    uunpklo z0.d, z0.s
44; CHECK-NEXT:    sunpklo z1.d, z1.s
45; CHECK-NEXT:    cmpne p1.d, p0/z, z1.d, #0
46; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
47; CHECK-NEXT:    st1b { z0.d }, p1, [z1.d]
48; CHECK-NEXT:    ret
49  %vals = load <4 x i8>, ptr %a
50  %ptrs = load <4 x ptr>, ptr %b
51  %mask = icmp eq <4 x i8> %vals, zeroinitializer
52  call void @llvm.masked.scatter.v4i8(<4 x i8> %vals, <4 x ptr> %ptrs, i32 8, <4 x i1> %mask)
53  ret void
54}
55
56define void @masked_scatter_v8i8(ptr %a, ptr %b) #0 {
57; VBITS_GE_256-LABEL: masked_scatter_v8i8:
58; VBITS_GE_256:       // %bb.0:
59; VBITS_GE_256-NEXT:    ldr d0, [x0]
60; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
61; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
62; VBITS_GE_256-NEXT:    cmeq v1.8b, v0.8b, #0
63; VBITS_GE_256-NEXT:    zip1 v3.8b, v0.8b, v0.8b
64; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1, x8, lsl #3]
65; VBITS_GE_256-NEXT:    zip1 v2.8b, v1.8b, v0.8b
66; VBITS_GE_256-NEXT:    zip2 v1.8b, v1.8b, v0.8b
67; VBITS_GE_256-NEXT:    zip2 v0.8b, v0.8b, v0.8b
68; VBITS_GE_256-NEXT:    uunpklo z3.s, z3.h
69; VBITS_GE_256-NEXT:    shl v2.4h, v2.4h, #8
70; VBITS_GE_256-NEXT:    shl v1.4h, v1.4h, #8
71; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
72; VBITS_GE_256-NEXT:    uunpklo z3.d, z3.s
73; VBITS_GE_256-NEXT:    sshr v2.4h, v2.4h, #8
74; VBITS_GE_256-NEXT:    sshr v1.4h, v1.4h, #8
75; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
76; VBITS_GE_256-NEXT:    sunpklo z2.s, z2.h
77; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
78; VBITS_GE_256-NEXT:    sunpklo z2.d, z2.s
79; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
80; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z2.d, #0
81; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1]
82; VBITS_GE_256-NEXT:    cmpne p0.d, p0/z, z1.d, #0
83; VBITS_GE_256-NEXT:    st1b { z3.d }, p1, [z2.d]
84; VBITS_GE_256-NEXT:    st1b { z0.d }, p0, [z4.d]
85; VBITS_GE_256-NEXT:    ret
86;
87; VBITS_GE_512-LABEL: masked_scatter_v8i8:
88; VBITS_GE_512:       // %bb.0:
89; VBITS_GE_512-NEXT:    ldr d0, [x0]
90; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
91; VBITS_GE_512-NEXT:    cmeq v1.8b, v0.8b, #0
92; VBITS_GE_512-NEXT:    uunpklo z0.h, z0.b
93; VBITS_GE_512-NEXT:    sunpklo z1.h, z1.b
94; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
95; VBITS_GE_512-NEXT:    sunpklo z1.s, z1.h
96; VBITS_GE_512-NEXT:    uunpklo z0.d, z0.s
97; VBITS_GE_512-NEXT:    sunpklo z1.d, z1.s
98; VBITS_GE_512-NEXT:    cmpne p1.d, p0/z, z1.d, #0
99; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
100; VBITS_GE_512-NEXT:    st1b { z0.d }, p1, [z1.d]
101; VBITS_GE_512-NEXT:    ret
102  %vals = load <8 x i8>, ptr %a
103  %ptrs = load <8 x ptr>, ptr %b
104  %mask = icmp eq <8 x i8> %vals, zeroinitializer
105  call void @llvm.masked.scatter.v8i8(<8 x i8> %vals, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask)
106  ret void
107}
108
109define void @masked_scatter_v16i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
110; CHECK-LABEL: masked_scatter_v16i8:
111; CHECK:       // %bb.0:
112; CHECK-NEXT:    ldr q0, [x0]
113; CHECK-NEXT:    ptrue p0.d, vl16
114; CHECK-NEXT:    cmeq v1.16b, v0.16b, #0
115; CHECK-NEXT:    uunpklo z0.h, z0.b
116; CHECK-NEXT:    sunpklo z1.h, z1.b
117; CHECK-NEXT:    uunpklo z0.s, z0.h
118; CHECK-NEXT:    sunpklo z1.s, z1.h
119; CHECK-NEXT:    uunpklo z0.d, z0.s
120; CHECK-NEXT:    sunpklo z1.d, z1.s
121; CHECK-NEXT:    cmpne p1.d, p0/z, z1.d, #0
122; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
123; CHECK-NEXT:    st1b { z0.d }, p1, [z1.d]
124; CHECK-NEXT:    ret
125  %vals = load <16 x i8>, ptr %a
126  %ptrs = load <16 x ptr>, ptr %b
127  %mask = icmp eq <16 x i8> %vals, zeroinitializer
128  call void @llvm.masked.scatter.v16i8(<16 x i8> %vals, <16 x ptr> %ptrs, i32 8, <16 x i1> %mask)
129  ret void
130}
131
132define void @masked_scatter_v32i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
133; CHECK-LABEL: masked_scatter_v32i8:
134; CHECK:       // %bb.0:
135; CHECK-NEXT:    ptrue p0.b, vl32
136; CHECK-NEXT:    ptrue p1.d, vl32
137; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
138; CHECK-NEXT:    uunpklo z1.h, z0.b
139; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
140; CHECK-NEXT:    uunpklo z0.s, z1.h
141; CHECK-NEXT:    punpklo p0.h, p0.b
142; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
143; CHECK-NEXT:    punpklo p0.h, p0.b
144; CHECK-NEXT:    uunpklo z0.d, z0.s
145; CHECK-NEXT:    punpklo p0.h, p0.b
146; CHECK-NEXT:    st1b { z0.d }, p0, [z1.d]
147; CHECK-NEXT:    ret
148  %vals = load <32 x i8>, ptr %a
149  %ptrs = load <32 x ptr>, ptr %b
150  %mask = icmp eq <32 x i8> %vals, zeroinitializer
151  call void @llvm.masked.scatter.v32i8(<32 x i8> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
152  ret void
153}
154
155;
156; ST1H
157;
158
159define void @masked_scatter_v2i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
160; CHECK-LABEL: masked_scatter_v2i16:
161; CHECK:       // %bb.0:
162; CHECK-NEXT:    ldrh w8, [x0]
163; CHECK-NEXT:    ldrh w9, [x0, #2]
164; CHECK-NEXT:    ptrue p0.d, vl2
165; CHECK-NEXT:    fmov s0, w8
166; CHECK-NEXT:    mov v0.s[1], w9
167; CHECK-NEXT:    cmeq v1.2s, v0.2s, #0
168; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
169; CHECK-NEXT:    sshll v1.2d, v1.2s, #0
170; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
171; CHECK-NEXT:    ldr q1, [x1]
172; CHECK-NEXT:    st1h { z0.d }, p0, [z1.d]
173; CHECK-NEXT:    ret
174  %vals = load <2 x i16>, ptr %a
175  %ptrs = load <2 x ptr>, ptr %b
176  %mask = icmp eq <2 x i16> %vals, zeroinitializer
177  call void @llvm.masked.scatter.v2i16(<2 x i16> %vals, <2 x ptr> %ptrs, i32 8, <2 x i1> %mask)
178  ret void
179}
180
181define void @masked_scatter_v4i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
182; CHECK-LABEL: masked_scatter_v4i16:
183; CHECK:       // %bb.0:
184; CHECK-NEXT:    ldr d0, [x0]
185; CHECK-NEXT:    ptrue p0.d, vl4
186; CHECK-NEXT:    cmeq v1.4h, v0.4h, #0
187; CHECK-NEXT:    uunpklo z0.s, z0.h
188; CHECK-NEXT:    sunpklo z1.s, z1.h
189; CHECK-NEXT:    uunpklo z0.d, z0.s
190; CHECK-NEXT:    sunpklo z1.d, z1.s
191; CHECK-NEXT:    cmpne p1.d, p0/z, z1.d, #0
192; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
193; CHECK-NEXT:    st1h { z0.d }, p1, [z1.d]
194; CHECK-NEXT:    ret
195  %vals = load <4 x i16>, ptr %a
196  %ptrs = load <4 x ptr>, ptr %b
197  %mask = icmp eq <4 x i16> %vals, zeroinitializer
198  call void @llvm.masked.scatter.v4i16(<4 x i16> %vals, <4 x ptr> %ptrs, i32 8, <4 x i1> %mask)
199  ret void
200}
201
202define void @masked_scatter_v8i16(ptr %a, ptr %b) #0 {
203; VBITS_GE_256-LABEL: masked_scatter_v8i16:
204; VBITS_GE_256:       // %bb.0:
205; VBITS_GE_256-NEXT:    ldr q0, [x0]
206; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
207; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
208; VBITS_GE_256-NEXT:    cmeq v1.8h, v0.8h, #0
209; VBITS_GE_256-NEXT:    uunpklo z3.s, z0.h
210; VBITS_GE_256-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
211; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1, x8, lsl #3]
212; VBITS_GE_256-NEXT:    sunpklo z2.s, z1.h
213; VBITS_GE_256-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
214; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
215; VBITS_GE_256-NEXT:    uunpklo z3.d, z3.s
216; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
217; VBITS_GE_256-NEXT:    sunpklo z2.d, z2.s
218; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
219; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
220; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z2.d, #0
221; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1]
222; VBITS_GE_256-NEXT:    cmpne p0.d, p0/z, z1.d, #0
223; VBITS_GE_256-NEXT:    st1h { z3.d }, p1, [z2.d]
224; VBITS_GE_256-NEXT:    st1h { z0.d }, p0, [z4.d]
225; VBITS_GE_256-NEXT:    ret
226;
227; VBITS_GE_512-LABEL: masked_scatter_v8i16:
228; VBITS_GE_512:       // %bb.0:
229; VBITS_GE_512-NEXT:    ldr q0, [x0]
230; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
231; VBITS_GE_512-NEXT:    cmeq v1.8h, v0.8h, #0
232; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
233; VBITS_GE_512-NEXT:    sunpklo z1.s, z1.h
234; VBITS_GE_512-NEXT:    uunpklo z0.d, z0.s
235; VBITS_GE_512-NEXT:    sunpklo z1.d, z1.s
236; VBITS_GE_512-NEXT:    cmpne p1.d, p0/z, z1.d, #0
237; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
238; VBITS_GE_512-NEXT:    st1h { z0.d }, p1, [z1.d]
239; VBITS_GE_512-NEXT:    ret
240  %vals = load <8 x i16>, ptr %a
241  %ptrs = load <8 x ptr>, ptr %b
242  %mask = icmp eq <8 x i16> %vals, zeroinitializer
243  call void @llvm.masked.scatter.v8i16(<8 x i16> %vals, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask)
244  ret void
245}
246
247define void @masked_scatter_v16i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
248; CHECK-LABEL: masked_scatter_v16i16:
249; CHECK:       // %bb.0:
250; CHECK-NEXT:    ptrue p0.h, vl16
251; CHECK-NEXT:    ptrue p1.d, vl16
252; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
253; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
254; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
255; CHECK-NEXT:    uunpklo z0.s, z0.h
256; CHECK-NEXT:    uunpklo z0.d, z0.s
257; CHECK-NEXT:    punpklo p0.h, p0.b
258; CHECK-NEXT:    punpklo p0.h, p0.b
259; CHECK-NEXT:    st1h { z0.d }, p0, [z1.d]
260; CHECK-NEXT:    ret
261  %vals = load <16 x i16>, ptr %a
262  %ptrs = load <16 x ptr>, ptr %b
263  %mask = icmp eq <16 x i16> %vals, zeroinitializer
264  call void @llvm.masked.scatter.v16i16(<16 x i16> %vals, <16 x ptr> %ptrs, i32 8, <16 x i1> %mask)
265  ret void
266}
267
268define void @masked_scatter_v32i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
269; CHECK-LABEL: masked_scatter_v32i16:
270; CHECK:       // %bb.0:
271; CHECK-NEXT:    ptrue p0.h, vl32
272; CHECK-NEXT:    ptrue p1.d, vl32
273; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
274; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
275; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
276; CHECK-NEXT:    uunpklo z0.s, z0.h
277; CHECK-NEXT:    uunpklo z0.d, z0.s
278; CHECK-NEXT:    punpklo p0.h, p0.b
279; CHECK-NEXT:    punpklo p0.h, p0.b
280; CHECK-NEXT:    st1h { z0.d }, p0, [z1.d]
281; CHECK-NEXT:    ret
282  %vals = load <32 x i16>, ptr %a
283  %ptrs = load <32 x ptr>, ptr %b
284  %mask = icmp eq <32 x i16> %vals, zeroinitializer
285  call void @llvm.masked.scatter.v32i16(<32 x i16> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
286  ret void
287}
288
289;
290; ST1W
291;
292
293define void @masked_scatter_v2i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
294; CHECK-LABEL: masked_scatter_v2i32:
295; CHECK:       // %bb.0:
296; CHECK-NEXT:    ldr d0, [x0]
297; CHECK-NEXT:    ptrue p0.d, vl2
298; CHECK-NEXT:    cmeq v1.2s, v0.2s, #0
299; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
300; CHECK-NEXT:    sshll v1.2d, v1.2s, #0
301; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
302; CHECK-NEXT:    ldr q1, [x1]
303; CHECK-NEXT:    st1w { z0.d }, p0, [z1.d]
304; CHECK-NEXT:    ret
305  %vals = load <2 x i32>, ptr %a
306  %ptrs = load <2 x ptr>, ptr %b
307  %mask = icmp eq <2 x i32> %vals, zeroinitializer
308  call void @llvm.masked.scatter.v2i32(<2 x i32> %vals, <2 x ptr> %ptrs, i32 8, <2 x i1> %mask)
309  ret void
310}
311
312define void @masked_scatter_v4i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
313; CHECK-LABEL: masked_scatter_v4i32:
314; CHECK:       // %bb.0:
315; CHECK-NEXT:    ldr q0, [x0]
316; CHECK-NEXT:    ptrue p0.d, vl4
317; CHECK-NEXT:    cmeq v1.4s, v0.4s, #0
318; CHECK-NEXT:    uunpklo z0.d, z0.s
319; CHECK-NEXT:    sunpklo z1.d, z1.s
320; CHECK-NEXT:    cmpne p1.d, p0/z, z1.d, #0
321; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
322; CHECK-NEXT:    st1w { z0.d }, p1, [z1.d]
323; CHECK-NEXT:    ret
324  %vals = load <4 x i32>, ptr %a
325  %ptrs = load <4 x ptr>, ptr %b
326  %mask = icmp eq <4 x i32> %vals, zeroinitializer
327  call void @llvm.masked.scatter.v4i32(<4 x i32> %vals, <4 x ptr> %ptrs, i32 8, <4 x i1> %mask)
328  ret void
329}
330
331define void @masked_scatter_v8i32(ptr %a, ptr %b) #0 {
332; VBITS_GE_256-LABEL: masked_scatter_v8i32:
333; VBITS_GE_256:       // %bb.0:
334; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
335; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
336; VBITS_GE_256-NEXT:    ptrue p1.d, vl4
337; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
338; VBITS_GE_256-NEXT:    ld1d { z3.d }, p1/z, [x1]
339; VBITS_GE_256-NEXT:    ld1d { z4.d }, p1/z, [x1, x8, lsl #3]
340; VBITS_GE_256-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
341; VBITS_GE_256-NEXT:    uunpklo z2.d, z0.s
342; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
343; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
344; VBITS_GE_256-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
345; VBITS_GE_256-NEXT:    punpklo p0.h, p0.b
346; VBITS_GE_256-NEXT:    and p0.b, p0/z, p0.b, p1.b
347; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
348; VBITS_GE_256-NEXT:    st1w { z2.d }, p0, [z3.d]
349; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
350; VBITS_GE_256-NEXT:    cmpne p0.d, p1/z, z1.d, #0
351; VBITS_GE_256-NEXT:    st1w { z0.d }, p0, [z4.d]
352; VBITS_GE_256-NEXT:    ret
353;
354; VBITS_GE_512-LABEL: masked_scatter_v8i32:
355; VBITS_GE_512:       // %bb.0:
356; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
357; VBITS_GE_512-NEXT:    ptrue p1.d, vl8
358; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
359; VBITS_GE_512-NEXT:    ld1d { z1.d }, p1/z, [x1]
360; VBITS_GE_512-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
361; VBITS_GE_512-NEXT:    uunpklo z0.d, z0.s
362; VBITS_GE_512-NEXT:    punpklo p0.h, p0.b
363; VBITS_GE_512-NEXT:    st1w { z0.d }, p0, [z1.d]
364; VBITS_GE_512-NEXT:    ret
365  %vals = load <8 x i32>, ptr %a
366  %ptrs = load <8 x ptr>, ptr %b
367  %mask = icmp eq <8 x i32> %vals, zeroinitializer
368  call void @llvm.masked.scatter.v8i32(<8 x i32> %vals, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask)
369  ret void
370}
371
372define void @masked_scatter_v16i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
373; CHECK-LABEL: masked_scatter_v16i32:
374; CHECK:       // %bb.0:
375; CHECK-NEXT:    ptrue p0.s, vl16
376; CHECK-NEXT:    ptrue p1.d, vl16
377; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
378; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
379; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
380; CHECK-NEXT:    uunpklo z0.d, z0.s
381; CHECK-NEXT:    punpklo p0.h, p0.b
382; CHECK-NEXT:    st1w { z0.d }, p0, [z1.d]
383; CHECK-NEXT:    ret
384  %vals = load <16 x i32>, ptr %a
385  %ptrs = load <16 x ptr>, ptr %b
386  %mask = icmp eq <16 x i32> %vals, zeroinitializer
387  call void @llvm.masked.scatter.v16i32(<16 x i32> %vals, <16 x ptr> %ptrs, i32 8, <16 x i1> %mask)
388  ret void
389}
390
391define void @masked_scatter_v32i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
392; CHECK-LABEL: masked_scatter_v32i32:
393; CHECK:       // %bb.0:
394; CHECK-NEXT:    ptrue p0.s, vl32
395; CHECK-NEXT:    ptrue p1.d, vl32
396; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
397; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
398; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
399; CHECK-NEXT:    uunpklo z0.d, z0.s
400; CHECK-NEXT:    punpklo p0.h, p0.b
401; CHECK-NEXT:    st1w { z0.d }, p0, [z1.d]
402; CHECK-NEXT:    ret
403  %vals = load <32 x i32>, ptr %a
404  %ptrs = load <32 x ptr>, ptr %b
405  %mask = icmp eq <32 x i32> %vals, zeroinitializer
406  call void @llvm.masked.scatter.v32i32(<32 x i32> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
407  ret void
408}
409
410;
411; ST1D
412;
413
414; Scalarize 1 x i64 scatters
415define void @masked_scatter_v1i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
416; CHECK-LABEL: masked_scatter_v1i64:
417; CHECK:       // %bb.0:
418; CHECK-NEXT:    ldr d0, [x0]
419; CHECK-NEXT:    fmov x8, d0
420; CHECK-NEXT:    cbnz x8, .LBB15_2
421; CHECK-NEXT:  // %bb.1: // %cond.store
422; CHECK-NEXT:    ldr d1, [x1]
423; CHECK-NEXT:    fmov x8, d1
424; CHECK-NEXT:    str d0, [x8]
425; CHECK-NEXT:  .LBB15_2: // %else
426; CHECK-NEXT:    ret
427  %vals = load <1 x i64>, ptr %a
428  %ptrs = load <1 x ptr>, ptr %b
429  %mask = icmp eq <1 x i64> %vals, zeroinitializer
430  call void @llvm.masked.scatter.v1i64(<1 x i64> %vals, <1 x ptr> %ptrs, i32 8, <1 x i1> %mask)
431  ret void
432}
433
434define void @masked_scatter_v2i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
435; CHECK-LABEL: masked_scatter_v2i64:
436; CHECK:       // %bb.0:
437; CHECK-NEXT:    ldr q0, [x0]
438; CHECK-NEXT:    ptrue p0.d, vl2
439; CHECK-NEXT:    cmeq v1.2d, v0.2d, #0
440; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
441; CHECK-NEXT:    ldr q1, [x1]
442; CHECK-NEXT:    st1d { z0.d }, p0, [z1.d]
443; CHECK-NEXT:    ret
444  %vals = load <2 x i64>, ptr %a
445  %ptrs = load <2 x ptr>, ptr %b
446  %mask = icmp eq <2 x i64> %vals, zeroinitializer
447  call void @llvm.masked.scatter.v2i64(<2 x i64> %vals, <2 x ptr> %ptrs, i32 8, <2 x i1> %mask)
448  ret void
449}
450
451define void @masked_scatter_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
452; CHECK-LABEL: masked_scatter_v4i64:
453; CHECK:       // %bb.0:
454; CHECK-NEXT:    ptrue p0.d, vl4
455; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
456; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
457; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
458; CHECK-NEXT:    st1d { z0.d }, p1, [z1.d]
459; CHECK-NEXT:    ret
460  %vals = load <4 x i64>, ptr %a
461  %ptrs = load <4 x ptr>, ptr %b
462  %mask = icmp eq <4 x i64> %vals, zeroinitializer
463  call void @llvm.masked.scatter.v4i64(<4 x i64> %vals, <4 x ptr> %ptrs, i32 8, <4 x i1> %mask)
464  ret void
465}
466
467define void @masked_scatter_v8i64(ptr %a, ptr %b) #0 {
468; VBITS_GE_256-LABEL: masked_scatter_v8i64:
469; VBITS_GE_256:       // %bb.0:
470; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
471; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
472; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0]
473; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x8, lsl #3]
474; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
475; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1, x8, lsl #3]
476; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
477; VBITS_GE_256-NEXT:    cmpeq p0.d, p0/z, z2.d, #0
478; VBITS_GE_256-NEXT:    st1d { z0.d }, p1, [z1.d]
479; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [z3.d]
480; VBITS_GE_256-NEXT:    ret
481;
482; VBITS_GE_512-LABEL: masked_scatter_v8i64:
483; VBITS_GE_512:       // %bb.0:
484; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
485; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
486; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
487; VBITS_GE_512-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
488; VBITS_GE_512-NEXT:    st1d { z0.d }, p1, [z1.d]
489; VBITS_GE_512-NEXT:    ret
490  %vals = load <8 x i64>, ptr %a
491  %ptrs = load <8 x ptr>, ptr %b
492  %mask = icmp eq <8 x i64> %vals, zeroinitializer
493  call void @llvm.masked.scatter.v8i64(<8 x i64> %vals, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask)
494  ret void
495}
496
497define void @masked_scatter_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
498; CHECK-LABEL: masked_scatter_v16i64:
499; CHECK:       // %bb.0:
500; CHECK-NEXT:    ptrue p0.d, vl16
501; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
502; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
503; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
504; CHECK-NEXT:    st1d { z0.d }, p1, [z1.d]
505; CHECK-NEXT:    ret
506  %vals = load <16 x i64>, ptr %a
507  %ptrs = load <16 x ptr>, ptr %b
508  %mask = icmp eq <16 x i64> %vals, zeroinitializer
509  call void @llvm.masked.scatter.v16i64(<16 x i64> %vals, <16 x ptr> %ptrs, i32 8, <16 x i1> %mask)
510  ret void
511}
512
513define void @masked_scatter_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
514; CHECK-LABEL: masked_scatter_v32i64:
515; CHECK:       // %bb.0:
516; CHECK-NEXT:    ptrue p0.d, vl32
517; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
518; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
519; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
520; CHECK-NEXT:    st1d { z0.d }, p1, [z1.d]
521; CHECK-NEXT:    ret
522  %vals = load <32 x i64>, ptr %a
523  %ptrs = load <32 x ptr>, ptr %b
524  %mask = icmp eq <32 x i64> %vals, zeroinitializer
525  call void @llvm.masked.scatter.v32i64(<32 x i64> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
526  ret void
527}
528
529;
530; ST1H (float)
531;
532
533define void @masked_scatter_v2f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
534; CHECK-LABEL: masked_scatter_v2f16:
535; CHECK:       // %bb.0:
536; CHECK-NEXT:    ldr s1, [x0]
537; CHECK-NEXT:    movi v0.2d, #0000000000000000
538; CHECK-NEXT:    ptrue p0.d, vl4
539; CHECK-NEXT:    fcmeq v2.4h, v1.4h, #0.0
540; CHECK-NEXT:    uunpklo z1.s, z1.h
541; CHECK-NEXT:    sshll v2.4s, v2.4h, #0
542; CHECK-NEXT:    mov v0.h[0], v2.h[0]
543; CHECK-NEXT:    mov w8, v2.s[1]
544; CHECK-NEXT:    mov v0.h[1], w8
545; CHECK-NEXT:    sunpklo z0.s, z0.h
546; CHECK-NEXT:    sunpklo z0.d, z0.s
547; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
548; CHECK-NEXT:    uunpklo z0.d, z1.s
549; CHECK-NEXT:    ldr q1, [x1]
550; CHECK-NEXT:    st1h { z0.d }, p0, [z1.d]
551; CHECK-NEXT:    ret
552  %vals = load <2 x half>, ptr %a
553  %ptrs = load <2 x ptr>, ptr %b
554  %mask = fcmp oeq <2 x half> %vals, zeroinitializer
555  call void @llvm.masked.scatter.v2f16(<2 x half> %vals, <2 x ptr> %ptrs, i32 8, <2 x i1> %mask)
556  ret void
557}
558
559define void @masked_scatter_v4f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
560; CHECK-LABEL: masked_scatter_v4f16:
561; CHECK:       // %bb.0:
562; CHECK-NEXT:    ldr d0, [x0]
563; CHECK-NEXT:    ptrue p0.d, vl4
564; CHECK-NEXT:    fcmeq v1.4h, v0.4h, #0.0
565; CHECK-NEXT:    uunpklo z0.s, z0.h
566; CHECK-NEXT:    sunpklo z1.s, z1.h
567; CHECK-NEXT:    uunpklo z0.d, z0.s
568; CHECK-NEXT:    sunpklo z1.d, z1.s
569; CHECK-NEXT:    cmpne p1.d, p0/z, z1.d, #0
570; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
571; CHECK-NEXT:    st1h { z0.d }, p1, [z1.d]
572; CHECK-NEXT:    ret
573  %vals = load <4 x half>, ptr %a
574  %ptrs = load <4 x ptr>, ptr %b
575  %mask = fcmp oeq <4 x half> %vals, zeroinitializer
576  call void @llvm.masked.scatter.v4f16(<4 x half> %vals, <4 x ptr> %ptrs, i32 8, <4 x i1> %mask)
577  ret void
578}
579
580define void @masked_scatter_v8f16(ptr %a, ptr %b) #0 {
581; VBITS_GE_256-LABEL: masked_scatter_v8f16:
582; VBITS_GE_256:       // %bb.0:
583; VBITS_GE_256-NEXT:    ldr q0, [x0]
584; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
585; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
586; VBITS_GE_256-NEXT:    fcmeq v1.8h, v0.8h, #0.0
587; VBITS_GE_256-NEXT:    uunpklo z3.s, z0.h
588; VBITS_GE_256-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
589; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1, x8, lsl #3]
590; VBITS_GE_256-NEXT:    sunpklo z2.s, z1.h
591; VBITS_GE_256-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
592; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
593; VBITS_GE_256-NEXT:    uunpklo z3.d, z3.s
594; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
595; VBITS_GE_256-NEXT:    sunpklo z2.d, z2.s
596; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
597; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
598; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z2.d, #0
599; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1]
600; VBITS_GE_256-NEXT:    cmpne p0.d, p0/z, z1.d, #0
601; VBITS_GE_256-NEXT:    st1h { z3.d }, p1, [z2.d]
602; VBITS_GE_256-NEXT:    st1h { z0.d }, p0, [z4.d]
603; VBITS_GE_256-NEXT:    ret
604;
605; VBITS_GE_512-LABEL: masked_scatter_v8f16:
606; VBITS_GE_512:       // %bb.0:
607; VBITS_GE_512-NEXT:    ldr q0, [x0]
608; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
609; VBITS_GE_512-NEXT:    fcmeq v1.8h, v0.8h, #0.0
610; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
611; VBITS_GE_512-NEXT:    sunpklo z1.s, z1.h
612; VBITS_GE_512-NEXT:    uunpklo z0.d, z0.s
613; VBITS_GE_512-NEXT:    sunpklo z1.d, z1.s
614; VBITS_GE_512-NEXT:    cmpne p1.d, p0/z, z1.d, #0
615; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
616; VBITS_GE_512-NEXT:    st1h { z0.d }, p1, [z1.d]
617; VBITS_GE_512-NEXT:    ret
618  %vals = load <8 x half>, ptr %a
619  %ptrs = load <8 x ptr>, ptr %b
620  %mask = fcmp oeq <8 x half> %vals, zeroinitializer
621  call void @llvm.masked.scatter.v8f16(<8 x half> %vals, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask)
622  ret void
623}
624
625define void @masked_scatter_v16f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
626; CHECK-LABEL: masked_scatter_v16f16:
627; CHECK:       // %bb.0:
628; CHECK-NEXT:    ptrue p0.h, vl16
629; CHECK-NEXT:    ptrue p1.d, vl16
630; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
631; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
632; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
633; CHECK-NEXT:    uunpklo z0.s, z0.h
634; CHECK-NEXT:    uunpklo z0.d, z0.s
635; CHECK-NEXT:    punpklo p0.h, p0.b
636; CHECK-NEXT:    punpklo p0.h, p0.b
637; CHECK-NEXT:    st1h { z0.d }, p0, [z1.d]
638; CHECK-NEXT:    ret
639  %vals = load <16 x half>, ptr %a
640  %ptrs = load <16 x ptr>, ptr %b
641  %mask = fcmp oeq <16 x half> %vals, zeroinitializer
642  call void @llvm.masked.scatter.v16f16(<16 x half> %vals, <16 x ptr> %ptrs, i32 8, <16 x i1> %mask)
643  ret void
644}
645
646define void @masked_scatter_v32f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
647; CHECK-LABEL: masked_scatter_v32f16:
648; CHECK:       // %bb.0:
649; CHECK-NEXT:    ptrue p0.h, vl32
650; CHECK-NEXT:    ptrue p1.d, vl32
651; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
652; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
653; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
654; CHECK-NEXT:    uunpklo z0.s, z0.h
655; CHECK-NEXT:    uunpklo z0.d, z0.s
656; CHECK-NEXT:    punpklo p0.h, p0.b
657; CHECK-NEXT:    punpklo p0.h, p0.b
658; CHECK-NEXT:    st1h { z0.d }, p0, [z1.d]
659; CHECK-NEXT:    ret
660  %vals = load <32 x half>, ptr %a
661  %ptrs = load <32 x ptr>, ptr %b
662  %mask = fcmp oeq <32 x half> %vals, zeroinitializer
663  call void @llvm.masked.scatter.v32f16(<32 x half> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
664  ret void
665}
666
667;
668; ST1W (float)
669;
670
671define void @masked_scatter_v2f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
672; CHECK-LABEL: masked_scatter_v2f32:
673; CHECK:       // %bb.0:
674; CHECK-NEXT:    ldr d0, [x0]
675; CHECK-NEXT:    ptrue p0.d, vl2
676; CHECK-NEXT:    fcmeq v1.2s, v0.2s, #0.0
677; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
678; CHECK-NEXT:    sshll v1.2d, v1.2s, #0
679; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
680; CHECK-NEXT:    ldr q1, [x1]
681; CHECK-NEXT:    st1w { z0.d }, p0, [z1.d]
682; CHECK-NEXT:    ret
683  %vals = load <2 x float>, ptr %a
684  %ptrs = load <2 x ptr>, ptr %b
685  %mask = fcmp oeq <2 x float> %vals, zeroinitializer
686  call void @llvm.masked.scatter.v2f32(<2 x float> %vals, <2 x ptr> %ptrs, i32 8, <2 x i1> %mask)
687  ret void
688}
689
690define void @masked_scatter_v4f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
691; CHECK-LABEL: masked_scatter_v4f32:
692; CHECK:       // %bb.0:
693; CHECK-NEXT:    ldr q0, [x0]
694; CHECK-NEXT:    ptrue p0.d, vl4
695; CHECK-NEXT:    fcmeq v1.4s, v0.4s, #0.0
696; CHECK-NEXT:    uunpklo z0.d, z0.s
697; CHECK-NEXT:    sunpklo z1.d, z1.s
698; CHECK-NEXT:    cmpne p1.d, p0/z, z1.d, #0
699; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
700; CHECK-NEXT:    st1w { z0.d }, p1, [z1.d]
701; CHECK-NEXT:    ret
702  %vals = load <4 x float>, ptr %a
703  %ptrs = load <4 x ptr>, ptr %b
704  %mask = fcmp oeq <4 x float> %vals, zeroinitializer
705  call void @llvm.masked.scatter.v4f32(<4 x float> %vals, <4 x ptr> %ptrs, i32 8, <4 x i1> %mask)
706  ret void
707}
708
709define void @masked_scatter_v8f32(ptr %a, ptr %b) #0 {
710; VBITS_GE_256-LABEL: masked_scatter_v8f32:
711; VBITS_GE_256:       // %bb.0:
712; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
713; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
714; VBITS_GE_256-NEXT:    ptrue p1.d, vl4
715; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
716; VBITS_GE_256-NEXT:    ld1d { z3.d }, p1/z, [x1]
717; VBITS_GE_256-NEXT:    ld1d { z4.d }, p1/z, [x1, x8, lsl #3]
718; VBITS_GE_256-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
719; VBITS_GE_256-NEXT:    uunpklo z2.d, z0.s
720; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
721; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
722; VBITS_GE_256-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
723; VBITS_GE_256-NEXT:    punpklo p0.h, p0.b
724; VBITS_GE_256-NEXT:    and p0.b, p0/z, p0.b, p1.b
725; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
726; VBITS_GE_256-NEXT:    st1w { z2.d }, p0, [z3.d]
727; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
728; VBITS_GE_256-NEXT:    cmpne p0.d, p1/z, z1.d, #0
729; VBITS_GE_256-NEXT:    st1w { z0.d }, p0, [z4.d]
730; VBITS_GE_256-NEXT:    ret
731;
732; VBITS_GE_512-LABEL: masked_scatter_v8f32:
733; VBITS_GE_512:       // %bb.0:
734; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
735; VBITS_GE_512-NEXT:    ptrue p1.d, vl8
736; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
737; VBITS_GE_512-NEXT:    ld1d { z1.d }, p1/z, [x1]
738; VBITS_GE_512-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
739; VBITS_GE_512-NEXT:    uunpklo z0.d, z0.s
740; VBITS_GE_512-NEXT:    punpklo p0.h, p0.b
741; VBITS_GE_512-NEXT:    st1w { z0.d }, p0, [z1.d]
742; VBITS_GE_512-NEXT:    ret
743  %vals = load <8 x float>, ptr %a
744  %ptrs = load <8 x ptr>, ptr %b
745  %mask = fcmp oeq <8 x float> %vals, zeroinitializer
746  call void @llvm.masked.scatter.v8f32(<8 x float> %vals, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask)
747  ret void
748}
749
750define void @masked_scatter_v16f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
751; CHECK-LABEL: masked_scatter_v16f32:
752; CHECK:       // %bb.0:
753; CHECK-NEXT:    ptrue p0.s, vl16
754; CHECK-NEXT:    ptrue p1.d, vl16
755; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
756; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
757; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
758; CHECK-NEXT:    uunpklo z0.d, z0.s
759; CHECK-NEXT:    punpklo p0.h, p0.b
760; CHECK-NEXT:    st1w { z0.d }, p0, [z1.d]
761; CHECK-NEXT:    ret
762  %vals = load <16 x float>, ptr %a
763  %ptrs = load <16 x ptr>, ptr %b
764  %mask = fcmp oeq <16 x float> %vals, zeroinitializer
765  call void @llvm.masked.scatter.v16f32(<16 x float> %vals, <16 x ptr> %ptrs, i32 8, <16 x i1> %mask)
766  ret void
767}
768
769define void @masked_scatter_v32f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
770; CHECK-LABEL: masked_scatter_v32f32:
771; CHECK:       // %bb.0:
772; CHECK-NEXT:    ptrue p0.s, vl32
773; CHECK-NEXT:    ptrue p1.d, vl32
774; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
775; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
776; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
777; CHECK-NEXT:    uunpklo z0.d, z0.s
778; CHECK-NEXT:    punpklo p0.h, p0.b
779; CHECK-NEXT:    st1w { z0.d }, p0, [z1.d]
780; CHECK-NEXT:    ret
781  %vals = load <32 x float>, ptr %a
782  %ptrs = load <32 x ptr>, ptr %b
783  %mask = fcmp oeq <32 x float> %vals, zeroinitializer
784  call void @llvm.masked.scatter.v32f32(<32 x float> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
785  ret void
786}
787
788;
789; ST1D (float)
790;
791
792; Scalarize 1 x double scatters
793define void @masked_scatter_v1f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
794; CHECK-LABEL: masked_scatter_v1f64:
795; CHECK:       // %bb.0:
796; CHECK-NEXT:    ldr d0, [x0]
797; CHECK-NEXT:    fcmp d0, #0.0
798; CHECK-NEXT:    b.ne .LBB31_2
799; CHECK-NEXT:  // %bb.1: // %cond.store
800; CHECK-NEXT:    ldr d1, [x1]
801; CHECK-NEXT:    fmov x8, d1
802; CHECK-NEXT:    str d0, [x8]
803; CHECK-NEXT:  .LBB31_2: // %else
804; CHECK-NEXT:    ret
805  %vals = load <1 x double>, ptr %a
806  %ptrs = load <1 x ptr>, ptr %b
807  %mask = fcmp oeq <1 x double> %vals, zeroinitializer
808  call void @llvm.masked.scatter.v1f64(<1 x double> %vals, <1 x ptr> %ptrs, i32 8, <1 x i1> %mask)
809  ret void
810}
811
812define void @masked_scatter_v2f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
813; CHECK-LABEL: masked_scatter_v2f64:
814; CHECK:       // %bb.0:
815; CHECK-NEXT:    ldr q0, [x0]
816; CHECK-NEXT:    ptrue p0.d, vl2
817; CHECK-NEXT:    fcmeq v1.2d, v0.2d, #0.0
818; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
819; CHECK-NEXT:    ldr q1, [x1]
820; CHECK-NEXT:    st1d { z0.d }, p0, [z1.d]
821; CHECK-NEXT:    ret
822  %vals = load <2 x double>, ptr %a
823  %ptrs = load <2 x ptr>, ptr %b
824  %mask = fcmp oeq <2 x double> %vals, zeroinitializer
825  call void @llvm.masked.scatter.v2f64(<2 x double> %vals, <2 x ptr> %ptrs, i32 8, <2 x i1> %mask)
826  ret void
827}
828
829define void @masked_scatter_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
830; CHECK-LABEL: masked_scatter_v4f64:
831; CHECK:       // %bb.0:
832; CHECK-NEXT:    ptrue p0.d, vl4
833; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
834; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
835; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
836; CHECK-NEXT:    st1d { z0.d }, p1, [z1.d]
837; CHECK-NEXT:    ret
838  %vals = load <4 x double>, ptr %a
839  %ptrs = load <4 x ptr>, ptr %b
840  %mask = fcmp oeq <4 x double> %vals, zeroinitializer
841  call void @llvm.masked.scatter.v4f64(<4 x double> %vals, <4 x ptr> %ptrs, i32 8, <4 x i1> %mask)
842  ret void
843}
844
845define void @masked_scatter_v8f64(ptr %a, ptr %b) #0 {
846; VBITS_GE_256-LABEL: masked_scatter_v8f64:
847; VBITS_GE_256:       // %bb.0:
848; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
849; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
850; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0]
851; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0, x8, lsl #3]
852; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
853; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1, x8, lsl #3]
854; VBITS_GE_256-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
855; VBITS_GE_256-NEXT:    fcmeq p0.d, p0/z, z2.d, #0.0
856; VBITS_GE_256-NEXT:    st1d { z0.d }, p1, [z1.d]
857; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [z3.d]
858; VBITS_GE_256-NEXT:    ret
859;
860; VBITS_GE_512-LABEL: masked_scatter_v8f64:
861; VBITS_GE_512:       // %bb.0:
862; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
863; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
864; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
865; VBITS_GE_512-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
866; VBITS_GE_512-NEXT:    st1d { z0.d }, p1, [z1.d]
867; VBITS_GE_512-NEXT:    ret
868  %vals = load <8 x double>, ptr %a
869  %ptrs = load <8 x ptr>, ptr %b
870  %mask = fcmp oeq <8 x double> %vals, zeroinitializer
871  call void @llvm.masked.scatter.v8f64(<8 x double> %vals, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask)
872  ret void
873}
874
875define void @masked_scatter_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
876; CHECK-LABEL: masked_scatter_v16f64:
877; CHECK:       // %bb.0:
878; CHECK-NEXT:    ptrue p0.d, vl16
879; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
880; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
881; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
882; CHECK-NEXT:    st1d { z0.d }, p1, [z1.d]
883; CHECK-NEXT:    ret
884  %vals = load <16 x double>, ptr %a
885  %ptrs = load <16 x ptr>, ptr %b
886  %mask = fcmp oeq <16 x double> %vals, zeroinitializer
887  call void @llvm.masked.scatter.v16f64(<16 x double> %vals, <16 x ptr> %ptrs, i32 8, <16 x i1> %mask)
888  ret void
889}
890
891define void @masked_scatter_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
892; CHECK-LABEL: masked_scatter_v32f64:
893; CHECK:       // %bb.0:
894; CHECK-NEXT:    ptrue p0.d, vl32
895; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
896; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
897; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
898; CHECK-NEXT:    st1d { z0.d }, p1, [z1.d]
899; CHECK-NEXT:    ret
900  %vals = load <32 x double>, ptr %a
901  %ptrs = load <32 x ptr>, ptr %b
902  %mask = fcmp oeq <32 x double> %vals, zeroinitializer
903  call void @llvm.masked.scatter.v32f64(<32 x double> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
904  ret void
905}
906
907; The above tests test the types, the below tests check that the addressing
908; modes still function
909
910define void @masked_scatter_32b_scaled_sext_f16(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 {
911; CHECK-LABEL: masked_scatter_32b_scaled_sext_f16:
912; CHECK:       // %bb.0:
913; CHECK-NEXT:    ptrue p0.h, vl32
914; CHECK-NEXT:    ptrue p1.s, vl32
915; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
916; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1]
917; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
918; CHECK-NEXT:    uunpklo z0.s, z0.h
919; CHECK-NEXT:    punpklo p0.h, p0.b
920; CHECK-NEXT:    st1h { z0.s }, p0, [x2, z1.s, sxtw #1]
921; CHECK-NEXT:    ret
922  %vals = load <32 x half>, ptr %a
923  %idxs = load <32 x i32>, ptr %b
924  %ext = sext <32 x i32> %idxs to <32 x i64>
925  %ptrs = getelementptr half, ptr %base, <32 x i64> %ext
926  %mask = fcmp oeq <32 x half> %vals, zeroinitializer
927  call void @llvm.masked.scatter.v32f16(<32 x half> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
928  ret void
929}
930
931define void @masked_scatter_32b_scaled_sext_f32(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 {
932; CHECK-LABEL: masked_scatter_32b_scaled_sext_f32:
933; CHECK:       // %bb.0:
934; CHECK-NEXT:    ptrue p0.s, vl32
935; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
936; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
937; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
938; CHECK-NEXT:    st1w { z0.s }, p1, [x2, z1.s, sxtw #2]
939; CHECK-NEXT:    ret
940  %vals = load <32 x float>, ptr %a
941  %idxs = load <32 x i32>, ptr %b
942  %ext = sext <32 x i32> %idxs to <32 x i64>
943  %ptrs = getelementptr float, ptr %base, <32 x i64> %ext
944  %mask = fcmp oeq <32 x float> %vals, zeroinitializer
945  call void @llvm.masked.scatter.v32f32(<32 x float> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
946  ret void
947}
948
949define void @masked_scatter_32b_scaled_sext_f64(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 {
950; CHECK-LABEL: masked_scatter_32b_scaled_sext_f64:
951; CHECK:       // %bb.0:
952; CHECK-NEXT:    ptrue p0.d, vl32
953; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
954; CHECK-NEXT:    ld1sw { z1.d }, p0/z, [x1]
955; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
956; CHECK-NEXT:    st1d { z0.d }, p1, [x2, z1.d, lsl #3]
957; CHECK-NEXT:    ret
958  %vals = load <32 x double>, ptr %a
959  %idxs = load <32 x i32>, ptr %b
960  %ext = sext <32 x i32> %idxs to <32 x i64>
961  %ptrs = getelementptr double, ptr %base, <32 x i64> %ext
962  %mask = fcmp oeq <32 x double> %vals, zeroinitializer
963  call void @llvm.masked.scatter.v32f64(<32 x double> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
964  ret void
965}
966
967define void @masked_scatter_32b_scaled_zext(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 {
968; CHECK-LABEL: masked_scatter_32b_scaled_zext:
969; CHECK:       // %bb.0:
970; CHECK-NEXT:    ptrue p0.h, vl32
971; CHECK-NEXT:    ptrue p1.s, vl32
972; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
973; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1]
974; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
975; CHECK-NEXT:    uunpklo z0.s, z0.h
976; CHECK-NEXT:    punpklo p0.h, p0.b
977; CHECK-NEXT:    st1h { z0.s }, p0, [x2, z1.s, uxtw #1]
978; CHECK-NEXT:    ret
979  %vals = load <32 x half>, ptr %a
980  %idxs = load <32 x i32>, ptr %b
981  %ext = zext <32 x i32> %idxs to <32 x i64>
982  %ptrs = getelementptr half, ptr %base, <32 x i64> %ext
983  %mask = fcmp oeq <32 x half> %vals, zeroinitializer
984  call void @llvm.masked.scatter.v32f16(<32 x half> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
985  ret void
986}
987
988define void @masked_scatter_32b_unscaled_sext(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 {
989; CHECK-LABEL: masked_scatter_32b_unscaled_sext:
990; CHECK:       // %bb.0:
991; CHECK-NEXT:    ptrue p0.h, vl32
992; CHECK-NEXT:    ptrue p1.s, vl32
993; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
994; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1]
995; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
996; CHECK-NEXT:    uunpklo z0.s, z0.h
997; CHECK-NEXT:    punpklo p0.h, p0.b
998; CHECK-NEXT:    st1h { z0.s }, p0, [x2, z1.s, sxtw]
999; CHECK-NEXT:    ret
1000  %vals = load <32 x half>, ptr %a
1001  %idxs = load <32 x i32>, ptr %b
1002  %ext = sext <32 x i32> %idxs to <32 x i64>
1003  %byte_ptrs = getelementptr i8, ptr %base, <32 x i64> %ext
1004  %ptrs = bitcast <32 x ptr> %byte_ptrs to <32 x ptr>
1005  %mask = fcmp oeq <32 x half> %vals, zeroinitializer
1006  call void @llvm.masked.scatter.v32f16(<32 x half> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
1007  ret void
1008}
1009
1010define void @masked_scatter_32b_unscaled_zext(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 {
1011; CHECK-LABEL: masked_scatter_32b_unscaled_zext:
1012; CHECK:       // %bb.0:
1013; CHECK-NEXT:    ptrue p0.h, vl32
1014; CHECK-NEXT:    ptrue p1.s, vl32
1015; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1016; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1]
1017; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
1018; CHECK-NEXT:    uunpklo z0.s, z0.h
1019; CHECK-NEXT:    punpklo p0.h, p0.b
1020; CHECK-NEXT:    st1h { z0.s }, p0, [x2, z1.s, uxtw]
1021; CHECK-NEXT:    ret
1022  %vals = load <32 x half>, ptr %a
1023  %idxs = load <32 x i32>, ptr %b
1024  %ext = zext <32 x i32> %idxs to <32 x i64>
1025  %byte_ptrs = getelementptr i8, ptr %base, <32 x i64> %ext
1026  %ptrs = bitcast <32 x ptr> %byte_ptrs to <32 x ptr>
1027  %mask = fcmp oeq <32 x half> %vals, zeroinitializer
1028  call void @llvm.masked.scatter.v32f16(<32 x half> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
1029  ret void
1030}
1031
1032define void @masked_scatter_64b_scaled(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 {
1033; CHECK-LABEL: masked_scatter_64b_scaled:
1034; CHECK:       // %bb.0:
1035; CHECK-NEXT:    ptrue p0.s, vl32
1036; CHECK-NEXT:    ptrue p1.d, vl32
1037; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1038; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
1039; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
1040; CHECK-NEXT:    uunpklo z0.d, z0.s
1041; CHECK-NEXT:    punpklo p0.h, p0.b
1042; CHECK-NEXT:    st1w { z0.d }, p0, [x2, z1.d, lsl #2]
1043; CHECK-NEXT:    ret
1044  %vals = load <32 x float>, ptr %a
1045  %idxs = load <32 x i64>, ptr %b
1046  %ptrs = getelementptr float, ptr %base, <32 x i64> %idxs
1047  %mask = fcmp oeq <32 x float> %vals, zeroinitializer
1048  call void @llvm.masked.scatter.v32f32(<32 x float> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
1049  ret void
1050}
1051
1052define void @masked_scatter_64b_unscaled(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 {
1053; CHECK-LABEL: masked_scatter_64b_unscaled:
1054; CHECK:       // %bb.0:
1055; CHECK-NEXT:    ptrue p0.s, vl32
1056; CHECK-NEXT:    ptrue p1.d, vl32
1057; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1058; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
1059; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
1060; CHECK-NEXT:    uunpklo z0.d, z0.s
1061; CHECK-NEXT:    punpklo p0.h, p0.b
1062; CHECK-NEXT:    st1w { z0.d }, p0, [x2, z1.d]
1063; CHECK-NEXT:    ret
1064  %vals = load <32 x float>, ptr %a
1065  %idxs = load <32 x i64>, ptr %b
1066  %byte_ptrs = getelementptr i8, ptr %base, <32 x i64> %idxs
1067  %ptrs = bitcast <32 x ptr> %byte_ptrs to <32 x ptr>
1068  %mask = fcmp oeq <32 x float> %vals, zeroinitializer
1069  call void @llvm.masked.scatter.v32f32(<32 x float> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
1070  ret void
1071}
1072
1073define void @masked_scatter_vec_plus_reg(ptr %a, ptr %b, i64 %off) vscale_range(16,0) #0 {
1074; CHECK-LABEL: masked_scatter_vec_plus_reg:
1075; CHECK:       // %bb.0:
1076; CHECK-NEXT:    ptrue p0.s, vl32
1077; CHECK-NEXT:    ptrue p1.d, vl32
1078; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1079; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
1080; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
1081; CHECK-NEXT:    uunpklo z0.d, z0.s
1082; CHECK-NEXT:    punpklo p0.h, p0.b
1083; CHECK-NEXT:    st1w { z0.d }, p0, [x2, z1.d]
1084; CHECK-NEXT:    ret
1085  %vals = load <32 x float>, ptr %a
1086  %bases = load <32 x ptr>, ptr %b
1087  %byte_ptrs = getelementptr i8, <32 x ptr> %bases, i64 %off
1088  %ptrs = bitcast <32 x ptr> %byte_ptrs to <32 x ptr>
1089  %mask = fcmp oeq <32 x float> %vals, zeroinitializer
1090  call void @llvm.masked.scatter.v32f32(<32 x float> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
1091  ret void
1092}
1093
1094define void @masked_scatter_vec_plus_imm(ptr %a, ptr %b) vscale_range(16,0) #0 {
1095; CHECK-LABEL: masked_scatter_vec_plus_imm:
1096; CHECK:       // %bb.0:
1097; CHECK-NEXT:    ptrue p0.s, vl32
1098; CHECK-NEXT:    ptrue p1.d, vl32
1099; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1100; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
1101; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
1102; CHECK-NEXT:    uunpklo z0.d, z0.s
1103; CHECK-NEXT:    punpklo p0.h, p0.b
1104; CHECK-NEXT:    st1w { z0.d }, p0, [z1.d, #4]
1105; CHECK-NEXT:    ret
1106  %vals = load <32 x float>, ptr %a
1107  %bases = load <32 x ptr>, ptr %b
1108  %byte_ptrs = getelementptr i8, <32 x ptr> %bases, i64 4
1109  %ptrs = bitcast <32 x ptr> %byte_ptrs to <32 x ptr>
1110  %mask = fcmp oeq <32 x float> %vals, zeroinitializer
1111  call void @llvm.masked.scatter.v32f32(<32 x float> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
1112  ret void
1113}
1114
1115; extract_subvec(...(insert_subvec(a,b,c))) -> extract_subvec(bitcast(b),d) like
1116; combines can effectively unlegalise bitcast operations. This test ensures such
1117; combines do not happen after operation legalisation. When not prevented the
1118; test triggers infinite combine->legalise->combine->...
1119;
1120; NOTE: For this test to function correctly it's critical for %vals to be in a
1121; different block to the scatter store.  If not, the problematic bitcast will be
1122; removed before operation legalisation and thus not exercise the combine.
1123define void @masked_scatter_bitcast_infinite_loop(ptr %a, ptr %b, i1 %cond) vscale_range(4,0) #0 {
1124; CHECK-LABEL: masked_scatter_bitcast_infinite_loop:
1125; CHECK:       // %bb.0:
1126; CHECK-NEXT:    ptrue p0.d, vl8
1127; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1128; CHECK-NEXT:    tbz w2, #0, .LBB47_2
1129; CHECK-NEXT:  // %bb.1: // %bb.1
1130; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
1131; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
1132; CHECK-NEXT:    st1d { z0.d }, p1, [z1.d]
1133; CHECK-NEXT:  .LBB47_2: // %bb.2
1134; CHECK-NEXT:    ret
1135  %vals = load volatile <8 x double>, ptr %a
1136  br i1 %cond, label %bb.1, label %bb.2
1137
1138bb.1:
1139  %ptrs = load <8 x ptr>, ptr %b
1140  %mask = fcmp oeq <8 x double> %vals, zeroinitializer
1141  call void @llvm.masked.scatter.v8f64(<8 x double> %vals, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask)
1142  br label %bb.2
1143
1144bb.2:
1145  ret void
1146}
1147
1148declare void @llvm.masked.scatter.v2i8(<2 x i8>, <2 x ptr>, i32, <2 x i1>)
1149declare void @llvm.masked.scatter.v4i8(<4 x i8>, <4 x ptr>, i32, <4 x i1>)
1150declare void @llvm.masked.scatter.v8i8(<8 x i8>, <8 x ptr>, i32, <8 x i1>)
1151declare void @llvm.masked.scatter.v16i8(<16 x i8>, <16 x ptr>, i32, <16 x i1>)
1152declare void @llvm.masked.scatter.v32i8(<32 x i8>, <32 x ptr>, i32, <32 x i1>)
1153
1154declare void @llvm.masked.scatter.v2i16(<2 x i16>, <2 x ptr>, i32, <2 x i1>)
1155declare void @llvm.masked.scatter.v4i16(<4 x i16>, <4 x ptr>, i32, <4 x i1>)
1156declare void @llvm.masked.scatter.v8i16(<8 x i16>, <8 x ptr>, i32, <8 x i1>)
1157declare void @llvm.masked.scatter.v16i16(<16 x i16>, <16 x ptr>, i32, <16 x i1>)
1158declare void @llvm.masked.scatter.v32i16(<32 x i16>, <32 x ptr>, i32, <32 x i1>)
1159
1160declare void @llvm.masked.scatter.v2i32(<2 x i32>, <2 x ptr>, i32, <2 x i1>)
1161declare void @llvm.masked.scatter.v4i32(<4 x i32>, <4 x ptr>, i32, <4 x i1>)
1162declare void @llvm.masked.scatter.v8i32(<8 x i32>, <8 x ptr>, i32, <8 x i1>)
1163declare void @llvm.masked.scatter.v16i32(<16 x i32>, <16 x ptr>, i32, <16 x i1>)
1164declare void @llvm.masked.scatter.v32i32(<32 x i32>, <32 x ptr>, i32, <32 x i1>)
1165
1166declare void @llvm.masked.scatter.v1i64(<1 x i64>, <1 x ptr>, i32, <1 x i1>)
1167declare void @llvm.masked.scatter.v2i64(<2 x i64>, <2 x ptr>, i32, <2 x i1>)
1168declare void @llvm.masked.scatter.v4i64(<4 x i64>, <4 x ptr>, i32, <4 x i1>)
1169declare void @llvm.masked.scatter.v8i64(<8 x i64>, <8 x ptr>, i32, <8 x i1>)
1170declare void @llvm.masked.scatter.v16i64(<16 x i64>, <16 x ptr>, i32, <16 x i1>)
1171declare void @llvm.masked.scatter.v32i64(<32 x i64>, <32 x ptr>, i32, <32 x i1>)
1172
1173declare void @llvm.masked.scatter.v2f16(<2 x half>, <2 x ptr>, i32, <2 x i1>)
1174declare void @llvm.masked.scatter.v4f16(<4 x half>, <4 x ptr>, i32, <4 x i1>)
1175declare void @llvm.masked.scatter.v8f16(<8 x half>, <8 x ptr>, i32, <8 x i1>)
1176declare void @llvm.masked.scatter.v16f16(<16 x half>, <16 x ptr>, i32, <16 x i1>)
1177declare void @llvm.masked.scatter.v32f16(<32 x half>, <32 x ptr>, i32, <32 x i1>)
1178
1179declare void @llvm.masked.scatter.v2f32(<2 x float>, <2 x ptr>, i32, <2 x i1>)
1180declare void @llvm.masked.scatter.v4f32(<4 x float>, <4 x ptr>, i32, <4 x i1>)
1181declare void @llvm.masked.scatter.v8f32(<8 x float>, <8 x ptr>, i32, <8 x i1>)
1182declare void @llvm.masked.scatter.v16f32(<16 x float>, <16 x ptr>, i32, <16 x i1>)
1183declare void @llvm.masked.scatter.v32f32(<32 x float>, <32 x ptr>, i32, <32 x i1>)
1184
1185declare void @llvm.masked.scatter.v1f64(<1 x double>, <1 x ptr>, i32, <1 x i1>)
1186declare void @llvm.masked.scatter.v2f64(<2 x double>, <2 x ptr>, i32, <2 x i1>)
1187declare void @llvm.masked.scatter.v4f64(<4 x double>, <4 x ptr>, i32, <4 x i1>)
1188declare void @llvm.masked.scatter.v8f64(<8 x double>, <8 x ptr>, i32, <8 x i1>)
1189declare void @llvm.masked.scatter.v16f64(<16 x double>, <16 x ptr>, i32, <16 x i1>)
1190declare void @llvm.masked.scatter.v32f64(<32 x double>, <32 x ptr>, i32, <32 x i1>)
1191
1192attributes #0 = { "target-features"="+sve" }
1193