xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll (revision cc82f1290a1e2157a6c0530d78d8cc84d2b8553d)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
5
6target triple = "aarch64-unknown-linux-gnu"
7
8;
9; LD1B
10;
11
12define void @masked_gather_v2i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
13; CHECK-LABEL: masked_gather_v2i8:
14; CHECK:       // %bb.0:
15; CHECK-NEXT:    ldrb w8, [x0]
16; CHECK-NEXT:    ldrb w9, [x0, #1]
17; CHECK-NEXT:    ptrue p0.d, vl2
18; CHECK-NEXT:    fmov s0, w8
19; CHECK-NEXT:    mov v0.s[1], w9
20; CHECK-NEXT:    cmeq v0.2s, v0.2s, #0
21; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
22; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
23; CHECK-NEXT:    ldr q0, [x1]
24; CHECK-NEXT:    ld1b { z0.d }, p0/z, [z0.d]
25; CHECK-NEXT:    ptrue p0.s, vl2
26; CHECK-NEXT:    xtn v0.2s, v0.2d
27; CHECK-NEXT:    st1b { z0.s }, p0, [x0]
28; CHECK-NEXT:    ret
29  %cval = load <2 x i8>, ptr %a
30  %ptrs = load <2 x ptr>, ptr %b
31  %mask = icmp eq <2 x i8> %cval, zeroinitializer
32  %vals = call <2 x i8> @llvm.masked.gather.v2i8(<2 x ptr> %ptrs, i32 8, <2 x i1> %mask, <2 x i8> undef)
33  store <2 x i8> %vals, ptr %a
34  ret void
35}
36
37define void @masked_gather_v4i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
38; CHECK-LABEL: masked_gather_v4i8:
39; CHECK:       // %bb.0:
40; CHECK-NEXT:    ldr s0, [x0]
41; CHECK-NEXT:    ptrue p0.d, vl4
42; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
43; CHECK-NEXT:    cmeq v0.4h, v0.4h, #0
44; CHECK-NEXT:    sunpklo z0.s, z0.h
45; CHECK-NEXT:    sunpklo z0.d, z0.s
46; CHECK-NEXT:    cmpne p1.d, p0/z, z0.d, #0
47; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
48; CHECK-NEXT:    ld1b { z0.d }, p1/z, [z0.d]
49; CHECK-NEXT:    st1b { z0.d }, p0, [x0]
50; CHECK-NEXT:    ret
51  %cval = load <4 x i8>, ptr %a
52  %ptrs = load <4 x ptr>, ptr %b
53  %mask = icmp eq <4 x i8> %cval, zeroinitializer
54  %vals = call <4 x i8> @llvm.masked.gather.v4i8(<4 x ptr> %ptrs, i32 8, <4 x i1> %mask, <4 x i8> undef)
55  store <4 x i8> %vals, ptr %a
56  ret void
57}
58
59define void @masked_gather_v8i8(ptr %a, ptr %b) #0 {
60; VBITS_GE_256-LABEL: masked_gather_v8i8:
61; VBITS_GE_256:       // %bb.0:
62; VBITS_GE_256-NEXT:    ldr d0, [x0]
63; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
64; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
65; VBITS_GE_256-NEXT:    cmeq v0.8b, v0.8b, #0
66; VBITS_GE_256-NEXT:    zip2 v1.8b, v0.8b, v0.8b
67; VBITS_GE_256-NEXT:    zip1 v0.8b, v0.8b, v0.8b
68; VBITS_GE_256-NEXT:    shl v1.4h, v1.4h, #8
69; VBITS_GE_256-NEXT:    shl v0.4h, v0.4h, #8
70; VBITS_GE_256-NEXT:    sshr v1.4h, v1.4h, #8
71; VBITS_GE_256-NEXT:    sshr v0.4h, v0.4h, #8
72; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
73; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
74; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
75; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
76; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z1.d, #0
77; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
78; VBITS_GE_256-NEXT:    ld1b { z1.d }, p1/z, [z1.d]
79; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z0.d, #0
80; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1]
81; VBITS_GE_256-NEXT:    ld1b { z0.d }, p1/z, [z0.d]
82; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
83; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
84; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
85; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
86; VBITS_GE_256-NEXT:    uzp1 v0.8b, v0.8b, v1.8b
87; VBITS_GE_256-NEXT:    str d0, [x0]
88; VBITS_GE_256-NEXT:    ret
89;
90; VBITS_GE_512-LABEL: masked_gather_v8i8:
91; VBITS_GE_512:       // %bb.0:
92; VBITS_GE_512-NEXT:    ldr d0, [x0]
93; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
94; VBITS_GE_512-NEXT:    cmeq v0.8b, v0.8b, #0
95; VBITS_GE_512-NEXT:    sunpklo z0.h, z0.b
96; VBITS_GE_512-NEXT:    sunpklo z0.s, z0.h
97; VBITS_GE_512-NEXT:    sunpklo z0.d, z0.s
98; VBITS_GE_512-NEXT:    cmpne p1.d, p0/z, z0.d, #0
99; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x1]
100; VBITS_GE_512-NEXT:    ld1b { z0.d }, p1/z, [z0.d]
101; VBITS_GE_512-NEXT:    uzp1 z0.s, z0.s, z0.s
102; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
103; VBITS_GE_512-NEXT:    uzp1 z0.b, z0.b, z0.b
104; VBITS_GE_512-NEXT:    str d0, [x0]
105; VBITS_GE_512-NEXT:    ret
106  %cval = load <8 x i8>, ptr %a
107  %ptrs = load <8 x ptr>, ptr %b
108  %mask = icmp eq <8 x i8> %cval, zeroinitializer
109  %vals = call <8 x i8> @llvm.masked.gather.v8i8(<8 x ptr> %ptrs, i32 8, <8 x i1> %mask, <8 x i8> undef)
110  store <8 x i8> %vals, ptr %a
111  ret void
112}
113
114define void @masked_gather_v16i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
115; CHECK-LABEL: masked_gather_v16i8:
116; CHECK:       // %bb.0:
117; CHECK-NEXT:    ldr q0, [x0]
118; CHECK-NEXT:    ptrue p0.d, vl16
119; CHECK-NEXT:    cmeq v0.16b, v0.16b, #0
120; CHECK-NEXT:    sunpklo z0.h, z0.b
121; CHECK-NEXT:    sunpklo z0.s, z0.h
122; CHECK-NEXT:    sunpklo z0.d, z0.s
123; CHECK-NEXT:    cmpne p1.d, p0/z, z0.d, #0
124; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
125; CHECK-NEXT:    ld1b { z0.d }, p1/z, [z0.d]
126; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
127; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
128; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
129; CHECK-NEXT:    str q0, [x0]
130; CHECK-NEXT:    ret
131  %cval = load <16 x i8>, ptr %a
132  %ptrs = load <16 x ptr>, ptr %b
133  %mask = icmp eq <16 x i8> %cval, zeroinitializer
134  %vals = call <16 x i8> @llvm.masked.gather.v16i8(<16 x ptr> %ptrs, i32 8, <16 x i1> %mask, <16 x i8> undef)
135  store <16 x i8> %vals, ptr %a
136  ret void
137}
138
139define void @masked_gather_v32i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
140; CHECK-LABEL: masked_gather_v32i8:
141; CHECK:       // %bb.0:
142; CHECK-NEXT:    ptrue p0.b, vl32
143; CHECK-NEXT:    ptrue p1.d, vl32
144; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
145; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
146; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x1]
147; CHECK-NEXT:    punpklo p0.h, p0.b
148; CHECK-NEXT:    punpklo p0.h, p0.b
149; CHECK-NEXT:    punpklo p0.h, p0.b
150; CHECK-NEXT:    ld1b { z0.d }, p0/z, [z0.d]
151; CHECK-NEXT:    st1b { z0.d }, p1, [x0]
152; CHECK-NEXT:    ret
153  %cval = load <32 x i8>, ptr %a
154  %ptrs = load <32 x ptr>, ptr %b
155  %mask = icmp eq <32 x i8> %cval, zeroinitializer
156  %vals = call <32 x i8> @llvm.masked.gather.v32i8(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x i8> undef)
157  store <32 x i8> %vals, ptr %a
158  ret void
159}
160
161;
162; LD1H
163;
164
165define void @masked_gather_v2i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
166; CHECK-LABEL: masked_gather_v2i16:
167; CHECK:       // %bb.0:
168; CHECK-NEXT:    ldrh w8, [x0]
169; CHECK-NEXT:    ldrh w9, [x0, #2]
170; CHECK-NEXT:    ptrue p0.d, vl2
171; CHECK-NEXT:    fmov s0, w8
172; CHECK-NEXT:    mov v0.s[1], w9
173; CHECK-NEXT:    cmeq v0.2s, v0.2s, #0
174; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
175; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
176; CHECK-NEXT:    ldr q0, [x1]
177; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z0.d]
178; CHECK-NEXT:    ptrue p0.s, vl2
179; CHECK-NEXT:    xtn v0.2s, v0.2d
180; CHECK-NEXT:    st1h { z0.s }, p0, [x0]
181; CHECK-NEXT:    ret
182  %cval = load <2 x i16>, ptr %a
183  %ptrs = load <2 x ptr>, ptr %b
184  %mask = icmp eq <2 x i16> %cval, zeroinitializer
185  %vals = call <2 x i16> @llvm.masked.gather.v2i16(<2 x ptr> %ptrs, i32 8, <2 x i1> %mask, <2 x i16> undef)
186  store <2 x i16> %vals, ptr %a
187  ret void
188}
189
190define void @masked_gather_v4i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
191; CHECK-LABEL: masked_gather_v4i16:
192; CHECK:       // %bb.0:
193; CHECK-NEXT:    ldr d0, [x0]
194; CHECK-NEXT:    ptrue p0.d, vl4
195; CHECK-NEXT:    cmeq v0.4h, v0.4h, #0
196; CHECK-NEXT:    sunpklo z0.s, z0.h
197; CHECK-NEXT:    sunpklo z0.d, z0.s
198; CHECK-NEXT:    cmpne p1.d, p0/z, z0.d, #0
199; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
200; CHECK-NEXT:    ld1h { z0.d }, p1/z, [z0.d]
201; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
202; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
203; CHECK-NEXT:    str d0, [x0]
204; CHECK-NEXT:    ret
205  %cval = load <4 x i16>, ptr %a
206  %ptrs = load <4 x ptr>, ptr %b
207  %mask = icmp eq <4 x i16> %cval, zeroinitializer
208  %vals = call <4 x i16> @llvm.masked.gather.v4i16(<4 x ptr> %ptrs, i32 8, <4 x i1> %mask, <4 x i16> undef)
209  store <4 x i16> %vals, ptr %a
210  ret void
211}
212
213define void @masked_gather_v8i16(ptr %a, ptr %b) #0 {
214; VBITS_GE_256-LABEL: masked_gather_v8i16:
215; VBITS_GE_256:       // %bb.0:
216; VBITS_GE_256-NEXT:    ldr q0, [x0]
217; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
218; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
219; VBITS_GE_256-NEXT:    cmeq v0.8h, v0.8h, #0
220; VBITS_GE_256-NEXT:    sunpklo z1.s, z0.h
221; VBITS_GE_256-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
222; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
223; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
224; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z1.d, #0
225; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
226; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
227; VBITS_GE_256-NEXT:    ld1h { z1.d }, p1/z, [z1.d]
228; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z0.d, #0
229; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
230; VBITS_GE_256-NEXT:    ld1h { z0.d }, p1/z, [z0.d]
231; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
232; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
233; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
234; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
235; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
236; VBITS_GE_256-NEXT:    str q1, [x0]
237; VBITS_GE_256-NEXT:    ret
238;
239; VBITS_GE_512-LABEL: masked_gather_v8i16:
240; VBITS_GE_512:       // %bb.0:
241; VBITS_GE_512-NEXT:    ldr q0, [x0]
242; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
243; VBITS_GE_512-NEXT:    cmeq v0.8h, v0.8h, #0
244; VBITS_GE_512-NEXT:    sunpklo z0.s, z0.h
245; VBITS_GE_512-NEXT:    sunpklo z0.d, z0.s
246; VBITS_GE_512-NEXT:    cmpne p1.d, p0/z, z0.d, #0
247; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x1]
248; VBITS_GE_512-NEXT:    ld1h { z0.d }, p1/z, [z0.d]
249; VBITS_GE_512-NEXT:    uzp1 z0.s, z0.s, z0.s
250; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
251; VBITS_GE_512-NEXT:    str q0, [x0]
252; VBITS_GE_512-NEXT:    ret
253  %cval = load <8 x i16>, ptr %a
254  %ptrs = load <8 x ptr>, ptr %b
255  %mask = icmp eq <8 x i16> %cval, zeroinitializer
256  %vals = call <8 x i16> @llvm.masked.gather.v8i16(<8 x ptr> %ptrs, i32 8, <8 x i1> %mask, <8 x i16> undef)
257  store <8 x i16> %vals, ptr %a
258  ret void
259}
260
261define void @masked_gather_v16i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
262; CHECK-LABEL: masked_gather_v16i16:
263; CHECK:       // %bb.0:
264; CHECK-NEXT:    ptrue p0.h, vl16
265; CHECK-NEXT:    ptrue p1.d, vl16
266; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
267; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
268; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x1]
269; CHECK-NEXT:    punpklo p0.h, p0.b
270; CHECK-NEXT:    punpklo p0.h, p0.b
271; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z0.d]
272; CHECK-NEXT:    st1h { z0.d }, p1, [x0]
273; CHECK-NEXT:    ret
274  %cval = load <16 x i16>, ptr %a
275  %ptrs = load <16 x ptr>, ptr %b
276  %mask = icmp eq <16 x i16> %cval, zeroinitializer
277  %vals = call <16 x i16> @llvm.masked.gather.v16i16(<16 x ptr> %ptrs, i32 8, <16 x i1> %mask, <16 x i16> undef)
278  store <16 x i16> %vals, ptr %a
279  ret void
280}
281
282define void @masked_gather_v32i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
283; CHECK-LABEL: masked_gather_v32i16:
284; CHECK:       // %bb.0:
285; CHECK-NEXT:    ptrue p0.h, vl32
286; CHECK-NEXT:    ptrue p1.d, vl32
287; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
288; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
289; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x1]
290; CHECK-NEXT:    punpklo p0.h, p0.b
291; CHECK-NEXT:    punpklo p0.h, p0.b
292; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z0.d]
293; CHECK-NEXT:    st1h { z0.d }, p1, [x0]
294; CHECK-NEXT:    ret
295  %cval = load <32 x i16>, ptr %a
296  %ptrs = load <32 x ptr>, ptr %b
297  %mask = icmp eq <32 x i16> %cval, zeroinitializer
298  %vals = call <32 x i16> @llvm.masked.gather.v32i16(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x i16> undef)
299  store <32 x i16> %vals, ptr %a
300  ret void
301}
302
303;
304; LD1W
305;
306
307define void @masked_gather_v2i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
308; CHECK-LABEL: masked_gather_v2i32:
309; CHECK:       // %bb.0:
310; CHECK-NEXT:    ldr d0, [x0]
311; CHECK-NEXT:    ptrue p0.d, vl2
312; CHECK-NEXT:    cmeq v0.2s, v0.2s, #0
313; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
314; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
315; CHECK-NEXT:    ldr q0, [x1]
316; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z0.d]
317; CHECK-NEXT:    xtn v0.2s, v0.2d
318; CHECK-NEXT:    str d0, [x0]
319; CHECK-NEXT:    ret
320  %cval = load <2 x i32>, ptr %a
321  %ptrs = load <2 x ptr>, ptr %b
322  %mask = icmp eq <2 x i32> %cval, zeroinitializer
323  %vals = call <2 x i32> @llvm.masked.gather.v2i32(<2 x ptr> %ptrs, i32 8, <2 x i1> %mask, <2 x i32> undef)
324  store <2 x i32> %vals, ptr %a
325  ret void
326}
327
328define void @masked_gather_v4i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
329; CHECK-LABEL: masked_gather_v4i32:
330; CHECK:       // %bb.0:
331; CHECK-NEXT:    ldr q0, [x0]
332; CHECK-NEXT:    ptrue p0.d, vl4
333; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
334; CHECK-NEXT:    sunpklo z0.d, z0.s
335; CHECK-NEXT:    cmpne p1.d, p0/z, z0.d, #0
336; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
337; CHECK-NEXT:    ld1w { z0.d }, p1/z, [z0.d]
338; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
339; CHECK-NEXT:    str q0, [x0]
340; CHECK-NEXT:    ret
341  %cval = load <4 x i32>, ptr %a
342  %ptrs = load <4 x ptr>, ptr %b
343  %mask = icmp eq <4 x i32> %cval, zeroinitializer
344  %vals = call <4 x i32> @llvm.masked.gather.v4i32(<4 x ptr> %ptrs, i32 8, <4 x i1> %mask, <4 x i32> undef)
345  store <4 x i32> %vals, ptr %a
346  ret void
347}
348
349define void @masked_gather_v8i32(ptr %a, ptr %b) #0 {
350; VBITS_GE_256-LABEL: masked_gather_v8i32:
351; VBITS_GE_256:       // %bb.0:
352; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
353; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
354; VBITS_GE_256-NEXT:    ptrue p2.d, vl4
355; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
356; VBITS_GE_256-NEXT:    ld1d { z1.d }, p2/z, [x1]
357; VBITS_GE_256-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
358; VBITS_GE_256-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
359; VBITS_GE_256-NEXT:    punpklo p1.h, p1.b
360; VBITS_GE_256-NEXT:    and p1.b, p1/z, p1.b, p2.b
361; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
362; VBITS_GE_256-NEXT:    ld1w { z1.d }, p1/z, [z1.d]
363; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
364; VBITS_GE_256-NEXT:    cmpne p1.d, p2/z, z0.d, #0
365; VBITS_GE_256-NEXT:    ld1d { z0.d }, p2/z, [x1, x8, lsl #3]
366; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
367; VBITS_GE_256-NEXT:    ld1w { z0.d }, p1/z, [z0.d]
368; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
369; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
370; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z0.s
371; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
372; VBITS_GE_256-NEXT:    ret
373;
374; VBITS_GE_512-LABEL: masked_gather_v8i32:
375; VBITS_GE_512:       // %bb.0:
376; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
377; VBITS_GE_512-NEXT:    ptrue p1.d, vl8
378; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
379; VBITS_GE_512-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
380; VBITS_GE_512-NEXT:    ld1d { z0.d }, p1/z, [x1]
381; VBITS_GE_512-NEXT:    punpklo p0.h, p0.b
382; VBITS_GE_512-NEXT:    ld1w { z0.d }, p0/z, [z0.d]
383; VBITS_GE_512-NEXT:    st1w { z0.d }, p1, [x0]
384; VBITS_GE_512-NEXT:    ret
385  %cval = load <8 x i32>, ptr %a
386  %ptrs = load <8 x ptr>, ptr %b
387  %mask = icmp eq <8 x i32> %cval, zeroinitializer
388  %vals = call <8 x i32> @llvm.masked.gather.v8i32(<8 x ptr> %ptrs, i32 8, <8 x i1> %mask, <8 x i32> undef)
389  store <8 x i32> %vals, ptr %a
390  ret void
391}
392
393define void @masked_gather_v16i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
394; CHECK-LABEL: masked_gather_v16i32:
395; CHECK:       // %bb.0:
396; CHECK-NEXT:    ptrue p0.s, vl16
397; CHECK-NEXT:    ptrue p1.d, vl16
398; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
399; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
400; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x1]
401; CHECK-NEXT:    punpklo p0.h, p0.b
402; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z0.d]
403; CHECK-NEXT:    st1w { z0.d }, p1, [x0]
404; CHECK-NEXT:    ret
405  %cval = load <16 x i32>, ptr %a
406  %ptrs = load <16 x ptr>, ptr %b
407  %mask = icmp eq <16 x i32> %cval, zeroinitializer
408  %vals = call <16 x i32> @llvm.masked.gather.v16i32(<16 x ptr> %ptrs, i32 8, <16 x i1> %mask, <16 x i32> undef)
409  store <16 x i32> %vals, ptr %a
410  ret void
411}
412
413define void @masked_gather_v32i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
414; CHECK-LABEL: masked_gather_v32i32:
415; CHECK:       // %bb.0:
416; CHECK-NEXT:    ptrue p0.s, vl32
417; CHECK-NEXT:    ptrue p1.d, vl32
418; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
419; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
420; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x1]
421; CHECK-NEXT:    punpklo p0.h, p0.b
422; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z0.d]
423; CHECK-NEXT:    st1w { z0.d }, p1, [x0]
424; CHECK-NEXT:    ret
425  %cval = load <32 x i32>, ptr %a
426  %ptrs = load <32 x ptr>, ptr %b
427  %mask = icmp eq <32 x i32> %cval, zeroinitializer
428  %vals = call <32 x i32> @llvm.masked.gather.v32i32(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x i32> undef)
429  store <32 x i32> %vals, ptr %a
430  ret void
431}
432
433;
434; LD1D
435;
436
437; Scalarize 1 x i64 gathers
438define void @masked_gather_v1i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
439; CHECK-LABEL: masked_gather_v1i64:
440; CHECK:       // %bb.0:
441; CHECK-NEXT:    ldr d0, [x0]
442; CHECK-NEXT:    fmov x8, d0
443; CHECK-NEXT:    // implicit-def: $d0
444; CHECK-NEXT:    cbnz x8, .LBB15_2
445; CHECK-NEXT:  // %bb.1: // %cond.load
446; CHECK-NEXT:    ldr d0, [x1]
447; CHECK-NEXT:    fmov x8, d0
448; CHECK-NEXT:    ldr d0, [x8]
449; CHECK-NEXT:  .LBB15_2: // %else
450; CHECK-NEXT:    str d0, [x0]
451; CHECK-NEXT:    ret
452  %cval = load <1 x i64>, ptr %a
453  %ptrs = load <1 x ptr>, ptr %b
454  %mask = icmp eq <1 x i64> %cval, zeroinitializer
455  %vals = call <1 x i64> @llvm.masked.gather.v1i64(<1 x ptr> %ptrs, i32 8, <1 x i1> %mask, <1 x i64> undef)
456  store <1 x i64> %vals, ptr %a
457  ret void
458}
459
460define void @masked_gather_v2i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
461; CHECK-LABEL: masked_gather_v2i64:
462; CHECK:       // %bb.0:
463; CHECK-NEXT:    ldr q0, [x0]
464; CHECK-NEXT:    ptrue p0.d, vl2
465; CHECK-NEXT:    cmeq v0.2d, v0.2d, #0
466; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
467; CHECK-NEXT:    ldr q0, [x1]
468; CHECK-NEXT:    ld1d { z0.d }, p0/z, [z0.d]
469; CHECK-NEXT:    str q0, [x0]
470; CHECK-NEXT:    ret
471  %cval = load <2 x i64>, ptr %a
472  %ptrs = load <2 x ptr>, ptr %b
473  %mask = icmp eq <2 x i64> %cval, zeroinitializer
474  %vals = call <2 x i64> @llvm.masked.gather.v2i64(<2 x ptr> %ptrs, i32 8, <2 x i1> %mask, <2 x i64> undef)
475  store <2 x i64> %vals, ptr %a
476  ret void
477}
478
479define void @masked_gather_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
480; CHECK-LABEL: masked_gather_v4i64:
481; CHECK:       // %bb.0:
482; CHECK-NEXT:    ptrue p0.d, vl4
483; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
484; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
485; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
486; CHECK-NEXT:    ld1d { z0.d }, p1/z, [z0.d]
487; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
488; CHECK-NEXT:    ret
489  %cval = load <4 x i64>, ptr %a
490  %ptrs = load <4 x ptr>, ptr %b
491  %mask = icmp eq <4 x i64> %cval, zeroinitializer
492  %vals = call <4 x i64> @llvm.masked.gather.v4i64(<4 x ptr> %ptrs, i32 8, <4 x i1> %mask, <4 x i64> undef)
493  store <4 x i64> %vals, ptr %a
494  ret void
495}
496
497define void @masked_gather_v8i64(ptr %a, ptr %b) #0 {
498; VBITS_GE_256-LABEL: masked_gather_v8i64:
499; VBITS_GE_256:       // %bb.0:
500; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
501; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
502; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
503; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
504; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
505; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
506; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [z0.d]
507; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z1.d, #0
508; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
509; VBITS_GE_256-NEXT:    ld1d { z1.d }, p1/z, [z1.d]
510; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
511; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
512; VBITS_GE_256-NEXT:    ret
513;
514; VBITS_GE_512-LABEL: masked_gather_v8i64:
515; VBITS_GE_512:       // %bb.0:
516; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
517; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
518; VBITS_GE_512-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
519; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x1]
520; VBITS_GE_512-NEXT:    ld1d { z0.d }, p1/z, [z0.d]
521; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
522; VBITS_GE_512-NEXT:    ret
523  %cval = load <8 x i64>, ptr %a
524  %ptrs = load <8 x ptr>, ptr %b
525  %mask = icmp eq <8 x i64> %cval, zeroinitializer
526  %vals = call <8 x i64> @llvm.masked.gather.v8i64(<8 x ptr> %ptrs, i32 8, <8 x i1> %mask, <8 x i64> undef)
527  store <8 x i64> %vals, ptr %a
528  ret void
529}
530
531define void @masked_gather_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
532; CHECK-LABEL: masked_gather_v16i64:
533; CHECK:       // %bb.0:
534; CHECK-NEXT:    ptrue p0.d, vl16
535; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
536; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
537; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
538; CHECK-NEXT:    ld1d { z0.d }, p1/z, [z0.d]
539; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
540; CHECK-NEXT:    ret
541  %cval = load <16 x i64>, ptr %a
542  %ptrs = load <16 x ptr>, ptr %b
543  %mask = icmp eq <16 x i64> %cval, zeroinitializer
544  %vals = call <16 x i64> @llvm.masked.gather.v16i64(<16 x ptr> %ptrs, i32 8, <16 x i1> %mask, <16 x i64> undef)
545  store <16 x i64> %vals, ptr %a
546  ret void
547}
548
549define void @masked_gather_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
550; CHECK-LABEL: masked_gather_v32i64:
551; CHECK:       // %bb.0:
552; CHECK-NEXT:    ptrue p0.d, vl32
553; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
554; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
555; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
556; CHECK-NEXT:    ld1d { z0.d }, p1/z, [z0.d]
557; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
558; CHECK-NEXT:    ret
559  %cval = load <32 x i64>, ptr %a
560  %ptrs = load <32 x ptr>, ptr %b
561  %mask = icmp eq <32 x i64> %cval, zeroinitializer
562  %vals = call <32 x i64> @llvm.masked.gather.v32i64(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x i64> undef)
563  store <32 x i64> %vals, ptr %a
564  ret void
565}
566
567;
568; LD1H (float)
569;
570
571define void @masked_gather_v2f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
572; CHECK-LABEL: masked_gather_v2f16:
573; CHECK:       // %bb.0:
574; CHECK-NEXT:    ldr s1, [x0]
575; CHECK-NEXT:    movi v0.2d, #0000000000000000
576; CHECK-NEXT:    ptrue p0.d, vl4
577; CHECK-NEXT:    fcmeq v1.4h, v1.4h, #0.0
578; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
579; CHECK-NEXT:    mov v0.h[0], v1.h[0]
580; CHECK-NEXT:    mov w8, v1.s[1]
581; CHECK-NEXT:    mov v0.h[1], w8
582; CHECK-NEXT:    sunpklo z0.s, z0.h
583; CHECK-NEXT:    sunpklo z0.d, z0.s
584; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
585; CHECK-NEXT:    ldr q0, [x1]
586; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z0.d]
587; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
588; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
589; CHECK-NEXT:    str s0, [x0]
590; CHECK-NEXT:    ret
591  %cval = load <2 x half>, ptr %a
592  %ptrs = load <2 x ptr>, ptr %b
593  %mask = fcmp oeq <2 x half> %cval, zeroinitializer
594  %vals = call <2 x half> @llvm.masked.gather.v2f16(<2 x ptr> %ptrs, i32 8, <2 x i1> %mask, <2 x half> undef)
595  store <2 x half> %vals, ptr %a
596  ret void
597}
598
599define void @masked_gather_v4f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
600; CHECK-LABEL: masked_gather_v4f16:
601; CHECK:       // %bb.0:
602; CHECK-NEXT:    ldr d0, [x0]
603; CHECK-NEXT:    ptrue p0.d, vl4
604; CHECK-NEXT:    fcmeq v0.4h, v0.4h, #0.0
605; CHECK-NEXT:    sunpklo z0.s, z0.h
606; CHECK-NEXT:    sunpklo z0.d, z0.s
607; CHECK-NEXT:    cmpne p1.d, p0/z, z0.d, #0
608; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
609; CHECK-NEXT:    ld1h { z0.d }, p1/z, [z0.d]
610; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
611; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
612; CHECK-NEXT:    str d0, [x0]
613; CHECK-NEXT:    ret
614  %cval = load <4 x half>, ptr %a
615  %ptrs = load <4 x ptr>, ptr %b
616  %mask = fcmp oeq <4 x half> %cval, zeroinitializer
617  %vals = call <4 x half> @llvm.masked.gather.v4f16(<4 x ptr> %ptrs, i32 8, <4 x i1> %mask, <4 x half> undef)
618  store <4 x half> %vals, ptr %a
619  ret void
620}
621
622define void @masked_gather_v8f16(ptr %a, ptr %b) #0 {
623; VBITS_GE_256-LABEL: masked_gather_v8f16:
624; VBITS_GE_256:       // %bb.0:
625; VBITS_GE_256-NEXT:    ldr q0, [x0]
626; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
627; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
628; VBITS_GE_256-NEXT:    fcmeq v0.8h, v0.8h, #0.0
629; VBITS_GE_256-NEXT:    sunpklo z1.s, z0.h
630; VBITS_GE_256-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
631; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
632; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
633; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z1.d, #0
634; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
635; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
636; VBITS_GE_256-NEXT:    ld1h { z1.d }, p1/z, [z1.d]
637; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z0.d, #0
638; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
639; VBITS_GE_256-NEXT:    ld1h { z0.d }, p1/z, [z0.d]
640; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
641; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
642; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
643; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
644; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
645; VBITS_GE_256-NEXT:    str q1, [x0]
646; VBITS_GE_256-NEXT:    ret
647;
648; VBITS_GE_512-LABEL: masked_gather_v8f16:
649; VBITS_GE_512:       // %bb.0:
650; VBITS_GE_512-NEXT:    ldr q0, [x0]
651; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
652; VBITS_GE_512-NEXT:    fcmeq v0.8h, v0.8h, #0.0
653; VBITS_GE_512-NEXT:    sunpklo z0.s, z0.h
654; VBITS_GE_512-NEXT:    sunpklo z0.d, z0.s
655; VBITS_GE_512-NEXT:    cmpne p1.d, p0/z, z0.d, #0
656; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x1]
657; VBITS_GE_512-NEXT:    ld1h { z0.d }, p1/z, [z0.d]
658; VBITS_GE_512-NEXT:    uzp1 z0.s, z0.s, z0.s
659; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
660; VBITS_GE_512-NEXT:    str q0, [x0]
661; VBITS_GE_512-NEXT:    ret
662  %cval = load <8 x half>, ptr %a
663  %ptrs = load <8 x ptr>, ptr %b
664  %mask = fcmp oeq <8 x half> %cval, zeroinitializer
665  %vals = call <8 x half> @llvm.masked.gather.v8f16(<8 x ptr> %ptrs, i32 8, <8 x i1> %mask, <8 x half> undef)
666  store <8 x half> %vals, ptr %a
667  ret void
668}
669
670define void @masked_gather_v16f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
671; CHECK-LABEL: masked_gather_v16f16:
672; CHECK:       // %bb.0:
673; CHECK-NEXT:    ptrue p0.h, vl16
674; CHECK-NEXT:    ptrue p1.d, vl16
675; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
676; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
677; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x1]
678; CHECK-NEXT:    punpklo p0.h, p0.b
679; CHECK-NEXT:    punpklo p0.h, p0.b
680; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z0.d]
681; CHECK-NEXT:    st1h { z0.d }, p1, [x0]
682; CHECK-NEXT:    ret
683  %cval = load <16 x half>, ptr %a
684  %ptrs = load <16 x ptr>, ptr %b
685  %mask = fcmp oeq <16 x half> %cval, zeroinitializer
686  %vals = call <16 x half> @llvm.masked.gather.v16f16(<16 x ptr> %ptrs, i32 8, <16 x i1> %mask, <16 x half> undef)
687  store <16 x half> %vals, ptr %a
688  ret void
689}
690
691define void @masked_gather_v32f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
692; CHECK-LABEL: masked_gather_v32f16:
693; CHECK:       // %bb.0:
694; CHECK-NEXT:    ptrue p0.h, vl32
695; CHECK-NEXT:    ptrue p1.d, vl32
696; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
697; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
698; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x1]
699; CHECK-NEXT:    punpklo p0.h, p0.b
700; CHECK-NEXT:    punpklo p0.h, p0.b
701; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z0.d]
702; CHECK-NEXT:    st1h { z0.d }, p1, [x0]
703; CHECK-NEXT:    ret
704  %cval = load <32 x half>, ptr %a
705  %ptrs = load <32 x ptr>, ptr %b
706  %mask = fcmp oeq <32 x half> %cval, zeroinitializer
707  %vals = call <32 x half> @llvm.masked.gather.v32f16(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x half> undef)
708  store <32 x half> %vals, ptr %a
709  ret void
710}
711
712;
713; LD1W (float)
714;
715
716define void @masked_gather_v2f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
717; CHECK-LABEL: masked_gather_v2f32:
718; CHECK:       // %bb.0:
719; CHECK-NEXT:    ldr d0, [x0]
720; CHECK-NEXT:    ptrue p0.d, vl2
721; CHECK-NEXT:    fcmeq v0.2s, v0.2s, #0.0
722; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
723; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
724; CHECK-NEXT:    ldr q0, [x1]
725; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z0.d]
726; CHECK-NEXT:    xtn v0.2s, v0.2d
727; CHECK-NEXT:    str d0, [x0]
728; CHECK-NEXT:    ret
729  %cval = load <2 x float>, ptr %a
730  %ptrs = load <2 x ptr>, ptr %b
731  %mask = fcmp oeq <2 x float> %cval, zeroinitializer
732  %vals = call <2 x float> @llvm.masked.gather.v2f32(<2 x ptr> %ptrs, i32 8, <2 x i1> %mask, <2 x float> undef)
733  store <2 x float> %vals, ptr %a
734  ret void
735}
736
737define void @masked_gather_v4f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
738; CHECK-LABEL: masked_gather_v4f32:
739; CHECK:       // %bb.0:
740; CHECK-NEXT:    ldr q0, [x0]
741; CHECK-NEXT:    ptrue p0.d, vl4
742; CHECK-NEXT:    fcmeq v0.4s, v0.4s, #0.0
743; CHECK-NEXT:    sunpklo z0.d, z0.s
744; CHECK-NEXT:    cmpne p1.d, p0/z, z0.d, #0
745; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
746; CHECK-NEXT:    ld1w { z0.d }, p1/z, [z0.d]
747; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
748; CHECK-NEXT:    str q0, [x0]
749; CHECK-NEXT:    ret
750  %cval = load <4 x float>, ptr %a
751  %ptrs = load <4 x ptr>, ptr %b
752  %mask = fcmp oeq <4 x float> %cval, zeroinitializer
753  %vals = call <4 x float> @llvm.masked.gather.v4f32(<4 x ptr> %ptrs, i32 8, <4 x i1> %mask, <4 x float> undef)
754  store <4 x float> %vals, ptr %a
755  ret void
756}
757
758define void @masked_gather_v8f32(ptr %a, ptr %b) #0 {
759; VBITS_GE_256-LABEL: masked_gather_v8f32:
760; VBITS_GE_256:       // %bb.0:
761; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
762; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
763; VBITS_GE_256-NEXT:    ptrue p2.d, vl4
764; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
765; VBITS_GE_256-NEXT:    ld1d { z1.d }, p2/z, [x1]
766; VBITS_GE_256-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
767; VBITS_GE_256-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
768; VBITS_GE_256-NEXT:    punpklo p1.h, p1.b
769; VBITS_GE_256-NEXT:    and p1.b, p1/z, p1.b, p2.b
770; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
771; VBITS_GE_256-NEXT:    ld1w { z1.d }, p1/z, [z1.d]
772; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
773; VBITS_GE_256-NEXT:    cmpne p1.d, p2/z, z0.d, #0
774; VBITS_GE_256-NEXT:    ld1d { z0.d }, p2/z, [x1, x8, lsl #3]
775; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
776; VBITS_GE_256-NEXT:    ld1w { z0.d }, p1/z, [z0.d]
777; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
778; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
779; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z0.s
780; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
781; VBITS_GE_256-NEXT:    ret
782;
783; VBITS_GE_512-LABEL: masked_gather_v8f32:
784; VBITS_GE_512:       // %bb.0:
785; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
786; VBITS_GE_512-NEXT:    ptrue p1.d, vl8
787; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
788; VBITS_GE_512-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
789; VBITS_GE_512-NEXT:    ld1d { z0.d }, p1/z, [x1]
790; VBITS_GE_512-NEXT:    punpklo p0.h, p0.b
791; VBITS_GE_512-NEXT:    ld1w { z0.d }, p0/z, [z0.d]
792; VBITS_GE_512-NEXT:    st1w { z0.d }, p1, [x0]
793; VBITS_GE_512-NEXT:    ret
794  %cval = load <8 x float>, ptr %a
795  %ptrs = load <8 x ptr>, ptr %b
796  %mask = fcmp oeq <8 x float> %cval, zeroinitializer
797  %vals = call <8 x float> @llvm.masked.gather.v8f32(<8 x ptr> %ptrs, i32 8, <8 x i1> %mask, <8 x float> undef)
798  store <8 x float> %vals, ptr %a
799  ret void
800}
801
802define void @masked_gather_v16f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
803; CHECK-LABEL: masked_gather_v16f32:
804; CHECK:       // %bb.0:
805; CHECK-NEXT:    ptrue p0.s, vl16
806; CHECK-NEXT:    ptrue p1.d, vl16
807; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
808; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
809; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x1]
810; CHECK-NEXT:    punpklo p0.h, p0.b
811; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z0.d]
812; CHECK-NEXT:    st1w { z0.d }, p1, [x0]
813; CHECK-NEXT:    ret
814  %cval = load <16 x float>, ptr %a
815  %ptrs = load <16 x ptr>, ptr %b
816  %mask = fcmp oeq <16 x float> %cval, zeroinitializer
817  %vals = call <16 x float> @llvm.masked.gather.v16f32(<16 x ptr> %ptrs, i32 8, <16 x i1> %mask, <16 x float> undef)
818  store <16 x float> %vals, ptr %a
819  ret void
820}
821
822define void @masked_gather_v32f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
823; CHECK-LABEL: masked_gather_v32f32:
824; CHECK:       // %bb.0:
825; CHECK-NEXT:    ptrue p0.s, vl32
826; CHECK-NEXT:    ptrue p1.d, vl32
827; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
828; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
829; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x1]
830; CHECK-NEXT:    punpklo p0.h, p0.b
831; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z0.d]
832; CHECK-NEXT:    st1w { z0.d }, p1, [x0]
833; CHECK-NEXT:    ret
834  %cval = load <32 x float>, ptr %a
835  %ptrs = load <32 x ptr>, ptr %b
836  %mask = fcmp oeq <32 x float> %cval, zeroinitializer
837  %vals = call <32 x float> @llvm.masked.gather.v32f32(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x float> undef)
838  store <32 x float> %vals, ptr %a
839  ret void
840}
841
842;
843; LD1D (float)
844;
845
846; Scalarize 1 x double gathers
847define void @masked_gather_v1f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
848; CHECK-LABEL: masked_gather_v1f64:
849; CHECK:       // %bb.0:
850; CHECK-NEXT:    ldr d0, [x0]
851; CHECK-NEXT:    fcmp d0, #0.0
852; CHECK-NEXT:    // implicit-def: $d0
853; CHECK-NEXT:    b.ne .LBB31_2
854; CHECK-NEXT:  // %bb.1: // %cond.load
855; CHECK-NEXT:    ldr d0, [x1]
856; CHECK-NEXT:    fmov x8, d0
857; CHECK-NEXT:    ldr d0, [x8]
858; CHECK-NEXT:  .LBB31_2: // %else
859; CHECK-NEXT:    str d0, [x0]
860; CHECK-NEXT:    ret
861  %cval = load <1 x double>, ptr %a
862  %ptrs = load <1 x ptr>, ptr %b
863  %mask = fcmp oeq <1 x double> %cval, zeroinitializer
864  %vals = call <1 x double> @llvm.masked.gather.v1f64(<1 x ptr> %ptrs, i32 8, <1 x i1> %mask, <1 x double> undef)
865  store <1 x double> %vals, ptr %a
866  ret void
867}
868
869define void @masked_gather_v2f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
870; CHECK-LABEL: masked_gather_v2f64:
871; CHECK:       // %bb.0:
872; CHECK-NEXT:    ldr q0, [x0]
873; CHECK-NEXT:    ptrue p0.d, vl2
874; CHECK-NEXT:    fcmeq v0.2d, v0.2d, #0.0
875; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
876; CHECK-NEXT:    ldr q0, [x1]
877; CHECK-NEXT:    ld1d { z0.d }, p0/z, [z0.d]
878; CHECK-NEXT:    str q0, [x0]
879; CHECK-NEXT:    ret
880  %cval = load <2 x double>, ptr %a
881  %ptrs = load <2 x ptr>, ptr %b
882  %mask = fcmp oeq <2 x double> %cval, zeroinitializer
883  %vals = call <2 x double> @llvm.masked.gather.v2f64(<2 x ptr> %ptrs, i32 8, <2 x i1> %mask, <2 x double> undef)
884  store <2 x double> %vals, ptr %a
885  ret void
886}
887
888define void @masked_gather_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
889; CHECK-LABEL: masked_gather_v4f64:
890; CHECK:       // %bb.0:
891; CHECK-NEXT:    ptrue p0.d, vl4
892; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
893; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
894; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
895; CHECK-NEXT:    ld1d { z0.d }, p1/z, [z0.d]
896; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
897; CHECK-NEXT:    ret
898  %cval = load <4 x double>, ptr %a
899  %ptrs = load <4 x ptr>, ptr %b
900  %mask = fcmp oeq <4 x double> %cval, zeroinitializer
901  %vals = call <4 x double> @llvm.masked.gather.v4f64(<4 x ptr> %ptrs, i32 8, <4 x i1> %mask, <4 x double> undef)
902  store <4 x double> %vals, ptr %a
903  ret void
904}
905
906define void @masked_gather_v8f64(ptr %a, ptr %b) #0 {
907; VBITS_GE_256-LABEL: masked_gather_v8f64:
908; VBITS_GE_256:       // %bb.0:
909; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
910; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
911; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
912; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
913; VBITS_GE_256-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
914; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
915; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [z0.d]
916; VBITS_GE_256-NEXT:    fcmeq p1.d, p0/z, z1.d, #0.0
917; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
918; VBITS_GE_256-NEXT:    ld1d { z1.d }, p1/z, [z1.d]
919; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
920; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
921; VBITS_GE_256-NEXT:    ret
922;
923; VBITS_GE_512-LABEL: masked_gather_v8f64:
924; VBITS_GE_512:       // %bb.0:
925; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
926; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
927; VBITS_GE_512-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
928; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x1]
929; VBITS_GE_512-NEXT:    ld1d { z0.d }, p1/z, [z0.d]
930; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
931; VBITS_GE_512-NEXT:    ret
932  %cval = load <8 x double>, ptr %a
933  %ptrs = load <8 x ptr>, ptr %b
934  %mask = fcmp oeq <8 x double> %cval, zeroinitializer
935  %vals = call <8 x double> @llvm.masked.gather.v8f64(<8 x ptr> %ptrs, i32 8, <8 x i1> %mask, <8 x double> undef)
936  store <8 x double> %vals, ptr %a
937  ret void
938}
939
940define void @masked_gather_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
941; CHECK-LABEL: masked_gather_v16f64:
942; CHECK:       // %bb.0:
943; CHECK-NEXT:    ptrue p0.d, vl16
944; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
945; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
946; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
947; CHECK-NEXT:    ld1d { z0.d }, p1/z, [z0.d]
948; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
949; CHECK-NEXT:    ret
950  %cval = load <16 x double>, ptr %a
951  %ptrs = load <16 x ptr>, ptr %b
952  %mask = fcmp oeq <16 x double> %cval, zeroinitializer
953  %vals = call <16 x double> @llvm.masked.gather.v16f64(<16 x ptr> %ptrs, i32 8, <16 x i1> %mask, <16 x double> undef)
954  store <16 x double> %vals, ptr %a
955  ret void
956}
957
958define void @masked_gather_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
959; CHECK-LABEL: masked_gather_v32f64:
960; CHECK:       // %bb.0:
961; CHECK-NEXT:    ptrue p0.d, vl32
962; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
963; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
964; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
965; CHECK-NEXT:    ld1d { z0.d }, p1/z, [z0.d]
966; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
967; CHECK-NEXT:    ret
968  %cval = load <32 x double>, ptr %a
969  %ptrs = load <32 x ptr>, ptr %b
970  %mask = fcmp oeq <32 x double> %cval, zeroinitializer
971  %vals = call <32 x double> @llvm.masked.gather.v32f64(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x double> undef)
972  store <32 x double> %vals, ptr %a
973  ret void
974}
975
976; The above tests test the types, the below tests check that the addressing
977; modes still function
978
979define void @masked_gather_32b_scaled_sext_f16(ptr %a, ptr %b, ptr %base) vscale_range(8,0) #0 {
980; CHECK-LABEL: masked_gather_32b_scaled_sext_f16:
981; CHECK:       // %bb.0:
982; CHECK-NEXT:    ptrue p0.h, vl32
983; CHECK-NEXT:    ptrue p1.s, vl32
984; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
985; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
986; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x1]
987; CHECK-NEXT:    punpklo p0.h, p0.b
988; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x2, z0.s, sxtw #1]
989; CHECK-NEXT:    st1h { z0.s }, p1, [x0]
990; CHECK-NEXT:    ret
991  %cvals = load <32 x half>, ptr %a
992  %idxs = load <32 x i32>, ptr %b
993  %ext = sext <32 x i32> %idxs to <32 x i64>
994  %ptrs = getelementptr half, ptr %base, <32 x i64> %ext
995  %mask = fcmp oeq <32 x half> %cvals, zeroinitializer
996  %vals = call <32 x half> @llvm.masked.gather.v32f16(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x half> undef)
997  store <32 x half> %vals, ptr %a
998  ret void
999}
1000
1001define void @masked_gather_32b_scaled_sext_f32(ptr %a, ptr %b, ptr %base) vscale_range(8,0) #0 {
1002; CHECK-LABEL: masked_gather_32b_scaled_sext_f32:
1003; CHECK:       // %bb.0:
1004; CHECK-NEXT:    ptrue p0.s, vl32
1005; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1006; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
1007; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1]
1008; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x2, z0.s, sxtw #2]
1009; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1010; CHECK-NEXT:    ret
1011  %cvals = load <32 x float>, ptr %a
1012  %idxs = load <32 x i32>, ptr %b
1013  %ext = sext <32 x i32> %idxs to <32 x i64>
1014  %ptrs = getelementptr float, ptr %base, <32 x i64> %ext
1015  %mask = fcmp oeq <32 x float> %cvals, zeroinitializer
1016  %vals = call <32 x float> @llvm.masked.gather.v32f32(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x float> undef)
1017  store <32 x float> %vals, ptr %a
1018  ret void
1019}
1020
1021define void @masked_gather_32b_scaled_sext_f64(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 {
1022; CHECK-LABEL: masked_gather_32b_scaled_sext_f64:
1023; CHECK:       // %bb.0:
1024; CHECK-NEXT:    ptrue p0.d, vl32
1025; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1026; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
1027; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x1]
1028; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x2, z0.d, lsl #3]
1029; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1030; CHECK-NEXT:    ret
1031  %cvals = load <32 x double>, ptr %a
1032  %idxs = load <32 x i32>, ptr %b
1033  %ext = sext <32 x i32> %idxs to <32 x i64>
1034  %ptrs = getelementptr double, ptr %base, <32 x i64> %ext
1035  %mask = fcmp oeq <32 x double> %cvals, zeroinitializer
1036  %vals = call <32 x double> @llvm.masked.gather.v32f64(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x double> undef)
1037  store <32 x double> %vals, ptr %a
1038  ret void
1039}
1040
1041define void @masked_gather_32b_scaled_zext(ptr %a, ptr %b, ptr %base) vscale_range(8,0) #0 {
1042; CHECK-LABEL: masked_gather_32b_scaled_zext:
1043; CHECK:       // %bb.0:
1044; CHECK-NEXT:    ptrue p0.h, vl32
1045; CHECK-NEXT:    ptrue p1.s, vl32
1046; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1047; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
1048; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x1]
1049; CHECK-NEXT:    punpklo p0.h, p0.b
1050; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x2, z0.s, uxtw #1]
1051; CHECK-NEXT:    st1h { z0.s }, p1, [x0]
1052; CHECK-NEXT:    ret
1053  %cvals = load <32 x half>, ptr %a
1054  %idxs = load <32 x i32>, ptr %b
1055  %ext = zext <32 x i32> %idxs to <32 x i64>
1056  %ptrs = getelementptr half, ptr %base, <32 x i64> %ext
1057  %mask = fcmp oeq <32 x half> %cvals, zeroinitializer
1058  %vals = call <32 x half> @llvm.masked.gather.v32f16(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x half> undef)
1059  store <32 x half> %vals, ptr %a
1060  ret void
1061}
1062
1063define void @masked_gather_32b_unscaled_sext(ptr %a, ptr %b, ptr %base) vscale_range(8,0) #0 {
1064; CHECK-LABEL: masked_gather_32b_unscaled_sext:
1065; CHECK:       // %bb.0:
1066; CHECK-NEXT:    ptrue p0.h, vl32
1067; CHECK-NEXT:    ptrue p1.s, vl32
1068; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1069; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
1070; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x1]
1071; CHECK-NEXT:    punpklo p0.h, p0.b
1072; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x2, z0.s, sxtw]
1073; CHECK-NEXT:    st1h { z0.s }, p1, [x0]
1074; CHECK-NEXT:    ret
1075  %cvals = load <32 x half>, ptr %a
1076  %idxs = load <32 x i32>, ptr %b
1077  %ext = sext <32 x i32> %idxs to <32 x i64>
1078  %byte_ptrs = getelementptr i8, ptr %base, <32 x i64> %ext
1079  %ptrs = bitcast <32 x ptr> %byte_ptrs to <32 x ptr>
1080  %mask = fcmp oeq <32 x half> %cvals, zeroinitializer
1081  %vals = call <32 x half> @llvm.masked.gather.v32f16(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x half> undef)
1082  store <32 x half> %vals, ptr %a
1083  ret void
1084}
1085
1086define void @masked_gather_32b_unscaled_zext(ptr %a, ptr %b, ptr %base) vscale_range(8,0) #0 {
1087; CHECK-LABEL: masked_gather_32b_unscaled_zext:
1088; CHECK:       // %bb.0:
1089; CHECK-NEXT:    ptrue p0.h, vl32
1090; CHECK-NEXT:    ptrue p1.s, vl32
1091; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1092; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
1093; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x1]
1094; CHECK-NEXT:    punpklo p0.h, p0.b
1095; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x2, z0.s, uxtw]
1096; CHECK-NEXT:    st1h { z0.s }, p1, [x0]
1097; CHECK-NEXT:    ret
1098  %cvals = load <32 x half>, ptr %a
1099  %idxs = load <32 x i32>, ptr %b
1100  %ext = zext <32 x i32> %idxs to <32 x i64>
1101  %byte_ptrs = getelementptr i8, ptr %base, <32 x i64> %ext
1102  %ptrs = bitcast <32 x ptr> %byte_ptrs to <32 x ptr>
1103  %mask = fcmp oeq <32 x half> %cvals, zeroinitializer
1104  %vals = call <32 x half> @llvm.masked.gather.v32f16(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x half> undef)
1105  store <32 x half> %vals, ptr %a
1106  ret void
1107}
1108
1109define void @masked_gather_64b_scaled(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 {
1110; CHECK-LABEL: masked_gather_64b_scaled:
1111; CHECK:       // %bb.0:
1112; CHECK-NEXT:    ptrue p0.s, vl32
1113; CHECK-NEXT:    ptrue p1.d, vl32
1114; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1115; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
1116; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x1]
1117; CHECK-NEXT:    punpklo p0.h, p0.b
1118; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x2, z0.d, lsl #2]
1119; CHECK-NEXT:    st1w { z0.d }, p1, [x0]
1120; CHECK-NEXT:    ret
1121  %cvals = load <32 x float>, ptr %a
1122  %idxs = load <32 x i64>, ptr %b
1123  %ptrs = getelementptr float, ptr %base, <32 x i64> %idxs
1124  %mask = fcmp oeq <32 x float> %cvals, zeroinitializer
1125  %vals = call <32 x float> @llvm.masked.gather.v32f32(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x float> undef)
1126  store <32 x float> %vals, ptr %a
1127  ret void
1128}
1129
1130define void @masked_gather_64b_unscaled(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 {
1131; CHECK-LABEL: masked_gather_64b_unscaled:
1132; CHECK:       // %bb.0:
1133; CHECK-NEXT:    ptrue p0.s, vl32
1134; CHECK-NEXT:    ptrue p1.d, vl32
1135; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1136; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
1137; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x1]
1138; CHECK-NEXT:    punpklo p0.h, p0.b
1139; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x2, z0.d]
1140; CHECK-NEXT:    st1w { z0.d }, p1, [x0]
1141; CHECK-NEXT:    ret
1142  %cvals = load <32 x float>, ptr %a
1143  %idxs = load <32 x i64>, ptr %b
1144  %byte_ptrs = getelementptr i8, ptr %base, <32 x i64> %idxs
1145  %ptrs = bitcast <32 x ptr> %byte_ptrs to <32 x ptr>
1146  %mask = fcmp oeq <32 x float> %cvals, zeroinitializer
1147  %vals = call <32 x float> @llvm.masked.gather.v32f32(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x float> undef)
1148  store <32 x float> %vals, ptr %a
1149  ret void
1150}
1151
1152define void @masked_gather_vec_plus_reg(ptr %a, ptr %b, i64 %off) vscale_range(16,0) #0 {
1153; CHECK-LABEL: masked_gather_vec_plus_reg:
1154; CHECK:       // %bb.0:
1155; CHECK-NEXT:    ptrue p0.s, vl32
1156; CHECK-NEXT:    ptrue p1.d, vl32
1157; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1158; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
1159; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x1]
1160; CHECK-NEXT:    punpklo p0.h, p0.b
1161; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x2, z0.d]
1162; CHECK-NEXT:    st1w { z0.d }, p1, [x0]
1163; CHECK-NEXT:    ret
1164  %cvals = load <32 x float>, ptr %a
1165  %bases = load <32 x ptr>, ptr %b
1166  %byte_ptrs = getelementptr i8, <32 x ptr> %bases, i64 %off
1167  %ptrs = bitcast <32 x ptr> %byte_ptrs to <32 x ptr>
1168  %mask = fcmp oeq <32 x float> %cvals, zeroinitializer
1169  %vals = call <32 x float> @llvm.masked.gather.v32f32(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x float> undef)
1170  store <32 x float> %vals, ptr %a
1171  ret void
1172}
1173
1174define void @masked_gather_vec_plus_imm(ptr %a, ptr %b) vscale_range(16,0) #0 {
1175; CHECK-LABEL: masked_gather_vec_plus_imm:
1176; CHECK:       // %bb.0:
1177; CHECK-NEXT:    ptrue p0.s, vl32
1178; CHECK-NEXT:    ptrue p1.d, vl32
1179; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1180; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
1181; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x1]
1182; CHECK-NEXT:    punpklo p0.h, p0.b
1183; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z0.d, #4]
1184; CHECK-NEXT:    st1w { z0.d }, p1, [x0]
1185; CHECK-NEXT:    ret
1186  %cvals = load <32 x float>, ptr %a
1187  %bases = load <32 x ptr>, ptr %b
1188  %byte_ptrs = getelementptr i8, <32 x ptr> %bases, i64 4
1189  %ptrs = bitcast <32 x ptr> %byte_ptrs to <32 x ptr>
1190  %mask = fcmp oeq <32 x float> %cvals, zeroinitializer
1191  %vals = call <32 x float> @llvm.masked.gather.v32f32(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x float> undef)
1192  store <32 x float> %vals, ptr %a
1193  ret void
1194}
1195
1196define void @masked_gather_passthru(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
1197; CHECK-LABEL: masked_gather_passthru:
1198; CHECK:       // %bb.0:
1199; CHECK-NEXT:    ptrue p0.s, vl32
1200; CHECK-NEXT:    ptrue p2.d, vl32
1201; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1202; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x2]
1203; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
1204; CHECK-NEXT:    ld1d { z0.d }, p2/z, [x1]
1205; CHECK-NEXT:    punpklo p2.h, p1.b
1206; CHECK-NEXT:    ld1w { z0.d }, p2/z, [z0.d]
1207; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
1208; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
1209; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1210; CHECK-NEXT:    ret
1211  %cvals = load <32 x float>, ptr %a
1212  %ptrs = load <32 x ptr>, ptr %b
1213  %passthru = load <32 x float>, ptr %c
1214  %mask = fcmp oeq <32 x float> %cvals, zeroinitializer
1215  %vals = call <32 x float> @llvm.masked.gather.v32f32(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x float> %passthru)
1216  store <32 x float> %vals, ptr %a
1217  ret void
1218}
1219
1220define void @masked_gather_passthru_0(ptr %a, ptr %b) vscale_range(16,0) #0 {
1221; CHECK-LABEL: masked_gather_passthru_0:
1222; CHECK:       // %bb.0:
1223; CHECK-NEXT:    ptrue p0.s, vl32
1224; CHECK-NEXT:    ptrue p1.d, vl32
1225; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1226; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
1227; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x1]
1228; CHECK-NEXT:    punpklo p0.h, p0.b
1229; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z0.d]
1230; CHECK-NEXT:    st1w { z0.d }, p1, [x0]
1231; CHECK-NEXT:    ret
1232  %cvals = load <32 x float>, ptr %a
1233  %ptrs = load <32 x ptr>, ptr %b
1234  %mask = fcmp oeq <32 x float> %cvals, zeroinitializer
1235  %vals = call <32 x float> @llvm.masked.gather.v32f32(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x float> zeroinitializer)
1236  store <32 x float> %vals, ptr %a
1237  ret void
1238}
1239
1240declare <2 x i8> @llvm.masked.gather.v2i8(<2 x ptr>, i32, <2 x i1>, <2 x i8>)
1241declare <4 x i8> @llvm.masked.gather.v4i8(<4 x ptr>, i32, <4 x i1>, <4 x i8>)
1242declare <8 x i8> @llvm.masked.gather.v8i8(<8 x ptr>, i32, <8 x i1>, <8 x i8>)
1243declare <16 x i8> @llvm.masked.gather.v16i8(<16 x ptr>, i32, <16 x i1>, <16 x i8>)
1244declare <32 x i8> @llvm.masked.gather.v32i8(<32 x ptr>, i32, <32 x i1>, <32 x i8>)
1245
1246declare <2 x i16> @llvm.masked.gather.v2i16(<2 x ptr>, i32, <2 x i1>, <2 x i16>)
1247declare <4 x i16> @llvm.masked.gather.v4i16(<4 x ptr>, i32, <4 x i1>, <4 x i16>)
1248declare <8 x i16> @llvm.masked.gather.v8i16(<8 x ptr>, i32, <8 x i1>, <8 x i16>)
1249declare <16 x i16> @llvm.masked.gather.v16i16(<16 x ptr>, i32, <16 x i1>, <16 x i16>)
1250declare <32 x i16> @llvm.masked.gather.v32i16(<32 x ptr>, i32, <32 x i1>, <32 x i16>)
1251
1252declare <2 x i32> @llvm.masked.gather.v2i32(<2 x ptr>, i32, <2 x i1>, <2 x i32>)
1253declare <4 x i32> @llvm.masked.gather.v4i32(<4 x ptr>, i32, <4 x i1>, <4 x i32>)
1254declare <8 x i32> @llvm.masked.gather.v8i32(<8 x ptr>, i32, <8 x i1>, <8 x i32>)
1255declare <16 x i32> @llvm.masked.gather.v16i32(<16 x ptr>, i32, <16 x i1>, <16 x i32>)
1256declare <32 x i32> @llvm.masked.gather.v32i32(<32 x ptr>, i32, <32 x i1>, <32 x i32>)
1257
1258declare <1 x i64> @llvm.masked.gather.v1i64(<1 x ptr>, i32, <1 x i1>, <1 x i64>)
1259declare <2 x i64> @llvm.masked.gather.v2i64(<2 x ptr>, i32, <2 x i1>, <2 x i64>)
1260declare <4 x i64> @llvm.masked.gather.v4i64(<4 x ptr>, i32, <4 x i1>, <4 x i64>)
1261declare <8 x i64> @llvm.masked.gather.v8i64(<8 x ptr>, i32, <8 x i1>, <8 x i64>)
1262declare <16 x i64> @llvm.masked.gather.v16i64(<16 x ptr>, i32, <16 x i1>, <16 x i64>)
1263declare <32 x i64> @llvm.masked.gather.v32i64(<32 x ptr>, i32, <32 x i1>, <32 x i64>)
1264
1265declare <2 x half> @llvm.masked.gather.v2f16(<2 x ptr>, i32, <2 x i1>, <2 x half>)
1266declare <4 x half> @llvm.masked.gather.v4f16(<4 x ptr>, i32, <4 x i1>, <4 x half>)
1267declare <8 x half> @llvm.masked.gather.v8f16(<8 x ptr>, i32, <8 x i1>, <8 x half>)
1268declare <16 x half> @llvm.masked.gather.v16f16(<16 x ptr>, i32, <16 x i1>, <16 x half>)
1269declare <32 x half> @llvm.masked.gather.v32f16(<32 x ptr>, i32, <32 x i1>, <32 x half>)
1270
1271declare <2 x float> @llvm.masked.gather.v2f32(<2 x ptr>, i32, <2 x i1>, <2 x float>)
1272declare <4 x float> @llvm.masked.gather.v4f32(<4 x ptr>, i32, <4 x i1>, <4 x float>)
1273declare <8 x float> @llvm.masked.gather.v8f32(<8 x ptr>, i32, <8 x i1>, <8 x float>)
1274declare <16 x float> @llvm.masked.gather.v16f32(<16 x ptr>, i32, <16 x i1>, <16 x float>)
1275declare <32 x float> @llvm.masked.gather.v32f32(<32 x ptr>, i32, <32 x i1>, <32 x float>)
1276
1277declare <1 x double> @llvm.masked.gather.v1f64(<1 x ptr>, i32, <1 x i1>, <1 x double>)
1278declare <2 x double> @llvm.masked.gather.v2f64(<2 x ptr>, i32, <2 x i1>, <2 x double>)
1279declare <4 x double> @llvm.masked.gather.v4f64(<4 x ptr>, i32, <4 x i1>, <4 x double>)
1280declare <8 x double> @llvm.masked.gather.v8f64(<8 x ptr>, i32, <8 x i1>, <8 x double>)
1281declare <16 x double> @llvm.masked.gather.v16f64(<16 x ptr>, i32, <16 x i1>, <16 x double>)
1282declare <32 x double> @llvm.masked.gather.v32f64(<32 x ptr>, i32, <32 x i1>, <32 x double>)
1283
1284attributes #0 = { "target-features"="+sve" }
1285