xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll (revision cc82f1290a1e2157a6c0530d78d8cc84d2b8553d)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
5
6target triple = "aarch64-unknown-linux-gnu"
7
8;
9; Masked Loads
10;
11
12define <2 x half> @masked_load_v2f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
13; CHECK-LABEL: masked_load_v2f16:
14; CHECK:       // %bb.0:
15; CHECK-NEXT:    ldr s1, [x0]
16; CHECK-NEXT:    ldr s2, [x1]
17; CHECK-NEXT:    movi v0.2d, #0000000000000000
18; CHECK-NEXT:    ptrue p0.h, vl4
19; CHECK-NEXT:    fcmeq v1.4h, v1.4h, v2.4h
20; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
21; CHECK-NEXT:    mov v0.h[0], v1.h[0]
22; CHECK-NEXT:    mov w8, v1.s[1]
23; CHECK-NEXT:    mov v0.h[1], w8
24; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
25; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
26; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
27; CHECK-NEXT:    ret
28  %a = load <2 x half>, ptr %ap
29  %b = load <2 x half>, ptr %bp
30  %mask = fcmp oeq <2 x half> %a, %b
31  %load = call <2 x half> @llvm.masked.load.v2f16(ptr %ap, i32 8, <2 x i1> %mask, <2 x half> zeroinitializer)
32  ret <2 x half> %load
33}
34
35define <2 x float> @masked_load_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
36; CHECK-LABEL: masked_load_v2f32:
37; CHECK:       // %bb.0:
38; CHECK-NEXT:    ldr d0, [x0]
39; CHECK-NEXT:    ldr d1, [x1]
40; CHECK-NEXT:    ptrue p0.s, vl2
41; CHECK-NEXT:    fcmeq v0.2s, v0.2s, v1.2s
42; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
43; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
44; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
45; CHECK-NEXT:    ret
46  %a = load <2 x float>, ptr %ap
47  %b = load <2 x float>, ptr %bp
48  %mask = fcmp oeq <2 x float> %a, %b
49  %load = call <2 x float> @llvm.masked.load.v2f32(ptr %ap, i32 8, <2 x i1> %mask, <2 x float> zeroinitializer)
50  ret <2 x float> %load
51}
52
53define <4 x float> @masked_load_v4f32(ptr %ap, ptr %bp) vscale_range(1,0) #0 {
54; CHECK-LABEL: masked_load_v4f32:
55; CHECK:       // %bb.0:
56; CHECK-NEXT:    ldr q0, [x0]
57; CHECK-NEXT:    ldr q1, [x1]
58; CHECK-NEXT:    ptrue p0.s, vl4
59; CHECK-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
60; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
61; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
62; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
63; CHECK-NEXT:    ret
64  %a = load <4 x float>, ptr %ap
65  %b = load <4 x float>, ptr %bp
66  %mask = fcmp oeq <4 x float> %a, %b
67  %load = call <4 x float> @llvm.masked.load.v4f32(ptr %ap, i32 8, <4 x i1> %mask, <4 x float> zeroinitializer)
68  ret <4 x float> %load
69}
70
71define void @masked_load_v8f32(ptr %ap, ptr %bp, ptr %c) vscale_range(2,0) #0 {
72; CHECK-LABEL: masked_load_v8f32:
73; CHECK:       // %bb.0:
74; CHECK-NEXT:    ptrue p0.s, vl8
75; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
76; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
77; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, z1.s
78; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x0]
79; CHECK-NEXT:    st1w { z0.s }, p0, [x2]
80; CHECK-NEXT:    ret
81  %a = load <8 x float>, ptr %ap
82  %b = load <8 x float>, ptr %bp
83  %mask = fcmp oeq <8 x float> %a, %b
84  %load = call <8 x float> @llvm.masked.load.v8f32(ptr %ap, i32 8, <8 x i1> %mask, <8 x float> zeroinitializer)
85  store <8 x float> %load, ptr %c
86  ret void
87}
88
89define void @masked_load_v16f32(ptr %ap, ptr %bp, ptr %c) #0 {
90; VBITS_GE_256-LABEL: masked_load_v16f32:
91; VBITS_GE_256:       // %bb.0:
92; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
93; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
94; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
95; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
96; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
97; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
98; VBITS_GE_256-NEXT:    fcmeq p1.s, p0/z, z0.s, z1.s
99; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z2.s, z3.s
100; VBITS_GE_256-NEXT:    ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
101; VBITS_GE_256-NEXT:    ld1w { z1.s }, p2/z, [x0]
102; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x2, x8, lsl #2]
103; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x2]
104; VBITS_GE_256-NEXT:    ret
105;
106; VBITS_GE_512-LABEL: masked_load_v16f32:
107; VBITS_GE_512:       // %bb.0:
108; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
109; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
110; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
111; VBITS_GE_512-NEXT:    fcmeq p1.s, p0/z, z0.s, z1.s
112; VBITS_GE_512-NEXT:    ld1w { z0.s }, p1/z, [x0]
113; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x2]
114; VBITS_GE_512-NEXT:    ret
115  %a = load <16 x float>, ptr %ap
116  %b = load <16 x float>, ptr %bp
117  %mask = fcmp oeq <16 x float> %a, %b
118  %load = call <16 x float> @llvm.masked.load.v16f32(ptr %ap, i32 8, <16 x i1> %mask, <16 x float> zeroinitializer)
119  store <16 x float> %load, ptr %c
120  ret void
121}
122
123define void @masked_load_v32f32(ptr %ap, ptr %bp, ptr %c) vscale_range(8,0) #0 {
124; CHECK-LABEL: masked_load_v32f32:
125; CHECK:       // %bb.0:
126; CHECK-NEXT:    ptrue p0.s, vl32
127; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
128; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
129; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, z1.s
130; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x0]
131; CHECK-NEXT:    st1w { z0.s }, p0, [x2]
132; CHECK-NEXT:    ret
133  %a = load <32 x float>, ptr %ap
134  %b = load <32 x float>, ptr %bp
135  %mask = fcmp oeq <32 x float> %a, %b
136  %load = call <32 x float> @llvm.masked.load.v32f32(ptr %ap, i32 8, <32 x i1> %mask, <32 x float> zeroinitializer)
137  store <32 x float> %load, ptr %c
138  ret void
139}
140
141define void @masked_load_v64f32(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
142; CHECK-LABEL: masked_load_v64f32:
143; CHECK:       // %bb.0:
144; CHECK-NEXT:    ptrue p0.s, vl64
145; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
146; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
147; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, z1.s
148; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x0]
149; CHECK-NEXT:    st1w { z0.s }, p0, [x2]
150; CHECK-NEXT:    ret
151  %a = load <64 x float>, ptr %ap
152  %b = load <64 x float>, ptr %bp
153  %mask = fcmp oeq <64 x float> %a, %b
154  %load = call <64 x float> @llvm.masked.load.v64f32(ptr %ap, i32 8, <64 x i1> %mask, <64 x float> zeroinitializer)
155  store <64 x float> %load, ptr %c
156  ret void
157}
158
159define void @masked_load_v64i8(ptr %ap, ptr %bp, ptr %c) #0 {
160; VBITS_GE_256-LABEL: masked_load_v64i8:
161; VBITS_GE_256:       // %bb.0:
162; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
163; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
164; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
165; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x1, x8]
166; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x0]
167; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
168; VBITS_GE_256-NEXT:    cmpeq p1.b, p0/z, z0.b, z1.b
169; VBITS_GE_256-NEXT:    cmpeq p2.b, p0/z, z2.b, z3.b
170; VBITS_GE_256-NEXT:    ld1b { z0.b }, p1/z, [x0, x8]
171; VBITS_GE_256-NEXT:    ld1b { z1.b }, p2/z, [x0]
172; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x2, x8]
173; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x2]
174; VBITS_GE_256-NEXT:    ret
175;
176; VBITS_GE_512-LABEL: masked_load_v64i8:
177; VBITS_GE_512:       // %bb.0:
178; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
179; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
180; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
181; VBITS_GE_512-NEXT:    cmpeq p1.b, p0/z, z0.b, z1.b
182; VBITS_GE_512-NEXT:    ld1b { z0.b }, p1/z, [x0]
183; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x2]
184; VBITS_GE_512-NEXT:    ret
185  %a = load <64 x i8>, ptr %ap
186  %b = load <64 x i8>, ptr %bp
187  %mask = icmp eq <64 x i8> %a, %b
188  %load = call <64 x i8> @llvm.masked.load.v64i8(ptr %ap, i32 8, <64 x i1> %mask, <64 x i8> undef)
189  store <64 x i8> %load, ptr %c
190  ret void
191}
192
193define void @masked_load_v32i16(ptr %ap, ptr %bp, ptr %c) #0 {
194; VBITS_GE_256-LABEL: masked_load_v32i16:
195; VBITS_GE_256:       // %bb.0:
196; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
197; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
198; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
199; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
200; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
201; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
202; VBITS_GE_256-NEXT:    cmpeq p1.h, p0/z, z0.h, z1.h
203; VBITS_GE_256-NEXT:    cmpeq p2.h, p0/z, z2.h, z3.h
204; VBITS_GE_256-NEXT:    ld1h { z0.h }, p1/z, [x0, x8, lsl #1]
205; VBITS_GE_256-NEXT:    ld1h { z1.h }, p2/z, [x0]
206; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x2, x8, lsl #1]
207; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x2]
208; VBITS_GE_256-NEXT:    ret
209;
210; VBITS_GE_512-LABEL: masked_load_v32i16:
211; VBITS_GE_512:       // %bb.0:
212; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
213; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
214; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
215; VBITS_GE_512-NEXT:    cmpeq p1.h, p0/z, z0.h, z1.h
216; VBITS_GE_512-NEXT:    ld1h { z0.h }, p1/z, [x0]
217; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x2]
218; VBITS_GE_512-NEXT:    ret
219  %a = load <32 x i16>, ptr %ap
220  %b = load <32 x i16>, ptr %bp
221  %mask = icmp eq <32 x i16> %a, %b
222  %load = call <32 x i16> @llvm.masked.load.v32i16(ptr %ap, i32 8, <32 x i1> %mask, <32 x i16> undef)
223  store <32 x i16> %load, ptr %c
224  ret void
225}
226
227define void @masked_load_v16i32(ptr %ap, ptr %bp, ptr %c) #0 {
228; VBITS_GE_256-LABEL: masked_load_v16i32:
229; VBITS_GE_256:       // %bb.0:
230; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
231; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
232; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
233; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
234; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
235; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
236; VBITS_GE_256-NEXT:    cmpeq p1.s, p0/z, z0.s, z1.s
237; VBITS_GE_256-NEXT:    cmpeq p2.s, p0/z, z2.s, z3.s
238; VBITS_GE_256-NEXT:    ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
239; VBITS_GE_256-NEXT:    ld1w { z1.s }, p2/z, [x0]
240; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x2, x8, lsl #2]
241; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x2]
242; VBITS_GE_256-NEXT:    ret
243;
244; VBITS_GE_512-LABEL: masked_load_v16i32:
245; VBITS_GE_512:       // %bb.0:
246; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
247; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
248; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
249; VBITS_GE_512-NEXT:    cmpeq p1.s, p0/z, z0.s, z1.s
250; VBITS_GE_512-NEXT:    ld1w { z0.s }, p1/z, [x0]
251; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x2]
252; VBITS_GE_512-NEXT:    ret
253  %a = load <16 x i32>, ptr %ap
254  %b = load <16 x i32>, ptr %bp
255  %mask = icmp eq <16 x i32> %a, %b
256  %load = call <16 x i32> @llvm.masked.load.v16i32(ptr %ap, i32 8, <16 x i1> %mask, <16 x i32> undef)
257  store <16 x i32> %load, ptr %c
258  ret void
259}
260
261define void @masked_load_v8i64(ptr %ap, ptr %bp, ptr %c) #0 {
262; VBITS_GE_256-LABEL: masked_load_v8i64:
263; VBITS_GE_256:       // %bb.0:
264; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
265; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
266; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
267; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
268; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
269; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
270; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, z1.d
271; VBITS_GE_256-NEXT:    cmpeq p2.d, p0/z, z2.d, z3.d
272; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
273; VBITS_GE_256-NEXT:    ld1d { z1.d }, p2/z, [x0]
274; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2, x8, lsl #3]
275; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2]
276; VBITS_GE_256-NEXT:    ret
277;
278; VBITS_GE_512-LABEL: masked_load_v8i64:
279; VBITS_GE_512:       // %bb.0:
280; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
281; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
282; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
283; VBITS_GE_512-NEXT:    cmpeq p1.d, p0/z, z0.d, z1.d
284; VBITS_GE_512-NEXT:    ld1d { z0.d }, p1/z, [x0]
285; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x2]
286; VBITS_GE_512-NEXT:    ret
287  %a = load <8 x i64>, ptr %ap
288  %b = load <8 x i64>, ptr %bp
289  %mask = icmp eq <8 x i64> %a, %b
290  %load = call <8 x i64> @llvm.masked.load.v8i64(ptr %ap, i32 8, <8 x i1> %mask, <8 x i64> undef)
291  store <8 x i64> %load, ptr %c
292  ret void
293}
294
295define void @masked_load_passthru_v8i64(ptr %ap, ptr %bp, ptr %c) #0 {
296; VBITS_GE_256-LABEL: masked_load_passthru_v8i64:
297; VBITS_GE_256:       // %bb.0:
298; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
299; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
300; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
301; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
302; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
303; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
304; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, z1.d
305; VBITS_GE_256-NEXT:    cmpeq p2.d, p0/z, z2.d, z3.d
306; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
307; VBITS_GE_256-NEXT:    ld1d { z2.d }, p2/z, [x0]
308; VBITS_GE_256-NEXT:    sel z0.d, p1, z0.d, z1.d
309; VBITS_GE_256-NEXT:    sel z1.d, p2, z2.d, z3.d
310; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2, x8, lsl #3]
311; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2]
312; VBITS_GE_256-NEXT:    ret
313;
314; VBITS_GE_512-LABEL: masked_load_passthru_v8i64:
315; VBITS_GE_512:       // %bb.0:
316; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
317; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
318; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
319; VBITS_GE_512-NEXT:    cmpeq p1.d, p0/z, z0.d, z1.d
320; VBITS_GE_512-NEXT:    ld1d { z0.d }, p1/z, [x0]
321; VBITS_GE_512-NEXT:    sel z0.d, p1, z0.d, z1.d
322; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x2]
323; VBITS_GE_512-NEXT:    ret
324  %a = load <8 x i64>, ptr %ap
325  %b = load <8 x i64>, ptr %bp
326  %mask = icmp eq <8 x i64> %a, %b
327  %load = call <8 x i64> @llvm.masked.load.v8i64(ptr %ap, i32 8, <8 x i1> %mask, <8 x i64> %b)
328  store <8 x i64> %load, ptr %c
329  ret void
330}
331
332define void @masked_load_passthru_v8f64(ptr %ap, ptr %bp, ptr %c) #0 {
333; VBITS_GE_256-LABEL: masked_load_passthru_v8f64:
334; VBITS_GE_256:       // %bb.0:
335; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
336; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
337; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
338; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
339; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
340; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
341; VBITS_GE_256-NEXT:    fcmeq p1.d, p0/z, z0.d, z1.d
342; VBITS_GE_256-NEXT:    fcmeq p2.d, p0/z, z2.d, z3.d
343; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
344; VBITS_GE_256-NEXT:    ld1d { z2.d }, p2/z, [x0]
345; VBITS_GE_256-NEXT:    sel z0.d, p1, z0.d, z1.d
346; VBITS_GE_256-NEXT:    sel z1.d, p2, z2.d, z3.d
347; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2, x8, lsl #3]
348; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2]
349; VBITS_GE_256-NEXT:    ret
350;
351; VBITS_GE_512-LABEL: masked_load_passthru_v8f64:
352; VBITS_GE_512:       // %bb.0:
353; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
354; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
355; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
356; VBITS_GE_512-NEXT:    fcmeq p1.d, p0/z, z0.d, z1.d
357; VBITS_GE_512-NEXT:    ld1d { z0.d }, p1/z, [x0]
358; VBITS_GE_512-NEXT:    sel z0.d, p1, z0.d, z1.d
359; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x2]
360; VBITS_GE_512-NEXT:    ret
361  %a = load <8 x double>, ptr %ap
362  %b = load <8 x double>, ptr %bp
363  %mask = fcmp oeq <8 x double> %a, %b
364  %load = call <8 x double> @llvm.masked.load.v8f64(ptr %ap, i32 8, <8 x i1> %mask, <8 x double> %b)
365  store <8 x double> %load, ptr %c
366  ret void
367}
368
369define void @masked_load_sext_v32i8i16(ptr %ap, ptr %bp, ptr %c) #0 {
370; VBITS_GE_256-LABEL: masked_load_sext_v32i8i16:
371; VBITS_GE_256:       // %bb.0:
372; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
373; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
374; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x1]
375; VBITS_GE_256-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
376; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
377; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
378; VBITS_GE_256-NEXT:    sunpklo z1.h, z0.b
379; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
380; VBITS_GE_256-NEXT:    sunpklo z0.h, z0.b
381; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x2]
382; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x2, x8, lsl #1]
383; VBITS_GE_256-NEXT:    ret
384;
385; VBITS_GE_512-LABEL: masked_load_sext_v32i8i16:
386; VBITS_GE_512:       // %bb.0:
387; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
388; VBITS_GE_512-NEXT:    ld1b { z0.h }, p0/z, [x1]
389; VBITS_GE_512-NEXT:    cmpeq p1.h, p0/z, z0.h, #0
390; VBITS_GE_512-NEXT:    ld1sb { z0.h }, p1/z, [x0]
391; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x2]
392; VBITS_GE_512-NEXT:    ret
393  %b = load <32 x i8>, ptr %bp
394  %mask = icmp eq <32 x i8> %b, zeroinitializer
395  %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
396  %ext = sext <32 x i8> %load to <32 x i16>
397  store <32 x i16> %ext, ptr %c
398  ret void
399}
400
401define void @masked_load_sext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 {
402; VBITS_GE_256-LABEL: masked_load_sext_v16i8i32:
403; VBITS_GE_256:       // %bb.0:
404; VBITS_GE_256-NEXT:    ldr q0, [x1]
405; VBITS_GE_256-NEXT:    ptrue p0.b, vl16
406; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
407; VBITS_GE_256-NEXT:    cmeq v0.16b, v0.16b, #0
408; VBITS_GE_256-NEXT:    cmpne p0.b, p0/z, z0.b, #0
409; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
410; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
411; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
412; VBITS_GE_256-NEXT:    sunpklo z0.h, z0.b
413; VBITS_GE_256-NEXT:    sunpklo z1.h, z1.b
414; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
415; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
416; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x2]
417; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x2, x8, lsl #2]
418; VBITS_GE_256-NEXT:    ret
419;
420; VBITS_GE_512-LABEL: masked_load_sext_v16i8i32:
421; VBITS_GE_512:       // %bb.0:
422; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
423; VBITS_GE_512-NEXT:    ld1b { z0.s }, p0/z, [x1]
424; VBITS_GE_512-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
425; VBITS_GE_512-NEXT:    ld1sb { z0.s }, p1/z, [x0]
426; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x2]
427; VBITS_GE_512-NEXT:    ret
428  %b = load <16 x i8>, ptr %bp
429  %mask = icmp eq <16 x i8> %b, zeroinitializer
430  %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %ap, i32 8, <16 x i1> %mask, <16 x i8> undef)
431  %ext = sext <16 x i8> %load to <16 x i32>
432  store <16 x i32> %ext, ptr %c
433  ret void
434}
435
436define void @masked_load_sext_v8i8i64(ptr %ap, ptr %bp, ptr %c) #0 {
437; VBITS_GE_256-LABEL: masked_load_sext_v8i8i64:
438; VBITS_GE_256:       // %bb.0:
439; VBITS_GE_256-NEXT:    ldr d0, [x1]
440; VBITS_GE_256-NEXT:    ptrue p0.b, vl8
441; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
442; VBITS_GE_256-NEXT:    cmeq v0.8b, v0.8b, #0
443; VBITS_GE_256-NEXT:    cmpne p0.b, p0/z, z0.b, #0
444; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
445; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
446; VBITS_GE_256-NEXT:    sshll v0.8h, v0.8b, #0
447; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
448; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
449; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
450; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
451; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
452; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2]
453; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2, x8, lsl #3]
454; VBITS_GE_256-NEXT:    ret
455;
456; VBITS_GE_512-LABEL: masked_load_sext_v8i8i64:
457; VBITS_GE_512:       // %bb.0:
458; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
459; VBITS_GE_512-NEXT:    ld1b { z0.d }, p0/z, [x1]
460; VBITS_GE_512-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
461; VBITS_GE_512-NEXT:    ld1sb { z0.d }, p1/z, [x0]
462; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x2]
463; VBITS_GE_512-NEXT:    ret
464  %b = load <8 x i8>, ptr %bp
465  %mask = icmp eq <8 x i8> %b, zeroinitializer
466  %load = call <8 x i8> @llvm.masked.load.v8i8(ptr %ap, i32 8, <8 x i1> %mask, <8 x i8> undef)
467  %ext = sext <8 x i8> %load to <8 x i64>
468  store <8 x i64> %ext, ptr %c
469  ret void
470}
471
472define void @masked_load_sext_v16i16i32(ptr %ap, ptr %bp, ptr %c) #0 {
473; VBITS_GE_256-LABEL: masked_load_sext_v16i16i32:
474; VBITS_GE_256:       // %bb.0:
475; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
476; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
477; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x1]
478; VBITS_GE_256-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
479; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
480; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
481; VBITS_GE_256-NEXT:    sunpklo z1.s, z0.h
482; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
483; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
484; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x2]
485; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x2, x8, lsl #2]
486; VBITS_GE_256-NEXT:    ret
487;
488; VBITS_GE_512-LABEL: masked_load_sext_v16i16i32:
489; VBITS_GE_512:       // %bb.0:
490; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
491; VBITS_GE_512-NEXT:    ld1h { z0.s }, p0/z, [x1]
492; VBITS_GE_512-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
493; VBITS_GE_512-NEXT:    ld1sh { z0.s }, p1/z, [x0]
494; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x2]
495; VBITS_GE_512-NEXT:    ret
496  %b = load <16 x i16>, ptr %bp
497  %mask = icmp eq <16 x i16> %b, zeroinitializer
498  %load = call <16 x i16> @llvm.masked.load.v16i16(ptr %ap, i32 8, <16 x i1> %mask, <16 x i16> undef)
499  %ext = sext <16 x i16> %load to <16 x i32>
500  store <16 x i32> %ext, ptr %c
501  ret void
502}
503
504define void @masked_load_sext_v8i16i64(ptr %ap, ptr %bp, ptr %c) #0 {
505; VBITS_GE_256-LABEL: masked_load_sext_v8i16i64:
506; VBITS_GE_256:       // %bb.0:
507; VBITS_GE_256-NEXT:    ldr q0, [x1]
508; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
509; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
510; VBITS_GE_256-NEXT:    cmeq v0.8h, v0.8h, #0
511; VBITS_GE_256-NEXT:    cmpne p0.h, p0/z, z0.h, #0
512; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
513; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
514; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
515; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
516; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
517; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
518; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
519; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2]
520; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2, x8, lsl #3]
521; VBITS_GE_256-NEXT:    ret
522;
523; VBITS_GE_512-LABEL: masked_load_sext_v8i16i64:
524; VBITS_GE_512:       // %bb.0:
525; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
526; VBITS_GE_512-NEXT:    ld1h { z0.d }, p0/z, [x1]
527; VBITS_GE_512-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
528; VBITS_GE_512-NEXT:    ld1sh { z0.d }, p1/z, [x0]
529; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x2]
530; VBITS_GE_512-NEXT:    ret
531  %b = load <8 x i16>, ptr %bp
532  %mask = icmp eq <8 x i16> %b, zeroinitializer
533  %load = call <8 x i16> @llvm.masked.load.v8i16(ptr %ap, i32 8, <8 x i1> %mask, <8 x i16> undef)
534  %ext = sext <8 x i16> %load to <8 x i64>
535  store <8 x i64> %ext, ptr %c
536  ret void
537}
538
539define void @masked_load_sext_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 {
540; VBITS_GE_256-LABEL: masked_load_sext_v8i32i64:
541; VBITS_GE_256:       // %bb.0:
542; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
543; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
544; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1]
545; VBITS_GE_256-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
546; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
547; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
548; VBITS_GE_256-NEXT:    sunpklo z1.d, z0.s
549; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
550; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
551; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2]
552; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2, x8, lsl #3]
553; VBITS_GE_256-NEXT:    ret
554;
555; VBITS_GE_512-LABEL: masked_load_sext_v8i32i64:
556; VBITS_GE_512:       // %bb.0:
557; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
558; VBITS_GE_512-NEXT:    ld1w { z0.d }, p0/z, [x1]
559; VBITS_GE_512-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
560; VBITS_GE_512-NEXT:    ld1sw { z0.d }, p1/z, [x0]
561; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x2]
562; VBITS_GE_512-NEXT:    ret
563  %b = load <8 x i32>, ptr %bp
564  %mask = icmp eq <8 x i32> %b, zeroinitializer
565  %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef)
566  %ext = sext <8 x i32> %load to <8 x i64>
567  store <8 x i64> %ext, ptr %c
568  ret void
569}
570
571define void @masked_load_zext_v32i8i16(ptr %ap, ptr %bp, ptr %c) #0 {
572; VBITS_GE_256-LABEL: masked_load_zext_v32i8i16:
573; VBITS_GE_256:       // %bb.0:
574; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
575; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
576; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x1]
577; VBITS_GE_256-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
578; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
579; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
580; VBITS_GE_256-NEXT:    uunpklo z1.h, z0.b
581; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
582; VBITS_GE_256-NEXT:    uunpklo z0.h, z0.b
583; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x2]
584; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x2, x8, lsl #1]
585; VBITS_GE_256-NEXT:    ret
586;
587; VBITS_GE_512-LABEL: masked_load_zext_v32i8i16:
588; VBITS_GE_512:       // %bb.0:
589; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
590; VBITS_GE_512-NEXT:    ld1b { z0.h }, p0/z, [x1]
591; VBITS_GE_512-NEXT:    cmpeq p1.h, p0/z, z0.h, #0
592; VBITS_GE_512-NEXT:    ld1b { z0.h }, p1/z, [x0]
593; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x2]
594; VBITS_GE_512-NEXT:    ret
595  %b = load <32 x i8>, ptr %bp
596  %mask = icmp eq <32 x i8> %b, zeroinitializer
597  %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
598  %ext = zext <32 x i8> %load to <32 x i16>
599  store <32 x i16> %ext, ptr %c
600  ret void
601}
602
603define void @masked_load_zext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 {
604; VBITS_GE_256-LABEL: masked_load_zext_v16i8i32:
605; VBITS_GE_256:       // %bb.0:
606; VBITS_GE_256-NEXT:    ldr q0, [x1]
607; VBITS_GE_256-NEXT:    ptrue p0.b, vl16
608; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
609; VBITS_GE_256-NEXT:    cmeq v0.16b, v0.16b, #0
610; VBITS_GE_256-NEXT:    cmpne p0.b, p0/z, z0.b, #0
611; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
612; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
613; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
614; VBITS_GE_256-NEXT:    uunpklo z0.h, z0.b
615; VBITS_GE_256-NEXT:    uunpklo z1.h, z1.b
616; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
617; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
618; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x2]
619; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x2, x8, lsl #2]
620; VBITS_GE_256-NEXT:    ret
621;
622; VBITS_GE_512-LABEL: masked_load_zext_v16i8i32:
623; VBITS_GE_512:       // %bb.0:
624; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
625; VBITS_GE_512-NEXT:    ld1b { z0.s }, p0/z, [x1]
626; VBITS_GE_512-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
627; VBITS_GE_512-NEXT:    ld1b { z0.s }, p1/z, [x0]
628; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x2]
629; VBITS_GE_512-NEXT:    ret
630  %b = load <16 x i8>, ptr %bp
631  %mask = icmp eq <16 x i8> %b, zeroinitializer
632  %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %ap, i32 8, <16 x i1> %mask, <16 x i8> undef)
633  %ext = zext <16 x i8> %load to <16 x i32>
634  store <16 x i32> %ext, ptr %c
635  ret void
636}
637
638define void @masked_load_zext_v8i8i64(ptr %ap, ptr %bp, ptr %c) #0 {
639; VBITS_GE_256-LABEL: masked_load_zext_v8i8i64:
640; VBITS_GE_256:       // %bb.0:
641; VBITS_GE_256-NEXT:    ldr d0, [x1]
642; VBITS_GE_256-NEXT:    ptrue p0.b, vl8
643; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
644; VBITS_GE_256-NEXT:    cmeq v0.8b, v0.8b, #0
645; VBITS_GE_256-NEXT:    cmpne p0.b, p0/z, z0.b, #0
646; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
647; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
648; VBITS_GE_256-NEXT:    ushll v0.8h, v0.8b, #0
649; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
650; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
651; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
652; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
653; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
654; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2]
655; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2, x8, lsl #3]
656; VBITS_GE_256-NEXT:    ret
657;
658; VBITS_GE_512-LABEL: masked_load_zext_v8i8i64:
659; VBITS_GE_512:       // %bb.0:
660; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
661; VBITS_GE_512-NEXT:    ld1b { z0.d }, p0/z, [x1]
662; VBITS_GE_512-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
663; VBITS_GE_512-NEXT:    ld1b { z0.d }, p1/z, [x0]
664; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x2]
665; VBITS_GE_512-NEXT:    ret
666  %b = load <8 x i8>, ptr %bp
667  %mask = icmp eq <8 x i8> %b, zeroinitializer
668  %load = call <8 x i8> @llvm.masked.load.v8i8(ptr %ap, i32 8, <8 x i1> %mask, <8 x i8> undef)
669  %ext = zext <8 x i8> %load to <8 x i64>
670  store <8 x i64> %ext, ptr %c
671  ret void
672}
673
674define void @masked_load_zext_v16i16i32(ptr %ap, ptr %bp, ptr %c) #0 {
675; VBITS_GE_256-LABEL: masked_load_zext_v16i16i32:
676; VBITS_GE_256:       // %bb.0:
677; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
678; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
679; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x1]
680; VBITS_GE_256-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
681; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
682; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
683; VBITS_GE_256-NEXT:    uunpklo z1.s, z0.h
684; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
685; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
686; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x2]
687; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x2, x8, lsl #2]
688; VBITS_GE_256-NEXT:    ret
689;
690; VBITS_GE_512-LABEL: masked_load_zext_v16i16i32:
691; VBITS_GE_512:       // %bb.0:
692; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
693; VBITS_GE_512-NEXT:    ld1h { z0.s }, p0/z, [x1]
694; VBITS_GE_512-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
695; VBITS_GE_512-NEXT:    ld1h { z0.s }, p1/z, [x0]
696; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x2]
697; VBITS_GE_512-NEXT:    ret
698  %b = load <16 x i16>, ptr %bp
699  %mask = icmp eq <16 x i16> %b, zeroinitializer
700  %load = call <16 x i16> @llvm.masked.load.v16i16(ptr %ap, i32 8, <16 x i1> %mask, <16 x i16> undef)
701  %ext = zext <16 x i16> %load to <16 x i32>
702  store <16 x i32> %ext, ptr %c
703  ret void
704}
705
706define void @masked_load_zext_v8i16i64(ptr %ap, ptr %bp, ptr %c) #0 {
707; VBITS_GE_256-LABEL: masked_load_zext_v8i16i64:
708; VBITS_GE_256:       // %bb.0:
709; VBITS_GE_256-NEXT:    ldr q0, [x1]
710; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
711; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
712; VBITS_GE_256-NEXT:    cmeq v0.8h, v0.8h, #0
713; VBITS_GE_256-NEXT:    cmpne p0.h, p0/z, z0.h, #0
714; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
715; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
716; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
717; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
718; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
719; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
720; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
721; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2]
722; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2, x8, lsl #3]
723; VBITS_GE_256-NEXT:    ret
724;
725; VBITS_GE_512-LABEL: masked_load_zext_v8i16i64:
726; VBITS_GE_512:       // %bb.0:
727; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
728; VBITS_GE_512-NEXT:    ld1h { z0.d }, p0/z, [x1]
729; VBITS_GE_512-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
730; VBITS_GE_512-NEXT:    ld1h { z0.d }, p1/z, [x0]
731; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x2]
732; VBITS_GE_512-NEXT:    ret
733  %b = load <8 x i16>, ptr %bp
734  %mask = icmp eq <8 x i16> %b, zeroinitializer
735  %load = call <8 x i16> @llvm.masked.load.v8i16(ptr %ap, i32 8, <8 x i1> %mask, <8 x i16> undef)
736  %ext = zext <8 x i16> %load to <8 x i64>
737  store <8 x i64> %ext, ptr %c
738  ret void
739}
740
741define void @masked_load_zext_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 {
742; VBITS_GE_256-LABEL: masked_load_zext_v8i32i64:
743; VBITS_GE_256:       // %bb.0:
744; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
745; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
746; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1]
747; VBITS_GE_256-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
748; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
749; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
750; VBITS_GE_256-NEXT:    uunpklo z1.d, z0.s
751; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
752; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
753; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2]
754; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2, x8, lsl #3]
755; VBITS_GE_256-NEXT:    ret
756;
757; VBITS_GE_512-LABEL: masked_load_zext_v8i32i64:
758; VBITS_GE_512:       // %bb.0:
759; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
760; VBITS_GE_512-NEXT:    ld1w { z0.d }, p0/z, [x1]
761; VBITS_GE_512-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
762; VBITS_GE_512-NEXT:    ld1w { z0.d }, p1/z, [x0]
763; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x2]
764; VBITS_GE_512-NEXT:    ret
765  %b = load <8 x i32>, ptr %bp
766  %mask = icmp eq <8 x i32> %b, zeroinitializer
767  %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef)
768  %ext = zext <8 x i32> %load to <8 x i64>
769  store <8 x i64> %ext, ptr %c
770  ret void
771}
772
773define void @masked_load_sext_v32i8i16_m16(ptr %ap, ptr %bp, ptr %c) #0 {
774; VBITS_GE_256-LABEL: masked_load_sext_v32i8i16_m16:
775; VBITS_GE_256:       // %bb.0:
776; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
777; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
778; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x1, x8, lsl #1]
779; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
780; VBITS_GE_256-NEXT:    cmpeq p1.h, p0/z, z0.h, #0
781; VBITS_GE_256-NEXT:    cmpeq p2.h, p0/z, z1.h, #0
782; VBITS_GE_256-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
783; VBITS_GE_256-NEXT:    mov z1.h, p2/z, #-1 // =0xffffffffffffffff
784; VBITS_GE_256-NEXT:    ptrue p1.b, vl16
785; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
786; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
787; VBITS_GE_256-NEXT:    splice z1.b, p1, z1.b, z0.b
788; VBITS_GE_256-NEXT:    ptrue p1.b, vl32
789; VBITS_GE_256-NEXT:    cmpne p1.b, p1/z, z1.b, #0
790; VBITS_GE_256-NEXT:    ld1b { z0.b }, p1/z, [x0]
791; VBITS_GE_256-NEXT:    sunpklo z1.h, z0.b
792; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
793; VBITS_GE_256-NEXT:    sunpklo z0.h, z0.b
794; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x2]
795; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x2, x8, lsl #1]
796; VBITS_GE_256-NEXT:    ret
797;
798; VBITS_GE_512-LABEL: masked_load_sext_v32i8i16_m16:
799; VBITS_GE_512:       // %bb.0:
800; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
801; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x1]
802; VBITS_GE_512-NEXT:    cmpeq p1.h, p0/z, z0.h, #0
803; VBITS_GE_512-NEXT:    ld1sb { z0.h }, p1/z, [x0]
804; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x2]
805; VBITS_GE_512-NEXT:    ret
806  %b = load <32 x i16>, ptr %bp
807  %mask = icmp eq <32 x i16> %b, zeroinitializer
808  %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
809  %ext = sext <32 x i8> %load to <32 x i16>
810  store <32 x i16> %ext, ptr %c
811  ret void
812}
813
814define void @masked_load_sext_v16i8i32_m32(ptr %ap, ptr %bp, ptr %c) #0 {
815; VBITS_GE_256-LABEL: masked_load_sext_v16i8i32_m32:
816; VBITS_GE_256:       // %bb.0:
817; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
818; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
819; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1, x8, lsl #2]
820; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1]
821; VBITS_GE_256-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
822; VBITS_GE_256-NEXT:    cmpeq p2.s, p0/z, z1.s, #0
823; VBITS_GE_256-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
824; VBITS_GE_256-NEXT:    mov z1.s, p2/z, #-1 // =0xffffffffffffffff
825; VBITS_GE_256-NEXT:    ptrue p1.b, vl16
826; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
827; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
828; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
829; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
830; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
831; VBITS_GE_256-NEXT:    cmpne p1.b, p1/z, z1.b, #0
832; VBITS_GE_256-NEXT:    ld1b { z0.b }, p1/z, [x0]
833; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
834; VBITS_GE_256-NEXT:    sunpklo z0.h, z0.b
835; VBITS_GE_256-NEXT:    sunpklo z1.h, z1.b
836; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
837; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
838; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x2]
839; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x2, x8, lsl #2]
840; VBITS_GE_256-NEXT:    ret
841;
842; VBITS_GE_512-LABEL: masked_load_sext_v16i8i32_m32:
843; VBITS_GE_512:       // %bb.0:
844; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
845; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x1]
846; VBITS_GE_512-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
847; VBITS_GE_512-NEXT:    ld1sb { z0.s }, p1/z, [x0]
848; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x2]
849; VBITS_GE_512-NEXT:    ret
850  %b = load <16 x i32>, ptr %bp
851  %mask = icmp eq <16 x i32> %b, zeroinitializer
852  %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %ap, i32 8, <16 x i1> %mask, <16 x i8> undef)
853  %ext = sext <16 x i8> %load to <16 x i32>
854  store <16 x i32> %ext, ptr %c
855  ret void
856}
857
858define void @masked_load_sext_v8i8i64_m64(ptr %ap, ptr %bp, ptr %c) #0 {
859; VBITS_GE_256-LABEL: masked_load_sext_v8i8i64_m64:
860; VBITS_GE_256:       // %bb.0:
861; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
862; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
863; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
864; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
865; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
866; VBITS_GE_256-NEXT:    cmpeq p2.d, p0/z, z1.d, #0
867; VBITS_GE_256-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
868; VBITS_GE_256-NEXT:    mov z1.d, p2/z, #-1 // =0xffffffffffffffff
869; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
870; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
871; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
872; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z0.s
873; VBITS_GE_256-NEXT:    ptrue p1.b, vl8
874; VBITS_GE_256-NEXT:    uzp1 z0.h, z1.h, z1.h
875; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
876; VBITS_GE_256-NEXT:    cmpne p1.b, p1/z, z0.b, #0
877; VBITS_GE_256-NEXT:    ld1b { z0.b }, p1/z, [x0]
878; VBITS_GE_256-NEXT:    sshll v0.8h, v0.8b, #0
879; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
880; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
881; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
882; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
883; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
884; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2]
885; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2, x8, lsl #3]
886; VBITS_GE_256-NEXT:    ret
887;
888; VBITS_GE_512-LABEL: masked_load_sext_v8i8i64_m64:
889; VBITS_GE_512:       // %bb.0:
890; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
891; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x1]
892; VBITS_GE_512-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
893; VBITS_GE_512-NEXT:    ld1sb { z0.d }, p1/z, [x0]
894; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x2]
895; VBITS_GE_512-NEXT:    ret
896  %b = load <8 x i64>, ptr %bp
897  %mask = icmp eq <8 x i64> %b, zeroinitializer
898  %load = call <8 x i8> @llvm.masked.load.v8i8(ptr %ap, i32 8, <8 x i1> %mask, <8 x i8> undef)
899  %ext = sext <8 x i8> %load to <8 x i64>
900  store <8 x i64> %ext, ptr %c
901  ret void
902}
903
904define void @masked_load_sext_v16i16i32_m32(ptr %ap, ptr %bp, ptr %c) #0 {
905; VBITS_GE_256-LABEL: masked_load_sext_v16i16i32_m32:
906; VBITS_GE_256:       // %bb.0:
907; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
908; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
909; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1, x8, lsl #2]
910; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1]
911; VBITS_GE_256-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
912; VBITS_GE_256-NEXT:    cmpeq p2.s, p0/z, z1.s, #0
913; VBITS_GE_256-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
914; VBITS_GE_256-NEXT:    mov z1.s, p2/z, #-1 // =0xffffffffffffffff
915; VBITS_GE_256-NEXT:    ptrue p1.h, vl16
916; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
917; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
918; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
919; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
920; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
921; VBITS_GE_256-NEXT:    sunpklo z0.h, z1.b
922; VBITS_GE_256-NEXT:    cmpne p1.h, p1/z, z0.h, #0
923; VBITS_GE_256-NEXT:    ld1h { z0.h }, p1/z, [x0]
924; VBITS_GE_256-NEXT:    sunpklo z1.s, z0.h
925; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
926; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
927; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x2]
928; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x2, x8, lsl #2]
929; VBITS_GE_256-NEXT:    ret
930;
931; VBITS_GE_512-LABEL: masked_load_sext_v16i16i32_m32:
932; VBITS_GE_512:       // %bb.0:
933; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
934; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x1]
935; VBITS_GE_512-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
936; VBITS_GE_512-NEXT:    ld1sh { z0.s }, p1/z, [x0]
937; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x2]
938; VBITS_GE_512-NEXT:    ret
939  %b = load <16 x i32>, ptr %bp
940  %mask = icmp eq <16 x i32> %b, zeroinitializer
941  %load = call <16 x i16> @llvm.masked.load.v16i16(ptr %ap, i32 8, <16 x i1> %mask, <16 x i16> undef)
942  %ext = sext <16 x i16> %load to <16 x i32>
943  store <16 x i32> %ext, ptr %c
944  ret void
945}
946
947define void @masked_load_sext_v8i16i64_m64(ptr %ap, ptr %bp, ptr %c) #0 {
948; VBITS_GE_256-LABEL: masked_load_sext_v8i16i64_m64:
949; VBITS_GE_256:       // %bb.0:
950; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
951; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
952; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
953; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
954; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
955; VBITS_GE_256-NEXT:    cmpeq p2.d, p0/z, z1.d, #0
956; VBITS_GE_256-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
957; VBITS_GE_256-NEXT:    mov z1.d, p2/z, #-1 // =0xffffffffffffffff
958; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
959; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
960; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
961; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z0.s
962; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
963; VBITS_GE_256-NEXT:    uzp1 z0.h, z1.h, z1.h
964; VBITS_GE_256-NEXT:    cmpne p1.h, p1/z, z0.h, #0
965; VBITS_GE_256-NEXT:    ld1h { z0.h }, p1/z, [x0]
966; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
967; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
968; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
969; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
970; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
971; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2]
972; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2, x8, lsl #3]
973; VBITS_GE_256-NEXT:    ret
974;
975; VBITS_GE_512-LABEL: masked_load_sext_v8i16i64_m64:
976; VBITS_GE_512:       // %bb.0:
977; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
978; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x1]
979; VBITS_GE_512-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
980; VBITS_GE_512-NEXT:    ld1sh { z0.d }, p1/z, [x0]
981; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x2]
982; VBITS_GE_512-NEXT:    ret
983  %b = load <8 x i64>, ptr %bp
984  %mask = icmp eq <8 x i64> %b, zeroinitializer
985  %load = call <8 x i16> @llvm.masked.load.v8i16(ptr %ap, i32 8, <8 x i1> %mask, <8 x i16> undef)
986  %ext = sext <8 x i16> %load to <8 x i64>
987  store <8 x i64> %ext, ptr %c
988  ret void
989}
990
991define void @masked_load_sext_v8i32i64_m64(ptr %ap, ptr %bp, ptr %c) #0 {
992; VBITS_GE_256-LABEL: masked_load_sext_v8i32i64_m64:
993; VBITS_GE_256:       // %bb.0:
994; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
995; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
996; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
997; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
998; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
999; VBITS_GE_256-NEXT:    cmpeq p2.d, p0/z, z1.d, #0
1000; VBITS_GE_256-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
1001; VBITS_GE_256-NEXT:    mov z1.d, p2/z, #-1 // =0xffffffffffffffff
1002; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
1003; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
1004; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
1005; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z0.s
1006; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
1007; VBITS_GE_256-NEXT:    cmpne p1.s, p1/z, z1.s, #0
1008; VBITS_GE_256-NEXT:    ld1w { z0.s }, p1/z, [x0]
1009; VBITS_GE_256-NEXT:    sunpklo z1.d, z0.s
1010; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
1011; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
1012; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2]
1013; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2, x8, lsl #3]
1014; VBITS_GE_256-NEXT:    ret
1015;
1016; VBITS_GE_512-LABEL: masked_load_sext_v8i32i64_m64:
1017; VBITS_GE_512:       // %bb.0:
1018; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
1019; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x1]
1020; VBITS_GE_512-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
1021; VBITS_GE_512-NEXT:    ld1sw { z0.d }, p1/z, [x0]
1022; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x2]
1023; VBITS_GE_512-NEXT:    ret
1024  %b = load <8 x i64>, ptr %bp
1025  %mask = icmp eq <8 x i64> %b, zeroinitializer
1026  %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef)
1027  %ext = sext <8 x i32> %load to <8 x i64>
1028  store <8 x i64> %ext, ptr %c
1029  ret void
1030}
1031
1032define void @masked_load_zext_v32i8i16_m16(ptr %ap, ptr %bp, ptr %c) #0 {
1033; VBITS_GE_256-LABEL: masked_load_zext_v32i8i16_m16:
1034; VBITS_GE_256:       // %bb.0:
1035; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
1036; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
1037; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x1, x8, lsl #1]
1038; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
1039; VBITS_GE_256-NEXT:    cmpeq p1.h, p0/z, z0.h, #0
1040; VBITS_GE_256-NEXT:    cmpeq p2.h, p0/z, z1.h, #0
1041; VBITS_GE_256-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
1042; VBITS_GE_256-NEXT:    mov z1.h, p2/z, #-1 // =0xffffffffffffffff
1043; VBITS_GE_256-NEXT:    ptrue p1.b, vl16
1044; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
1045; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
1046; VBITS_GE_256-NEXT:    splice z1.b, p1, z1.b, z0.b
1047; VBITS_GE_256-NEXT:    ptrue p1.b, vl32
1048; VBITS_GE_256-NEXT:    cmpne p1.b, p1/z, z1.b, #0
1049; VBITS_GE_256-NEXT:    ld1b { z0.b }, p1/z, [x0]
1050; VBITS_GE_256-NEXT:    uunpklo z1.h, z0.b
1051; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
1052; VBITS_GE_256-NEXT:    uunpklo z0.h, z0.b
1053; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x2]
1054; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x2, x8, lsl #1]
1055; VBITS_GE_256-NEXT:    ret
1056;
1057; VBITS_GE_512-LABEL: masked_load_zext_v32i8i16_m16:
1058; VBITS_GE_512:       // %bb.0:
1059; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
1060; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x1]
1061; VBITS_GE_512-NEXT:    cmpeq p1.h, p0/z, z0.h, #0
1062; VBITS_GE_512-NEXT:    ld1b { z0.h }, p1/z, [x0]
1063; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x2]
1064; VBITS_GE_512-NEXT:    ret
1065  %b = load <32 x i16>, ptr %bp
1066  %mask = icmp eq <32 x i16> %b, zeroinitializer
1067  %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
1068  %ext = zext <32 x i8> %load to <32 x i16>
1069  store <32 x i16> %ext, ptr %c
1070  ret void
1071}
1072
1073define void @masked_load_zext_v16i8i32_m32(ptr %ap, ptr %bp, ptr %c) #0 {
1074; VBITS_GE_256-LABEL: masked_load_zext_v16i8i32_m32:
1075; VBITS_GE_256:       // %bb.0:
1076; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
1077; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
1078; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1, x8, lsl #2]
1079; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1]
1080; VBITS_GE_256-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
1081; VBITS_GE_256-NEXT:    cmpeq p2.s, p0/z, z1.s, #0
1082; VBITS_GE_256-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
1083; VBITS_GE_256-NEXT:    mov z1.s, p2/z, #-1 // =0xffffffffffffffff
1084; VBITS_GE_256-NEXT:    ptrue p1.b, vl16
1085; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
1086; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
1087; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
1088; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
1089; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
1090; VBITS_GE_256-NEXT:    cmpne p1.b, p1/z, z1.b, #0
1091; VBITS_GE_256-NEXT:    ld1b { z0.b }, p1/z, [x0]
1092; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
1093; VBITS_GE_256-NEXT:    uunpklo z0.h, z0.b
1094; VBITS_GE_256-NEXT:    uunpklo z1.h, z1.b
1095; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
1096; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
1097; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x2]
1098; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x2, x8, lsl #2]
1099; VBITS_GE_256-NEXT:    ret
1100;
1101; VBITS_GE_512-LABEL: masked_load_zext_v16i8i32_m32:
1102; VBITS_GE_512:       // %bb.0:
1103; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
1104; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x1]
1105; VBITS_GE_512-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
1106; VBITS_GE_512-NEXT:    ld1b { z0.s }, p1/z, [x0]
1107; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x2]
1108; VBITS_GE_512-NEXT:    ret
1109  %b = load <16 x i32>, ptr %bp
1110  %mask = icmp eq <16 x i32> %b, zeroinitializer
1111  %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %ap, i32 8, <16 x i1> %mask, <16 x i8> undef)
1112  %ext = zext <16 x i8> %load to <16 x i32>
1113  store <16 x i32> %ext, ptr %c
1114  ret void
1115}
1116
1117define void @masked_load_zext_v8i8i64_m64(ptr %ap, ptr %bp, ptr %c) #0 {
1118; VBITS_GE_256-LABEL: masked_load_zext_v8i8i64_m64:
1119; VBITS_GE_256:       // %bb.0:
1120; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
1121; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
1122; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
1123; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
1124; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
1125; VBITS_GE_256-NEXT:    cmpeq p2.d, p0/z, z1.d, #0
1126; VBITS_GE_256-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
1127; VBITS_GE_256-NEXT:    mov z1.d, p2/z, #-1 // =0xffffffffffffffff
1128; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
1129; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
1130; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
1131; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z0.s
1132; VBITS_GE_256-NEXT:    ptrue p1.b, vl8
1133; VBITS_GE_256-NEXT:    uzp1 z0.h, z1.h, z1.h
1134; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
1135; VBITS_GE_256-NEXT:    cmpne p1.b, p1/z, z0.b, #0
1136; VBITS_GE_256-NEXT:    ld1b { z0.b }, p1/z, [x0]
1137; VBITS_GE_256-NEXT:    ushll v0.8h, v0.8b, #0
1138; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
1139; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
1140; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
1141; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
1142; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
1143; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2]
1144; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2, x8, lsl #3]
1145; VBITS_GE_256-NEXT:    ret
1146;
1147; VBITS_GE_512-LABEL: masked_load_zext_v8i8i64_m64:
1148; VBITS_GE_512:       // %bb.0:
1149; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
1150; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x1]
1151; VBITS_GE_512-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
1152; VBITS_GE_512-NEXT:    ld1b { z0.d }, p1/z, [x0]
1153; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x2]
1154; VBITS_GE_512-NEXT:    ret
1155  %b = load <8 x i64>, ptr %bp
1156  %mask = icmp eq <8 x i64> %b, zeroinitializer
1157  %load = call <8 x i8> @llvm.masked.load.v8i8(ptr %ap, i32 8, <8 x i1> %mask, <8 x i8> undef)
1158  %ext = zext <8 x i8> %load to <8 x i64>
1159  store <8 x i64> %ext, ptr %c
1160  ret void
1161}
1162
1163define void @masked_load_zext_v16i16i32_m32(ptr %ap, ptr %bp, ptr %c) #0 {
1164; VBITS_GE_256-LABEL: masked_load_zext_v16i16i32_m32:
1165; VBITS_GE_256:       // %bb.0:
1166; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
1167; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
1168; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1, x8, lsl #2]
1169; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1]
1170; VBITS_GE_256-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
1171; VBITS_GE_256-NEXT:    cmpeq p2.s, p0/z, z1.s, #0
1172; VBITS_GE_256-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
1173; VBITS_GE_256-NEXT:    mov z1.s, p2/z, #-1 // =0xffffffffffffffff
1174; VBITS_GE_256-NEXT:    ptrue p1.h, vl16
1175; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
1176; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
1177; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
1178; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
1179; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
1180; VBITS_GE_256-NEXT:    sunpklo z0.h, z1.b
1181; VBITS_GE_256-NEXT:    cmpne p1.h, p1/z, z0.h, #0
1182; VBITS_GE_256-NEXT:    ld1h { z0.h }, p1/z, [x0]
1183; VBITS_GE_256-NEXT:    uunpklo z1.s, z0.h
1184; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
1185; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
1186; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x2]
1187; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x2, x8, lsl #2]
1188; VBITS_GE_256-NEXT:    ret
1189;
1190; VBITS_GE_512-LABEL: masked_load_zext_v16i16i32_m32:
1191; VBITS_GE_512:       // %bb.0:
1192; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
1193; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x1]
1194; VBITS_GE_512-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
1195; VBITS_GE_512-NEXT:    ld1h { z0.s }, p1/z, [x0]
1196; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x2]
1197; VBITS_GE_512-NEXT:    ret
1198  %b = load <16 x i32>, ptr %bp
1199  %mask = icmp eq <16 x i32> %b, zeroinitializer
1200  %load = call <16 x i16> @llvm.masked.load.v16i16(ptr %ap, i32 8, <16 x i1> %mask, <16 x i16> undef)
1201  %ext = zext <16 x i16> %load to <16 x i32>
1202  store <16 x i32> %ext, ptr %c
1203  ret void
1204}
1205
1206define void @masked_load_zext_v8i16i64_m64(ptr %ap, ptr %bp, ptr %c) #0 {
1207; VBITS_GE_256-LABEL: masked_load_zext_v8i16i64_m64:
1208; VBITS_GE_256:       // %bb.0:
1209; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
1210; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
1211; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
1212; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
1213; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
1214; VBITS_GE_256-NEXT:    cmpeq p2.d, p0/z, z1.d, #0
1215; VBITS_GE_256-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
1216; VBITS_GE_256-NEXT:    mov z1.d, p2/z, #-1 // =0xffffffffffffffff
1217; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
1218; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
1219; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
1220; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z0.s
1221; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
1222; VBITS_GE_256-NEXT:    uzp1 z0.h, z1.h, z1.h
1223; VBITS_GE_256-NEXT:    cmpne p1.h, p1/z, z0.h, #0
1224; VBITS_GE_256-NEXT:    ld1h { z0.h }, p1/z, [x0]
1225; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
1226; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
1227; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
1228; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
1229; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
1230; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2]
1231; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2, x8, lsl #3]
1232; VBITS_GE_256-NEXT:    ret
1233;
1234; VBITS_GE_512-LABEL: masked_load_zext_v8i16i64_m64:
1235; VBITS_GE_512:       // %bb.0:
1236; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
1237; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x1]
1238; VBITS_GE_512-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
1239; VBITS_GE_512-NEXT:    ld1h { z0.d }, p1/z, [x0]
1240; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x2]
1241; VBITS_GE_512-NEXT:    ret
1242  %b = load <8 x i64>, ptr %bp
1243  %mask = icmp eq <8 x i64> %b, zeroinitializer
1244  %load = call <8 x i16> @llvm.masked.load.v8i16(ptr %ap, i32 8, <8 x i1> %mask, <8 x i16> undef)
1245  %ext = zext <8 x i16> %load to <8 x i64>
1246  store <8 x i64> %ext, ptr %c
1247  ret void
1248}
1249
1250define void @masked_load_zext_v8i32i64_m64(ptr %ap, ptr %bp, ptr %c) #0 {
1251; VBITS_GE_256-LABEL: masked_load_zext_v8i32i64_m64:
1252; VBITS_GE_256:       // %bb.0:
1253; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
1254; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
1255; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
1256; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
1257; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
1258; VBITS_GE_256-NEXT:    cmpeq p2.d, p0/z, z1.d, #0
1259; VBITS_GE_256-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
1260; VBITS_GE_256-NEXT:    mov z1.d, p2/z, #-1 // =0xffffffffffffffff
1261; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
1262; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
1263; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
1264; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z0.s
1265; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
1266; VBITS_GE_256-NEXT:    cmpne p1.s, p1/z, z1.s, #0
1267; VBITS_GE_256-NEXT:    ld1w { z0.s }, p1/z, [x0]
1268; VBITS_GE_256-NEXT:    uunpklo z1.d, z0.s
1269; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
1270; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
1271; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2]
1272; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2, x8, lsl #3]
1273; VBITS_GE_256-NEXT:    ret
1274;
1275; VBITS_GE_512-LABEL: masked_load_zext_v8i32i64_m64:
1276; VBITS_GE_512:       // %bb.0:
1277; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
1278; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x1]
1279; VBITS_GE_512-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
1280; VBITS_GE_512-NEXT:    ld1w { z0.d }, p1/z, [x0]
1281; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x2]
1282; VBITS_GE_512-NEXT:    ret
1283  %b = load <8 x i64>, ptr %bp
1284  %mask = icmp eq <8 x i64> %b, zeroinitializer
1285  %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef)
1286  %ext = zext <8 x i32> %load to <8 x i64>
1287  store <8 x i64> %ext, ptr %c
1288  ret void
1289}
1290
1291define void @masked_load_sext_v128i8i16(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
1292; CHECK-LABEL: masked_load_sext_v128i8i16:
1293; CHECK:       // %bb.0:
1294; CHECK-NEXT:    ptrue p0.h, vl128
1295; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x1]
1296; CHECK-NEXT:    cmpeq p1.h, p0/z, z0.h, #0
1297; CHECK-NEXT:    ld1sb { z0.h }, p1/z, [x0]
1298; CHECK-NEXT:    st1h { z0.h }, p0, [x2]
1299; CHECK-NEXT:    ret
1300  %b = load <128 x i8>, ptr %bp
1301  %mask = icmp eq <128 x i8> %b, zeroinitializer
1302  %load = call <128 x i8> @llvm.masked.load.v128i8(ptr %ap, i32 8, <128 x i1> %mask, <128 x i8> undef)
1303  %ext = sext <128 x i8> %load to <128 x i16>
1304  store <128 x i16> %ext, ptr %c
1305  ret void
1306}
1307
1308define void @masked_load_sext_v64i8i32(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
1309; CHECK-LABEL: masked_load_sext_v64i8i32:
1310; CHECK:       // %bb.0:
1311; CHECK-NEXT:    ptrue p0.s, vl64
1312; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x1]
1313; CHECK-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
1314; CHECK-NEXT:    ld1sb { z0.s }, p1/z, [x0]
1315; CHECK-NEXT:    st1w { z0.s }, p0, [x2]
1316; CHECK-NEXT:    ret
1317  %b = load <64 x i8>, ptr %bp
1318  %mask = icmp eq <64 x i8> %b, zeroinitializer
1319  %load = call <64 x i8> @llvm.masked.load.v64i8(ptr %ap, i32 8, <64 x i1> %mask, <64 x i8> undef)
1320  %ext = sext <64 x i8> %load to <64 x i32>
1321  store <64 x i32> %ext, ptr %c
1322  ret void
1323}
1324
1325define void @masked_load_sext_v32i8i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
1326; CHECK-LABEL: masked_load_sext_v32i8i64:
1327; CHECK:       // %bb.0:
1328; CHECK-NEXT:    ptrue p0.d, vl32
1329; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x1]
1330; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
1331; CHECK-NEXT:    ld1sb { z0.d }, p1/z, [x0]
1332; CHECK-NEXT:    st1d { z0.d }, p0, [x2]
1333; CHECK-NEXT:    ret
1334  %b = load <32 x i8>, ptr %bp
1335  %mask = icmp eq <32 x i8> %b, zeroinitializer
1336  %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
1337  %ext = sext <32 x i8> %load to <32 x i64>
1338  store <32 x i64> %ext, ptr %c
1339  ret void
1340}
1341
1342define void @masked_load_sext_v64i16i32(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
1343; CHECK-LABEL: masked_load_sext_v64i16i32:
1344; CHECK:       // %bb.0:
1345; CHECK-NEXT:    ptrue p0.s, vl64
1346; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x1]
1347; CHECK-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
1348; CHECK-NEXT:    ld1sh { z0.s }, p1/z, [x0]
1349; CHECK-NEXT:    st1w { z0.s }, p0, [x2]
1350; CHECK-NEXT:    ret
1351  %b = load <64 x i16>, ptr %bp
1352  %mask = icmp eq <64 x i16> %b, zeroinitializer
1353  %load = call <64 x i16> @llvm.masked.load.v64i16(ptr %ap, i32 8, <64 x i1> %mask, <64 x i16> undef)
1354  %ext = sext <64 x i16> %load to <64 x i32>
1355  store <64 x i32> %ext, ptr %c
1356  ret void
1357}
1358
1359define void @masked_load_sext_v32i16i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
1360; CHECK-LABEL: masked_load_sext_v32i16i64:
1361; CHECK:       // %bb.0:
1362; CHECK-NEXT:    ptrue p0.d, vl32
1363; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x1]
1364; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
1365; CHECK-NEXT:    ld1sh { z0.d }, p1/z, [x0]
1366; CHECK-NEXT:    st1d { z0.d }, p0, [x2]
1367; CHECK-NEXT:    ret
1368  %b = load <32 x i16>, ptr %bp
1369  %mask = icmp eq <32 x i16> %b, zeroinitializer
1370  %load = call <32 x i16> @llvm.masked.load.v32i16(ptr %ap, i32 8, <32 x i1> %mask, <32 x i16> undef)
1371  %ext = sext <32 x i16> %load to <32 x i64>
1372  store <32 x i64> %ext, ptr %c
1373  ret void
1374}
1375
1376define void @masked_load_sext_v32i32i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
1377; CHECK-LABEL: masked_load_sext_v32i32i64:
1378; CHECK:       // %bb.0:
1379; CHECK-NEXT:    ptrue p0.d, vl32
1380; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x1]
1381; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
1382; CHECK-NEXT:    ld1sw { z0.d }, p1/z, [x0]
1383; CHECK-NEXT:    st1d { z0.d }, p0, [x2]
1384; CHECK-NEXT:    ret
1385  %b = load <32 x i32>, ptr %bp
1386  %mask = icmp eq <32 x i32> %b, zeroinitializer
1387  %load = call <32 x i32> @llvm.masked.load.v32i32(ptr %ap, i32 8, <32 x i1> %mask, <32 x i32> undef)
1388  %ext = sext <32 x i32> %load to <32 x i64>
1389  store <32 x i64> %ext, ptr %c
1390  ret void
1391}
1392
1393define void @masked_load_zext_v128i8i16(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
1394; CHECK-LABEL: masked_load_zext_v128i8i16:
1395; CHECK:       // %bb.0:
1396; CHECK-NEXT:    ptrue p0.h, vl128
1397; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x1]
1398; CHECK-NEXT:    cmpeq p1.h, p0/z, z0.h, #0
1399; CHECK-NEXT:    ld1b { z0.h }, p1/z, [x0]
1400; CHECK-NEXT:    st1h { z0.h }, p0, [x2]
1401; CHECK-NEXT:    ret
1402  %b = load <128 x i8>, ptr %bp
1403  %mask = icmp eq <128 x i8> %b, zeroinitializer
1404  %load = call <128 x i8> @llvm.masked.load.v128i8(ptr %ap, i32 8, <128 x i1> %mask, <128 x i8> undef)
1405  %ext = zext <128 x i8> %load to <128 x i16>
1406  store <128 x i16> %ext, ptr %c
1407  ret void
1408}
1409
1410define void @masked_load_zext_v64i8i32(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
1411; CHECK-LABEL: masked_load_zext_v64i8i32:
1412; CHECK:       // %bb.0:
1413; CHECK-NEXT:    ptrue p0.s, vl64
1414; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x1]
1415; CHECK-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
1416; CHECK-NEXT:    ld1b { z0.s }, p1/z, [x0]
1417; CHECK-NEXT:    st1w { z0.s }, p0, [x2]
1418; CHECK-NEXT:    ret
1419  %b = load <64 x i8>, ptr %bp
1420  %mask = icmp eq <64 x i8> %b, zeroinitializer
1421  %load = call <64 x i8> @llvm.masked.load.v64i8(ptr %ap, i32 8, <64 x i1> %mask, <64 x i8> undef)
1422  %ext = zext <64 x i8> %load to <64 x i32>
1423  store <64 x i32> %ext, ptr %c
1424  ret void
1425}
1426
1427define void @masked_load_zext_v32i8i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
1428; CHECK-LABEL: masked_load_zext_v32i8i64:
1429; CHECK:       // %bb.0:
1430; CHECK-NEXT:    ptrue p0.d, vl32
1431; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x1]
1432; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
1433; CHECK-NEXT:    ld1b { z0.d }, p1/z, [x0]
1434; CHECK-NEXT:    st1d { z0.d }, p0, [x2]
1435; CHECK-NEXT:    ret
1436  %b = load <32 x i8>, ptr %bp
1437  %mask = icmp eq <32 x i8> %b, zeroinitializer
1438  %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
1439  %ext = zext <32 x i8> %load to <32 x i64>
1440  store <32 x i64> %ext, ptr %c
1441  ret void
1442}
1443
1444define void @masked_load_zext_v64i16i32(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
1445; CHECK-LABEL: masked_load_zext_v64i16i32:
1446; CHECK:       // %bb.0:
1447; CHECK-NEXT:    ptrue p0.s, vl64
1448; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x1]
1449; CHECK-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
1450; CHECK-NEXT:    ld1h { z0.s }, p1/z, [x0]
1451; CHECK-NEXT:    st1w { z0.s }, p0, [x2]
1452; CHECK-NEXT:    ret
1453  %b = load <64 x i16>, ptr %bp
1454  %mask = icmp eq <64 x i16> %b, zeroinitializer
1455  %load = call <64 x i16> @llvm.masked.load.v64i16(ptr %ap, i32 8, <64 x i1> %mask, <64 x i16> undef)
1456  %ext = zext <64 x i16> %load to <64 x i32>
1457  store <64 x i32> %ext, ptr %c
1458  ret void
1459}
1460
1461define void @masked_load_zext_v32i16i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
1462; CHECK-LABEL: masked_load_zext_v32i16i64:
1463; CHECK:       // %bb.0:
1464; CHECK-NEXT:    ptrue p0.d, vl32
1465; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x1]
1466; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
1467; CHECK-NEXT:    ld1h { z0.d }, p1/z, [x0]
1468; CHECK-NEXT:    st1d { z0.d }, p0, [x2]
1469; CHECK-NEXT:    ret
1470  %b = load <32 x i16>, ptr %bp
1471  %mask = icmp eq <32 x i16> %b, zeroinitializer
1472  %load = call <32 x i16> @llvm.masked.load.v32i16(ptr %ap, i32 8, <32 x i1> %mask, <32 x i16> undef)
1473  %ext = zext <32 x i16> %load to <32 x i64>
1474  store <32 x i64> %ext, ptr %c
1475  ret void
1476}
1477
1478define void @masked_load_zext_v32i32i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
1479; CHECK-LABEL: masked_load_zext_v32i32i64:
1480; CHECK:       // %bb.0:
1481; CHECK-NEXT:    ptrue p0.d, vl32
1482; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x1]
1483; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
1484; CHECK-NEXT:    ld1w { z0.d }, p1/z, [x0]
1485; CHECK-NEXT:    st1d { z0.d }, p0, [x2]
1486; CHECK-NEXT:    ret
1487  %b = load <32 x i32>, ptr %bp
1488  %mask = icmp eq <32 x i32> %b, zeroinitializer
1489  %load = call <32 x i32> @llvm.masked.load.v32i32(ptr %ap, i32 8, <32 x i1> %mask, <32 x i32> undef)
1490  %ext = zext <32 x i32> %load to <32 x i64>
1491  store <32 x i64> %ext, ptr %c
1492  ret void
1493}
1494
1495define void @masked_load_sext_ugt_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 {
1496; VBITS_GE_256-LABEL: masked_load_sext_ugt_v8i32i64:
1497; VBITS_GE_256:       // %bb.0:
1498; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
1499; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
1500; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1]
1501; VBITS_GE_256-NEXT:    cmpne p0.s, p0/z, z0.s, #0
1502; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
1503; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
1504; VBITS_GE_256-NEXT:    sunpklo z1.d, z0.s
1505; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
1506; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
1507; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2]
1508; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2, x8, lsl #3]
1509; VBITS_GE_256-NEXT:    ret
1510;
1511; VBITS_GE_512-LABEL: masked_load_sext_ugt_v8i32i64:
1512; VBITS_GE_512:       // %bb.0:
1513; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
1514; VBITS_GE_512-NEXT:    ld1w { z0.d }, p0/z, [x1]
1515; VBITS_GE_512-NEXT:    cmpne p1.d, p0/z, z0.d, #0
1516; VBITS_GE_512-NEXT:    ld1sw { z0.d }, p1/z, [x0]
1517; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x2]
1518; VBITS_GE_512-NEXT:    ret
1519  %b = load <8 x i32>, ptr %bp
1520  %mask = icmp ugt <8 x i32> %b, zeroinitializer
1521  %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef)
1522  %ext = sext <8 x i32> %load to <8 x i64>
1523  store <8 x i64> %ext, ptr %c
1524  ret void
1525}
1526
1527define void @masked_load_zext_sgt_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 {
1528; VBITS_GE_256-LABEL: masked_load_zext_sgt_v8i32i64:
1529; VBITS_GE_256:       // %bb.0:
1530; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
1531; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
1532; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1]
1533; VBITS_GE_256-NEXT:    cmpgt p0.s, p0/z, z0.s, #0
1534; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
1535; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
1536; VBITS_GE_256-NEXT:    uunpklo z1.d, z0.s
1537; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
1538; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
1539; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2]
1540; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2, x8, lsl #3]
1541; VBITS_GE_256-NEXT:    ret
1542;
1543; VBITS_GE_512-LABEL: masked_load_zext_sgt_v8i32i64:
1544; VBITS_GE_512:       // %bb.0:
1545; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
1546; VBITS_GE_512-NEXT:    ld1sw { z0.d }, p0/z, [x1]
1547; VBITS_GE_512-NEXT:    cmpgt p1.d, p0/z, z0.d, #0
1548; VBITS_GE_512-NEXT:    ld1w { z0.d }, p1/z, [x0]
1549; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x2]
1550; VBITS_GE_512-NEXT:    ret
1551  %b = load <8 x i32>, ptr %bp
1552  %mask = icmp sgt <8 x i32> %b, zeroinitializer
1553  %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef)
1554  %ext = zext <8 x i32> %load to <8 x i64>
1555  store <8 x i64> %ext, ptr %c
1556  ret void
1557}
1558
1559declare <2 x half> @llvm.masked.load.v2f16(ptr, i32, <2 x i1>, <2 x half>)
1560declare <2 x float> @llvm.masked.load.v2f32(ptr, i32, <2 x i1>, <2 x float>)
1561declare <4 x float> @llvm.masked.load.v4f32(ptr, i32, <4 x i1>, <4 x float>)
1562declare <8 x float> @llvm.masked.load.v8f32(ptr, i32, <8 x i1>, <8 x float>)
1563declare <16 x float> @llvm.masked.load.v16f32(ptr, i32, <16 x i1>, <16 x float>)
1564declare <32 x float> @llvm.masked.load.v32f32(ptr, i32, <32 x i1>, <32 x float>)
1565declare <64 x float> @llvm.masked.load.v64f32(ptr, i32, <64 x i1>, <64 x float>)
1566
1567declare <128 x i8> @llvm.masked.load.v128i8(ptr, i32, <128 x i1>, <128 x i8>)
1568declare <64 x i8> @llvm.masked.load.v64i8(ptr, i32, <64 x i1>, <64 x i8>)
1569declare <32 x i8> @llvm.masked.load.v32i8(ptr, i32, <32 x i1>, <32 x i8>)
1570declare <16 x i8> @llvm.masked.load.v16i8(ptr, i32, <16 x i1>, <16 x i8>)
1571declare <16 x i16> @llvm.masked.load.v16i16(ptr, i32, <16 x i1>, <16 x i16>)
1572declare <8 x i8> @llvm.masked.load.v8i8(ptr, i32, <8 x i1>, <8 x i8>)
1573declare <8 x i16> @llvm.masked.load.v8i16(ptr, i32, <8 x i1>, <8 x i16>)
1574declare <8 x i32> @llvm.masked.load.v8i32(ptr, i32, <8 x i1>, <8 x i32>)
1575declare <32 x i32> @llvm.masked.load.v32i32(ptr, i32, <32 x i1>, <32 x i32>)
1576declare <32 x i16> @llvm.masked.load.v32i16(ptr, i32, <32 x i1>, <32 x i16>)
1577declare <64 x i16> @llvm.masked.load.v64i16(ptr, i32, <64 x i1>, <64 x i16>)
1578declare <16 x i32> @llvm.masked.load.v16i32(ptr, i32, <16 x i1>, <16 x i32>)
1579declare <8 x i64> @llvm.masked.load.v8i64(ptr, i32, <8 x i1>, <8 x i64>)
1580declare <8 x double> @llvm.masked.load.v8f64(ptr, i32, <8 x i1>, <8 x double>)
1581
1582attributes #0 = { "target-features"="+sve" }
1583