xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll (revision b24af43fdfa1b1242b7cb77540462212227c57c4)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
5
6target triple = "aarch64-unknown-linux-gnu"
7
8;
9; ASHR
10;
11
12; Don't use SVE for 64-bit vectors.
13define <8 x i8> @ashr_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
14; CHECK-LABEL: ashr_v8i8:
15; CHECK:       // %bb.0:
16; CHECK-NEXT:    neg v1.8b, v1.8b
17; CHECK-NEXT:    sshl v0.8b, v0.8b, v1.8b
18; CHECK-NEXT:    ret
19  %res = ashr <8 x i8> %op1, %op2
20  ret <8 x i8> %res
21}
22
23; Don't use SVE for 128-bit vectors.
24define <16 x i8> @ashr_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
25; CHECK-LABEL: ashr_v16i8:
26; CHECK:       // %bb.0:
27; CHECK-NEXT:    neg v1.16b, v1.16b
28; CHECK-NEXT:    sshl v0.16b, v0.16b, v1.16b
29; CHECK-NEXT:    ret
30  %res = ashr <16 x i8> %op1, %op2
31  ret <16 x i8> %res
32}
33
34define void @ashr_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
35; CHECK-LABEL: ashr_v32i8:
36; CHECK:       // %bb.0:
37; CHECK-NEXT:    ptrue p0.b, vl32
38; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
39; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
40; CHECK-NEXT:    asr z0.b, p0/m, z0.b, z1.b
41; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
42; CHECK-NEXT:    ret
43  %op1 = load <32 x i8>, ptr %a
44  %op2 = load <32 x i8>, ptr %b
45  %res = ashr <32 x i8> %op1, %op2
46  store <32 x i8> %res, ptr %a
47  ret void
48}
49
50define void @ashr_v64i8(ptr %a, ptr %b) #0 {
51; VBITS_GE_256-LABEL: ashr_v64i8:
52; VBITS_GE_256:       // %bb.0:
53; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
54; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
55; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
56; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x1, x8]
57; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x0]
58; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
59; VBITS_GE_256-NEXT:    asr z0.b, p0/m, z0.b, z1.b
60; VBITS_GE_256-NEXT:    movprfx z1, z2
61; VBITS_GE_256-NEXT:    asr z1.b, p0/m, z1.b, z3.b
62; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
63; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
64; VBITS_GE_256-NEXT:    ret
65;
66; VBITS_GE_512-LABEL: ashr_v64i8:
67; VBITS_GE_512:       // %bb.0:
68; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
69; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
70; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
71; VBITS_GE_512-NEXT:    asr z0.b, p0/m, z0.b, z1.b
72; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
73; VBITS_GE_512-NEXT:    ret
74  %op1 = load <64 x i8>, ptr %a
75  %op2 = load <64 x i8>, ptr %b
76  %res = ashr <64 x i8> %op1, %op2
77  store <64 x i8> %res, ptr %a
78  ret void
79}
80
81define void @ashr_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
82; CHECK-LABEL: ashr_v128i8:
83; CHECK:       // %bb.0:
84; CHECK-NEXT:    ptrue p0.b, vl128
85; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
86; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
87; CHECK-NEXT:    asr z0.b, p0/m, z0.b, z1.b
88; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
89; CHECK-NEXT:    ret
90  %op1 = load <128 x i8>, ptr %a
91  %op2 = load <128 x i8>, ptr %b
92  %res = ashr <128 x i8> %op1, %op2
93  store <128 x i8> %res, ptr %a
94  ret void
95}
96
97define void @ashr_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
98; CHECK-LABEL: ashr_v256i8:
99; CHECK:       // %bb.0:
100; CHECK-NEXT:    ptrue p0.b, vl256
101; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
102; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
103; CHECK-NEXT:    asr z0.b, p0/m, z0.b, z1.b
104; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
105; CHECK-NEXT:    ret
106  %op1 = load <256 x i8>, ptr %a
107  %op2 = load <256 x i8>, ptr %b
108  %res = ashr <256 x i8> %op1, %op2
109  store <256 x i8> %res, ptr %a
110  ret void
111}
112
113; Don't use SVE for 64-bit vectors.
114define <4 x i16> @ashr_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
115; CHECK-LABEL: ashr_v4i16:
116; CHECK:       // %bb.0:
117; CHECK-NEXT:    neg v1.4h, v1.4h
118; CHECK-NEXT:    sshl v0.4h, v0.4h, v1.4h
119; CHECK-NEXT:    ret
120  %res = ashr <4 x i16> %op1, %op2
121  ret <4 x i16> %res
122}
123
124; Don't use SVE for 128-bit vectors.
125define <8 x i16> @ashr_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
126; CHECK-LABEL: ashr_v8i16:
127; CHECK:       // %bb.0:
128; CHECK-NEXT:    neg v1.8h, v1.8h
129; CHECK-NEXT:    sshl v0.8h, v0.8h, v1.8h
130; CHECK-NEXT:    ret
131  %res = ashr <8 x i16> %op1, %op2
132  ret <8 x i16> %res
133}
134
135define void @ashr_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
136; CHECK-LABEL: ashr_v16i16:
137; CHECK:       // %bb.0:
138; CHECK-NEXT:    ptrue p0.h, vl16
139; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
140; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
141; CHECK-NEXT:    asr z0.h, p0/m, z0.h, z1.h
142; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
143; CHECK-NEXT:    ret
144  %op1 = load <16 x i16>, ptr %a
145  %op2 = load <16 x i16>, ptr %b
146  %res = ashr <16 x i16> %op1, %op2
147  store <16 x i16> %res, ptr %a
148  ret void
149}
150
151define void @ashr_v32i16(ptr %a, ptr %b) #0 {
152; VBITS_GE_256-LABEL: ashr_v32i16:
153; VBITS_GE_256:       // %bb.0:
154; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
155; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
156; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
157; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
158; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
159; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
160; VBITS_GE_256-NEXT:    asr z0.h, p0/m, z0.h, z1.h
161; VBITS_GE_256-NEXT:    movprfx z1, z2
162; VBITS_GE_256-NEXT:    asr z1.h, p0/m, z1.h, z3.h
163; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
164; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
165; VBITS_GE_256-NEXT:    ret
166;
167; VBITS_GE_512-LABEL: ashr_v32i16:
168; VBITS_GE_512:       // %bb.0:
169; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
170; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
171; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
172; VBITS_GE_512-NEXT:    asr z0.h, p0/m, z0.h, z1.h
173; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
174; VBITS_GE_512-NEXT:    ret
175  %op1 = load <32 x i16>, ptr %a
176  %op2 = load <32 x i16>, ptr %b
177  %res = ashr <32 x i16> %op1, %op2
178  store <32 x i16> %res, ptr %a
179  ret void
180}
181
182define void @ashr_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
183; CHECK-LABEL: ashr_v64i16:
184; CHECK:       // %bb.0:
185; CHECK-NEXT:    ptrue p0.h, vl64
186; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
187; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
188; CHECK-NEXT:    asr z0.h, p0/m, z0.h, z1.h
189; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
190; CHECK-NEXT:    ret
191  %op1 = load <64 x i16>, ptr %a
192  %op2 = load <64 x i16>, ptr %b
193  %res = ashr <64 x i16> %op1, %op2
194  store <64 x i16> %res, ptr %a
195  ret void
196}
197
198define void @ashr_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
199; CHECK-LABEL: ashr_v128i16:
200; CHECK:       // %bb.0:
201; CHECK-NEXT:    ptrue p0.h, vl128
202; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
203; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
204; CHECK-NEXT:    asr z0.h, p0/m, z0.h, z1.h
205; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
206; CHECK-NEXT:    ret
207  %op1 = load <128 x i16>, ptr %a
208  %op2 = load <128 x i16>, ptr %b
209  %res = ashr <128 x i16> %op1, %op2
210  store <128 x i16> %res, ptr %a
211  ret void
212}
213
214; Don't use SVE for 64-bit vectors.
215define <2 x i32> @ashr_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
216; CHECK-LABEL: ashr_v2i32:
217; CHECK:       // %bb.0:
218; CHECK-NEXT:    neg v1.2s, v1.2s
219; CHECK-NEXT:    sshl v0.2s, v0.2s, v1.2s
220; CHECK-NEXT:    ret
221  %res = ashr <2 x i32> %op1, %op2
222  ret <2 x i32> %res
223}
224
225; Don't use SVE for 128-bit vectors.
226define <4 x i32> @ashr_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
227; CHECK-LABEL: ashr_v4i32:
228; CHECK:       // %bb.0:
229; CHECK-NEXT:    neg v1.4s, v1.4s
230; CHECK-NEXT:    sshl v0.4s, v0.4s, v1.4s
231; CHECK-NEXT:    ret
232  %res = ashr <4 x i32> %op1, %op2
233  ret <4 x i32> %res
234}
235
236define void @ashr_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
237; CHECK-LABEL: ashr_v8i32:
238; CHECK:       // %bb.0:
239; CHECK-NEXT:    ptrue p0.s, vl8
240; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
241; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
242; CHECK-NEXT:    asr z0.s, p0/m, z0.s, z1.s
243; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
244; CHECK-NEXT:    ret
245  %op1 = load <8 x i32>, ptr %a
246  %op2 = load <8 x i32>, ptr %b
247  %res = ashr <8 x i32> %op1, %op2
248  store <8 x i32> %res, ptr %a
249  ret void
250}
251
252define void @ashr_v16i32(ptr %a, ptr %b) #0 {
253; VBITS_GE_256-LABEL: ashr_v16i32:
254; VBITS_GE_256:       // %bb.0:
255; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
256; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
257; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
258; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
259; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
260; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
261; VBITS_GE_256-NEXT:    asr z0.s, p0/m, z0.s, z1.s
262; VBITS_GE_256-NEXT:    movprfx z1, z2
263; VBITS_GE_256-NEXT:    asr z1.s, p0/m, z1.s, z3.s
264; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
265; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
266; VBITS_GE_256-NEXT:    ret
267;
268; VBITS_GE_512-LABEL: ashr_v16i32:
269; VBITS_GE_512:       // %bb.0:
270; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
271; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
272; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
273; VBITS_GE_512-NEXT:    asr z0.s, p0/m, z0.s, z1.s
274; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
275; VBITS_GE_512-NEXT:    ret
276  %op1 = load <16 x i32>, ptr %a
277  %op2 = load <16 x i32>, ptr %b
278  %res = ashr <16 x i32> %op1, %op2
279  store <16 x i32> %res, ptr %a
280  ret void
281}
282
283define void @ashr_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
284; CHECK-LABEL: ashr_v32i32:
285; CHECK:       // %bb.0:
286; CHECK-NEXT:    ptrue p0.s, vl32
287; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
288; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
289; CHECK-NEXT:    asr z0.s, p0/m, z0.s, z1.s
290; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
291; CHECK-NEXT:    ret
292  %op1 = load <32 x i32>, ptr %a
293  %op2 = load <32 x i32>, ptr %b
294  %res = ashr <32 x i32> %op1, %op2
295  store <32 x i32> %res, ptr %a
296  ret void
297}
298
299define void @ashr_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
300; CHECK-LABEL: ashr_v64i32:
301; CHECK:       // %bb.0:
302; CHECK-NEXT:    ptrue p0.s, vl64
303; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
304; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
305; CHECK-NEXT:    asr z0.s, p0/m, z0.s, z1.s
306; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
307; CHECK-NEXT:    ret
308  %op1 = load <64 x i32>, ptr %a
309  %op2 = load <64 x i32>, ptr %b
310  %res = ashr <64 x i32> %op1, %op2
311  store <64 x i32> %res, ptr %a
312  ret void
313}
314
315; Don't use SVE for 64-bit vectors.
316define <1 x i64> @ashr_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
317; CHECK-LABEL: ashr_v1i64:
318; CHECK:       // %bb.0:
319; CHECK-NEXT:    neg d1, d1
320; CHECK-NEXT:    sshl d0, d0, d1
321; CHECK-NEXT:    ret
322  %res = ashr <1 x i64> %op1, %op2
323  ret <1 x i64> %res
324}
325
326; Don't use SVE for 128-bit vectors.
327define <2 x i64> @ashr_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
328; CHECK-LABEL: ashr_v2i64:
329; CHECK:       // %bb.0:
330; CHECK-NEXT:    neg v1.2d, v1.2d
331; CHECK-NEXT:    sshl v0.2d, v0.2d, v1.2d
332; CHECK-NEXT:    ret
333  %res = ashr <2 x i64> %op1, %op2
334  ret <2 x i64> %res
335}
336
337define void @ashr_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
338; CHECK-LABEL: ashr_v4i64:
339; CHECK:       // %bb.0:
340; CHECK-NEXT:    ptrue p0.d, vl4
341; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
342; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
343; CHECK-NEXT:    asr z0.d, p0/m, z0.d, z1.d
344; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
345; CHECK-NEXT:    ret
346  %op1 = load <4 x i64>, ptr %a
347  %op2 = load <4 x i64>, ptr %b
348  %res = ashr <4 x i64> %op1, %op2
349  store <4 x i64> %res, ptr %a
350  ret void
351}
352
353define void @ashr_v8i64(ptr %a, ptr %b) #0 {
354; VBITS_GE_256-LABEL: ashr_v8i64:
355; VBITS_GE_256:       // %bb.0:
356; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
357; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
358; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
359; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
360; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
361; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
362; VBITS_GE_256-NEXT:    asr z0.d, p0/m, z0.d, z1.d
363; VBITS_GE_256-NEXT:    movprfx z1, z2
364; VBITS_GE_256-NEXT:    asr z1.d, p0/m, z1.d, z3.d
365; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
366; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
367; VBITS_GE_256-NEXT:    ret
368;
369; VBITS_GE_512-LABEL: ashr_v8i64:
370; VBITS_GE_512:       // %bb.0:
371; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
372; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
373; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
374; VBITS_GE_512-NEXT:    asr z0.d, p0/m, z0.d, z1.d
375; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
376; VBITS_GE_512-NEXT:    ret
377  %op1 = load <8 x i64>, ptr %a
378  %op2 = load <8 x i64>, ptr %b
379  %res = ashr <8 x i64> %op1, %op2
380  store <8 x i64> %res, ptr %a
381  ret void
382}
383
384define void @ashr_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
385; CHECK-LABEL: ashr_v16i64:
386; CHECK:       // %bb.0:
387; CHECK-NEXT:    ptrue p0.d, vl16
388; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
389; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
390; CHECK-NEXT:    asr z0.d, p0/m, z0.d, z1.d
391; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
392; CHECK-NEXT:    ret
393  %op1 = load <16 x i64>, ptr %a
394  %op2 = load <16 x i64>, ptr %b
395  %res = ashr <16 x i64> %op1, %op2
396  store <16 x i64> %res, ptr %a
397  ret void
398}
399
400define void @ashr_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
401; CHECK-LABEL: ashr_v32i64:
402; CHECK:       // %bb.0:
403; CHECK-NEXT:    ptrue p0.d, vl32
404; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
405; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
406; CHECK-NEXT:    asr z0.d, p0/m, z0.d, z1.d
407; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
408; CHECK-NEXT:    ret
409  %op1 = load <32 x i64>, ptr %a
410  %op2 = load <32 x i64>, ptr %b
411  %res = ashr <32 x i64> %op1, %op2
412  store <32 x i64> %res, ptr %a
413  ret void
414}
415
416;
417; LSHR
418;
419
420; Don't use SVE for 64-bit vectors.
421define <8 x i8> @lshr_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
422; CHECK-LABEL: lshr_v8i8:
423; CHECK:       // %bb.0:
424; CHECK-NEXT:    neg v1.8b, v1.8b
425; CHECK-NEXT:    ushl v0.8b, v0.8b, v1.8b
426; CHECK-NEXT:    ret
427  %res = lshr <8 x i8> %op1, %op2
428  ret <8 x i8> %res
429}
430
431; Don't use SVE for 128-bit vectors.
432define <16 x i8> @lshr_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
433; CHECK-LABEL: lshr_v16i8:
434; CHECK:       // %bb.0:
435; CHECK-NEXT:    neg v1.16b, v1.16b
436; CHECK-NEXT:    ushl v0.16b, v0.16b, v1.16b
437; CHECK-NEXT:    ret
438  %res = lshr <16 x i8> %op1, %op2
439  ret <16 x i8> %res
440}
441
442define void @lshr_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
443; CHECK-LABEL: lshr_v32i8:
444; CHECK:       // %bb.0:
445; CHECK-NEXT:    ptrue p0.b, vl32
446; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
447; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
448; CHECK-NEXT:    lsr z0.b, p0/m, z0.b, z1.b
449; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
450; CHECK-NEXT:    ret
451  %op1 = load <32 x i8>, ptr %a
452  %op2 = load <32 x i8>, ptr %b
453  %res = lshr <32 x i8> %op1, %op2
454  store <32 x i8> %res, ptr %a
455  ret void
456}
457
458define void @lshr_v64i8(ptr %a, ptr %b) #0 {
459; VBITS_GE_256-LABEL: lshr_v64i8:
460; VBITS_GE_256:       // %bb.0:
461; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
462; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
463; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
464; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x1, x8]
465; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x0]
466; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
467; VBITS_GE_256-NEXT:    lsr z0.b, p0/m, z0.b, z1.b
468; VBITS_GE_256-NEXT:    movprfx z1, z2
469; VBITS_GE_256-NEXT:    lsr z1.b, p0/m, z1.b, z3.b
470; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
471; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
472; VBITS_GE_256-NEXT:    ret
473;
474; VBITS_GE_512-LABEL: lshr_v64i8:
475; VBITS_GE_512:       // %bb.0:
476; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
477; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
478; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
479; VBITS_GE_512-NEXT:    lsr z0.b, p0/m, z0.b, z1.b
480; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
481; VBITS_GE_512-NEXT:    ret
482  %op1 = load <64 x i8>, ptr %a
483  %op2 = load <64 x i8>, ptr %b
484  %res = lshr <64 x i8> %op1, %op2
485  store <64 x i8> %res, ptr %a
486  ret void
487}
488
489define void @lshr_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
490; CHECK-LABEL: lshr_v128i8:
491; CHECK:       // %bb.0:
492; CHECK-NEXT:    ptrue p0.b, vl128
493; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
494; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
495; CHECK-NEXT:    lsr z0.b, p0/m, z0.b, z1.b
496; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
497; CHECK-NEXT:    ret
498  %op1 = load <128 x i8>, ptr %a
499  %op2 = load <128 x i8>, ptr %b
500  %res = lshr <128 x i8> %op1, %op2
501  store <128 x i8> %res, ptr %a
502  ret void
503}
504
505define void @lshr_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
506; CHECK-LABEL: lshr_v256i8:
507; CHECK:       // %bb.0:
508; CHECK-NEXT:    ptrue p0.b, vl256
509; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
510; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
511; CHECK-NEXT:    lsr z0.b, p0/m, z0.b, z1.b
512; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
513; CHECK-NEXT:    ret
514  %op1 = load <256 x i8>, ptr %a
515  %op2 = load <256 x i8>, ptr %b
516  %res = lshr <256 x i8> %op1, %op2
517  store <256 x i8> %res, ptr %a
518  ret void
519}
520
521; Don't use SVE for 64-bit vectors.
522define <4 x i16> @lshr_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
523; CHECK-LABEL: lshr_v4i16:
524; CHECK:       // %bb.0:
525; CHECK-NEXT:    neg v1.4h, v1.4h
526; CHECK-NEXT:    ushl v0.4h, v0.4h, v1.4h
527; CHECK-NEXT:    ret
528  %res = lshr <4 x i16> %op1, %op2
529  ret <4 x i16> %res
530}
531
532; Don't use SVE for 128-bit vectors.
533define <8 x i16> @lshr_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
534; CHECK-LABEL: lshr_v8i16:
535; CHECK:       // %bb.0:
536; CHECK-NEXT:    neg v1.8h, v1.8h
537; CHECK-NEXT:    ushl v0.8h, v0.8h, v1.8h
538; CHECK-NEXT:    ret
539  %res = lshr <8 x i16> %op1, %op2
540  ret <8 x i16> %res
541}
542
543define void @lshr_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
544; CHECK-LABEL: lshr_v16i16:
545; CHECK:       // %bb.0:
546; CHECK-NEXT:    ptrue p0.h, vl16
547; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
548; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
549; CHECK-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
550; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
551; CHECK-NEXT:    ret
552  %op1 = load <16 x i16>, ptr %a
553  %op2 = load <16 x i16>, ptr %b
554  %res = lshr <16 x i16> %op1, %op2
555  store <16 x i16> %res, ptr %a
556  ret void
557}
558
559define void @lshr_v32i16(ptr %a, ptr %b) #0 {
560; VBITS_GE_256-LABEL: lshr_v32i16:
561; VBITS_GE_256:       // %bb.0:
562; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
563; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
564; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
565; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
566; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
567; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
568; VBITS_GE_256-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
569; VBITS_GE_256-NEXT:    movprfx z1, z2
570; VBITS_GE_256-NEXT:    lsr z1.h, p0/m, z1.h, z3.h
571; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
572; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
573; VBITS_GE_256-NEXT:    ret
574;
575; VBITS_GE_512-LABEL: lshr_v32i16:
576; VBITS_GE_512:       // %bb.0:
577; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
578; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
579; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
580; VBITS_GE_512-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
581; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
582; VBITS_GE_512-NEXT:    ret
583  %op1 = load <32 x i16>, ptr %a
584  %op2 = load <32 x i16>, ptr %b
585  %res = lshr <32 x i16> %op1, %op2
586  store <32 x i16> %res, ptr %a
587  ret void
588}
589
590define void @lshr_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
591; CHECK-LABEL: lshr_v64i16:
592; CHECK:       // %bb.0:
593; CHECK-NEXT:    ptrue p0.h, vl64
594; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
595; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
596; CHECK-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
597; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
598; CHECK-NEXT:    ret
599  %op1 = load <64 x i16>, ptr %a
600  %op2 = load <64 x i16>, ptr %b
601  %res = lshr <64 x i16> %op1, %op2
602  store <64 x i16> %res, ptr %a
603  ret void
604}
605
606define void @lshr_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
607; CHECK-LABEL: lshr_v128i16:
608; CHECK:       // %bb.0:
609; CHECK-NEXT:    ptrue p0.h, vl128
610; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
611; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
612; CHECK-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
613; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
614; CHECK-NEXT:    ret
615  %op1 = load <128 x i16>, ptr %a
616  %op2 = load <128 x i16>, ptr %b
617  %res = lshr <128 x i16> %op1, %op2
618  store <128 x i16> %res, ptr %a
619  ret void
620}
621
622; Don't use SVE for 64-bit vectors.
623define <2 x i32> @lshr_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
624; CHECK-LABEL: lshr_v2i32:
625; CHECK:       // %bb.0:
626; CHECK-NEXT:    neg v1.2s, v1.2s
627; CHECK-NEXT:    ushl v0.2s, v0.2s, v1.2s
628; CHECK-NEXT:    ret
629  %res = lshr <2 x i32> %op1, %op2
630  ret <2 x i32> %res
631}
632
633; Don't use SVE for 128-bit vectors.
634define <4 x i32> @lshr_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
635; CHECK-LABEL: lshr_v4i32:
636; CHECK:       // %bb.0:
637; CHECK-NEXT:    neg v1.4s, v1.4s
638; CHECK-NEXT:    ushl v0.4s, v0.4s, v1.4s
639; CHECK-NEXT:    ret
640  %res = lshr <4 x i32> %op1, %op2
641  ret <4 x i32> %res
642}
643
644define void @lshr_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
645; CHECK-LABEL: lshr_v8i32:
646; CHECK:       // %bb.0:
647; CHECK-NEXT:    ptrue p0.s, vl8
648; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
649; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
650; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
651; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
652; CHECK-NEXT:    ret
653  %op1 = load <8 x i32>, ptr %a
654  %op2 = load <8 x i32>, ptr %b
655  %res = lshr <8 x i32> %op1, %op2
656  store <8 x i32> %res, ptr %a
657  ret void
658}
659
660define void @lshr_v16i32(ptr %a, ptr %b) #0 {
661; VBITS_GE_256-LABEL: lshr_v16i32:
662; VBITS_GE_256:       // %bb.0:
663; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
664; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
665; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
666; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
667; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
668; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
669; VBITS_GE_256-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
670; VBITS_GE_256-NEXT:    movprfx z1, z2
671; VBITS_GE_256-NEXT:    lsr z1.s, p0/m, z1.s, z3.s
672; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
673; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
674; VBITS_GE_256-NEXT:    ret
675;
676; VBITS_GE_512-LABEL: lshr_v16i32:
677; VBITS_GE_512:       // %bb.0:
678; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
679; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
680; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
681; VBITS_GE_512-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
682; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
683; VBITS_GE_512-NEXT:    ret
684  %op1 = load <16 x i32>, ptr %a
685  %op2 = load <16 x i32>, ptr %b
686  %res = lshr <16 x i32> %op1, %op2
687  store <16 x i32> %res, ptr %a
688  ret void
689}
690
691define void @lshr_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
692; CHECK-LABEL: lshr_v32i32:
693; CHECK:       // %bb.0:
694; CHECK-NEXT:    ptrue p0.s, vl32
695; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
696; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
697; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
698; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
699; CHECK-NEXT:    ret
700  %op1 = load <32 x i32>, ptr %a
701  %op2 = load <32 x i32>, ptr %b
702  %res = lshr <32 x i32> %op1, %op2
703  store <32 x i32> %res, ptr %a
704  ret void
705}
706
707define void @lshr_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
708; CHECK-LABEL: lshr_v64i32:
709; CHECK:       // %bb.0:
710; CHECK-NEXT:    ptrue p0.s, vl64
711; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
712; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
713; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
714; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
715; CHECK-NEXT:    ret
716  %op1 = load <64 x i32>, ptr %a
717  %op2 = load <64 x i32>, ptr %b
718  %res = lshr <64 x i32> %op1, %op2
719  store <64 x i32> %res, ptr %a
720  ret void
721}
722
723; Don't use SVE for 64-bit vectors.
724define <1 x i64> @lshr_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
725; CHECK-LABEL: lshr_v1i64:
726; CHECK:       // %bb.0:
727; CHECK-NEXT:    neg d1, d1
728; CHECK-NEXT:    ushl d0, d0, d1
729; CHECK-NEXT:    ret
730  %res = lshr <1 x i64> %op1, %op2
731  ret <1 x i64> %res
732}
733
734; Don't use SVE for 128-bit vectors.
735define <2 x i64> @lshr_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
736; CHECK-LABEL: lshr_v2i64:
737; CHECK:       // %bb.0:
738; CHECK-NEXT:    neg v1.2d, v1.2d
739; CHECK-NEXT:    ushl v0.2d, v0.2d, v1.2d
740; CHECK-NEXT:    ret
741  %res = lshr <2 x i64> %op1, %op2
742  ret <2 x i64> %res
743}
744
745define void @lshr_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
746; CHECK-LABEL: lshr_v4i64:
747; CHECK:       // %bb.0:
748; CHECK-NEXT:    ptrue p0.d, vl4
749; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
750; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
751; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, z1.d
752; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
753; CHECK-NEXT:    ret
754  %op1 = load <4 x i64>, ptr %a
755  %op2 = load <4 x i64>, ptr %b
756  %res = lshr <4 x i64> %op1, %op2
757  store <4 x i64> %res, ptr %a
758  ret void
759}
760
761define void @lshr_v8i64(ptr %a, ptr %b) #0 {
762; VBITS_GE_256-LABEL: lshr_v8i64:
763; VBITS_GE_256:       // %bb.0:
764; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
765; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
766; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
767; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
768; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
769; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
770; VBITS_GE_256-NEXT:    lsr z0.d, p0/m, z0.d, z1.d
771; VBITS_GE_256-NEXT:    movprfx z1, z2
772; VBITS_GE_256-NEXT:    lsr z1.d, p0/m, z1.d, z3.d
773; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
774; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
775; VBITS_GE_256-NEXT:    ret
776;
777; VBITS_GE_512-LABEL: lshr_v8i64:
778; VBITS_GE_512:       // %bb.0:
779; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
780; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
781; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
782; VBITS_GE_512-NEXT:    lsr z0.d, p0/m, z0.d, z1.d
783; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
784; VBITS_GE_512-NEXT:    ret
785  %op1 = load <8 x i64>, ptr %a
786  %op2 = load <8 x i64>, ptr %b
787  %res = lshr <8 x i64> %op1, %op2
788  store <8 x i64> %res, ptr %a
789  ret void
790}
791
792define void @lshr_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
793; CHECK-LABEL: lshr_v16i64:
794; CHECK:       // %bb.0:
795; CHECK-NEXT:    ptrue p0.d, vl16
796; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
797; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
798; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, z1.d
799; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
800; CHECK-NEXT:    ret
801  %op1 = load <16 x i64>, ptr %a
802  %op2 = load <16 x i64>, ptr %b
803  %res = lshr <16 x i64> %op1, %op2
804  store <16 x i64> %res, ptr %a
805  ret void
806}
807
808define void @lshr_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
809; CHECK-LABEL: lshr_v32i64:
810; CHECK:       // %bb.0:
811; CHECK-NEXT:    ptrue p0.d, vl32
812; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
813; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
814; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, z1.d
815; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
816; CHECK-NEXT:    ret
817  %op1 = load <32 x i64>, ptr %a
818  %op2 = load <32 x i64>, ptr %b
819  %res = lshr <32 x i64> %op1, %op2
820  store <32 x i64> %res, ptr %a
821  ret void
822}
823
824;
825; SHL
826;
827
828; Don't use SVE for 64-bit vectors.
829define <8 x i8> @shl_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
830; CHECK-LABEL: shl_v8i8:
831; CHECK:       // %bb.0:
832; CHECK-NEXT:    ushl v0.8b, v0.8b, v1.8b
833; CHECK-NEXT:    ret
834  %res = shl <8 x i8> %op1, %op2
835  ret <8 x i8> %res
836}
837
838; Don't use SVE for 128-bit vectors.
839define <16 x i8> @shl_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
840; CHECK-LABEL: shl_v16i8:
841; CHECK:       // %bb.0:
842; CHECK-NEXT:    ushl v0.16b, v0.16b, v1.16b
843; CHECK-NEXT:    ret
844  %res = shl <16 x i8> %op1, %op2
845  ret <16 x i8> %res
846}
847
848define void @shl_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
849; CHECK-LABEL: shl_v32i8:
850; CHECK:       // %bb.0:
851; CHECK-NEXT:    ptrue p0.b, vl32
852; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
853; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
854; CHECK-NEXT:    lsl z0.b, p0/m, z0.b, z1.b
855; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
856; CHECK-NEXT:    ret
857  %op1 = load <32 x i8>, ptr %a
858  %op2 = load <32 x i8>, ptr %b
859  %res = shl <32 x i8> %op1, %op2
860  store <32 x i8> %res, ptr %a
861  ret void
862}
863
864define void @shl_v64i8(ptr %a, ptr %b) #0 {
865; VBITS_GE_256-LABEL: shl_v64i8:
866; VBITS_GE_256:       // %bb.0:
867; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
868; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
869; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
870; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x1, x8]
871; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x0]
872; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
873; VBITS_GE_256-NEXT:    lsl z0.b, p0/m, z0.b, z1.b
874; VBITS_GE_256-NEXT:    movprfx z1, z2
875; VBITS_GE_256-NEXT:    lsl z1.b, p0/m, z1.b, z3.b
876; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
877; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
878; VBITS_GE_256-NEXT:    ret
879;
880; VBITS_GE_512-LABEL: shl_v64i8:
881; VBITS_GE_512:       // %bb.0:
882; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
883; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
884; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
885; VBITS_GE_512-NEXT:    lsl z0.b, p0/m, z0.b, z1.b
886; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
887; VBITS_GE_512-NEXT:    ret
888  %op1 = load <64 x i8>, ptr %a
889  %op2 = load <64 x i8>, ptr %b
890  %res = shl <64 x i8> %op1, %op2
891  store <64 x i8> %res, ptr %a
892  ret void
893}
894
895define void @shl_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
896; CHECK-LABEL: shl_v128i8:
897; CHECK:       // %bb.0:
898; CHECK-NEXT:    ptrue p0.b, vl128
899; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
900; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
901; CHECK-NEXT:    lsl z0.b, p0/m, z0.b, z1.b
902; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
903; CHECK-NEXT:    ret
904  %op1 = load <128 x i8>, ptr %a
905  %op2 = load <128 x i8>, ptr %b
906  %res = shl <128 x i8> %op1, %op2
907  store <128 x i8> %res, ptr %a
908  ret void
909}
910
911define void @shl_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
912; CHECK-LABEL: shl_v256i8:
913; CHECK:       // %bb.0:
914; CHECK-NEXT:    ptrue p0.b, vl256
915; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
916; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
917; CHECK-NEXT:    lsl z0.b, p0/m, z0.b, z1.b
918; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
919; CHECK-NEXT:    ret
920  %op1 = load <256 x i8>, ptr %a
921  %op2 = load <256 x i8>, ptr %b
922  %res = shl <256 x i8> %op1, %op2
923  store <256 x i8> %res, ptr %a
924  ret void
925}
926
927; Don't use SVE for 64-bit vectors.
928define <4 x i16> @shl_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
929; CHECK-LABEL: shl_v4i16:
930; CHECK:       // %bb.0:
931; CHECK-NEXT:    ushl v0.4h, v0.4h, v1.4h
932; CHECK-NEXT:    ret
933  %res = shl <4 x i16> %op1, %op2
934  ret <4 x i16> %res
935}
936
937; Don't use SVE for 128-bit vectors.
938define <8 x i16> @shl_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
939; CHECK-LABEL: shl_v8i16:
940; CHECK:       // %bb.0:
941; CHECK-NEXT:    ushl v0.8h, v0.8h, v1.8h
942; CHECK-NEXT:    ret
943  %res = shl <8 x i16> %op1, %op2
944  ret <8 x i16> %res
945}
946
947define void @shl_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
948; CHECK-LABEL: shl_v16i16:
949; CHECK:       // %bb.0:
950; CHECK-NEXT:    ptrue p0.h, vl16
951; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
952; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
953; CHECK-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
954; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
955; CHECK-NEXT:    ret
956  %op1 = load <16 x i16>, ptr %a
957  %op2 = load <16 x i16>, ptr %b
958  %res = shl <16 x i16> %op1, %op2
959  store <16 x i16> %res, ptr %a
960  ret void
961}
962
963define void @shl_v32i16(ptr %a, ptr %b) #0 {
964; VBITS_GE_256-LABEL: shl_v32i16:
965; VBITS_GE_256:       // %bb.0:
966; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
967; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
968; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
969; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
970; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
971; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
972; VBITS_GE_256-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
973; VBITS_GE_256-NEXT:    movprfx z1, z2
974; VBITS_GE_256-NEXT:    lsl z1.h, p0/m, z1.h, z3.h
975; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
976; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
977; VBITS_GE_256-NEXT:    ret
978;
979; VBITS_GE_512-LABEL: shl_v32i16:
980; VBITS_GE_512:       // %bb.0:
981; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
982; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
983; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
984; VBITS_GE_512-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
985; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
986; VBITS_GE_512-NEXT:    ret
987  %op1 = load <32 x i16>, ptr %a
988  %op2 = load <32 x i16>, ptr %b
989  %res = shl <32 x i16> %op1, %op2
990  store <32 x i16> %res, ptr %a
991  ret void
992}
993
994define void @shl_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
995; CHECK-LABEL: shl_v64i16:
996; CHECK:       // %bb.0:
997; CHECK-NEXT:    ptrue p0.h, vl64
998; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
999; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
1000; CHECK-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
1001; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
1002; CHECK-NEXT:    ret
1003  %op1 = load <64 x i16>, ptr %a
1004  %op2 = load <64 x i16>, ptr %b
1005  %res = shl <64 x i16> %op1, %op2
1006  store <64 x i16> %res, ptr %a
1007  ret void
1008}
1009
1010define void @shl_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
1011; CHECK-LABEL: shl_v128i16:
1012; CHECK:       // %bb.0:
1013; CHECK-NEXT:    ptrue p0.h, vl128
1014; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1015; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
1016; CHECK-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
1017; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
1018; CHECK-NEXT:    ret
1019  %op1 = load <128 x i16>, ptr %a
1020  %op2 = load <128 x i16>, ptr %b
1021  %res = shl <128 x i16> %op1, %op2
1022  store <128 x i16> %res, ptr %a
1023  ret void
1024}
1025
1026; Don't use SVE for 64-bit vectors.
1027define <2 x i32> @shl_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
1028; CHECK-LABEL: shl_v2i32:
1029; CHECK:       // %bb.0:
1030; CHECK-NEXT:    ushl v0.2s, v0.2s, v1.2s
1031; CHECK-NEXT:    ret
1032  %res = shl <2 x i32> %op1, %op2
1033  ret <2 x i32> %res
1034}
1035
1036; Don't use SVE for 128-bit vectors.
1037define <4 x i32> @shl_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
1038; CHECK-LABEL: shl_v4i32:
1039; CHECK:       // %bb.0:
1040; CHECK-NEXT:    ushl v0.4s, v0.4s, v1.4s
1041; CHECK-NEXT:    ret
1042  %res = shl <4 x i32> %op1, %op2
1043  ret <4 x i32> %res
1044}
1045
1046define void @shl_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
1047; CHECK-LABEL: shl_v8i32:
1048; CHECK:       // %bb.0:
1049; CHECK-NEXT:    ptrue p0.s, vl8
1050; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1051; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
1052; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
1053; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1054; CHECK-NEXT:    ret
1055  %op1 = load <8 x i32>, ptr %a
1056  %op2 = load <8 x i32>, ptr %b
1057  %res = shl <8 x i32> %op1, %op2
1058  store <8 x i32> %res, ptr %a
1059  ret void
1060}
1061
1062define void @shl_v16i32(ptr %a, ptr %b) #0 {
1063; VBITS_GE_256-LABEL: shl_v16i32:
1064; VBITS_GE_256:       // %bb.0:
1065; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
1066; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
1067; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1068; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
1069; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
1070; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
1071; VBITS_GE_256-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
1072; VBITS_GE_256-NEXT:    movprfx z1, z2
1073; VBITS_GE_256-NEXT:    lsl z1.s, p0/m, z1.s, z3.s
1074; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
1075; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
1076; VBITS_GE_256-NEXT:    ret
1077;
1078; VBITS_GE_512-LABEL: shl_v16i32:
1079; VBITS_GE_512:       // %bb.0:
1080; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
1081; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
1082; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
1083; VBITS_GE_512-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
1084; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
1085; VBITS_GE_512-NEXT:    ret
1086  %op1 = load <16 x i32>, ptr %a
1087  %op2 = load <16 x i32>, ptr %b
1088  %res = shl <16 x i32> %op1, %op2
1089  store <16 x i32> %res, ptr %a
1090  ret void
1091}
1092
1093define void @shl_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
1094; CHECK-LABEL: shl_v32i32:
1095; CHECK:       // %bb.0:
1096; CHECK-NEXT:    ptrue p0.s, vl32
1097; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1098; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
1099; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
1100; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1101; CHECK-NEXT:    ret
1102  %op1 = load <32 x i32>, ptr %a
1103  %op2 = load <32 x i32>, ptr %b
1104  %res = shl <32 x i32> %op1, %op2
1105  store <32 x i32> %res, ptr %a
1106  ret void
1107}
1108
1109define void @shl_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
1110; CHECK-LABEL: shl_v64i32:
1111; CHECK:       // %bb.0:
1112; CHECK-NEXT:    ptrue p0.s, vl64
1113; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1114; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
1115; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
1116; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1117; CHECK-NEXT:    ret
1118  %op1 = load <64 x i32>, ptr %a
1119  %op2 = load <64 x i32>, ptr %b
1120  %res = shl <64 x i32> %op1, %op2
1121  store <64 x i32> %res, ptr %a
1122  ret void
1123}
1124
1125; Don't use SVE for 64-bit vectors.
1126define <1 x i64> @shl_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
1127; CHECK-LABEL: shl_v1i64:
1128; CHECK:       // %bb.0:
1129; CHECK-NEXT:    ushl d0, d0, d1
1130; CHECK-NEXT:    ret
1131  %res = shl <1 x i64> %op1, %op2
1132  ret <1 x i64> %res
1133}
1134
1135; Don't use SVE for 128-bit vectors.
1136define <2 x i64> @shl_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
1137; CHECK-LABEL: shl_v2i64:
1138; CHECK:       // %bb.0:
1139; CHECK-NEXT:    ushl v0.2d, v0.2d, v1.2d
1140; CHECK-NEXT:    ret
1141  %res = shl <2 x i64> %op1, %op2
1142  ret <2 x i64> %res
1143}
1144
1145define void @shl_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
1146; CHECK-LABEL: shl_v4i64:
1147; CHECK:       // %bb.0:
1148; CHECK-NEXT:    ptrue p0.d, vl4
1149; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1150; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
1151; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z1.d
1152; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1153; CHECK-NEXT:    ret
1154  %op1 = load <4 x i64>, ptr %a
1155  %op2 = load <4 x i64>, ptr %b
1156  %res = shl <4 x i64> %op1, %op2
1157  store <4 x i64> %res, ptr %a
1158  ret void
1159}
1160
1161define void @shl_v8i64(ptr %a, ptr %b) #0 {
1162; VBITS_GE_256-LABEL: shl_v8i64:
1163; VBITS_GE_256:       // %bb.0:
1164; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
1165; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
1166; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1167; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
1168; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
1169; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
1170; VBITS_GE_256-NEXT:    lsl z0.d, p0/m, z0.d, z1.d
1171; VBITS_GE_256-NEXT:    movprfx z1, z2
1172; VBITS_GE_256-NEXT:    lsl z1.d, p0/m, z1.d, z3.d
1173; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
1174; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
1175; VBITS_GE_256-NEXT:    ret
1176;
1177; VBITS_GE_512-LABEL: shl_v8i64:
1178; VBITS_GE_512:       // %bb.0:
1179; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
1180; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
1181; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
1182; VBITS_GE_512-NEXT:    lsl z0.d, p0/m, z0.d, z1.d
1183; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
1184; VBITS_GE_512-NEXT:    ret
1185  %op1 = load <8 x i64>, ptr %a
1186  %op2 = load <8 x i64>, ptr %b
1187  %res = shl <8 x i64> %op1, %op2
1188  store <8 x i64> %res, ptr %a
1189  ret void
1190}
1191
1192define void @shl_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
1193; CHECK-LABEL: shl_v16i64:
1194; CHECK:       // %bb.0:
1195; CHECK-NEXT:    ptrue p0.d, vl16
1196; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1197; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
1198; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z1.d
1199; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1200; CHECK-NEXT:    ret
1201  %op1 = load <16 x i64>, ptr %a
1202  %op2 = load <16 x i64>, ptr %b
1203  %res = shl <16 x i64> %op1, %op2
1204  store <16 x i64> %res, ptr %a
1205  ret void
1206}
1207
1208define void @shl_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
1209; CHECK-LABEL: shl_v32i64:
1210; CHECK:       // %bb.0:
1211; CHECK-NEXT:    ptrue p0.d, vl32
1212; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1213; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
1214; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z1.d
1215; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1216; CHECK-NEXT:    ret
1217  %op1 = load <32 x i64>, ptr %a
1218  %op2 = load <32 x i64>, ptr %b
1219  %res = shl <32 x i64> %op1, %op2
1220  store <32 x i64> %res, ptr %a
1221  ret void
1222}
1223
1224attributes #0 = { "target-features"="+sve" }
1225