xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll (revision cc82f1290a1e2157a6c0530d78d8cc84d2b8553d)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
5
6target triple = "aarch64-unknown-linux-gnu"
7
8;
9; ADD
10;
11
12; Don't use SVE for 64-bit vectors.
13define <8 x i8> @add_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
14; CHECK-LABEL: add_v8i8:
15; CHECK:       // %bb.0:
16; CHECK-NEXT:    add v0.8b, v0.8b, v1.8b
17; CHECK-NEXT:    ret
18  %res = add <8 x i8> %op1, %op2
19  ret <8 x i8> %res
20}
21
22; Don't use SVE for 128-bit vectors.
23define <16 x i8> @add_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
24; CHECK-LABEL: add_v16i8:
25; CHECK:       // %bb.0:
26; CHECK-NEXT:    add v0.16b, v0.16b, v1.16b
27; CHECK-NEXT:    ret
28  %res = add <16 x i8> %op1, %op2
29  ret <16 x i8> %res
30}
31
32define void @add_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
33; CHECK-LABEL: add_v32i8:
34; CHECK:       // %bb.0:
35; CHECK-NEXT:    ptrue p0.b, vl32
36; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
37; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
38; CHECK-NEXT:    add z0.b, z0.b, z1.b
39; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
40; CHECK-NEXT:    ret
41  %op1 = load <32 x i8>, ptr %a
42  %op2 = load <32 x i8>, ptr %b
43  %res = add <32 x i8> %op1, %op2
44  store <32 x i8> %res, ptr %a
45  ret void
46}
47
48define void @add_v64i8(ptr %a, ptr %b) #0 {
49; VBITS_GE_256-LABEL: add_v64i8:
50; VBITS_GE_256:       // %bb.0:
51; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
52; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
53; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
54; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x1, x8]
55; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x0]
56; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
57; VBITS_GE_256-NEXT:    add z0.b, z0.b, z1.b
58; VBITS_GE_256-NEXT:    add z1.b, z2.b, z3.b
59; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
60; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
61; VBITS_GE_256-NEXT:    ret
62;
63; VBITS_GE_512-LABEL: add_v64i8:
64; VBITS_GE_512:       // %bb.0:
65; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
66; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
67; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
68; VBITS_GE_512-NEXT:    add z0.b, z0.b, z1.b
69; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
70; VBITS_GE_512-NEXT:    ret
71  %op1 = load <64 x i8>, ptr %a
72  %op2 = load <64 x i8>, ptr %b
73  %res = add <64 x i8> %op1, %op2
74  store <64 x i8> %res, ptr %a
75  ret void
76}
77
78define void @add_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
79; CHECK-LABEL: add_v128i8:
80; CHECK:       // %bb.0:
81; CHECK-NEXT:    ptrue p0.b, vl128
82; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
83; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
84; CHECK-NEXT:    add z0.b, z0.b, z1.b
85; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
86; CHECK-NEXT:    ret
87  %op1 = load <128 x i8>, ptr %a
88  %op2 = load <128 x i8>, ptr %b
89  %res = add <128 x i8> %op1, %op2
90  store <128 x i8> %res, ptr %a
91  ret void
92}
93
94define void @add_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
95; CHECK-LABEL: add_v256i8:
96; CHECK:       // %bb.0:
97; CHECK-NEXT:    ptrue p0.b, vl256
98; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
99; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
100; CHECK-NEXT:    add z0.b, z0.b, z1.b
101; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
102; CHECK-NEXT:    ret
103  %op1 = load <256 x i8>, ptr %a
104  %op2 = load <256 x i8>, ptr %b
105  %res = add <256 x i8> %op1, %op2
106  store <256 x i8> %res, ptr %a
107  ret void
108}
109
110; Don't use SVE for 64-bit vectors.
111define <4 x i16> @add_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
112; CHECK-LABEL: add_v4i16:
113; CHECK:       // %bb.0:
114; CHECK-NEXT:    add v0.4h, v0.4h, v1.4h
115; CHECK-NEXT:    ret
116  %res = add <4 x i16> %op1, %op2
117  ret <4 x i16> %res
118}
119
120; Don't use SVE for 128-bit vectors.
121define <8 x i16> @add_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
122; CHECK-LABEL: add_v8i16:
123; CHECK:       // %bb.0:
124; CHECK-NEXT:    add v0.8h, v0.8h, v1.8h
125; CHECK-NEXT:    ret
126  %res = add <8 x i16> %op1, %op2
127  ret <8 x i16> %res
128}
129
130define void @add_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
131; CHECK-LABEL: add_v16i16:
132; CHECK:       // %bb.0:
133; CHECK-NEXT:    ptrue p0.h, vl16
134; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
135; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
136; CHECK-NEXT:    add z0.h, z0.h, z1.h
137; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
138; CHECK-NEXT:    ret
139  %op1 = load <16 x i16>, ptr %a
140  %op2 = load <16 x i16>, ptr %b
141  %res = add <16 x i16> %op1, %op2
142  store <16 x i16> %res, ptr %a
143  ret void
144}
145
146define void @add_v32i16(ptr %a, ptr %b) #0 {
147; VBITS_GE_256-LABEL: add_v32i16:
148; VBITS_GE_256:       // %bb.0:
149; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
150; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
151; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
152; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
153; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
154; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
155; VBITS_GE_256-NEXT:    add z0.h, z0.h, z1.h
156; VBITS_GE_256-NEXT:    add z1.h, z2.h, z3.h
157; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
158; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
159; VBITS_GE_256-NEXT:    ret
160;
161; VBITS_GE_512-LABEL: add_v32i16:
162; VBITS_GE_512:       // %bb.0:
163; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
164; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
165; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
166; VBITS_GE_512-NEXT:    add z0.h, z0.h, z1.h
167; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
168; VBITS_GE_512-NEXT:    ret
169  %op1 = load <32 x i16>, ptr %a
170  %op2 = load <32 x i16>, ptr %b
171  %res = add <32 x i16> %op1, %op2
172  store <32 x i16> %res, ptr %a
173  ret void
174}
175
176define void @add_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
177; CHECK-LABEL: add_v64i16:
178; CHECK:       // %bb.0:
179; CHECK-NEXT:    ptrue p0.h, vl64
180; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
181; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
182; CHECK-NEXT:    add z0.h, z0.h, z1.h
183; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
184; CHECK-NEXT:    ret
185  %op1 = load <64 x i16>, ptr %a
186  %op2 = load <64 x i16>, ptr %b
187  %res = add <64 x i16> %op1, %op2
188  store <64 x i16> %res, ptr %a
189  ret void
190}
191
192define void @add_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
193; CHECK-LABEL: add_v128i16:
194; CHECK:       // %bb.0:
195; CHECK-NEXT:    ptrue p0.h, vl128
196; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
197; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
198; CHECK-NEXT:    add z0.h, z0.h, z1.h
199; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
200; CHECK-NEXT:    ret
201  %op1 = load <128 x i16>, ptr %a
202  %op2 = load <128 x i16>, ptr %b
203  %res = add <128 x i16> %op1, %op2
204  store <128 x i16> %res, ptr %a
205  ret void
206}
207
208; Don't use SVE for 64-bit vectors.
209define <2 x i32> @add_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
210; CHECK-LABEL: add_v2i32:
211; CHECK:       // %bb.0:
212; CHECK-NEXT:    add v0.2s, v0.2s, v1.2s
213; CHECK-NEXT:    ret
214  %res = add <2 x i32> %op1, %op2
215  ret <2 x i32> %res
216}
217
218; Don't use SVE for 128-bit vectors.
219define <4 x i32> @add_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
220; CHECK-LABEL: add_v4i32:
221; CHECK:       // %bb.0:
222; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
223; CHECK-NEXT:    ret
224  %res = add <4 x i32> %op1, %op2
225  ret <4 x i32> %res
226}
227
228define void @add_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
229; CHECK-LABEL: add_v8i32:
230; CHECK:       // %bb.0:
231; CHECK-NEXT:    ptrue p0.s, vl8
232; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
233; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
234; CHECK-NEXT:    add z0.s, z0.s, z1.s
235; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
236; CHECK-NEXT:    ret
237  %op1 = load <8 x i32>, ptr %a
238  %op2 = load <8 x i32>, ptr %b
239  %res = add <8 x i32> %op1, %op2
240  store <8 x i32> %res, ptr %a
241  ret void
242}
243
244define void @add_v16i32(ptr %a, ptr %b) #0 {
245; VBITS_GE_256-LABEL: add_v16i32:
246; VBITS_GE_256:       // %bb.0:
247; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
248; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
249; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
250; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
251; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
252; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
253; VBITS_GE_256-NEXT:    add z0.s, z0.s, z1.s
254; VBITS_GE_256-NEXT:    add z1.s, z2.s, z3.s
255; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
256; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
257; VBITS_GE_256-NEXT:    ret
258;
259; VBITS_GE_512-LABEL: add_v16i32:
260; VBITS_GE_512:       // %bb.0:
261; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
262; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
263; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
264; VBITS_GE_512-NEXT:    add z0.s, z0.s, z1.s
265; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
266; VBITS_GE_512-NEXT:    ret
267  %op1 = load <16 x i32>, ptr %a
268  %op2 = load <16 x i32>, ptr %b
269  %res = add <16 x i32> %op1, %op2
270  store <16 x i32> %res, ptr %a
271  ret void
272}
273
274define void @add_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
275; CHECK-LABEL: add_v32i32:
276; CHECK:       // %bb.0:
277; CHECK-NEXT:    ptrue p0.s, vl32
278; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
279; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
280; CHECK-NEXT:    add z0.s, z0.s, z1.s
281; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
282; CHECK-NEXT:    ret
283  %op1 = load <32 x i32>, ptr %a
284  %op2 = load <32 x i32>, ptr %b
285  %res = add <32 x i32> %op1, %op2
286  store <32 x i32> %res, ptr %a
287  ret void
288}
289
290define void @add_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
291; CHECK-LABEL: add_v64i32:
292; CHECK:       // %bb.0:
293; CHECK-NEXT:    ptrue p0.s, vl64
294; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
295; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
296; CHECK-NEXT:    add z0.s, z0.s, z1.s
297; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
298; CHECK-NEXT:    ret
299  %op1 = load <64 x i32>, ptr %a
300  %op2 = load <64 x i32>, ptr %b
301  %res = add <64 x i32> %op1, %op2
302  store <64 x i32> %res, ptr %a
303  ret void
304}
305
306; Don't use SVE for 64-bit vectors.
307define <1 x i64> @add_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
308; CHECK-LABEL: add_v1i64:
309; CHECK:       // %bb.0:
310; CHECK-NEXT:    add d0, d0, d1
311; CHECK-NEXT:    ret
312  %res = add <1 x i64> %op1, %op2
313  ret <1 x i64> %res
314}
315
316; Don't use SVE for 128-bit vectors.
317define <2 x i64> @add_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
318; CHECK-LABEL: add_v2i64:
319; CHECK:       // %bb.0:
320; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
321; CHECK-NEXT:    ret
322  %res = add <2 x i64> %op1, %op2
323  ret <2 x i64> %res
324}
325
326define void @add_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
327; CHECK-LABEL: add_v4i64:
328; CHECK:       // %bb.0:
329; CHECK-NEXT:    ptrue p0.d, vl4
330; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
331; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
332; CHECK-NEXT:    add z0.d, z0.d, z1.d
333; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
334; CHECK-NEXT:    ret
335  %op1 = load <4 x i64>, ptr %a
336  %op2 = load <4 x i64>, ptr %b
337  %res = add <4 x i64> %op1, %op2
338  store <4 x i64> %res, ptr %a
339  ret void
340}
341
342define void @add_v8i64(ptr %a, ptr %b) #0 {
343; VBITS_GE_256-LABEL: add_v8i64:
344; VBITS_GE_256:       // %bb.0:
345; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
346; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
347; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
348; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
349; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
350; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
351; VBITS_GE_256-NEXT:    add z0.d, z0.d, z1.d
352; VBITS_GE_256-NEXT:    add z1.d, z2.d, z3.d
353; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
354; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
355; VBITS_GE_256-NEXT:    ret
356;
357; VBITS_GE_512-LABEL: add_v8i64:
358; VBITS_GE_512:       // %bb.0:
359; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
360; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
361; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
362; VBITS_GE_512-NEXT:    add z0.d, z0.d, z1.d
363; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
364; VBITS_GE_512-NEXT:    ret
365  %op1 = load <8 x i64>, ptr %a
366  %op2 = load <8 x i64>, ptr %b
367  %res = add <8 x i64> %op1, %op2
368  store <8 x i64> %res, ptr %a
369  ret void
370}
371
372define void @add_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
373; CHECK-LABEL: add_v16i64:
374; CHECK:       // %bb.0:
375; CHECK-NEXT:    ptrue p0.d, vl16
376; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
377; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
378; CHECK-NEXT:    add z0.d, z0.d, z1.d
379; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
380; CHECK-NEXT:    ret
381  %op1 = load <16 x i64>, ptr %a
382  %op2 = load <16 x i64>, ptr %b
383  %res = add <16 x i64> %op1, %op2
384  store <16 x i64> %res, ptr %a
385  ret void
386}
387
388define void @add_v32i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
389; CHECK-LABEL: add_v32i64:
390; CHECK:       // %bb.0:
391; CHECK-NEXT:    ptrue p0.d, vl16
392; CHECK-NEXT:    mov x8, #16 // =0x10
393; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
394; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
395; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0]
396; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x1]
397; CHECK-NEXT:    add z0.d, z0.d, z1.d
398; CHECK-NEXT:    add z1.d, z2.d, z3.d
399; CHECK-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
400; CHECK-NEXT:    st1d { z1.d }, p0, [x0]
401; CHECK-NEXT:    ret
402  %op1 = load <32 x i64>, ptr %a
403  %op2 = load <32 x i64>, ptr %b
404  %res = add <32 x i64> %op1, %op2
405  store <32 x i64> %res, ptr %a
406  ret void
407}
408
409;
410; MUL
411;
412
413; Don't use SVE for 64-bit vectors.
414define <8 x i8> @mul_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
415; CHECK-LABEL: mul_v8i8:
416; CHECK:       // %bb.0:
417; CHECK-NEXT:    mul v0.8b, v0.8b, v1.8b
418; CHECK-NEXT:    ret
419  %res = mul <8 x i8> %op1, %op2
420  ret <8 x i8> %res
421}
422
423; Don't use SVE for 128-bit vectors.
424define <16 x i8> @mul_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
425; CHECK-LABEL: mul_v16i8:
426; CHECK:       // %bb.0:
427; CHECK-NEXT:    mul v0.16b, v0.16b, v1.16b
428; CHECK-NEXT:    ret
429  %res = mul <16 x i8> %op1, %op2
430  ret <16 x i8> %res
431}
432
433define void @mul_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
434; CHECK-LABEL: mul_v32i8:
435; CHECK:       // %bb.0:
436; CHECK-NEXT:    ptrue p0.b, vl32
437; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
438; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
439; CHECK-NEXT:    mul z0.b, p0/m, z0.b, z1.b
440; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
441; CHECK-NEXT:    ret
442  %op1 = load <32 x i8>, ptr %a
443  %op2 = load <32 x i8>, ptr %b
444  %res = mul <32 x i8> %op1, %op2
445  store <32 x i8> %res, ptr %a
446  ret void
447}
448
449define void @mul_v64i8(ptr %a, ptr %b) #0 {
450; VBITS_GE_256-LABEL: mul_v64i8:
451; VBITS_GE_256:       // %bb.0:
452; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
453; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
454; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
455; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x1, x8]
456; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x0]
457; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
458; VBITS_GE_256-NEXT:    mul z0.b, p0/m, z0.b, z1.b
459; VBITS_GE_256-NEXT:    movprfx z1, z2
460; VBITS_GE_256-NEXT:    mul z1.b, p0/m, z1.b, z3.b
461; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
462; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
463; VBITS_GE_256-NEXT:    ret
464;
465; VBITS_GE_512-LABEL: mul_v64i8:
466; VBITS_GE_512:       // %bb.0:
467; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
468; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
469; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
470; VBITS_GE_512-NEXT:    mul z0.b, p0/m, z0.b, z1.b
471; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
472; VBITS_GE_512-NEXT:    ret
473  %op1 = load <64 x i8>, ptr %a
474  %op2 = load <64 x i8>, ptr %b
475  %res = mul <64 x i8> %op1, %op2
476  store <64 x i8> %res, ptr %a
477  ret void
478}
479
480define void @mul_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
481; CHECK-LABEL: mul_v128i8:
482; CHECK:       // %bb.0:
483; CHECK-NEXT:    ptrue p0.b, vl128
484; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
485; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
486; CHECK-NEXT:    mul z0.b, p0/m, z0.b, z1.b
487; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
488; CHECK-NEXT:    ret
489  %op1 = load <128 x i8>, ptr %a
490  %op2 = load <128 x i8>, ptr %b
491  %res = mul <128 x i8> %op1, %op2
492  store <128 x i8> %res, ptr %a
493  ret void
494}
495
496define void @mul_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
497; CHECK-LABEL: mul_v256i8:
498; CHECK:       // %bb.0:
499; CHECK-NEXT:    ptrue p0.b, vl256
500; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
501; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
502; CHECK-NEXT:    mul z0.b, p0/m, z0.b, z1.b
503; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
504; CHECK-NEXT:    ret
505  %op1 = load <256 x i8>, ptr %a
506  %op2 = load <256 x i8>, ptr %b
507  %res = mul <256 x i8> %op1, %op2
508  store <256 x i8> %res, ptr %a
509  ret void
510}
511
512; Don't use SVE for 64-bit vectors.
513define <4 x i16> @mul_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
514; CHECK-LABEL: mul_v4i16:
515; CHECK:       // %bb.0:
516; CHECK-NEXT:    mul v0.4h, v0.4h, v1.4h
517; CHECK-NEXT:    ret
518  %res = mul <4 x i16> %op1, %op2
519  ret <4 x i16> %res
520}
521
522; Don't use SVE for 128-bit vectors.
523define <8 x i16> @mul_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
524; CHECK-LABEL: mul_v8i16:
525; CHECK:       // %bb.0:
526; CHECK-NEXT:    mul v0.8h, v0.8h, v1.8h
527; CHECK-NEXT:    ret
528  %res = mul <8 x i16> %op1, %op2
529  ret <8 x i16> %res
530}
531
532define void @mul_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
533; CHECK-LABEL: mul_v16i16:
534; CHECK:       // %bb.0:
535; CHECK-NEXT:    ptrue p0.h, vl16
536; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
537; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
538; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z1.h
539; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
540; CHECK-NEXT:    ret
541  %op1 = load <16 x i16>, ptr %a
542  %op2 = load <16 x i16>, ptr %b
543  %res = mul <16 x i16> %op1, %op2
544  store <16 x i16> %res, ptr %a
545  ret void
546}
547
548define void @mul_v32i16(ptr %a, ptr %b) #0 {
549; VBITS_GE_256-LABEL: mul_v32i16:
550; VBITS_GE_256:       // %bb.0:
551; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
552; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
553; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
554; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
555; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
556; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
557; VBITS_GE_256-NEXT:    mul z0.h, p0/m, z0.h, z1.h
558; VBITS_GE_256-NEXT:    movprfx z1, z2
559; VBITS_GE_256-NEXT:    mul z1.h, p0/m, z1.h, z3.h
560; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
561; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
562; VBITS_GE_256-NEXT:    ret
563;
564; VBITS_GE_512-LABEL: mul_v32i16:
565; VBITS_GE_512:       // %bb.0:
566; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
567; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
568; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
569; VBITS_GE_512-NEXT:    mul z0.h, p0/m, z0.h, z1.h
570; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
571; VBITS_GE_512-NEXT:    ret
572  %op1 = load <32 x i16>, ptr %a
573  %op2 = load <32 x i16>, ptr %b
574  %res = mul <32 x i16> %op1, %op2
575  store <32 x i16> %res, ptr %a
576  ret void
577}
578
579define void @mul_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
580; CHECK-LABEL: mul_v64i16:
581; CHECK:       // %bb.0:
582; CHECK-NEXT:    ptrue p0.h, vl64
583; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
584; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
585; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z1.h
586; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
587; CHECK-NEXT:    ret
588  %op1 = load <64 x i16>, ptr %a
589  %op2 = load <64 x i16>, ptr %b
590  %res = mul <64 x i16> %op1, %op2
591  store <64 x i16> %res, ptr %a
592  ret void
593}
594
595define void @mul_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
596; CHECK-LABEL: mul_v128i16:
597; CHECK:       // %bb.0:
598; CHECK-NEXT:    ptrue p0.h, vl128
599; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
600; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
601; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z1.h
602; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
603; CHECK-NEXT:    ret
604  %op1 = load <128 x i16>, ptr %a
605  %op2 = load <128 x i16>, ptr %b
606  %res = mul <128 x i16> %op1, %op2
607  store <128 x i16> %res, ptr %a
608  ret void
609}
610
611; Don't use SVE for 64-bit vectors.
612define <2 x i32> @mul_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
613; CHECK-LABEL: mul_v2i32:
614; CHECK:       // %bb.0:
615; CHECK-NEXT:    mul v0.2s, v0.2s, v1.2s
616; CHECK-NEXT:    ret
617  %res = mul <2 x i32> %op1, %op2
618  ret <2 x i32> %res
619}
620
621; Don't use SVE for 128-bit vectors.
622define <4 x i32> @mul_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
623; CHECK-LABEL: mul_v4i32:
624; CHECK:       // %bb.0:
625; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
626; CHECK-NEXT:    ret
627  %res = mul <4 x i32> %op1, %op2
628  ret <4 x i32> %res
629}
630
631define void @mul_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
632; CHECK-LABEL: mul_v8i32:
633; CHECK:       // %bb.0:
634; CHECK-NEXT:    ptrue p0.s, vl8
635; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
636; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
637; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
638; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
639; CHECK-NEXT:    ret
640  %op1 = load <8 x i32>, ptr %a
641  %op2 = load <8 x i32>, ptr %b
642  %res = mul <8 x i32> %op1, %op2
643  store <8 x i32> %res, ptr %a
644  ret void
645}
646
647define void @mul_v16i32(ptr %a, ptr %b) #0 {
648; VBITS_GE_256-LABEL: mul_v16i32:
649; VBITS_GE_256:       // %bb.0:
650; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
651; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
652; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
653; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
654; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
655; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
656; VBITS_GE_256-NEXT:    mul z0.s, p0/m, z0.s, z1.s
657; VBITS_GE_256-NEXT:    movprfx z1, z2
658; VBITS_GE_256-NEXT:    mul z1.s, p0/m, z1.s, z3.s
659; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
660; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
661; VBITS_GE_256-NEXT:    ret
662;
663; VBITS_GE_512-LABEL: mul_v16i32:
664; VBITS_GE_512:       // %bb.0:
665; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
666; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
667; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
668; VBITS_GE_512-NEXT:    mul z0.s, p0/m, z0.s, z1.s
669; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
670; VBITS_GE_512-NEXT:    ret
671  %op1 = load <16 x i32>, ptr %a
672  %op2 = load <16 x i32>, ptr %b
673  %res = mul <16 x i32> %op1, %op2
674  store <16 x i32> %res, ptr %a
675  ret void
676}
677
678define void @mul_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
679; CHECK-LABEL: mul_v32i32:
680; CHECK:       // %bb.0:
681; CHECK-NEXT:    ptrue p0.s, vl32
682; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
683; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
684; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
685; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
686; CHECK-NEXT:    ret
687  %op1 = load <32 x i32>, ptr %a
688  %op2 = load <32 x i32>, ptr %b
689  %res = mul <32 x i32> %op1, %op2
690  store <32 x i32> %res, ptr %a
691  ret void
692}
693
694define void @mul_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
695; CHECK-LABEL: mul_v64i32:
696; CHECK:       // %bb.0:
697; CHECK-NEXT:    ptrue p0.s, vl64
698; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
699; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
700; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
701; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
702; CHECK-NEXT:    ret
703  %op1 = load <64 x i32>, ptr %a
704  %op2 = load <64 x i32>, ptr %b
705  %res = mul <64 x i32> %op1, %op2
706  store <64 x i32> %res, ptr %a
707  ret void
708}
709
710define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
711; CHECK-LABEL: mul_v1i64:
712; CHECK:       // %bb.0:
713; CHECK-NEXT:    ptrue p0.d, vl1
714; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
715; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
716; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
717; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
718; CHECK-NEXT:    ret
719  %res = mul <1 x i64> %op1, %op2
720  ret <1 x i64> %res
721}
722
723define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
724; CHECK-LABEL: mul_v2i64:
725; CHECK:       // %bb.0:
726; CHECK-NEXT:    ptrue p0.d, vl2
727; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
728; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
729; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
730; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
731; CHECK-NEXT:    ret
732  %res = mul <2 x i64> %op1, %op2
733  ret <2 x i64> %res
734}
735
736define void @mul_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
737; CHECK-LABEL: mul_v4i64:
738; CHECK:       // %bb.0:
739; CHECK-NEXT:    ptrue p0.d, vl4
740; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
741; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
742; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
743; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
744; CHECK-NEXT:    ret
745  %op1 = load <4 x i64>, ptr %a
746  %op2 = load <4 x i64>, ptr %b
747  %res = mul <4 x i64> %op1, %op2
748  store <4 x i64> %res, ptr %a
749  ret void
750}
751
752define void @mul_v8i64(ptr %a, ptr %b) #0 {
753; VBITS_GE_256-LABEL: mul_v8i64:
754; VBITS_GE_256:       // %bb.0:
755; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
756; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
757; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
758; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
759; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
760; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
761; VBITS_GE_256-NEXT:    mul z0.d, p0/m, z0.d, z1.d
762; VBITS_GE_256-NEXT:    movprfx z1, z2
763; VBITS_GE_256-NEXT:    mul z1.d, p0/m, z1.d, z3.d
764; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
765; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
766; VBITS_GE_256-NEXT:    ret
767;
768; VBITS_GE_512-LABEL: mul_v8i64:
769; VBITS_GE_512:       // %bb.0:
770; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
771; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
772; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
773; VBITS_GE_512-NEXT:    mul z0.d, p0/m, z0.d, z1.d
774; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
775; VBITS_GE_512-NEXT:    ret
776  %op1 = load <8 x i64>, ptr %a
777  %op2 = load <8 x i64>, ptr %b
778  %res = mul <8 x i64> %op1, %op2
779  store <8 x i64> %res, ptr %a
780  ret void
781}
782
783define void @mul_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
784; CHECK-LABEL: mul_v16i64:
785; CHECK:       // %bb.0:
786; CHECK-NEXT:    ptrue p0.d, vl16
787; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
788; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
789; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
790; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
791; CHECK-NEXT:    ret
792  %op1 = load <16 x i64>, ptr %a
793  %op2 = load <16 x i64>, ptr %b
794  %res = mul <16 x i64> %op1, %op2
795  store <16 x i64> %res, ptr %a
796  ret void
797}
798
799define void @mul_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
800; CHECK-LABEL: mul_v32i64:
801; CHECK:       // %bb.0:
802; CHECK-NEXT:    ptrue p0.d, vl32
803; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
804; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
805; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
806; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
807; CHECK-NEXT:    ret
808  %op1 = load <32 x i64>, ptr %a
809  %op2 = load <32 x i64>, ptr %b
810  %res = mul <32 x i64> %op1, %op2
811  store <32 x i64> %res, ptr %a
812  ret void
813}
814
815;
816; SUB
817;
818
819; Don't use SVE for 64-bit vectors.
820define <8 x i8> @sub_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
821; CHECK-LABEL: sub_v8i8:
822; CHECK:       // %bb.0:
823; CHECK-NEXT:    sub v0.8b, v0.8b, v1.8b
824; CHECK-NEXT:    ret
825  %res = sub <8 x i8> %op1, %op2
826  ret <8 x i8> %res
827}
828
829; Don't use SVE for 128-bit vectors.
830define <16 x i8> @sub_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
831; CHECK-LABEL: sub_v16i8:
832; CHECK:       // %bb.0:
833; CHECK-NEXT:    sub v0.16b, v0.16b, v1.16b
834; CHECK-NEXT:    ret
835  %res = sub <16 x i8> %op1, %op2
836  ret <16 x i8> %res
837}
838
839define void @sub_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
840; CHECK-LABEL: sub_v32i8:
841; CHECK:       // %bb.0:
842; CHECK-NEXT:    ptrue p0.b, vl32
843; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
844; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
845; CHECK-NEXT:    sub z0.b, z0.b, z1.b
846; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
847; CHECK-NEXT:    ret
848  %op1 = load <32 x i8>, ptr %a
849  %op2 = load <32 x i8>, ptr %b
850  %res = sub <32 x i8> %op1, %op2
851  store <32 x i8> %res, ptr %a
852  ret void
853}
854
855define void @sub_v64i8(ptr %a, ptr %b) #0 {
856; VBITS_GE_256-LABEL: sub_v64i8:
857; VBITS_GE_256:       // %bb.0:
858; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
859; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
860; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
861; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x1, x8]
862; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x0]
863; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
864; VBITS_GE_256-NEXT:    sub z0.b, z0.b, z1.b
865; VBITS_GE_256-NEXT:    sub z1.b, z2.b, z3.b
866; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
867; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
868; VBITS_GE_256-NEXT:    ret
869;
870; VBITS_GE_512-LABEL: sub_v64i8:
871; VBITS_GE_512:       // %bb.0:
872; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
873; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
874; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
875; VBITS_GE_512-NEXT:    sub z0.b, z0.b, z1.b
876; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
877; VBITS_GE_512-NEXT:    ret
878  %op1 = load <64 x i8>, ptr %a
879  %op2 = load <64 x i8>, ptr %b
880  %res = sub <64 x i8> %op1, %op2
881  store <64 x i8> %res, ptr %a
882  ret void
883}
884
885define void @sub_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
886; CHECK-LABEL: sub_v128i8:
887; CHECK:       // %bb.0:
888; CHECK-NEXT:    ptrue p0.b, vl128
889; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
890; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
891; CHECK-NEXT:    sub z0.b, z0.b, z1.b
892; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
893; CHECK-NEXT:    ret
894  %op1 = load <128 x i8>, ptr %a
895  %op2 = load <128 x i8>, ptr %b
896  %res = sub <128 x i8> %op1, %op2
897  store <128 x i8> %res, ptr %a
898  ret void
899}
900
901define void @sub_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
902; CHECK-LABEL: sub_v256i8:
903; CHECK:       // %bb.0:
904; CHECK-NEXT:    ptrue p0.b, vl256
905; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
906; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
907; CHECK-NEXT:    sub z0.b, z0.b, z1.b
908; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
909; CHECK-NEXT:    ret
910  %op1 = load <256 x i8>, ptr %a
911  %op2 = load <256 x i8>, ptr %b
912  %res = sub <256 x i8> %op1, %op2
913  store <256 x i8> %res, ptr %a
914  ret void
915}
916
917; Don't use SVE for 64-bit vectors.
918define <4 x i16> @sub_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
919; CHECK-LABEL: sub_v4i16:
920; CHECK:       // %bb.0:
921; CHECK-NEXT:    sub v0.4h, v0.4h, v1.4h
922; CHECK-NEXT:    ret
923  %res = sub <4 x i16> %op1, %op2
924  ret <4 x i16> %res
925}
926
927; Don't use SVE for 128-bit vectors.
928define <8 x i16> @sub_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
929; CHECK-LABEL: sub_v8i16:
930; CHECK:       // %bb.0:
931; CHECK-NEXT:    sub v0.8h, v0.8h, v1.8h
932; CHECK-NEXT:    ret
933  %res = sub <8 x i16> %op1, %op2
934  ret <8 x i16> %res
935}
936
937define void @sub_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
938; CHECK-LABEL: sub_v16i16:
939; CHECK:       // %bb.0:
940; CHECK-NEXT:    ptrue p0.h, vl16
941; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
942; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
943; CHECK-NEXT:    sub z0.h, z0.h, z1.h
944; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
945; CHECK-NEXT:    ret
946  %op1 = load <16 x i16>, ptr %a
947  %op2 = load <16 x i16>, ptr %b
948  %res = sub <16 x i16> %op1, %op2
949  store <16 x i16> %res, ptr %a
950  ret void
951}
952
953define void @sub_v32i16(ptr %a, ptr %b) #0 {
954; VBITS_GE_256-LABEL: sub_v32i16:
955; VBITS_GE_256:       // %bb.0:
956; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
957; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
958; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
959; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
960; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
961; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
962; VBITS_GE_256-NEXT:    sub z0.h, z0.h, z1.h
963; VBITS_GE_256-NEXT:    sub z1.h, z2.h, z3.h
964; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
965; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
966; VBITS_GE_256-NEXT:    ret
967;
968; VBITS_GE_512-LABEL: sub_v32i16:
969; VBITS_GE_512:       // %bb.0:
970; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
971; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
972; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
973; VBITS_GE_512-NEXT:    sub z0.h, z0.h, z1.h
974; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
975; VBITS_GE_512-NEXT:    ret
976  %op1 = load <32 x i16>, ptr %a
977  %op2 = load <32 x i16>, ptr %b
978  %res = sub <32 x i16> %op1, %op2
979  store <32 x i16> %res, ptr %a
980  ret void
981}
982
983define void @sub_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
984; CHECK-LABEL: sub_v64i16:
985; CHECK:       // %bb.0:
986; CHECK-NEXT:    ptrue p0.h, vl64
987; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
988; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
989; CHECK-NEXT:    sub z0.h, z0.h, z1.h
990; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
991; CHECK-NEXT:    ret
992  %op1 = load <64 x i16>, ptr %a
993  %op2 = load <64 x i16>, ptr %b
994  %res = sub <64 x i16> %op1, %op2
995  store <64 x i16> %res, ptr %a
996  ret void
997}
998
999define void @sub_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
1000; CHECK-LABEL: sub_v128i16:
1001; CHECK:       // %bb.0:
1002; CHECK-NEXT:    ptrue p0.h, vl128
1003; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1004; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
1005; CHECK-NEXT:    sub z0.h, z0.h, z1.h
1006; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
1007; CHECK-NEXT:    ret
1008  %op1 = load <128 x i16>, ptr %a
1009  %op2 = load <128 x i16>, ptr %b
1010  %res = sub <128 x i16> %op1, %op2
1011  store <128 x i16> %res, ptr %a
1012  ret void
1013}
1014
1015; Don't use SVE for 64-bit vectors.
1016define <2 x i32> @sub_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
1017; CHECK-LABEL: sub_v2i32:
1018; CHECK:       // %bb.0:
1019; CHECK-NEXT:    sub v0.2s, v0.2s, v1.2s
1020; CHECK-NEXT:    ret
1021  %res = sub <2 x i32> %op1, %op2
1022  ret <2 x i32> %res
1023}
1024
1025; Don't use SVE for 128-bit vectors.
1026define <4 x i32> @sub_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
1027; CHECK-LABEL: sub_v4i32:
1028; CHECK:       // %bb.0:
1029; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
1030; CHECK-NEXT:    ret
1031  %res = sub <4 x i32> %op1, %op2
1032  ret <4 x i32> %res
1033}
1034
1035define void @sub_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
1036; CHECK-LABEL: sub_v8i32:
1037; CHECK:       // %bb.0:
1038; CHECK-NEXT:    ptrue p0.s, vl8
1039; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1040; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
1041; CHECK-NEXT:    sub z0.s, z0.s, z1.s
1042; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1043; CHECK-NEXT:    ret
1044  %op1 = load <8 x i32>, ptr %a
1045  %op2 = load <8 x i32>, ptr %b
1046  %res = sub <8 x i32> %op1, %op2
1047  store <8 x i32> %res, ptr %a
1048  ret void
1049}
1050
1051define void @sub_v16i32(ptr %a, ptr %b) #0 {
1052; VBITS_GE_256-LABEL: sub_v16i32:
1053; VBITS_GE_256:       // %bb.0:
1054; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
1055; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
1056; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1057; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
1058; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
1059; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
1060; VBITS_GE_256-NEXT:    sub z0.s, z0.s, z1.s
1061; VBITS_GE_256-NEXT:    sub z1.s, z2.s, z3.s
1062; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
1063; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
1064; VBITS_GE_256-NEXT:    ret
1065;
1066; VBITS_GE_512-LABEL: sub_v16i32:
1067; VBITS_GE_512:       // %bb.0:
1068; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
1069; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
1070; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
1071; VBITS_GE_512-NEXT:    sub z0.s, z0.s, z1.s
1072; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
1073; VBITS_GE_512-NEXT:    ret
1074  %op1 = load <16 x i32>, ptr %a
1075  %op2 = load <16 x i32>, ptr %b
1076  %res = sub <16 x i32> %op1, %op2
1077  store <16 x i32> %res, ptr %a
1078  ret void
1079}
1080
1081define void @sub_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
1082; CHECK-LABEL: sub_v32i32:
1083; CHECK:       // %bb.0:
1084; CHECK-NEXT:    ptrue p0.s, vl32
1085; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1086; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
1087; CHECK-NEXT:    sub z0.s, z0.s, z1.s
1088; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1089; CHECK-NEXT:    ret
1090  %op1 = load <32 x i32>, ptr %a
1091  %op2 = load <32 x i32>, ptr %b
1092  %res = sub <32 x i32> %op1, %op2
1093  store <32 x i32> %res, ptr %a
1094  ret void
1095}
1096
1097define void @sub_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
1098; CHECK-LABEL: sub_v64i32:
1099; CHECK:       // %bb.0:
1100; CHECK-NEXT:    ptrue p0.s, vl64
1101; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1102; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
1103; CHECK-NEXT:    sub z0.s, z0.s, z1.s
1104; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1105; CHECK-NEXT:    ret
1106  %op1 = load <64 x i32>, ptr %a
1107  %op2 = load <64 x i32>, ptr %b
1108  %res = sub <64 x i32> %op1, %op2
1109  store <64 x i32> %res, ptr %a
1110  ret void
1111}
1112
1113; Don't use SVE for 64-bit vectors.
1114define <1 x i64> @sub_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
1115; CHECK-LABEL: sub_v1i64:
1116; CHECK:       // %bb.0:
1117; CHECK-NEXT:    sub d0, d0, d1
1118; CHECK-NEXT:    ret
1119  %res = sub <1 x i64> %op1, %op2
1120  ret <1 x i64> %res
1121}
1122
1123; Don't use SVE for 128-bit vectors.
1124define <2 x i64> @sub_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
1125; CHECK-LABEL: sub_v2i64:
1126; CHECK:       // %bb.0:
1127; CHECK-NEXT:    sub v0.2d, v0.2d, v1.2d
1128; CHECK-NEXT:    ret
1129  %res = sub <2 x i64> %op1, %op2
1130  ret <2 x i64> %res
1131}
1132
1133define void @sub_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
1134; CHECK-LABEL: sub_v4i64:
1135; CHECK:       // %bb.0:
1136; CHECK-NEXT:    ptrue p0.d, vl4
1137; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1138; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
1139; CHECK-NEXT:    sub z0.d, z0.d, z1.d
1140; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1141; CHECK-NEXT:    ret
1142  %op1 = load <4 x i64>, ptr %a
1143  %op2 = load <4 x i64>, ptr %b
1144  %res = sub <4 x i64> %op1, %op2
1145  store <4 x i64> %res, ptr %a
1146  ret void
1147}
1148
1149define void @sub_v8i64(ptr %a, ptr %b) #0 {
1150; VBITS_GE_256-LABEL: sub_v8i64:
1151; VBITS_GE_256:       // %bb.0:
1152; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
1153; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
1154; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1155; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
1156; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
1157; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
1158; VBITS_GE_256-NEXT:    sub z0.d, z0.d, z1.d
1159; VBITS_GE_256-NEXT:    sub z1.d, z2.d, z3.d
1160; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
1161; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
1162; VBITS_GE_256-NEXT:    ret
1163;
1164; VBITS_GE_512-LABEL: sub_v8i64:
1165; VBITS_GE_512:       // %bb.0:
1166; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
1167; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
1168; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
1169; VBITS_GE_512-NEXT:    sub z0.d, z0.d, z1.d
1170; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
1171; VBITS_GE_512-NEXT:    ret
1172  %op1 = load <8 x i64>, ptr %a
1173  %op2 = load <8 x i64>, ptr %b
1174  %res = sub <8 x i64> %op1, %op2
1175  store <8 x i64> %res, ptr %a
1176  ret void
1177}
1178
1179define void @sub_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
1180; CHECK-LABEL: sub_v16i64:
1181; CHECK:       // %bb.0:
1182; CHECK-NEXT:    ptrue p0.d, vl16
1183; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1184; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
1185; CHECK-NEXT:    sub z0.d, z0.d, z1.d
1186; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1187; CHECK-NEXT:    ret
1188  %op1 = load <16 x i64>, ptr %a
1189  %op2 = load <16 x i64>, ptr %b
1190  %res = sub <16 x i64> %op1, %op2
1191  store <16 x i64> %res, ptr %a
1192  ret void
1193}
1194
1195define void @sub_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
1196; CHECK-LABEL: sub_v32i64:
1197; CHECK:       // %bb.0:
1198; CHECK-NEXT:    ptrue p0.d, vl32
1199; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1200; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
1201; CHECK-NEXT:    sub z0.d, z0.d, z1.d
1202; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1203; CHECK-NEXT:    ret
1204  %op1 = load <32 x i64>, ptr %a
1205  %op2 = load <32 x i64>, ptr %b
1206  %res = sub <32 x i64> %op1, %op2
1207  store <32 x i64> %res, ptr %a
1208  ret void
1209}
1210
1211
1212;
1213; ABS
1214;
1215
1216; Don't use SVE for 64-bit vectors.
1217define <8 x i8> @abs_v8i8(<8 x i8> %op1) vscale_range(2,0) #0 {
1218; CHECK-LABEL: abs_v8i8:
1219; CHECK:       // %bb.0:
1220; CHECK-NEXT:    abs v0.8b, v0.8b
1221; CHECK-NEXT:    ret
1222  %res = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %op1, i1 false)
1223  ret <8 x i8> %res
1224}
1225
1226; Don't use SVE for 128-bit vectors.
1227define <16 x i8> @abs_v16i8(<16 x i8> %op1) vscale_range(2,0) #0 {
1228; CHECK-LABEL: abs_v16i8:
1229; CHECK:       // %bb.0:
1230; CHECK-NEXT:    abs v0.16b, v0.16b
1231; CHECK-NEXT:    ret
1232  %res = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %op1, i1 false)
1233  ret <16 x i8> %res
1234}
1235
1236define void @abs_v32i8(ptr %a) vscale_range(2,0) #0 {
1237; CHECK-LABEL: abs_v32i8:
1238; CHECK:       // %bb.0:
1239; CHECK-NEXT:    ptrue p0.b, vl32
1240; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
1241; CHECK-NEXT:    abs z0.b, p0/m, z0.b
1242; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
1243; CHECK-NEXT:    ret
1244  %op1 = load <32 x i8>, ptr %a
1245  %res = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %op1, i1 false)
1246  store <32 x i8> %res, ptr %a
1247  ret void
1248}
1249
1250define void @abs_v64i8(ptr %a) #0 {
1251; VBITS_GE_256-LABEL: abs_v64i8:
1252; VBITS_GE_256:       // %bb.0:
1253; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
1254; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
1255; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
1256; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
1257; VBITS_GE_256-NEXT:    abs z0.b, p0/m, z0.b
1258; VBITS_GE_256-NEXT:    abs z1.b, p0/m, z1.b
1259; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
1260; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
1261; VBITS_GE_256-NEXT:    ret
1262;
1263; VBITS_GE_512-LABEL: abs_v64i8:
1264; VBITS_GE_512:       // %bb.0:
1265; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
1266; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
1267; VBITS_GE_512-NEXT:    abs z0.b, p0/m, z0.b
1268; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
1269; VBITS_GE_512-NEXT:    ret
1270  %op1 = load <64 x i8>, ptr %a
1271  %res = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %op1, i1 false)
1272  store <64 x i8> %res, ptr %a
1273  ret void
1274}
1275
1276define void @abs_v128i8(ptr %a) vscale_range(8,0) #0 {
1277; CHECK-LABEL: abs_v128i8:
1278; CHECK:       // %bb.0:
1279; CHECK-NEXT:    ptrue p0.b, vl128
1280; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
1281; CHECK-NEXT:    abs z0.b, p0/m, z0.b
1282; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
1283; CHECK-NEXT:    ret
1284  %op1 = load <128 x i8>, ptr %a
1285  %res = call <128 x i8> @llvm.abs.v128i8(<128 x i8> %op1, i1 false)
1286  store <128 x i8> %res, ptr %a
1287  ret void
1288}
1289
1290define void @abs_v256i8(ptr %a) vscale_range(16,0) #0 {
1291; CHECK-LABEL: abs_v256i8:
1292; CHECK:       // %bb.0:
1293; CHECK-NEXT:    ptrue p0.b, vl256
1294; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
1295; CHECK-NEXT:    abs z0.b, p0/m, z0.b
1296; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
1297; CHECK-NEXT:    ret
1298  %op1 = load <256 x i8>, ptr %a
1299  %res = call <256 x i8> @llvm.abs.v256i8(<256 x i8> %op1, i1 false)
1300  store <256 x i8> %res, ptr %a
1301  ret void
1302}
1303
1304; Don't use SVE for 64-bit vectors.
1305define <4 x i16> @abs_v4i16(<4 x i16> %op1) vscale_range(2,0) #0 {
1306; CHECK-LABEL: abs_v4i16:
1307; CHECK:       // %bb.0:
1308; CHECK-NEXT:    abs v0.4h, v0.4h
1309; CHECK-NEXT:    ret
1310  %res = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %op1, i1 false)
1311  ret <4 x i16> %res
1312}
1313
1314; Don't use SVE for 128-bit vectors.
1315define <8 x i16> @abs_v8i16(<8 x i16> %op1) vscale_range(2,0) #0 {
1316; CHECK-LABEL: abs_v8i16:
1317; CHECK:       // %bb.0:
1318; CHECK-NEXT:    abs v0.8h, v0.8h
1319; CHECK-NEXT:    ret
1320  %res = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %op1, i1 false)
1321  ret <8 x i16> %res
1322}
1323
1324define void @abs_v16i16(ptr %a) vscale_range(2,0) #0 {
1325; CHECK-LABEL: abs_v16i16:
1326; CHECK:       // %bb.0:
1327; CHECK-NEXT:    ptrue p0.h, vl16
1328; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1329; CHECK-NEXT:    abs z0.h, p0/m, z0.h
1330; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
1331; CHECK-NEXT:    ret
1332  %op1 = load <16 x i16>, ptr %a
1333  %res = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %op1, i1 false)
1334  store <16 x i16> %res, ptr %a
1335  ret void
1336}
1337
1338define void @abs_v32i16(ptr %a) vscale_range(2,0) #0 {
1339; CHECK-LABEL: abs_v32i16:
1340; CHECK:       // %bb.0:
1341; CHECK-NEXT:    ptrue p0.h, vl16
1342; CHECK-NEXT:    mov x8, #16 // =0x10
1343; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
1344; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
1345; CHECK-NEXT:    abs z0.h, p0/m, z0.h
1346; CHECK-NEXT:    abs z1.h, p0/m, z1.h
1347; CHECK-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
1348; CHECK-NEXT:    st1h { z1.h }, p0, [x0]
1349; CHECK-NEXT:    ret
1350  %op1 = load <32 x i16>, ptr %a
1351  %res = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %op1, i1 false)
1352  store <32 x i16> %res, ptr %a
1353  ret void
1354}
1355
1356define void @abs_v64i16(ptr %a) vscale_range(2,0) #0 {
1357; CHECK-LABEL: abs_v64i16:
1358; CHECK:       // %bb.0:
1359; CHECK-NEXT:    ptrue p0.h, vl16
1360; CHECK-NEXT:    mov x8, #32 // =0x20
1361; CHECK-NEXT:    mov x9, #48 // =0x30
1362; CHECK-NEXT:    mov x10, #16 // =0x10
1363; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
1364; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
1365; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
1366; CHECK-NEXT:    ld1h { z3.h }, p0/z, [x0]
1367; CHECK-NEXT:    abs z0.h, p0/m, z0.h
1368; CHECK-NEXT:    abs z1.h, p0/m, z1.h
1369; CHECK-NEXT:    abs z2.h, p0/m, z2.h
1370; CHECK-NEXT:    abs z3.h, p0/m, z3.h
1371; CHECK-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
1372; CHECK-NEXT:    st1h { z1.h }, p0, [x0, x9, lsl #1]
1373; CHECK-NEXT:    st1h { z2.h }, p0, [x0, x10, lsl #1]
1374; CHECK-NEXT:    st1h { z3.h }, p0, [x0]
1375; CHECK-NEXT:    ret
1376  %op1 = load <64 x i16>, ptr %a
1377  %res = call <64 x i16> @llvm.abs.v64i16(<64 x i16> %op1, i1 false)
1378  store <64 x i16> %res, ptr %a
1379  ret void
1380}
1381
1382define void @abs_v128i16(ptr %a) vscale_range(2,0) #0 {
1383; CHECK-LABEL: abs_v128i16:
1384; CHECK:       // %bb.0:
1385; CHECK-NEXT:    ptrue p0.h, vl16
1386; CHECK-NEXT:    mov x8, #96 // =0x60
1387; CHECK-NEXT:    mov x9, #112 // =0x70
1388; CHECK-NEXT:    mov x10, #64 // =0x40
1389; CHECK-NEXT:    mov x11, #80 // =0x50
1390; CHECK-NEXT:    mov x12, #32 // =0x20
1391; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
1392; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
1393; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
1394; CHECK-NEXT:    mov x13, #48 // =0x30
1395; CHECK-NEXT:    mov x14, #16 // =0x10
1396; CHECK-NEXT:    ld1h { z3.h }, p0/z, [x0, x11, lsl #1]
1397; CHECK-NEXT:    ld1h { z4.h }, p0/z, [x0, x12, lsl #1]
1398; CHECK-NEXT:    ld1h { z5.h }, p0/z, [x0, x13, lsl #1]
1399; CHECK-NEXT:    ld1h { z6.h }, p0/z, [x0, x14, lsl #1]
1400; CHECK-NEXT:    abs z0.h, p0/m, z0.h
1401; CHECK-NEXT:    abs z1.h, p0/m, z1.h
1402; CHECK-NEXT:    abs z2.h, p0/m, z2.h
1403; CHECK-NEXT:    abs z3.h, p0/m, z3.h
1404; CHECK-NEXT:    abs z4.h, p0/m, z4.h
1405; CHECK-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
1406; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1407; CHECK-NEXT:    st1h { z1.h }, p0, [x0, x9, lsl #1]
1408; CHECK-NEXT:    movprfx z1, z5
1409; CHECK-NEXT:    abs z1.h, p0/m, z5.h
1410; CHECK-NEXT:    st1h { z2.h }, p0, [x0, x10, lsl #1]
1411; CHECK-NEXT:    movprfx z2, z6
1412; CHECK-NEXT:    abs z2.h, p0/m, z6.h
1413; CHECK-NEXT:    abs z0.h, p0/m, z0.h
1414; CHECK-NEXT:    st1h { z3.h }, p0, [x0, x11, lsl #1]
1415; CHECK-NEXT:    st1h { z4.h }, p0, [x0, x12, lsl #1]
1416; CHECK-NEXT:    st1h { z1.h }, p0, [x0, x13, lsl #1]
1417; CHECK-NEXT:    st1h { z2.h }, p0, [x0, x14, lsl #1]
1418; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
1419; CHECK-NEXT:    ret
1420  %op1 = load <128 x i16>, ptr %a
1421  %res = call <128 x i16> @llvm.abs.v128i16(<128 x i16> %op1, i1 false)
1422  store <128 x i16> %res, ptr %a
1423  ret void
1424}
1425
1426; Don't use SVE for 64-bit vectors.
1427define <2 x i32> @abs_v2i32(<2 x i32> %op1) vscale_range(2,0) #0 {
1428; CHECK-LABEL: abs_v2i32:
1429; CHECK:       // %bb.0:
1430; CHECK-NEXT:    abs v0.2s, v0.2s
1431; CHECK-NEXT:    ret
1432  %res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %op1, i1 false)
1433  ret <2 x i32> %res
1434}
1435
1436; Don't use SVE for 128-bit vectors.
1437define <4 x i32> @abs_v4i32(<4 x i32> %op1) vscale_range(2,0) #0 {
1438; CHECK-LABEL: abs_v4i32:
1439; CHECK:       // %bb.0:
1440; CHECK-NEXT:    abs v0.4s, v0.4s
1441; CHECK-NEXT:    ret
1442  %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %op1, i1 false)
1443  ret <4 x i32> %res
1444}
1445
1446define void @abs_v8i32(ptr %a) vscale_range(2,0) #0 {
1447; CHECK-LABEL: abs_v8i32:
1448; CHECK:       // %bb.0:
1449; CHECK-NEXT:    ptrue p0.s, vl8
1450; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1451; CHECK-NEXT:    abs z0.s, p0/m, z0.s
1452; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1453; CHECK-NEXT:    ret
1454  %op1 = load <8 x i32>, ptr %a
1455  %res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false)
1456  store <8 x i32> %res, ptr %a
1457  ret void
1458}
1459
1460define void @abs_v16i32(ptr %a) #0 {
1461; VBITS_GE_256-LABEL: abs_v16i32:
1462; VBITS_GE_256:       // %bb.0:
1463; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
1464; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
1465; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1466; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
1467; VBITS_GE_256-NEXT:    abs z0.s, p0/m, z0.s
1468; VBITS_GE_256-NEXT:    abs z1.s, p0/m, z1.s
1469; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
1470; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
1471; VBITS_GE_256-NEXT:    ret
1472;
1473; VBITS_GE_512-LABEL: abs_v16i32:
1474; VBITS_GE_512:       // %bb.0:
1475; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
1476; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
1477; VBITS_GE_512-NEXT:    abs z0.s, p0/m, z0.s
1478; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
1479; VBITS_GE_512-NEXT:    ret
1480  %op1 = load <16 x i32>, ptr %a
1481  %res = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %op1, i1 false)
1482  store <16 x i32> %res, ptr %a
1483  ret void
1484}
1485
1486define void @abs_v32i32(ptr %a) vscale_range(8,0) #0 {
1487; CHECK-LABEL: abs_v32i32:
1488; CHECK:       // %bb.0:
1489; CHECK-NEXT:    ptrue p0.s, vl32
1490; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1491; CHECK-NEXT:    abs z0.s, p0/m, z0.s
1492; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1493; CHECK-NEXT:    ret
1494  %op1 = load <32 x i32>, ptr %a
1495  %res = call <32 x i32> @llvm.abs.v32i32(<32 x i32> %op1, i1 false)
1496  store <32 x i32> %res, ptr %a
1497  ret void
1498}
1499
1500define void @abs_v64i32(ptr %a) vscale_range(16,0) #0 {
1501; CHECK-LABEL: abs_v64i32:
1502; CHECK:       // %bb.0:
1503; CHECK-NEXT:    ptrue p0.s, vl64
1504; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1505; CHECK-NEXT:    abs z0.s, p0/m, z0.s
1506; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1507; CHECK-NEXT:    ret
1508  %op1 = load <64 x i32>, ptr %a
1509  %res = call <64 x i32> @llvm.abs.v64i32(<64 x i32> %op1, i1 false)
1510  store <64 x i32> %res, ptr %a
1511  ret void
1512}
1513
1514; Don't use SVE for 64-bit vectors.
1515define <1 x i64> @abs_v1i64(<1 x i64> %op1) vscale_range(2,0) #0 {
1516; CHECK-LABEL: abs_v1i64:
1517; CHECK:       // %bb.0:
1518; CHECK-NEXT:    abs d0, d0
1519; CHECK-NEXT:    ret
1520  %res = call <1 x i64> @llvm.abs.v1i64(<1 x i64> %op1, i1 false)
1521  ret <1 x i64> %res
1522}
1523
1524; Don't use SVE for 128-bit vectors.
1525define <2 x i64> @abs_v2i64(<2 x i64> %op1) vscale_range(2,0) #0 {
1526; CHECK-LABEL: abs_v2i64:
1527; CHECK:       // %bb.0:
1528; CHECK-NEXT:    abs v0.2d, v0.2d
1529; CHECK-NEXT:    ret
1530  %res = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %op1, i1 false)
1531  ret <2 x i64> %res
1532}
1533
1534define void @abs_v4i64(ptr %a) vscale_range(2,0) #0 {
1535; CHECK-LABEL: abs_v4i64:
1536; CHECK:       // %bb.0:
1537; CHECK-NEXT:    ptrue p0.d, vl4
1538; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1539; CHECK-NEXT:    abs z0.d, p0/m, z0.d
1540; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1541; CHECK-NEXT:    ret
1542  %op1 = load <4 x i64>, ptr %a
1543  %res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false)
1544  store <4 x i64> %res, ptr %a
1545  ret void
1546}
1547
1548define void @abs_v8i64(ptr %a) #0 {
1549; VBITS_GE_256-LABEL: abs_v8i64:
1550; VBITS_GE_256:       // %bb.0:
1551; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
1552; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
1553; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1554; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
1555; VBITS_GE_256-NEXT:    abs z0.d, p0/m, z0.d
1556; VBITS_GE_256-NEXT:    abs z1.d, p0/m, z1.d
1557; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
1558; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
1559; VBITS_GE_256-NEXT:    ret
1560;
1561; VBITS_GE_512-LABEL: abs_v8i64:
1562; VBITS_GE_512:       // %bb.0:
1563; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
1564; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
1565; VBITS_GE_512-NEXT:    abs z0.d, p0/m, z0.d
1566; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
1567; VBITS_GE_512-NEXT:    ret
1568  %op1 = load <8 x i64>, ptr %a
1569  %res = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %op1, i1 false)
1570  store <8 x i64> %res, ptr %a
1571  ret void
1572}
1573
1574define void @abs_v16i64(ptr %a) vscale_range(8,0) #0 {
1575; CHECK-LABEL: abs_v16i64:
1576; CHECK:       // %bb.0:
1577; CHECK-NEXT:    ptrue p0.d, vl16
1578; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1579; CHECK-NEXT:    abs z0.d, p0/m, z0.d
1580; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1581; CHECK-NEXT:    ret
1582  %op1 = load <16 x i64>, ptr %a
1583  %res = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %op1, i1 false)
1584  store <16 x i64> %res, ptr %a
1585  ret void
1586}
1587
1588define void @abs_v32i64(ptr %a) vscale_range(16,0) #0 {
1589; CHECK-LABEL: abs_v32i64:
1590; CHECK:       // %bb.0:
1591; CHECK-NEXT:    ptrue p0.d, vl32
1592; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1593; CHECK-NEXT:    abs z0.d, p0/m, z0.d
1594; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1595; CHECK-NEXT:    ret
1596  %op1 = load <32 x i64>, ptr %a
1597  %res = call <32 x i64> @llvm.abs.v32i64(<32 x i64> %op1, i1 false)
1598  store <32 x i64> %res, ptr %a
1599  ret void
1600}
1601
1602declare <8 x i8> @llvm.abs.v8i8(<8 x i8>, i1)
1603declare <16 x i8> @llvm.abs.v16i8(<16 x i8>, i1)
1604declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1)
1605declare <64 x i8> @llvm.abs.v64i8(<64 x i8>, i1)
1606declare <128 x i8> @llvm.abs.v128i8(<128 x i8>, i1)
1607declare <256 x i8> @llvm.abs.v256i8(<256 x i8>, i1)
1608declare <4 x i16> @llvm.abs.v4i16(<4 x i16>, i1)
1609declare <8 x i16> @llvm.abs.v8i16(<8 x i16>, i1)
1610declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1)
1611declare <32 x i16> @llvm.abs.v32i16(<32 x i16>, i1)
1612declare <64 x i16> @llvm.abs.v64i16(<64 x i16>, i1)
1613declare <128 x i16> @llvm.abs.v128i16(<128 x i16>, i1)
1614declare <2 x i32> @llvm.abs.v2i32(<2 x i32>, i1)
1615declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
1616declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1)
1617declare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1)
1618declare <32 x i32> @llvm.abs.v32i32(<32 x i32>, i1)
1619declare <64 x i32> @llvm.abs.v64i32(<64 x i32>, i1)
1620declare <1 x i64> @llvm.abs.v1i64(<1 x i64>, i1)
1621declare <2 x i64> @llvm.abs.v2i64(<2 x i64>, i1)
1622declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1)
1623declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1)
1624declare <16 x i64> @llvm.abs.v16i64(<16 x i64>, i1)
1625declare <32 x i64> @llvm.abs.v32i64(<32 x i64>, i1)
1626
1627attributes #0 = { "target-features"="+sve" }
1628