xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll (revision b24af43fdfa1b1242b7cb77540462212227c57c4)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
5
6target triple = "aarch64-unknown-linux-gnu"
7
8;
9; FADD
10;
11
12; Don't use SVE for 64-bit vectors.
13define <4 x half> @fadd_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
14; CHECK-LABEL: fadd_v4f16:
15; CHECK:       // %bb.0:
16; CHECK-NEXT:    fadd v0.4h, v0.4h, v1.4h
17; CHECK-NEXT:    ret
18  %res = fadd <4 x half> %op1, %op2
19  ret <4 x half> %res
20}
21
22; Don't use SVE for 128-bit vectors.
23define <8 x half> @fadd_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
24; CHECK-LABEL: fadd_v8f16:
25; CHECK:       // %bb.0:
26; CHECK-NEXT:    fadd v0.8h, v0.8h, v1.8h
27; CHECK-NEXT:    ret
28  %res = fadd <8 x half> %op1, %op2
29  ret <8 x half> %res
30}
31
32define void @fadd_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
33; CHECK-LABEL: fadd_v16f16:
34; CHECK:       // %bb.0:
35; CHECK-NEXT:    ptrue p0.h, vl16
36; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
37; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
38; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
39; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
40; CHECK-NEXT:    ret
41  %op1 = load <16 x half>, ptr %a
42  %op2 = load <16 x half>, ptr %b
43  %res = fadd <16 x half> %op1, %op2
44  store <16 x half> %res, ptr %a
45  ret void
46}
47
48define void @fadd_v32f16(ptr %a, ptr %b) #0 {
49; VBITS_GE_256-LABEL: fadd_v32f16:
50; VBITS_GE_256:       // %bb.0:
51; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
52; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
53; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
54; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
55; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
56; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
57; VBITS_GE_256-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
58; VBITS_GE_256-NEXT:    movprfx z1, z2
59; VBITS_GE_256-NEXT:    fadd z1.h, p0/m, z1.h, z3.h
60; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
61; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
62; VBITS_GE_256-NEXT:    ret
63;
64; VBITS_GE_512-LABEL: fadd_v32f16:
65; VBITS_GE_512:       // %bb.0:
66; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
67; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
68; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
69; VBITS_GE_512-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
70; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
71; VBITS_GE_512-NEXT:    ret
72  %op1 = load <32 x half>, ptr %a
73  %op2 = load <32 x half>, ptr %b
74  %res = fadd <32 x half> %op1, %op2
75  store <32 x half> %res, ptr %a
76  ret void
77}
78
79define void @fadd_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
80; CHECK-LABEL: fadd_v64f16:
81; CHECK:       // %bb.0:
82; CHECK-NEXT:    ptrue p0.h, vl64
83; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
84; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
85; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
86; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
87; CHECK-NEXT:    ret
88  %op1 = load <64 x half>, ptr %a
89  %op2 = load <64 x half>, ptr %b
90  %res = fadd <64 x half> %op1, %op2
91  store <64 x half> %res, ptr %a
92  ret void
93}
94
95define void @fadd_v128f16(ptr %a, ptr %b)  vscale_range(16,0) #0 {
96; CHECK-LABEL: fadd_v128f16:
97; CHECK:       // %bb.0:
98; CHECK-NEXT:    ptrue p0.h, vl128
99; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
100; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
101; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
102; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
103; CHECK-NEXT:    ret
104  %op1 = load <128 x half>, ptr %a
105  %op2 = load <128 x half>, ptr %b
106  %res = fadd <128 x half> %op1, %op2
107  store <128 x half> %res, ptr %a
108  ret void
109}
110
111; Don't use SVE for 64-bit vectors.
112define <2 x float> @fadd_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
113; CHECK-LABEL: fadd_v2f32:
114; CHECK:       // %bb.0:
115; CHECK-NEXT:    fadd v0.2s, v0.2s, v1.2s
116; CHECK-NEXT:    ret
117  %res = fadd <2 x float> %op1, %op2
118  ret <2 x float> %res
119}
120
121; Don't use SVE for 128-bit vectors.
122define <4 x float> @fadd_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
123; CHECK-LABEL: fadd_v4f32:
124; CHECK:       // %bb.0:
125; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
126; CHECK-NEXT:    ret
127  %res = fadd <4 x float> %op1, %op2
128  ret <4 x float> %res
129}
130
131define void @fadd_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
132; CHECK-LABEL: fadd_v8f32:
133; CHECK:       // %bb.0:
134; CHECK-NEXT:    ptrue p0.s, vl8
135; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
136; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
137; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
138; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
139; CHECK-NEXT:    ret
140  %op1 = load <8 x float>, ptr %a
141  %op2 = load <8 x float>, ptr %b
142  %res = fadd <8 x float> %op1, %op2
143  store <8 x float> %res, ptr %a
144  ret void
145}
146
147define void @fadd_v16f32(ptr %a, ptr %b) #0 {
148; VBITS_GE_256-LABEL: fadd_v16f32:
149; VBITS_GE_256:       // %bb.0:
150; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
151; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
152; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
153; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
154; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
155; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
156; VBITS_GE_256-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
157; VBITS_GE_256-NEXT:    movprfx z1, z2
158; VBITS_GE_256-NEXT:    fadd z1.s, p0/m, z1.s, z3.s
159; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
160; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
161; VBITS_GE_256-NEXT:    ret
162;
163; VBITS_GE_512-LABEL: fadd_v16f32:
164; VBITS_GE_512:       // %bb.0:
165; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
166; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
167; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
168; VBITS_GE_512-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
169; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
170; VBITS_GE_512-NEXT:    ret
171  %op1 = load <16 x float>, ptr %a
172  %op2 = load <16 x float>, ptr %b
173  %res = fadd <16 x float> %op1, %op2
174  store <16 x float> %res, ptr %a
175  ret void
176}
177
178define void @fadd_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
179; CHECK-LABEL: fadd_v32f32:
180; CHECK:       // %bb.0:
181; CHECK-NEXT:    ptrue p0.s, vl32
182; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
183; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
184; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
185; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
186; CHECK-NEXT:    ret
187  %op1 = load <32 x float>, ptr %a
188  %op2 = load <32 x float>, ptr %b
189  %res = fadd <32 x float> %op1, %op2
190  store <32 x float> %res, ptr %a
191  ret void
192}
193
194define void @fadd_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
195; CHECK-LABEL: fadd_v64f32:
196; CHECK:       // %bb.0:
197; CHECK-NEXT:    ptrue p0.s, vl64
198; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
199; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
200; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
201; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
202; CHECK-NEXT:    ret
203  %op1 = load <64 x float>, ptr %a
204  %op2 = load <64 x float>, ptr %b
205  %res = fadd <64 x float> %op1, %op2
206  store <64 x float> %res, ptr %a
207  ret void
208}
209
210; Don't use SVE for 64-bit vectors.
211define <1 x double> @fadd_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
212; CHECK-LABEL: fadd_v1f64:
213; CHECK:       // %bb.0:
214; CHECK-NEXT:    fadd d0, d0, d1
215; CHECK-NEXT:    ret
216  %res = fadd <1 x double> %op1, %op2
217  ret <1 x double> %res
218}
219
220; Don't use SVE for 128-bit vectors.
221define <2 x double> @fadd_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
222; CHECK-LABEL: fadd_v2f64:
223; CHECK:       // %bb.0:
224; CHECK-NEXT:    fadd v0.2d, v0.2d, v1.2d
225; CHECK-NEXT:    ret
226  %res = fadd <2 x double> %op1, %op2
227  ret <2 x double> %res
228}
229
230define void @fadd_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
231; CHECK-LABEL: fadd_v4f64:
232; CHECK:       // %bb.0:
233; CHECK-NEXT:    ptrue p0.d, vl4
234; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
235; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
236; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
237; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
238; CHECK-NEXT:    ret
239  %op1 = load <4 x double>, ptr %a
240  %op2 = load <4 x double>, ptr %b
241  %res = fadd <4 x double> %op1, %op2
242  store <4 x double> %res, ptr %a
243  ret void
244}
245
246define void @fadd_v8f64(ptr %a, ptr %b) #0 {
247; VBITS_GE_256-LABEL: fadd_v8f64:
248; VBITS_GE_256:       // %bb.0:
249; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
250; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
251; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
252; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
253; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
254; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
255; VBITS_GE_256-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
256; VBITS_GE_256-NEXT:    movprfx z1, z2
257; VBITS_GE_256-NEXT:    fadd z1.d, p0/m, z1.d, z3.d
258; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
259; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
260; VBITS_GE_256-NEXT:    ret
261;
262; VBITS_GE_512-LABEL: fadd_v8f64:
263; VBITS_GE_512:       // %bb.0:
264; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
265; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
266; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
267; VBITS_GE_512-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
268; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
269; VBITS_GE_512-NEXT:    ret
270  %op1 = load <8 x double>, ptr %a
271  %op2 = load <8 x double>, ptr %b
272  %res = fadd <8 x double> %op1, %op2
273  store <8 x double> %res, ptr %a
274  ret void
275}
276
277define void @fadd_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
278; CHECK-LABEL: fadd_v16f64:
279; CHECK:       // %bb.0:
280; CHECK-NEXT:    ptrue p0.d, vl16
281; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
282; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
283; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
284; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
285; CHECK-NEXT:    ret
286  %op1 = load <16 x double>, ptr %a
287  %op2 = load <16 x double>, ptr %b
288  %res = fadd <16 x double> %op1, %op2
289  store <16 x double> %res, ptr %a
290  ret void
291}
292
293define void @fadd_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
294; CHECK-LABEL: fadd_v32f64:
295; CHECK:       // %bb.0:
296; CHECK-NEXT:    ptrue p0.d, vl32
297; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
298; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
299; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
300; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
301; CHECK-NEXT:    ret
302  %op1 = load <32 x double>, ptr %a
303  %op2 = load <32 x double>, ptr %b
304  %res = fadd <32 x double> %op1, %op2
305  store <32 x double> %res, ptr %a
306  ret void
307}
308
309;
310; FDIV
311;
312
313; Don't use SVE for 64-bit vectors.
314define <4 x half> @fdiv_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
315; CHECK-LABEL: fdiv_v4f16:
316; CHECK:       // %bb.0:
317; CHECK-NEXT:    fdiv v0.4h, v0.4h, v1.4h
318; CHECK-NEXT:    ret
319  %res = fdiv <4 x half> %op1, %op2
320  ret <4 x half> %res
321}
322
323; Don't use SVE for 128-bit vectors.
324define <8 x half> @fdiv_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
325; CHECK-LABEL: fdiv_v8f16:
326; CHECK:       // %bb.0:
327; CHECK-NEXT:    fdiv v0.8h, v0.8h, v1.8h
328; CHECK-NEXT:    ret
329  %res = fdiv <8 x half> %op1, %op2
330  ret <8 x half> %res
331}
332
333define void @fdiv_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
334; CHECK-LABEL: fdiv_v16f16:
335; CHECK:       // %bb.0:
336; CHECK-NEXT:    ptrue p0.h, vl16
337; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
338; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
339; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
340; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
341; CHECK-NEXT:    ret
342  %op1 = load <16 x half>, ptr %a
343  %op2 = load <16 x half>, ptr %b
344  %res = fdiv <16 x half> %op1, %op2
345  store <16 x half> %res, ptr %a
346  ret void
347}
348
349define void @fdiv_v32f16(ptr %a, ptr %b) #0 {
350; VBITS_GE_256-LABEL: fdiv_v32f16:
351; VBITS_GE_256:       // %bb.0:
352; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
353; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
354; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
355; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
356; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1]
357; VBITS_GE_256-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
358; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
359; VBITS_GE_256-NEXT:    fdiv z1.h, p0/m, z1.h, z2.h
360; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
361; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
362; VBITS_GE_256-NEXT:    ret
363;
364; VBITS_GE_512-LABEL: fdiv_v32f16:
365; VBITS_GE_512:       // %bb.0:
366; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
367; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
368; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
369; VBITS_GE_512-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
370; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
371; VBITS_GE_512-NEXT:    ret
372  %op1 = load <32 x half>, ptr %a
373  %op2 = load <32 x half>, ptr %b
374  %res = fdiv <32 x half> %op1, %op2
375  store <32 x half> %res, ptr %a
376  ret void
377}
378
379define void @fdiv_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
380; CHECK-LABEL: fdiv_v64f16:
381; CHECK:       // %bb.0:
382; CHECK-NEXT:    ptrue p0.h, vl64
383; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
384; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
385; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
386; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
387; CHECK-NEXT:    ret
388  %op1 = load <64 x half>, ptr %a
389  %op2 = load <64 x half>, ptr %b
390  %res = fdiv <64 x half> %op1, %op2
391  store <64 x half> %res, ptr %a
392  ret void
393}
394
395define void @fdiv_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
396; CHECK-LABEL: fdiv_v128f16:
397; CHECK:       // %bb.0:
398; CHECK-NEXT:    ptrue p0.h, vl128
399; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
400; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
401; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
402; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
403; CHECK-NEXT:    ret
404  %op1 = load <128 x half>, ptr %a
405  %op2 = load <128 x half>, ptr %b
406  %res = fdiv <128 x half> %op1, %op2
407  store <128 x half> %res, ptr %a
408  ret void
409}
410
411; Don't use SVE for 64-bit vectors.
412define <2 x float> @fdiv_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
413; CHECK-LABEL: fdiv_v2f32:
414; CHECK:       // %bb.0:
415; CHECK-NEXT:    fdiv v0.2s, v0.2s, v1.2s
416; CHECK-NEXT:    ret
417  %res = fdiv <2 x float> %op1, %op2
418  ret <2 x float> %res
419}
420
421; Don't use SVE for 128-bit vectors.
422define <4 x float> @fdiv_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
423; CHECK-LABEL: fdiv_v4f32:
424; CHECK:       // %bb.0:
425; CHECK-NEXT:    fdiv v0.4s, v0.4s, v1.4s
426; CHECK-NEXT:    ret
427  %res = fdiv <4 x float> %op1, %op2
428  ret <4 x float> %res
429}
430
431define void @fdiv_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
432; CHECK-LABEL: fdiv_v8f32:
433; CHECK:       // %bb.0:
434; CHECK-NEXT:    ptrue p0.s, vl8
435; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
436; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
437; CHECK-NEXT:    fdiv z0.s, p0/m, z0.s, z1.s
438; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
439; CHECK-NEXT:    ret
440  %op1 = load <8 x float>, ptr %a
441  %op2 = load <8 x float>, ptr %b
442  %res = fdiv <8 x float> %op1, %op2
443  store <8 x float> %res, ptr %a
444  ret void
445}
446
447define void @fdiv_v16f32(ptr %a, ptr %b) #0 {
448; VBITS_GE_256-LABEL: fdiv_v16f32:
449; VBITS_GE_256:       // %bb.0:
450; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
451; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
452; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
453; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
454; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1]
455; VBITS_GE_256-NEXT:    fdiv z0.s, p0/m, z0.s, z1.s
456; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
457; VBITS_GE_256-NEXT:    fdiv z1.s, p0/m, z1.s, z2.s
458; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
459; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
460; VBITS_GE_256-NEXT:    ret
461;
462; VBITS_GE_512-LABEL: fdiv_v16f32:
463; VBITS_GE_512:       // %bb.0:
464; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
465; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
466; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
467; VBITS_GE_512-NEXT:    fdiv z0.s, p0/m, z0.s, z1.s
468; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
469; VBITS_GE_512-NEXT:    ret
470  %op1 = load <16 x float>, ptr %a
471  %op2 = load <16 x float>, ptr %b
472  %res = fdiv <16 x float> %op1, %op2
473  store <16 x float> %res, ptr %a
474  ret void
475}
476
477define void @fdiv_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
478; CHECK-LABEL: fdiv_v32f32:
479; CHECK:       // %bb.0:
480; CHECK-NEXT:    ptrue p0.s, vl32
481; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
482; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
483; CHECK-NEXT:    fdiv z0.s, p0/m, z0.s, z1.s
484; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
485; CHECK-NEXT:    ret
486  %op1 = load <32 x float>, ptr %a
487  %op2 = load <32 x float>, ptr %b
488  %res = fdiv <32 x float> %op1, %op2
489  store <32 x float> %res, ptr %a
490  ret void
491}
492
493define void @fdiv_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
494; CHECK-LABEL: fdiv_v64f32:
495; CHECK:       // %bb.0:
496; CHECK-NEXT:    ptrue p0.s, vl64
497; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
498; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
499; CHECK-NEXT:    fdiv z0.s, p0/m, z0.s, z1.s
500; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
501; CHECK-NEXT:    ret
502  %op1 = load <64 x float>, ptr %a
503  %op2 = load <64 x float>, ptr %b
504  %res = fdiv <64 x float> %op1, %op2
505  store <64 x float> %res, ptr %a
506  ret void
507}
508
509; Don't use SVE for 64-bit vectors.
510define <1 x double> @fdiv_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
511; CHECK-LABEL: fdiv_v1f64:
512; CHECK:       // %bb.0:
513; CHECK-NEXT:    fdiv d0, d0, d1
514; CHECK-NEXT:    ret
515  %res = fdiv <1 x double> %op1, %op2
516  ret <1 x double> %res
517}
518
519; Don't use SVE for 128-bit vectors.
520define <2 x double> @fdiv_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
521; CHECK-LABEL: fdiv_v2f64:
522; CHECK:       // %bb.0:
523; CHECK-NEXT:    fdiv v0.2d, v0.2d, v1.2d
524; CHECK-NEXT:    ret
525  %res = fdiv <2 x double> %op1, %op2
526  ret <2 x double> %res
527}
528
529define void @fdiv_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
530; CHECK-LABEL: fdiv_v4f64:
531; CHECK:       // %bb.0:
532; CHECK-NEXT:    ptrue p0.d, vl4
533; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
534; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
535; CHECK-NEXT:    fdiv z0.d, p0/m, z0.d, z1.d
536; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
537; CHECK-NEXT:    ret
538  %op1 = load <4 x double>, ptr %a
539  %op2 = load <4 x double>, ptr %b
540  %res = fdiv <4 x double> %op1, %op2
541  store <4 x double> %res, ptr %a
542  ret void
543}
544
545define void @fdiv_v8f64(ptr %a, ptr %b) #0 {
546; VBITS_GE_256-LABEL: fdiv_v8f64:
547; VBITS_GE_256:       // %bb.0:
548; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
549; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
550; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
551; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
552; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1]
553; VBITS_GE_256-NEXT:    fdiv z0.d, p0/m, z0.d, z1.d
554; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
555; VBITS_GE_256-NEXT:    fdiv z1.d, p0/m, z1.d, z2.d
556; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
557; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
558; VBITS_GE_256-NEXT:    ret
559;
560; VBITS_GE_512-LABEL: fdiv_v8f64:
561; VBITS_GE_512:       // %bb.0:
562; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
563; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
564; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
565; VBITS_GE_512-NEXT:    fdiv z0.d, p0/m, z0.d, z1.d
566; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
567; VBITS_GE_512-NEXT:    ret
568  %op1 = load <8 x double>, ptr %a
569  %op2 = load <8 x double>, ptr %b
570  %res = fdiv <8 x double> %op1, %op2
571  store <8 x double> %res, ptr %a
572  ret void
573}
574
575define void @fdiv_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
576; CHECK-LABEL: fdiv_v16f64:
577; CHECK:       // %bb.0:
578; CHECK-NEXT:    ptrue p0.d, vl16
579; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
580; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
581; CHECK-NEXT:    fdiv z0.d, p0/m, z0.d, z1.d
582; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
583; CHECK-NEXT:    ret
584  %op1 = load <16 x double>, ptr %a
585  %op2 = load <16 x double>, ptr %b
586  %res = fdiv <16 x double> %op1, %op2
587  store <16 x double> %res, ptr %a
588  ret void
589}
590
591define void @fdiv_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
592; CHECK-LABEL: fdiv_v32f64:
593; CHECK:       // %bb.0:
594; CHECK-NEXT:    ptrue p0.d, vl32
595; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
596; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
597; CHECK-NEXT:    fdiv z0.d, p0/m, z0.d, z1.d
598; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
599; CHECK-NEXT:    ret
600  %op1 = load <32 x double>, ptr %a
601  %op2 = load <32 x double>, ptr %b
602  %res = fdiv <32 x double> %op1, %op2
603  store <32 x double> %res, ptr %a
604  ret void
605}
606
607;
608; FMA
609;
610
611; Don't use SVE for 64-bit vectors.
612define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) vscale_range(2,0) #0 {
613; CHECK-LABEL: fma_v4f16:
614; CHECK:       // %bb.0:
615; CHECK-NEXT:    fmla v2.4h, v1.4h, v0.4h
616; CHECK-NEXT:    fmov d0, d2
617; CHECK-NEXT:    ret
618  %res = call <4 x half> @llvm.fma.v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
619  ret <4 x half> %res
620}
621
622; Don't use SVE for 128-bit vectors.
623define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) vscale_range(2,0) #0 {
624; CHECK-LABEL: fma_v8f16:
625; CHECK:       // %bb.0:
626; CHECK-NEXT:    fmla v2.8h, v1.8h, v0.8h
627; CHECK-NEXT:    mov v0.16b, v2.16b
628; CHECK-NEXT:    ret
629  %res = call <8 x half> @llvm.fma.v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
630  ret <8 x half> %res
631}
632
633define void @fma_v16f16(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
634; CHECK-LABEL: fma_v16f16:
635; CHECK:       // %bb.0:
636; CHECK-NEXT:    ptrue p0.h, vl16
637; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
638; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
639; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x2]
640; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
641; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
642; CHECK-NEXT:    ret
643  %op1 = load <16 x half>, ptr %a
644  %op2 = load <16 x half>, ptr %b
645  %op3 = load <16 x half>, ptr %c
646  %res = call <16 x half> @llvm.fma.v16f16(<16 x half> %op1, <16 x half> %op2, <16 x half> %op3)
647  store <16 x half> %res, ptr %a
648  ret void
649}
650
651define void @fma_v32f16(ptr %a, ptr %b, ptr %c) #0 {
652; VBITS_GE_256-LABEL: fma_v32f16:
653; VBITS_GE_256:       // %bb.0:
654; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
655; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
656; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
657; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
658; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x2, x8, lsl #1]
659; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x0]
660; VBITS_GE_256-NEXT:    ld1h { z4.h }, p0/z, [x1]
661; VBITS_GE_256-NEXT:    ld1h { z5.h }, p0/z, [x2]
662; VBITS_GE_256-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
663; VBITS_GE_256-NEXT:    movprfx z1, z5
664; VBITS_GE_256-NEXT:    fmla z1.h, p0/m, z3.h, z4.h
665; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
666; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
667; VBITS_GE_256-NEXT:    ret
668;
669; VBITS_GE_512-LABEL: fma_v32f16:
670; VBITS_GE_512:       // %bb.0:
671; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
672; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
673; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
674; VBITS_GE_512-NEXT:    ld1h { z2.h }, p0/z, [x2]
675; VBITS_GE_512-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
676; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
677; VBITS_GE_512-NEXT:    ret
678  %op1 = load <32 x half>, ptr %a
679  %op2 = load <32 x half>, ptr %b
680  %op3 = load <32 x half>, ptr %c
681  %res = call <32 x half> @llvm.fma.v32f16(<32 x half> %op1, <32 x half> %op2, <32 x half> %op3)
682  store <32 x half> %res, ptr %a
683  ret void
684}
685
686define void @fma_v64f16(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 {
687; CHECK-LABEL: fma_v64f16:
688; CHECK:       // %bb.0:
689; CHECK-NEXT:    ptrue p0.h, vl64
690; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
691; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
692; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x2]
693; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
694; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
695; CHECK-NEXT:    ret
696  %op1 = load <64 x half>, ptr %a
697  %op2 = load <64 x half>, ptr %b
698  %op3 = load <64 x half>, ptr %c
699  %res = call <64 x half> @llvm.fma.v64f16(<64 x half> %op1, <64 x half> %op2, <64 x half> %op3)
700  store <64 x half> %res, ptr %a
701  ret void
702}
703
704define void @fma_v128f16(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
705; CHECK-LABEL: fma_v128f16:
706; CHECK:       // %bb.0:
707; CHECK-NEXT:    ptrue p0.h, vl128
708; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
709; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
710; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x2]
711; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
712; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
713; CHECK-NEXT:    ret
714  %op1 = load <128 x half>, ptr %a
715  %op2 = load <128 x half>, ptr %b
716  %op3 = load <128 x half>, ptr %c
717  %res = call <128 x half> @llvm.fma.v128f16(<128 x half> %op1, <128 x half> %op2, <128 x half> %op3)
718  store <128 x half> %res, ptr %a
719  ret void
720}
721
722; Don't use SVE for 64-bit vectors.
723define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3) vscale_range(2,0) #0 {
724; CHECK-LABEL: fma_v2f32:
725; CHECK:       // %bb.0:
726; CHECK-NEXT:    fmla v2.2s, v1.2s, v0.2s
727; CHECK-NEXT:    fmov d0, d2
728; CHECK-NEXT:    ret
729  %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3)
730  ret <2 x float> %res
731}
732
733; Don't use SVE for 128-bit vectors.
734define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3) vscale_range(2,0) #0 {
735; CHECK-LABEL: fma_v4f32:
736; CHECK:       // %bb.0:
737; CHECK-NEXT:    fmla v2.4s, v1.4s, v0.4s
738; CHECK-NEXT:    mov v0.16b, v2.16b
739; CHECK-NEXT:    ret
740  %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3)
741  ret <4 x float> %res
742}
743
744define void @fma_v8f32(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
745; CHECK-LABEL: fma_v8f32:
746; CHECK:       // %bb.0:
747; CHECK-NEXT:    ptrue p0.s, vl8
748; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
749; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
750; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x2]
751; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
752; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
753; CHECK-NEXT:    ret
754  %op1 = load <8 x float>, ptr %a
755  %op2 = load <8 x float>, ptr %b
756  %op3 = load <8 x float>, ptr %c
757  %res = call <8 x float> @llvm.fma.v8f32(<8 x float> %op1, <8 x float> %op2, <8 x float> %op3)
758  store <8 x float> %res, ptr %a
759  ret void
760}
761
762define void @fma_v16f32(ptr %a, ptr %b, ptr %c) #0 {
763; VBITS_GE_256-LABEL: fma_v16f32:
764; VBITS_GE_256:       // %bb.0:
765; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
766; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
767; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
768; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
769; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x2, x8, lsl #2]
770; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
771; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x1]
772; VBITS_GE_256-NEXT:    ld1w { z5.s }, p0/z, [x2]
773; VBITS_GE_256-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
774; VBITS_GE_256-NEXT:    movprfx z1, z5
775; VBITS_GE_256-NEXT:    fmla z1.s, p0/m, z3.s, z4.s
776; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
777; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
778; VBITS_GE_256-NEXT:    ret
779;
780; VBITS_GE_512-LABEL: fma_v16f32:
781; VBITS_GE_512:       // %bb.0:
782; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
783; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
784; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
785; VBITS_GE_512-NEXT:    ld1w { z2.s }, p0/z, [x2]
786; VBITS_GE_512-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
787; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
788; VBITS_GE_512-NEXT:    ret
789  %op1 = load <16 x float>, ptr %a
790  %op2 = load <16 x float>, ptr %b
791  %op3 = load <16 x float>, ptr %c
792  %res = call <16 x float> @llvm.fma.v16f32(<16 x float> %op1, <16 x float> %op2, <16 x float> %op3)
793  store <16 x float> %res, ptr %a
794  ret void
795}
796
797define void @fma_v32f32(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 {
798; CHECK-LABEL: fma_v32f32:
799; CHECK:       // %bb.0:
800; CHECK-NEXT:    ptrue p0.s, vl32
801; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
802; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
803; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x2]
804; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
805; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
806; CHECK-NEXT:    ret
807  %op1 = load <32 x float>, ptr %a
808  %op2 = load <32 x float>, ptr %b
809  %op3 = load <32 x float>, ptr %c
810  %res = call <32 x float> @llvm.fma.v32f32(<32 x float> %op1, <32 x float> %op2, <32 x float> %op3)
811  store <32 x float> %res, ptr %a
812  ret void
813}
814
815define void @fma_v64f32(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
816; CHECK-LABEL: fma_v64f32:
817; CHECK:       // %bb.0:
818; CHECK-NEXT:    ptrue p0.s, vl64
819; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
820; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
821; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x2]
822; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
823; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
824; CHECK-NEXT:    ret
825  %op1 = load <64 x float>, ptr %a
826  %op2 = load <64 x float>, ptr %b
827  %op3 = load <64 x float>, ptr %c
828  %res = call <64 x float> @llvm.fma.v64f32(<64 x float> %op1, <64 x float> %op2, <64 x float> %op3)
829  store <64 x float> %res, ptr %a
830  ret void
831}
832
833; Don't use SVE for 64-bit vectors.
834define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double> %op3) vscale_range(2,0) #0 {
835; CHECK-LABEL: fma_v1f64:
836; CHECK:       // %bb.0:
837; CHECK-NEXT:    fmadd d0, d0, d1, d2
838; CHECK-NEXT:    ret
839  %res = call <1 x double> @llvm.fma.v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double> %op3)
840  ret <1 x double> %res
841}
842
843; Don't use SVE for 128-bit vectors.
844define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3) vscale_range(2,0) #0 {
845; CHECK-LABEL: fma_v2f64:
846; CHECK:       // %bb.0:
847; CHECK-NEXT:    fmla v2.2d, v1.2d, v0.2d
848; CHECK-NEXT:    mov v0.16b, v2.16b
849; CHECK-NEXT:    ret
850  %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3)
851  ret <2 x double> %res
852}
853
854define void @fma_v4f64(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
855; CHECK-LABEL: fma_v4f64:
856; CHECK:       // %bb.0:
857; CHECK-NEXT:    ptrue p0.d, vl4
858; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
859; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
860; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x2]
861; CHECK-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
862; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
863; CHECK-NEXT:    ret
864  %op1 = load <4 x double>, ptr %a
865  %op2 = load <4 x double>, ptr %b
866  %op3 = load <4 x double>, ptr %c
867  %res = call <4 x double> @llvm.fma.v4f64(<4 x double> %op1, <4 x double> %op2, <4 x double> %op3)
868  store <4 x double> %res, ptr %a
869  ret void
870}
871
872define void @fma_v8f64(ptr %a, ptr %b, ptr %c) #0 {
873; VBITS_GE_256-LABEL: fma_v8f64:
874; VBITS_GE_256:       // %bb.0:
875; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
876; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
877; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
878; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
879; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x2, x8, lsl #3]
880; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0]
881; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1]
882; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x2]
883; VBITS_GE_256-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
884; VBITS_GE_256-NEXT:    movprfx z1, z5
885; VBITS_GE_256-NEXT:    fmla z1.d, p0/m, z3.d, z4.d
886; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
887; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
888; VBITS_GE_256-NEXT:    ret
889;
890; VBITS_GE_512-LABEL: fma_v8f64:
891; VBITS_GE_512:       // %bb.0:
892; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
893; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
894; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
895; VBITS_GE_512-NEXT:    ld1d { z2.d }, p0/z, [x2]
896; VBITS_GE_512-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
897; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
898; VBITS_GE_512-NEXT:    ret
899  %op1 = load <8 x double>, ptr %a
900  %op2 = load <8 x double>, ptr %b
901  %op3 = load <8 x double>, ptr %c
902  %res = call <8 x double> @llvm.fma.v8f64(<8 x double> %op1, <8 x double> %op2, <8 x double> %op3)
903  store <8 x double> %res, ptr %a
904  ret void
905}
906
907define void @fma_v16f64(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 {
908; CHECK-LABEL: fma_v16f64:
909; CHECK:       // %bb.0:
910; CHECK-NEXT:    ptrue p0.d, vl16
911; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
912; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
913; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x2]
914; CHECK-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
915; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
916; CHECK-NEXT:    ret
917  %op1 = load <16 x double>, ptr %a
918  %op2 = load <16 x double>, ptr %b
919  %op3 = load <16 x double>, ptr %c
920  %res = call <16 x double> @llvm.fma.v16f64(<16 x double> %op1, <16 x double> %op2, <16 x double> %op3)
921  store <16 x double> %res, ptr %a
922  ret void
923}
924
925define void @fma_v32f64(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
926; CHECK-LABEL: fma_v32f64:
927; CHECK:       // %bb.0:
928; CHECK-NEXT:    ptrue p0.d, vl32
929; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
930; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
931; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x2]
932; CHECK-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
933; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
934; CHECK-NEXT:    ret
935  %op1 = load <32 x double>, ptr %a
936  %op2 = load <32 x double>, ptr %b
937  %op3 = load <32 x double>, ptr %c
938  %res = call <32 x double> @llvm.fma.v32f64(<32 x double> %op1, <32 x double> %op2, <32 x double> %op3)
939  store <32 x double> %res, ptr %a
940  ret void
941}
942
943;
944; FMUL
945;
946
947; Don't use SVE for 64-bit vectors.
948define <4 x half> @fmul_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
949; CHECK-LABEL: fmul_v4f16:
950; CHECK:       // %bb.0:
951; CHECK-NEXT:    fmul v0.4h, v0.4h, v1.4h
952; CHECK-NEXT:    ret
953  %res = fmul <4 x half> %op1, %op2
954  ret <4 x half> %res
955}
956
957; Don't use SVE for 128-bit vectors.
958define <8 x half> @fmul_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
959; CHECK-LABEL: fmul_v8f16:
960; CHECK:       // %bb.0:
961; CHECK-NEXT:    fmul v0.8h, v0.8h, v1.8h
962; CHECK-NEXT:    ret
963  %res = fmul <8 x half> %op1, %op2
964  ret <8 x half> %res
965}
966
967define void @fmul_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
968; CHECK-LABEL: fmul_v16f16:
969; CHECK:       // %bb.0:
970; CHECK-NEXT:    ptrue p0.h, vl16
971; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
972; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
973; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
974; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
975; CHECK-NEXT:    ret
976  %op1 = load <16 x half>, ptr %a
977  %op2 = load <16 x half>, ptr %b
978  %res = fmul <16 x half> %op1, %op2
979  store <16 x half> %res, ptr %a
980  ret void
981}
982
983define void @fmul_v32f16(ptr %a, ptr %b) #0 {
984; VBITS_GE_256-LABEL: fmul_v32f16:
985; VBITS_GE_256:       // %bb.0:
986; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
987; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
988; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
989; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
990; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
991; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
992; VBITS_GE_256-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
993; VBITS_GE_256-NEXT:    movprfx z1, z2
994; VBITS_GE_256-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
995; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
996; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
997; VBITS_GE_256-NEXT:    ret
998;
999; VBITS_GE_512-LABEL: fmul_v32f16:
1000; VBITS_GE_512:       // %bb.0:
1001; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
1002; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
1003; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
1004; VBITS_GE_512-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
1005; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
1006; VBITS_GE_512-NEXT:    ret
1007  %op1 = load <32 x half>, ptr %a
1008  %op2 = load <32 x half>, ptr %b
1009  %res = fmul <32 x half> %op1, %op2
1010  store <32 x half> %res, ptr %a
1011  ret void
1012}
1013
1014define void @fmul_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
1015; CHECK-LABEL: fmul_v64f16:
1016; CHECK:       // %bb.0:
1017; CHECK-NEXT:    ptrue p0.h, vl64
1018; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1019; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
1020; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
1021; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
1022; CHECK-NEXT:    ret
1023  %op1 = load <64 x half>, ptr %a
1024  %op2 = load <64 x half>, ptr %b
1025  %res = fmul <64 x half> %op1, %op2
1026  store <64 x half> %res, ptr %a
1027  ret void
1028}
1029
1030define void @fmul_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
1031; CHECK-LABEL: fmul_v128f16:
1032; CHECK:       // %bb.0:
1033; CHECK-NEXT:    ptrue p0.h, vl128
1034; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1035; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
1036; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
1037; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
1038; CHECK-NEXT:    ret
1039  %op1 = load <128 x half>, ptr %a
1040  %op2 = load <128 x half>, ptr %b
1041  %res = fmul <128 x half> %op1, %op2
1042  store <128 x half> %res, ptr %a
1043  ret void
1044}
1045
1046; Don't use SVE for 64-bit vectors.
1047define <2 x float> @fmul_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
1048; CHECK-LABEL: fmul_v2f32:
1049; CHECK:       // %bb.0:
1050; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.2s
1051; CHECK-NEXT:    ret
1052  %res = fmul <2 x float> %op1, %op2
1053  ret <2 x float> %res
1054}
1055
1056; Don't use SVE for 128-bit vectors.
1057define <4 x float> @fmul_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
1058; CHECK-LABEL: fmul_v4f32:
1059; CHECK:       // %bb.0:
1060; CHECK-NEXT:    fmul v0.4s, v0.4s, v1.4s
1061; CHECK-NEXT:    ret
1062  %res = fmul <4 x float> %op1, %op2
1063  ret <4 x float> %res
1064}
1065
1066define void @fmul_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
1067; CHECK-LABEL: fmul_v8f32:
1068; CHECK:       // %bb.0:
1069; CHECK-NEXT:    ptrue p0.s, vl8
1070; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1071; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
1072; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
1073; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1074; CHECK-NEXT:    ret
1075  %op1 = load <8 x float>, ptr %a
1076  %op2 = load <8 x float>, ptr %b
1077  %res = fmul <8 x float> %op1, %op2
1078  store <8 x float> %res, ptr %a
1079  ret void
1080}
1081
1082define void @fmul_v16f32(ptr %a, ptr %b) #0 {
1083; VBITS_GE_256-LABEL: fmul_v16f32:
1084; VBITS_GE_256:       // %bb.0:
1085; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
1086; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
1087; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1088; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
1089; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
1090; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
1091; VBITS_GE_256-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
1092; VBITS_GE_256-NEXT:    movprfx z1, z2
1093; VBITS_GE_256-NEXT:    fmul z1.s, p0/m, z1.s, z3.s
1094; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
1095; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
1096; VBITS_GE_256-NEXT:    ret
1097;
1098; VBITS_GE_512-LABEL: fmul_v16f32:
1099; VBITS_GE_512:       // %bb.0:
1100; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
1101; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
1102; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
1103; VBITS_GE_512-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
1104; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
1105; VBITS_GE_512-NEXT:    ret
1106  %op1 = load <16 x float>, ptr %a
1107  %op2 = load <16 x float>, ptr %b
1108  %res = fmul <16 x float> %op1, %op2
1109  store <16 x float> %res, ptr %a
1110  ret void
1111}
1112
1113define void @fmul_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
1114; CHECK-LABEL: fmul_v32f32:
1115; CHECK:       // %bb.0:
1116; CHECK-NEXT:    ptrue p0.s, vl32
1117; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1118; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
1119; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
1120; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1121; CHECK-NEXT:    ret
1122  %op1 = load <32 x float>, ptr %a
1123  %op2 = load <32 x float>, ptr %b
1124  %res = fmul <32 x float> %op1, %op2
1125  store <32 x float> %res, ptr %a
1126  ret void
1127}
1128
1129define void @fmul_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
1130; CHECK-LABEL: fmul_v64f32:
1131; CHECK:       // %bb.0:
1132; CHECK-NEXT:    ptrue p0.s, vl64
1133; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1134; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
1135; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
1136; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1137; CHECK-NEXT:    ret
1138  %op1 = load <64 x float>, ptr %a
1139  %op2 = load <64 x float>, ptr %b
1140  %res = fmul <64 x float> %op1, %op2
1141  store <64 x float> %res, ptr %a
1142  ret void
1143}
1144
1145; Don't use SVE for 64-bit vectors.
1146define <1 x double> @fmul_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
1147; CHECK-LABEL: fmul_v1f64:
1148; CHECK:       // %bb.0:
1149; CHECK-NEXT:    fmul d0, d0, d1
1150; CHECK-NEXT:    ret
1151  %res = fmul <1 x double> %op1, %op2
1152  ret <1 x double> %res
1153}
1154
1155; Don't use SVE for 128-bit vectors.
1156define <2 x double> @fmul_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
1157; CHECK-LABEL: fmul_v2f64:
1158; CHECK:       // %bb.0:
1159; CHECK-NEXT:    fmul v0.2d, v0.2d, v1.2d
1160; CHECK-NEXT:    ret
1161  %res = fmul <2 x double> %op1, %op2
1162  ret <2 x double> %res
1163}
1164
1165define void @fmul_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
1166; CHECK-LABEL: fmul_v4f64:
1167; CHECK:       // %bb.0:
1168; CHECK-NEXT:    ptrue p0.d, vl4
1169; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1170; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
1171; CHECK-NEXT:    fmul z0.d, p0/m, z0.d, z1.d
1172; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1173; CHECK-NEXT:    ret
1174  %op1 = load <4 x double>, ptr %a
1175  %op2 = load <4 x double>, ptr %b
1176  %res = fmul <4 x double> %op1, %op2
1177  store <4 x double> %res, ptr %a
1178  ret void
1179}
1180
1181define void @fmul_v8f64(ptr %a, ptr %b) #0 {
1182; VBITS_GE_256-LABEL: fmul_v8f64:
1183; VBITS_GE_256:       // %bb.0:
1184; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
1185; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
1186; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1187; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
1188; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
1189; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
1190; VBITS_GE_256-NEXT:    fmul z0.d, p0/m, z0.d, z1.d
1191; VBITS_GE_256-NEXT:    movprfx z1, z2
1192; VBITS_GE_256-NEXT:    fmul z1.d, p0/m, z1.d, z3.d
1193; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
1194; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
1195; VBITS_GE_256-NEXT:    ret
1196;
1197; VBITS_GE_512-LABEL: fmul_v8f64:
1198; VBITS_GE_512:       // %bb.0:
1199; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
1200; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
1201; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
1202; VBITS_GE_512-NEXT:    fmul z0.d, p0/m, z0.d, z1.d
1203; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
1204; VBITS_GE_512-NEXT:    ret
1205  %op1 = load <8 x double>, ptr %a
1206  %op2 = load <8 x double>, ptr %b
1207  %res = fmul <8 x double> %op1, %op2
1208  store <8 x double> %res, ptr %a
1209  ret void
1210}
1211
1212define void @fmul_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
1213; CHECK-LABEL: fmul_v16f64:
1214; CHECK:       // %bb.0:
1215; CHECK-NEXT:    ptrue p0.d, vl16
1216; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1217; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
1218; CHECK-NEXT:    fmul z0.d, p0/m, z0.d, z1.d
1219; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1220; CHECK-NEXT:    ret
1221  %op1 = load <16 x double>, ptr %a
1222  %op2 = load <16 x double>, ptr %b
1223  %res = fmul <16 x double> %op1, %op2
1224  store <16 x double> %res, ptr %a
1225  ret void
1226}
1227
1228define void @fmul_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
1229; CHECK-LABEL: fmul_v32f64:
1230; CHECK:       // %bb.0:
1231; CHECK-NEXT:    ptrue p0.d, vl32
1232; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1233; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
1234; CHECK-NEXT:    fmul z0.d, p0/m, z0.d, z1.d
1235; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1236; CHECK-NEXT:    ret
1237  %op1 = load <32 x double>, ptr %a
1238  %op2 = load <32 x double>, ptr %b
1239  %res = fmul <32 x double> %op1, %op2
1240  store <32 x double> %res, ptr %a
1241  ret void
1242}
1243
1244;
1245; FNEG
1246;
1247
1248; Don't use SVE for 64-bit vectors.
1249define <4 x half> @fneg_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
1250; CHECK-LABEL: fneg_v4f16:
1251; CHECK:       // %bb.0:
1252; CHECK-NEXT:    fneg v0.4h, v0.4h
1253; CHECK-NEXT:    ret
1254  %res = fneg <4 x half> %op
1255  ret <4 x half> %res
1256}
1257
1258; Don't use SVE for 128-bit vectors.
1259define <8 x half> @fneg_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
1260; CHECK-LABEL: fneg_v8f16:
1261; CHECK:       // %bb.0:
1262; CHECK-NEXT:    fneg v0.8h, v0.8h
1263; CHECK-NEXT:    ret
1264  %res = fneg <8 x half> %op
1265  ret <8 x half> %res
1266}
1267
1268define void @fneg_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
1269; CHECK-LABEL: fneg_v16f16:
1270; CHECK:       // %bb.0:
1271; CHECK-NEXT:    ptrue p0.h, vl16
1272; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1273; CHECK-NEXT:    fneg z0.h, p0/m, z0.h
1274; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
1275; CHECK-NEXT:    ret
1276  %op = load <16 x half>, ptr %a
1277  %res = fneg <16 x half> %op
1278  store <16 x half> %res, ptr %a
1279  ret void
1280}
1281
1282define void @fneg_v32f16(ptr %a) #0 {
1283; VBITS_GE_256-LABEL: fneg_v32f16:
1284; VBITS_GE_256:       // %bb.0:
1285; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
1286; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
1287; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
1288; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
1289; VBITS_GE_256-NEXT:    fneg z0.h, p0/m, z0.h
1290; VBITS_GE_256-NEXT:    fneg z1.h, p0/m, z1.h
1291; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
1292; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
1293; VBITS_GE_256-NEXT:    ret
1294;
1295; VBITS_GE_512-LABEL: fneg_v32f16:
1296; VBITS_GE_512:       // %bb.0:
1297; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
1298; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
1299; VBITS_GE_512-NEXT:    fneg z0.h, p0/m, z0.h
1300; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
1301; VBITS_GE_512-NEXT:    ret
1302  %op = load <32 x half>, ptr %a
1303  %res = fneg <32 x half> %op
1304  store <32 x half> %res, ptr %a
1305  ret void
1306}
1307
1308define void @fneg_v64f16(ptr %a) vscale_range(8,0) #0 {
1309; CHECK-LABEL: fneg_v64f16:
1310; CHECK:       // %bb.0:
1311; CHECK-NEXT:    ptrue p0.h, vl64
1312; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1313; CHECK-NEXT:    fneg z0.h, p0/m, z0.h
1314; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
1315; CHECK-NEXT:    ret
1316  %op = load <64 x half>, ptr %a
1317  %res = fneg <64 x half> %op
1318  store <64 x half> %res, ptr %a
1319  ret void
1320}
1321
1322define void @fneg_v128f16(ptr %a) vscale_range(16,0) #0 {
1323; CHECK-LABEL: fneg_v128f16:
1324; CHECK:       // %bb.0:
1325; CHECK-NEXT:    ptrue p0.h, vl128
1326; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1327; CHECK-NEXT:    fneg z0.h, p0/m, z0.h
1328; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
1329; CHECK-NEXT:    ret
1330  %op = load <128 x half>, ptr %a
1331  %res = fneg <128 x half> %op
1332  store <128 x half> %res, ptr %a
1333  ret void
1334}
1335
1336; Don't use SVE for 64-bit vectors.
1337define <2 x float> @fneg_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
1338; CHECK-LABEL: fneg_v2f32:
1339; CHECK:       // %bb.0:
1340; CHECK-NEXT:    fneg v0.2s, v0.2s
1341; CHECK-NEXT:    ret
1342  %res = fneg <2 x float> %op
1343  ret <2 x float> %res
1344}
1345
1346; Don't use SVE for 128-bit vectors.
1347define <4 x float> @fneg_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
1348; CHECK-LABEL: fneg_v4f32:
1349; CHECK:       // %bb.0:
1350; CHECK-NEXT:    fneg v0.4s, v0.4s
1351; CHECK-NEXT:    ret
1352  %res = fneg <4 x float> %op
1353  ret <4 x float> %res
1354}
1355
1356define void @fneg_v8f32(ptr %a) vscale_range(2,0) #0 {
1357; CHECK-LABEL: fneg_v8f32:
1358; CHECK:       // %bb.0:
1359; CHECK-NEXT:    ptrue p0.s, vl8
1360; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1361; CHECK-NEXT:    fneg z0.s, p0/m, z0.s
1362; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1363; CHECK-NEXT:    ret
1364  %op = load <8 x float>, ptr %a
1365  %res = fneg <8 x float> %op
1366  store <8 x float> %res, ptr %a
1367  ret void
1368}
1369
1370define void @fneg_v16f32(ptr %a) #0 {
1371; VBITS_GE_256-LABEL: fneg_v16f32:
1372; VBITS_GE_256:       // %bb.0:
1373; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
1374; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
1375; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1376; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
1377; VBITS_GE_256-NEXT:    fneg z0.s, p0/m, z0.s
1378; VBITS_GE_256-NEXT:    fneg z1.s, p0/m, z1.s
1379; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
1380; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
1381; VBITS_GE_256-NEXT:    ret
1382;
1383; VBITS_GE_512-LABEL: fneg_v16f32:
1384; VBITS_GE_512:       // %bb.0:
1385; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
1386; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
1387; VBITS_GE_512-NEXT:    fneg z0.s, p0/m, z0.s
1388; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
1389; VBITS_GE_512-NEXT:    ret
1390  %op = load <16 x float>, ptr %a
1391  %res = fneg <16 x float> %op
1392  store <16 x float> %res, ptr %a
1393  ret void
1394}
1395
1396define void @fneg_v32f32(ptr %a) vscale_range(8,0) #0 {
1397; CHECK-LABEL: fneg_v32f32:
1398; CHECK:       // %bb.0:
1399; CHECK-NEXT:    ptrue p0.s, vl32
1400; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1401; CHECK-NEXT:    fneg z0.s, p0/m, z0.s
1402; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1403; CHECK-NEXT:    ret
1404  %op = load <32 x float>, ptr %a
1405  %res = fneg <32 x float> %op
1406  store <32 x float> %res, ptr %a
1407  ret void
1408}
1409
1410define void @fneg_v64f32(ptr %a) vscale_range(16,0) #0 {
1411; CHECK-LABEL: fneg_v64f32:
1412; CHECK:       // %bb.0:
1413; CHECK-NEXT:    ptrue p0.s, vl64
1414; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1415; CHECK-NEXT:    fneg z0.s, p0/m, z0.s
1416; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1417; CHECK-NEXT:    ret
1418  %op = load <64 x float>, ptr %a
1419  %res = fneg <64 x float> %op
1420  store <64 x float> %res, ptr %a
1421  ret void
1422}
1423
1424; Don't use SVE for 64-bit vectors.
1425define <1 x double> @fneg_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
1426; CHECK-LABEL: fneg_v1f64:
1427; CHECK:       // %bb.0:
1428; CHECK-NEXT:    fneg d0, d0
1429; CHECK-NEXT:    ret
1430  %res = fneg <1 x double> %op
1431  ret <1 x double> %res
1432}
1433
1434; Don't use SVE for 128-bit vectors.
1435define <2 x double> @fneg_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
1436; CHECK-LABEL: fneg_v2f64:
1437; CHECK:       // %bb.0:
1438; CHECK-NEXT:    fneg v0.2d, v0.2d
1439; CHECK-NEXT:    ret
1440  %res = fneg <2 x double> %op
1441  ret <2 x double> %res
1442}
1443
1444define void @fneg_v4f64(ptr %a) vscale_range(2,0) #0 {
1445; CHECK-LABEL: fneg_v4f64:
1446; CHECK:       // %bb.0:
1447; CHECK-NEXT:    ptrue p0.d, vl4
1448; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1449; CHECK-NEXT:    fneg z0.d, p0/m, z0.d
1450; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1451; CHECK-NEXT:    ret
1452  %op = load <4 x double>, ptr %a
1453  %res = fneg <4 x double> %op
1454  store <4 x double> %res, ptr %a
1455  ret void
1456}
1457
1458define void @fneg_v8f64(ptr %a) #0 {
1459; VBITS_GE_256-LABEL: fneg_v8f64:
1460; VBITS_GE_256:       // %bb.0:
1461; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
1462; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
1463; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1464; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
1465; VBITS_GE_256-NEXT:    fneg z0.d, p0/m, z0.d
1466; VBITS_GE_256-NEXT:    fneg z1.d, p0/m, z1.d
1467; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
1468; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
1469; VBITS_GE_256-NEXT:    ret
1470;
1471; VBITS_GE_512-LABEL: fneg_v8f64:
1472; VBITS_GE_512:       // %bb.0:
1473; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
1474; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
1475; VBITS_GE_512-NEXT:    fneg z0.d, p0/m, z0.d
1476; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
1477; VBITS_GE_512-NEXT:    ret
1478  %op = load <8 x double>, ptr %a
1479  %res = fneg <8 x double> %op
1480  store <8 x double> %res, ptr %a
1481  ret void
1482}
1483
1484define void @fneg_v16f64(ptr %a) vscale_range(8,0) #0 {
1485; CHECK-LABEL: fneg_v16f64:
1486; CHECK:       // %bb.0:
1487; CHECK-NEXT:    ptrue p0.d, vl16
1488; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1489; CHECK-NEXT:    fneg z0.d, p0/m, z0.d
1490; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1491; CHECK-NEXT:    ret
1492  %op = load <16 x double>, ptr %a
1493  %res = fneg <16 x double> %op
1494  store <16 x double> %res, ptr %a
1495  ret void
1496}
1497
1498define void @fneg_v32f64(ptr %a) vscale_range(16,0) #0 {
1499; CHECK-LABEL: fneg_v32f64:
1500; CHECK:       // %bb.0:
1501; CHECK-NEXT:    ptrue p0.d, vl32
1502; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1503; CHECK-NEXT:    fneg z0.d, p0/m, z0.d
1504; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1505; CHECK-NEXT:    ret
1506  %op = load <32 x double>, ptr %a
1507  %res = fneg <32 x double> %op
1508  store <32 x double> %res, ptr %a
1509  ret void
1510}
1511
1512;
1513; FSQRT
1514;
1515
1516; Don't use SVE for 64-bit vectors.
1517define <4 x half> @fsqrt_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
1518; CHECK-LABEL: fsqrt_v4f16:
1519; CHECK:       // %bb.0:
1520; CHECK-NEXT:    fsqrt v0.4h, v0.4h
1521; CHECK-NEXT:    ret
1522  %res = call <4 x half> @llvm.sqrt.v4f16(<4 x half> %op)
1523  ret <4 x half> %res
1524}
1525
1526; Don't use SVE for 128-bit vectors.
1527define <8 x half> @fsqrt_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
1528; CHECK-LABEL: fsqrt_v8f16:
1529; CHECK:       // %bb.0:
1530; CHECK-NEXT:    fsqrt v0.8h, v0.8h
1531; CHECK-NEXT:    ret
1532  %res = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %op)
1533  ret <8 x half> %res
1534}
1535
1536define void @fsqrt_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
1537; CHECK-LABEL: fsqrt_v16f16:
1538; CHECK:       // %bb.0:
1539; CHECK-NEXT:    ptrue p0.h, vl16
1540; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1541; CHECK-NEXT:    fsqrt z0.h, p0/m, z0.h
1542; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
1543; CHECK-NEXT:    ret
1544  %op = load <16 x half>, ptr %a
1545  %res = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %op)
1546  store <16 x half> %res, ptr %a
1547  ret void
1548}
1549
1550define void @fsqrt_v32f16(ptr %a) #0 {
1551; VBITS_GE_256-LABEL: fsqrt_v32f16:
1552; VBITS_GE_256:       // %bb.0:
1553; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
1554; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
1555; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
1556; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
1557; VBITS_GE_256-NEXT:    fsqrt z0.h, p0/m, z0.h
1558; VBITS_GE_256-NEXT:    fsqrt z1.h, p0/m, z1.h
1559; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
1560; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
1561; VBITS_GE_256-NEXT:    ret
1562;
1563; VBITS_GE_512-LABEL: fsqrt_v32f16:
1564; VBITS_GE_512:       // %bb.0:
1565; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
1566; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
1567; VBITS_GE_512-NEXT:    fsqrt z0.h, p0/m, z0.h
1568; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
1569; VBITS_GE_512-NEXT:    ret
1570  %op = load <32 x half>, ptr %a
1571  %res = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %op)
1572  store <32 x half> %res, ptr %a
1573  ret void
1574}
1575
1576define void @fsqrt_v64f16(ptr %a) vscale_range(8,0) #0 {
1577; CHECK-LABEL: fsqrt_v64f16:
1578; CHECK:       // %bb.0:
1579; CHECK-NEXT:    ptrue p0.h, vl64
1580; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1581; CHECK-NEXT:    fsqrt z0.h, p0/m, z0.h
1582; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
1583; CHECK-NEXT:    ret
1584  %op = load <64 x half>, ptr %a
1585  %res = call <64 x half> @llvm.sqrt.v64f16(<64 x half> %op)
1586  store <64 x half> %res, ptr %a
1587  ret void
1588}
1589
1590define void @fsqrt_v128f16(ptr %a) vscale_range(16,0) #0 {
1591; CHECK-LABEL: fsqrt_v128f16:
1592; CHECK:       // %bb.0:
1593; CHECK-NEXT:    ptrue p0.h, vl128
1594; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1595; CHECK-NEXT:    fsqrt z0.h, p0/m, z0.h
1596; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
1597; CHECK-NEXT:    ret
1598  %op = load <128 x half>, ptr %a
1599  %res = call <128 x half> @llvm.sqrt.v128f16(<128 x half> %op)
1600  store <128 x half> %res, ptr %a
1601  ret void
1602}
1603
1604; Don't use SVE for 64-bit vectors.
1605define <2 x float> @fsqrt_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
1606; CHECK-LABEL: fsqrt_v2f32:
1607; CHECK:       // %bb.0:
1608; CHECK-NEXT:    fsqrt v0.2s, v0.2s
1609; CHECK-NEXT:    ret
1610  %res = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %op)
1611  ret <2 x float> %res
1612}
1613
1614; Don't use SVE for 128-bit vectors.
1615define <4 x float> @fsqrt_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
1616; CHECK-LABEL: fsqrt_v4f32:
1617; CHECK:       // %bb.0:
1618; CHECK-NEXT:    fsqrt v0.4s, v0.4s
1619; CHECK-NEXT:    ret
1620  %res = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %op)
1621  ret <4 x float> %res
1622}
1623
1624define void @fsqrt_v8f32(ptr %a) vscale_range(2,0) #0 {
1625; CHECK-LABEL: fsqrt_v8f32:
1626; CHECK:       // %bb.0:
1627; CHECK-NEXT:    ptrue p0.s, vl8
1628; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1629; CHECK-NEXT:    fsqrt z0.s, p0/m, z0.s
1630; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1631; CHECK-NEXT:    ret
1632  %op = load <8 x float>, ptr %a
1633  %res = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %op)
1634  store <8 x float> %res, ptr %a
1635  ret void
1636}
1637
1638define void @fsqrt_v16f32(ptr %a) #0 {
1639; VBITS_GE_256-LABEL: fsqrt_v16f32:
1640; VBITS_GE_256:       // %bb.0:
1641; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
1642; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
1643; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1644; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
1645; VBITS_GE_256-NEXT:    fsqrt z0.s, p0/m, z0.s
1646; VBITS_GE_256-NEXT:    fsqrt z1.s, p0/m, z1.s
1647; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
1648; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
1649; VBITS_GE_256-NEXT:    ret
1650;
1651; VBITS_GE_512-LABEL: fsqrt_v16f32:
1652; VBITS_GE_512:       // %bb.0:
1653; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
1654; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
1655; VBITS_GE_512-NEXT:    fsqrt z0.s, p0/m, z0.s
1656; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
1657; VBITS_GE_512-NEXT:    ret
1658  %op = load <16 x float>, ptr %a
1659  %res = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %op)
1660  store <16 x float> %res, ptr %a
1661  ret void
1662}
1663
1664define void @fsqrt_v32f32(ptr %a) vscale_range(8,0) #0 {
1665; CHECK-LABEL: fsqrt_v32f32:
1666; CHECK:       // %bb.0:
1667; CHECK-NEXT:    ptrue p0.s, vl32
1668; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1669; CHECK-NEXT:    fsqrt z0.s, p0/m, z0.s
1670; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1671; CHECK-NEXT:    ret
1672  %op = load <32 x float>, ptr %a
1673  %res = call <32 x float> @llvm.sqrt.v32f32(<32 x float> %op)
1674  store <32 x float> %res, ptr %a
1675  ret void
1676}
1677
1678define void @fsqrt_v64f32(ptr %a) vscale_range(16,0) #0 {
1679; CHECK-LABEL: fsqrt_v64f32:
1680; CHECK:       // %bb.0:
1681; CHECK-NEXT:    ptrue p0.s, vl64
1682; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1683; CHECK-NEXT:    fsqrt z0.s, p0/m, z0.s
1684; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1685; CHECK-NEXT:    ret
1686  %op = load <64 x float>, ptr %a
1687  %res = call <64 x float> @llvm.sqrt.v64f32(<64 x float> %op)
1688  store <64 x float> %res, ptr %a
1689  ret void
1690}
1691
1692; Don't use SVE for 64-bit vectors.
1693define <1 x double> @fsqrt_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
1694; CHECK-LABEL: fsqrt_v1f64:
1695; CHECK:       // %bb.0:
1696; CHECK-NEXT:    fsqrt d0, d0
1697; CHECK-NEXT:    ret
1698  %res = call <1 x double> @llvm.sqrt.v1f64(<1 x double> %op)
1699  ret <1 x double> %res
1700}
1701
1702; Don't use SVE for 128-bit vectors.
1703define <2 x double> @fsqrt_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
1704; CHECK-LABEL: fsqrt_v2f64:
1705; CHECK:       // %bb.0:
1706; CHECK-NEXT:    fsqrt v0.2d, v0.2d
1707; CHECK-NEXT:    ret
1708  %res = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %op)
1709  ret <2 x double> %res
1710}
1711
1712define void @fsqrt_v4f64(ptr %a) vscale_range(2,0) #0 {
1713; CHECK-LABEL: fsqrt_v4f64:
1714; CHECK:       // %bb.0:
1715; CHECK-NEXT:    ptrue p0.d, vl4
1716; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1717; CHECK-NEXT:    fsqrt z0.d, p0/m, z0.d
1718; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1719; CHECK-NEXT:    ret
1720  %op = load <4 x double>, ptr %a
1721  %res = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %op)
1722  store <4 x double> %res, ptr %a
1723  ret void
1724}
1725
1726define void @fsqrt_v8f64(ptr %a) #0 {
1727; VBITS_GE_256-LABEL: fsqrt_v8f64:
1728; VBITS_GE_256:       // %bb.0:
1729; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
1730; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
1731; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1732; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
1733; VBITS_GE_256-NEXT:    fsqrt z0.d, p0/m, z0.d
1734; VBITS_GE_256-NEXT:    fsqrt z1.d, p0/m, z1.d
1735; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
1736; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
1737; VBITS_GE_256-NEXT:    ret
1738;
1739; VBITS_GE_512-LABEL: fsqrt_v8f64:
1740; VBITS_GE_512:       // %bb.0:
1741; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
1742; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
1743; VBITS_GE_512-NEXT:    fsqrt z0.d, p0/m, z0.d
1744; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
1745; VBITS_GE_512-NEXT:    ret
1746  %op = load <8 x double>, ptr %a
1747  %res = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %op)
1748  store <8 x double> %res, ptr %a
1749  ret void
1750}
1751
1752define void @fsqrt_v16f64(ptr %a) vscale_range(8,0) #0 {
1753; CHECK-LABEL: fsqrt_v16f64:
1754; CHECK:       // %bb.0:
1755; CHECK-NEXT:    ptrue p0.d, vl16
1756; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1757; CHECK-NEXT:    fsqrt z0.d, p0/m, z0.d
1758; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1759; CHECK-NEXT:    ret
1760  %op = load <16 x double>, ptr %a
1761  %res = call <16 x double> @llvm.sqrt.v16f64(<16 x double> %op)
1762  store <16 x double> %res, ptr %a
1763  ret void
1764}
1765
1766define void @fsqrt_v32f64(ptr %a) vscale_range(16,0) #0 {
1767; CHECK-LABEL: fsqrt_v32f64:
1768; CHECK:       // %bb.0:
1769; CHECK-NEXT:    ptrue p0.d, vl32
1770; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1771; CHECK-NEXT:    fsqrt z0.d, p0/m, z0.d
1772; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1773; CHECK-NEXT:    ret
1774  %op = load <32 x double>, ptr %a
1775  %res = call <32 x double> @llvm.sqrt.v32f64(<32 x double> %op)
1776  store <32 x double> %res, ptr %a
1777  ret void
1778}
1779
1780;
1781; FSUB
1782;
1783
1784; Don't use SVE for 64-bit vectors.
1785define <4 x half> @fsub_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
1786; CHECK-LABEL: fsub_v4f16:
1787; CHECK:       // %bb.0:
1788; CHECK-NEXT:    fsub v0.4h, v0.4h, v1.4h
1789; CHECK-NEXT:    ret
1790  %res = fsub <4 x half> %op1, %op2
1791  ret <4 x half> %res
1792}
1793
1794; Don't use SVE for 128-bit vectors.
1795define <8 x half> @fsub_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
1796; CHECK-LABEL: fsub_v8f16:
1797; CHECK:       // %bb.0:
1798; CHECK-NEXT:    fsub v0.8h, v0.8h, v1.8h
1799; CHECK-NEXT:    ret
1800  %res = fsub <8 x half> %op1, %op2
1801  ret <8 x half> %res
1802}
1803
1804define void @fsub_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
1805; CHECK-LABEL: fsub_v16f16:
1806; CHECK:       // %bb.0:
1807; CHECK-NEXT:    ptrue p0.h, vl16
1808; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1809; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
1810; CHECK-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
1811; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
1812; CHECK-NEXT:    ret
1813  %op1 = load <16 x half>, ptr %a
1814  %op2 = load <16 x half>, ptr %b
1815  %res = fsub <16 x half> %op1, %op2
1816  store <16 x half> %res, ptr %a
1817  ret void
1818}
1819
1820define void @fsub_v32f16(ptr %a, ptr %b) #0 {
1821; VBITS_GE_256-LABEL: fsub_v32f16:
1822; VBITS_GE_256:       // %bb.0:
1823; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
1824; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
1825; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
1826; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
1827; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
1828; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
1829; VBITS_GE_256-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
1830; VBITS_GE_256-NEXT:    movprfx z1, z2
1831; VBITS_GE_256-NEXT:    fsub z1.h, p0/m, z1.h, z3.h
1832; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
1833; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
1834; VBITS_GE_256-NEXT:    ret
1835;
1836; VBITS_GE_512-LABEL: fsub_v32f16:
1837; VBITS_GE_512:       // %bb.0:
1838; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
1839; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
1840; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
1841; VBITS_GE_512-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
1842; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
1843; VBITS_GE_512-NEXT:    ret
1844  %op1 = load <32 x half>, ptr %a
1845  %op2 = load <32 x half>, ptr %b
1846  %res = fsub <32 x half> %op1, %op2
1847  store <32 x half> %res, ptr %a
1848  ret void
1849}
1850
1851define void @fsub_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
1852; CHECK-LABEL: fsub_v64f16:
1853; CHECK:       // %bb.0:
1854; CHECK-NEXT:    ptrue p0.h, vl64
1855; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1856; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
1857; CHECK-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
1858; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
1859; CHECK-NEXT:    ret
1860  %op1 = load <64 x half>, ptr %a
1861  %op2 = load <64 x half>, ptr %b
1862  %res = fsub <64 x half> %op1, %op2
1863  store <64 x half> %res, ptr %a
1864  ret void
1865}
1866
1867define void @fsub_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
1868; CHECK-LABEL: fsub_v128f16:
1869; CHECK:       // %bb.0:
1870; CHECK-NEXT:    ptrue p0.h, vl128
1871; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1872; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
1873; CHECK-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
1874; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
1875; CHECK-NEXT:    ret
1876  %op1 = load <128 x half>, ptr %a
1877  %op2 = load <128 x half>, ptr %b
1878  %res = fsub <128 x half> %op1, %op2
1879  store <128 x half> %res, ptr %a
1880  ret void
1881}
1882
1883; Don't use SVE for 64-bit vectors.
1884define <2 x float> @fsub_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
1885; CHECK-LABEL: fsub_v2f32:
1886; CHECK:       // %bb.0:
1887; CHECK-NEXT:    fsub v0.2s, v0.2s, v1.2s
1888; CHECK-NEXT:    ret
1889  %res = fsub <2 x float> %op1, %op2
1890  ret <2 x float> %res
1891}
1892
1893; Don't use SVE for 128-bit vectors.
1894define <4 x float> @fsub_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
1895; CHECK-LABEL: fsub_v4f32:
1896; CHECK:       // %bb.0:
1897; CHECK-NEXT:    fsub v0.4s, v0.4s, v1.4s
1898; CHECK-NEXT:    ret
1899  %res = fsub <4 x float> %op1, %op2
1900  ret <4 x float> %res
1901}
1902
1903define void @fsub_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
1904; CHECK-LABEL: fsub_v8f32:
1905; CHECK:       // %bb.0:
1906; CHECK-NEXT:    ptrue p0.s, vl8
1907; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1908; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
1909; CHECK-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
1910; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1911; CHECK-NEXT:    ret
1912  %op1 = load <8 x float>, ptr %a
1913  %op2 = load <8 x float>, ptr %b
1914  %res = fsub <8 x float> %op1, %op2
1915  store <8 x float> %res, ptr %a
1916  ret void
1917}
1918
1919define void @fsub_v16f32(ptr %a, ptr %b) #0 {
1920; VBITS_GE_256-LABEL: fsub_v16f32:
1921; VBITS_GE_256:       // %bb.0:
1922; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
1923; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
1924; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1925; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
1926; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
1927; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
1928; VBITS_GE_256-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
1929; VBITS_GE_256-NEXT:    movprfx z1, z2
1930; VBITS_GE_256-NEXT:    fsub z1.s, p0/m, z1.s, z3.s
1931; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
1932; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
1933; VBITS_GE_256-NEXT:    ret
1934;
1935; VBITS_GE_512-LABEL: fsub_v16f32:
1936; VBITS_GE_512:       // %bb.0:
1937; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
1938; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
1939; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
1940; VBITS_GE_512-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
1941; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
1942; VBITS_GE_512-NEXT:    ret
1943  %op1 = load <16 x float>, ptr %a
1944  %op2 = load <16 x float>, ptr %b
1945  %res = fsub <16 x float> %op1, %op2
1946  store <16 x float> %res, ptr %a
1947  ret void
1948}
1949
1950define void @fsub_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
1951; CHECK-LABEL: fsub_v32f32:
1952; CHECK:       // %bb.0:
1953; CHECK-NEXT:    ptrue p0.s, vl32
1954; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1955; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
1956; CHECK-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
1957; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1958; CHECK-NEXT:    ret
1959  %op1 = load <32 x float>, ptr %a
1960  %op2 = load <32 x float>, ptr %b
1961  %res = fsub <32 x float> %op1, %op2
1962  store <32 x float> %res, ptr %a
1963  ret void
1964}
1965
1966define void @fsub_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
1967; CHECK-LABEL: fsub_v64f32:
1968; CHECK:       // %bb.0:
1969; CHECK-NEXT:    ptrue p0.s, vl64
1970; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1971; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
1972; CHECK-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
1973; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1974; CHECK-NEXT:    ret
1975  %op1 = load <64 x float>, ptr %a
1976  %op2 = load <64 x float>, ptr %b
1977  %res = fsub <64 x float> %op1, %op2
1978  store <64 x float> %res, ptr %a
1979  ret void
1980}
1981
1982; Don't use SVE for 64-bit vectors.
1983define <1 x double> @fsub_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
1984; CHECK-LABEL: fsub_v1f64:
1985; CHECK:       // %bb.0:
1986; CHECK-NEXT:    fsub d0, d0, d1
1987; CHECK-NEXT:    ret
1988  %res = fsub <1 x double> %op1, %op2
1989  ret <1 x double> %res
1990}
1991
1992; Don't use SVE for 128-bit vectors.
1993define <2 x double> @fsub_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
1994; CHECK-LABEL: fsub_v2f64:
1995; CHECK:       // %bb.0:
1996; CHECK-NEXT:    fsub v0.2d, v0.2d, v1.2d
1997; CHECK-NEXT:    ret
1998  %res = fsub <2 x double> %op1, %op2
1999  ret <2 x double> %res
2000}
2001
2002define void @fsub_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
2003; CHECK-LABEL: fsub_v4f64:
2004; CHECK:       // %bb.0:
2005; CHECK-NEXT:    ptrue p0.d, vl4
2006; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
2007; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
2008; CHECK-NEXT:    fsub z0.d, p0/m, z0.d, z1.d
2009; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
2010; CHECK-NEXT:    ret
2011  %op1 = load <4 x double>, ptr %a
2012  %op2 = load <4 x double>, ptr %b
2013  %res = fsub <4 x double> %op1, %op2
2014  store <4 x double> %res, ptr %a
2015  ret void
2016}
2017
2018define void @fsub_v8f64(ptr %a, ptr %b) #0 {
2019; VBITS_GE_256-LABEL: fsub_v8f64:
2020; VBITS_GE_256:       // %bb.0:
2021; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
2022; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
2023; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
2024; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
2025; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
2026; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
2027; VBITS_GE_256-NEXT:    fsub z0.d, p0/m, z0.d, z1.d
2028; VBITS_GE_256-NEXT:    movprfx z1, z2
2029; VBITS_GE_256-NEXT:    fsub z1.d, p0/m, z1.d, z3.d
2030; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
2031; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
2032; VBITS_GE_256-NEXT:    ret
2033;
2034; VBITS_GE_512-LABEL: fsub_v8f64:
2035; VBITS_GE_512:       // %bb.0:
2036; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
2037; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
2038; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
2039; VBITS_GE_512-NEXT:    fsub z0.d, p0/m, z0.d, z1.d
2040; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
2041; VBITS_GE_512-NEXT:    ret
2042  %op1 = load <8 x double>, ptr %a
2043  %op2 = load <8 x double>, ptr %b
2044  %res = fsub <8 x double> %op1, %op2
2045  store <8 x double> %res, ptr %a
2046  ret void
2047}
2048
2049define void @fsub_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
2050; CHECK-LABEL: fsub_v16f64:
2051; CHECK:       // %bb.0:
2052; CHECK-NEXT:    ptrue p0.d, vl16
2053; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
2054; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
2055; CHECK-NEXT:    fsub z0.d, p0/m, z0.d, z1.d
2056; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
2057; CHECK-NEXT:    ret
2058  %op1 = load <16 x double>, ptr %a
2059  %op2 = load <16 x double>, ptr %b
2060  %res = fsub <16 x double> %op1, %op2
2061  store <16 x double> %res, ptr %a
2062  ret void
2063}
2064
2065define void @fsub_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
2066; CHECK-LABEL: fsub_v32f64:
2067; CHECK:       // %bb.0:
2068; CHECK-NEXT:    ptrue p0.d, vl32
2069; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
2070; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
2071; CHECK-NEXT:    fsub z0.d, p0/m, z0.d, z1.d
2072; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
2073; CHECK-NEXT:    ret
2074  %op1 = load <32 x double>, ptr %a
2075  %op2 = load <32 x double>, ptr %b
2076  %res = fsub <32 x double> %op1, %op2
2077  store <32 x double> %res, ptr %a
2078  ret void
2079}
2080
2081;
2082; FABS
2083;
2084
2085; Don't use SVE for 64-bit vectors.
2086define <4 x half> @fabs_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
2087; CHECK-LABEL: fabs_v4f16:
2088; CHECK:       // %bb.0:
2089; CHECK-NEXT:    fabs v0.4h, v0.4h
2090; CHECK-NEXT:    ret
2091  %res = call <4 x half> @llvm.fabs.v4f16(<4 x half> %op)
2092  ret <4 x half> %res
2093}
2094
2095; Don't use SVE for 128-bit vectors.
2096define <8 x half> @fabs_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
2097; CHECK-LABEL: fabs_v8f16:
2098; CHECK:       // %bb.0:
2099; CHECK-NEXT:    fabs v0.8h, v0.8h
2100; CHECK-NEXT:    ret
2101  %res = call <8 x half> @llvm.fabs.v8f16(<8 x half> %op)
2102  ret <8 x half> %res
2103}
2104
2105define void @fabs_v16f16(ptr %a) vscale_range(2,0) #0 {
2106; CHECK-LABEL: fabs_v16f16:
2107; CHECK:       // %bb.0:
2108; CHECK-NEXT:    ptrue p0.h, vl16
2109; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
2110; CHECK-NEXT:    fabs z0.h, p0/m, z0.h
2111; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
2112; CHECK-NEXT:    ret
2113  %op = load <16 x half>, ptr %a
2114  %res = call <16 x half> @llvm.fabs.v16f16(<16 x half> %op)
2115  store <16 x half> %res, ptr %a
2116  ret void
2117}
2118
2119define void @fabs_v32f16(ptr %a) #0 {
2120; VBITS_GE_256-LABEL: fabs_v32f16:
2121; VBITS_GE_256:       // %bb.0:
2122; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
2123; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
2124; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
2125; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
2126; VBITS_GE_256-NEXT:    fabs z0.h, p0/m, z0.h
2127; VBITS_GE_256-NEXT:    fabs z1.h, p0/m, z1.h
2128; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
2129; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
2130; VBITS_GE_256-NEXT:    ret
2131;
2132; VBITS_GE_512-LABEL: fabs_v32f16:
2133; VBITS_GE_512:       // %bb.0:
2134; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
2135; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
2136; VBITS_GE_512-NEXT:    fabs z0.h, p0/m, z0.h
2137; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
2138; VBITS_GE_512-NEXT:    ret
2139  %op = load <32 x half>, ptr %a
2140  %res = call <32 x half> @llvm.fabs.v32f16(<32 x half> %op)
2141  store <32 x half> %res, ptr %a
2142  ret void
2143}
2144
2145define void @fabs_v64f16(ptr %a) vscale_range(8,0) #0 {
2146; CHECK-LABEL: fabs_v64f16:
2147; CHECK:       // %bb.0:
2148; CHECK-NEXT:    ptrue p0.h, vl64
2149; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
2150; CHECK-NEXT:    fabs z0.h, p0/m, z0.h
2151; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
2152; CHECK-NEXT:    ret
2153  %op = load <64 x half>, ptr %a
2154  %res = call <64 x half> @llvm.fabs.v64f16(<64 x half> %op)
2155  store <64 x half> %res, ptr %a
2156  ret void
2157}
2158
2159define void @fabs_v128f16(ptr %a) vscale_range(16,0) #0 {
2160; CHECK-LABEL: fabs_v128f16:
2161; CHECK:       // %bb.0:
2162; CHECK-NEXT:    ptrue p0.h, vl128
2163; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
2164; CHECK-NEXT:    fabs z0.h, p0/m, z0.h
2165; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
2166; CHECK-NEXT:    ret
2167  %op = load <128 x half>, ptr %a
2168  %res = call <128 x half> @llvm.fabs.v128f16(<128 x half> %op)
2169  store <128 x half> %res, ptr %a
2170  ret void
2171}
2172
2173; Don't use SVE for 64-bit vectors.
2174define <2 x float> @fabs_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
2175; CHECK-LABEL: fabs_v2f32:
2176; CHECK:       // %bb.0:
2177; CHECK-NEXT:    fabs v0.2s, v0.2s
2178; CHECK-NEXT:    ret
2179  %res = call <2 x float> @llvm.fabs.v2f32(<2 x float> %op)
2180  ret <2 x float> %res
2181}
2182
2183; Don't use SVE for 128-bit vectors.
2184define <4 x float> @fabs_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
2185; CHECK-LABEL: fabs_v4f32:
2186; CHECK:       // %bb.0:
2187; CHECK-NEXT:    fabs v0.4s, v0.4s
2188; CHECK-NEXT:    ret
2189  %res = call <4 x float> @llvm.fabs.v4f32(<4 x float> %op)
2190  ret <4 x float> %res
2191}
2192
2193define void @fabs_v8f32(ptr %a) vscale_range(2,0) #0 {
2194; CHECK-LABEL: fabs_v8f32:
2195; CHECK:       // %bb.0:
2196; CHECK-NEXT:    ptrue p0.s, vl8
2197; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
2198; CHECK-NEXT:    fabs z0.s, p0/m, z0.s
2199; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
2200; CHECK-NEXT:    ret
2201  %op = load <8 x float>, ptr %a
2202  %res = call <8 x float> @llvm.fabs.v8f32(<8 x float> %op)
2203  store <8 x float> %res, ptr %a
2204  ret void
2205}
2206
2207define void @fabs_v16f32(ptr %a) #0 {
2208; VBITS_GE_256-LABEL: fabs_v16f32:
2209; VBITS_GE_256:       // %bb.0:
2210; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
2211; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
2212; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
2213; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
2214; VBITS_GE_256-NEXT:    fabs z0.s, p0/m, z0.s
2215; VBITS_GE_256-NEXT:    fabs z1.s, p0/m, z1.s
2216; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
2217; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
2218; VBITS_GE_256-NEXT:    ret
2219;
2220; VBITS_GE_512-LABEL: fabs_v16f32:
2221; VBITS_GE_512:       // %bb.0:
2222; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
2223; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
2224; VBITS_GE_512-NEXT:    fabs z0.s, p0/m, z0.s
2225; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
2226; VBITS_GE_512-NEXT:    ret
2227  %op = load <16 x float>, ptr %a
2228  %res = call <16 x float> @llvm.fabs.v16f32(<16 x float> %op)
2229  store <16 x float> %res, ptr %a
2230  ret void
2231}
2232
2233define void @fabs_v32f32(ptr %a) vscale_range(8,0) #0 {
2234; CHECK-LABEL: fabs_v32f32:
2235; CHECK:       // %bb.0:
2236; CHECK-NEXT:    ptrue p0.s, vl32
2237; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
2238; CHECK-NEXT:    fabs z0.s, p0/m, z0.s
2239; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
2240; CHECK-NEXT:    ret
2241  %op = load <32 x float>, ptr %a
2242  %res = call <32 x float> @llvm.fabs.v32f32(<32 x float> %op)
2243  store <32 x float> %res, ptr %a
2244  ret void
2245}
2246
2247define void @fabs_v64f32(ptr %a) vscale_range(16,0) #0 {
2248; CHECK-LABEL: fabs_v64f32:
2249; CHECK:       // %bb.0:
2250; CHECK-NEXT:    ptrue p0.s, vl64
2251; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
2252; CHECK-NEXT:    fabs z0.s, p0/m, z0.s
2253; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
2254; CHECK-NEXT:    ret
2255  %op = load <64 x float>, ptr %a
2256  %res = call <64 x float> @llvm.fabs.v64f32(<64 x float> %op)
2257  store <64 x float> %res, ptr %a
2258  ret void
2259}
2260
2261; Don't use SVE for 64-bit vectors.
2262define <1 x double> @fabs_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
2263; CHECK-LABEL: fabs_v1f64:
2264; CHECK:       // %bb.0:
2265; CHECK-NEXT:    fabs d0, d0
2266; CHECK-NEXT:    ret
2267  %res = call <1 x double> @llvm.fabs.v1f64(<1 x double> %op)
2268  ret <1 x double> %res
2269}
2270
2271; Don't use SVE for 128-bit vectors.
2272define <2 x double> @fabs_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
2273; CHECK-LABEL: fabs_v2f64:
2274; CHECK:       // %bb.0:
2275; CHECK-NEXT:    fabs v0.2d, v0.2d
2276; CHECK-NEXT:    ret
2277  %res = call <2 x double> @llvm.fabs.v2f64(<2 x double> %op)
2278  ret <2 x double> %res
2279}
2280
2281define void @fabs_v4f64(ptr %a) vscale_range(2,0) #0 {
2282; CHECK-LABEL: fabs_v4f64:
2283; CHECK:       // %bb.0:
2284; CHECK-NEXT:    ptrue p0.d, vl4
2285; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
2286; CHECK-NEXT:    fabs z0.d, p0/m, z0.d
2287; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
2288; CHECK-NEXT:    ret
2289  %op = load <4 x double>, ptr %a
2290  %res = call <4 x double> @llvm.fabs.v4f64(<4 x double> %op)
2291  store <4 x double> %res, ptr %a
2292  ret void
2293}
2294
2295define void @fabs_v8f64(ptr %a) #0 {
2296; VBITS_GE_256-LABEL: fabs_v8f64:
2297; VBITS_GE_256:       // %bb.0:
2298; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
2299; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
2300; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
2301; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
2302; VBITS_GE_256-NEXT:    fabs z0.d, p0/m, z0.d
2303; VBITS_GE_256-NEXT:    fabs z1.d, p0/m, z1.d
2304; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
2305; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
2306; VBITS_GE_256-NEXT:    ret
2307;
2308; VBITS_GE_512-LABEL: fabs_v8f64:
2309; VBITS_GE_512:       // %bb.0:
2310; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
2311; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
2312; VBITS_GE_512-NEXT:    fabs z0.d, p0/m, z0.d
2313; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
2314; VBITS_GE_512-NEXT:    ret
2315  %op = load <8 x double>, ptr %a
2316  %res = call <8 x double> @llvm.fabs.v8f64(<8 x double> %op)
2317  store <8 x double> %res, ptr %a
2318  ret void
2319}
2320
2321define void @fabs_v16f64(ptr %a) vscale_range(8,0) #0 {
2322; CHECK-LABEL: fabs_v16f64:
2323; CHECK:       // %bb.0:
2324; CHECK-NEXT:    ptrue p0.d, vl16
2325; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
2326; CHECK-NEXT:    fabs z0.d, p0/m, z0.d
2327; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
2328; CHECK-NEXT:    ret
2329  %op = load <16 x double>, ptr %a
2330  %res = call <16 x double> @llvm.fabs.v16f64(<16 x double> %op)
2331  store <16 x double> %res, ptr %a
2332  ret void
2333}
2334
2335define void @fabs_v32f64(ptr %a) vscale_range(16,0) #0 {
2336; CHECK-LABEL: fabs_v32f64:
2337; CHECK:       // %bb.0:
2338; CHECK-NEXT:    ptrue p0.d, vl32
2339; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
2340; CHECK-NEXT:    fabs z0.d, p0/m, z0.d
2341; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
2342; CHECK-NEXT:    ret
2343  %op = load <32 x double>, ptr %a
2344  %res = call <32 x double> @llvm.fabs.v32f64(<32 x double> %op)
2345  store <32 x double> %res, ptr %a
2346  ret void
2347}
2348
2349attributes #0 = { "target-features"="+sve" }
2350
2351declare <4 x half> @llvm.fma.v4f16(<4 x half>, <4 x half>, <4 x half>)
2352declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>)
2353declare <16 x half> @llvm.fma.v16f16(<16 x half>, <16 x half>, <16 x half>)
2354declare <32 x half> @llvm.fma.v32f16(<32 x half>, <32 x half>, <32 x half>)
2355declare <64 x half> @llvm.fma.v64f16(<64 x half>, <64 x half>, <64 x half>)
2356declare <128 x half> @llvm.fma.v128f16(<128 x half>, <128 x half>, <128 x half>)
2357declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
2358declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
2359declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>)
2360declare <16 x float> @llvm.fma.v16f32(<16 x float>, <16 x float>, <16 x float>)
2361declare <32 x float> @llvm.fma.v32f32(<32 x float>, <32 x float>, <32 x float>)
2362declare <64 x float> @llvm.fma.v64f32(<64 x float>, <64 x float>, <64 x float>)
2363declare <1 x double> @llvm.fma.v1f64(<1 x double>, <1 x double>, <1 x double>)
2364declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
2365declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>)
2366declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>)
2367declare <16 x double> @llvm.fma.v16f64(<16 x double>, <16 x double>, <16 x double>)
2368declare <32 x double> @llvm.fma.v32f64(<32 x double>, <32 x double>, <32 x double>)
2369
2370declare <4 x half> @llvm.sqrt.v4f16(<4 x half>)
2371declare <8 x half> @llvm.sqrt.v8f16(<8 x half>)
2372declare <16 x half> @llvm.sqrt.v16f16(<16 x half>)
2373declare <32 x half> @llvm.sqrt.v32f16(<32 x half>)
2374declare <64 x half> @llvm.sqrt.v64f16(<64 x half>)
2375declare <128 x half> @llvm.sqrt.v128f16(<128 x half>)
2376declare <2 x float> @llvm.sqrt.v2f32(<2 x float>)
2377declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
2378declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
2379declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
2380declare <32 x float> @llvm.sqrt.v32f32(<32 x float>)
2381declare <64 x float> @llvm.sqrt.v64f32(<64 x float>)
2382declare <1 x double> @llvm.sqrt.v1f64(<1 x double>)
2383declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
2384declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
2385declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
2386declare <16 x double> @llvm.sqrt.v16f64(<16 x double>)
2387declare <32 x double> @llvm.sqrt.v32f64(<32 x double>)
2388
2389declare <4 x half> @llvm.fabs.v4f16(<4 x half>)
2390declare <8 x half> @llvm.fabs.v8f16(<8 x half>)
2391declare <16 x half> @llvm.fabs.v16f16(<16 x half>)
2392declare <32 x half> @llvm.fabs.v32f16(<32 x half>)
2393declare <64 x half> @llvm.fabs.v64f16(<64 x half>)
2394declare <128 x half> @llvm.fabs.v128f16(<128 x half>)
2395declare <2 x float> @llvm.fabs.v2f32(<2 x float>)
2396declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
2397declare <8 x float> @llvm.fabs.v8f32(<8 x float>)
2398declare <16 x float> @llvm.fabs.v16f32(<16 x float>)
2399declare <32 x float> @llvm.fabs.v32f32(<32 x float>)
2400declare <64 x float> @llvm.fabs.v64f32(<64 x float>)
2401declare <1 x double> @llvm.fabs.v1f64(<1 x double>)
2402declare <2 x double> @llvm.fabs.v2f64(<2 x double>)
2403declare <4 x double> @llvm.fabs.v4f64(<4 x double>)
2404declare <8 x double> @llvm.fabs.v8f64(<8 x double>)
2405declare <16 x double> @llvm.fabs.v16f64(<16 x double>)
2406declare <32 x double> @llvm.fabs.v32f64(<32 x double>)
2407