xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll (revision b24af43fdfa1b1242b7cb77540462212227c57c4)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -O3 -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3; RUN: llc -O3 -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4; RUN: llc -O3 -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
5
6target triple = "aarch64-unknown-linux-gnu"
7
8;
9; FMA
10;
11
12; Don't use SVE for 64-bit vectors.
13define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) vscale_range(2,0) #0 {
14; CHECK-LABEL: fma_v4f16:
15; CHECK:       // %bb.0:
16; CHECK-NEXT:    fmla v2.4h, v0.4h, v1.4h
17; CHECK-NEXT:    fmov d0, d2
18; CHECK-NEXT:    ret
19  %mul = fmul contract <4 x half> %op1, %op2
20  %res = fadd contract <4 x half> %mul, %op3
21  ret <4 x half> %res
22}
23
24; Don't use SVE for 128-bit vectors.
25define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) vscale_range(2,0) #0 {
26; CHECK-LABEL: fma_v8f16:
27; CHECK:       // %bb.0:
28; CHECK-NEXT:    fmla v2.8h, v0.8h, v1.8h
29; CHECK-NEXT:    mov v0.16b, v2.16b
30; CHECK-NEXT:    ret
31  %mul = fmul contract <8 x half> %op1, %op2
32  %res = fadd contract <8 x half> %mul, %op3
33  ret <8 x half> %res
34}
35
36define void @fma_v16f16(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
37; CHECK-LABEL: fma_v16f16:
38; CHECK:       // %bb.0:
39; CHECK-NEXT:    ptrue p0.h, vl16
40; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
41; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
42; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x2]
43; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
44; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
45; CHECK-NEXT:    ret
46  %op1 = load <16 x half>, ptr %a
47  %op2 = load <16 x half>, ptr %b
48  %op3 = load <16 x half>, ptr %c
49  %mul = fmul contract <16 x half> %op1, %op2
50  %res = fadd contract <16 x half> %mul, %op3
51  store <16 x half> %res, ptr %a
52  ret void
53}
54
55define void @fma_v32f16(ptr %a, ptr %b, ptr %c) #0 {
56; VBITS_GE_256-LABEL: fma_v32f16:
57; VBITS_GE_256:       // %bb.0:
58; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
59; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
60; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
61; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
62; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x2, x8, lsl #1]
63; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x0]
64; VBITS_GE_256-NEXT:    ld1h { z4.h }, p0/z, [x1]
65; VBITS_GE_256-NEXT:    ld1h { z5.h }, p0/z, [x2]
66; VBITS_GE_256-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
67; VBITS_GE_256-NEXT:    movprfx z1, z5
68; VBITS_GE_256-NEXT:    fmla z1.h, p0/m, z3.h, z4.h
69; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
70; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
71; VBITS_GE_256-NEXT:    ret
72;
73; VBITS_GE_512-LABEL: fma_v32f16:
74; VBITS_GE_512:       // %bb.0:
75; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
76; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
77; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
78; VBITS_GE_512-NEXT:    ld1h { z2.h }, p0/z, [x2]
79; VBITS_GE_512-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
80; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
81; VBITS_GE_512-NEXT:    ret
82  %op1 = load <32 x half>, ptr %a
83  %op2 = load <32 x half>, ptr %b
84  %op3 = load <32 x half>, ptr %c
85  %mul = fmul contract <32 x half> %op1, %op2
86  %res = fadd contract <32 x half> %mul, %op3
87  store <32 x half> %res, ptr %a
88  ret void
89}
90
91define void @fma_v64f16(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 {
92; CHECK-LABEL: fma_v64f16:
93; CHECK:       // %bb.0:
94; CHECK-NEXT:    ptrue p0.h, vl64
95; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
96; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
97; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x2]
98; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
99; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
100; CHECK-NEXT:    ret
101  %op1 = load <64 x half>, ptr %a
102  %op2 = load <64 x half>, ptr %b
103  %op3 = load <64 x half>, ptr %c
104  %mul = fmul contract <64 x half> %op1, %op2
105  %res = fadd contract <64 x half> %mul, %op3
106  store <64 x half> %res, ptr %a
107  ret void
108}
109
110define void @fma_v128f16(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
111; CHECK-LABEL: fma_v128f16:
112; CHECK:       // %bb.0:
113; CHECK-NEXT:    ptrue p0.h, vl128
114; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
115; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
116; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x2]
117; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
118; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
119; CHECK-NEXT:    ret
120  %op1 = load <128 x half>, ptr %a
121  %op2 = load <128 x half>, ptr %b
122  %op3 = load <128 x half>, ptr %c
123  %mul = fmul contract <128 x half> %op1, %op2
124  %res = fadd contract <128 x half> %mul, %op3
125  store <128 x half> %res, ptr %a
126  ret void
127}
128
129; Don't use SVE for 64-bit vectors.
130define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3) vscale_range(2,0) #0 {
131; CHECK-LABEL: fma_v2f32:
132; CHECK:       // %bb.0:
133; CHECK-NEXT:    fmla v2.2s, v0.2s, v1.2s
134; CHECK-NEXT:    fmov d0, d2
135; CHECK-NEXT:    ret
136  %mul = fmul contract <2 x float> %op1, %op2
137  %res = fadd contract <2 x float> %mul, %op3
138  ret <2 x float> %res
139}
140
141; Don't use SVE for 128-bit vectors.
142define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3) vscale_range(2,0) #0 {
143; CHECK-LABEL: fma_v4f32:
144; CHECK:       // %bb.0:
145; CHECK-NEXT:    fmla v2.4s, v0.4s, v1.4s
146; CHECK-NEXT:    mov v0.16b, v2.16b
147; CHECK-NEXT:    ret
148  %mul = fmul contract <4 x float> %op1, %op2
149  %res = fadd contract <4 x float> %mul, %op3
150  ret <4 x float> %res
151}
152
153define void @fma_v8f32(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
154; CHECK-LABEL: fma_v8f32:
155; CHECK:       // %bb.0:
156; CHECK-NEXT:    ptrue p0.s, vl8
157; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
158; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
159; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x2]
160; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
161; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
162; CHECK-NEXT:    ret
163  %op1 = load <8 x float>, ptr %a
164  %op2 = load <8 x float>, ptr %b
165  %op3 = load <8 x float>, ptr %c
166  %mul = fmul contract <8 x float> %op1, %op2
167  %res = fadd contract <8 x float> %mul, %op3
168  store <8 x float> %res, ptr %a
169  ret void
170}
171
172define void @fma_v16f32(ptr %a, ptr %b, ptr %c) #0 {
173; VBITS_GE_256-LABEL: fma_v16f32:
174; VBITS_GE_256:       // %bb.0:
175; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
176; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
177; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
178; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
179; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x2, x8, lsl #2]
180; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
181; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x1]
182; VBITS_GE_256-NEXT:    ld1w { z5.s }, p0/z, [x2]
183; VBITS_GE_256-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
184; VBITS_GE_256-NEXT:    movprfx z1, z5
185; VBITS_GE_256-NEXT:    fmla z1.s, p0/m, z3.s, z4.s
186; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
187; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
188; VBITS_GE_256-NEXT:    ret
189;
190; VBITS_GE_512-LABEL: fma_v16f32:
191; VBITS_GE_512:       // %bb.0:
192; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
193; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
194; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
195; VBITS_GE_512-NEXT:    ld1w { z2.s }, p0/z, [x2]
196; VBITS_GE_512-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
197; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
198; VBITS_GE_512-NEXT:    ret
199  %op1 = load <16 x float>, ptr %a
200  %op2 = load <16 x float>, ptr %b
201  %op3 = load <16 x float>, ptr %c
202  %mul = fmul contract <16 x float> %op1, %op2
203  %res = fadd contract <16 x float> %mul, %op3
204  store <16 x float> %res, ptr %a
205  ret void
206}
207
208define void @fma_v32f32(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 {
209; CHECK-LABEL: fma_v32f32:
210; CHECK:       // %bb.0:
211; CHECK-NEXT:    ptrue p0.s, vl32
212; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
213; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
214; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x2]
215; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
216; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
217; CHECK-NEXT:    ret
218  %op1 = load <32 x float>, ptr %a
219  %op2 = load <32 x float>, ptr %b
220  %op3 = load <32 x float>, ptr %c
221  %mul = fmul contract <32 x float> %op1, %op2
222  %res = fadd contract <32 x float> %mul, %op3
223  store <32 x float> %res, ptr %a
224  ret void
225}
226
227define void @fma_v64f32(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
228; CHECK-LABEL: fma_v64f32:
229; CHECK:       // %bb.0:
230; CHECK-NEXT:    ptrue p0.s, vl64
231; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
232; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
233; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x2]
234; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
235; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
236; CHECK-NEXT:    ret
237  %op1 = load <64 x float>, ptr %a
238  %op2 = load <64 x float>, ptr %b
239  %op3 = load <64 x float>, ptr %c
240  %mul = fmul contract <64 x float> %op1, %op2
241  %res = fadd contract <64 x float> %mul, %op3
242  store <64 x float> %res, ptr %a
243  ret void
244}
245
246; Don't use SVE for 64-bit vectors.
247define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double> %op3) vscale_range(2,0) #0 {
248; CHECK-LABEL: fma_v1f64:
249; CHECK:       // %bb.0:
250; CHECK-NEXT:    fmadd d0, d0, d1, d2
251; CHECK-NEXT:    ret
252  %mul = fmul contract <1 x double> %op1, %op2
253  %res = fadd contract <1 x double> %mul, %op3
254  ret <1 x double> %res
255}
256
257; Don't use SVE for 128-bit vectors.
258define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3) vscale_range(2,0) #0 {
259; CHECK-LABEL: fma_v2f64:
260; CHECK:       // %bb.0:
261; CHECK-NEXT:    fmla v2.2d, v0.2d, v1.2d
262; CHECK-NEXT:    mov v0.16b, v2.16b
263; CHECK-NEXT:    ret
264  %mul = fmul contract <2 x double> %op1, %op2
265  %res = fadd contract <2 x double> %mul, %op3
266  ret <2 x double> %res
267}
268
269define void @fma_v4f64(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
270; CHECK-LABEL: fma_v4f64:
271; CHECK:       // %bb.0:
272; CHECK-NEXT:    ptrue p0.d, vl4
273; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
274; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
275; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x2]
276; CHECK-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
277; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
278; CHECK-NEXT:    ret
279  %op1 = load <4 x double>, ptr %a
280  %op2 = load <4 x double>, ptr %b
281  %op3 = load <4 x double>, ptr %c
282  %mul = fmul contract <4 x double> %op1, %op2
283  %res = fadd contract <4 x double> %mul, %op3
284  store <4 x double> %res, ptr %a
285  ret void
286}
287
288define void @fma_v8f64(ptr %a, ptr %b, ptr %c) #0 {
289; VBITS_GE_256-LABEL: fma_v8f64:
290; VBITS_GE_256:       // %bb.0:
291; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
292; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
293; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
294; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
295; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x2, x8, lsl #3]
296; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0]
297; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1]
298; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x2]
299; VBITS_GE_256-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
300; VBITS_GE_256-NEXT:    movprfx z1, z5
301; VBITS_GE_256-NEXT:    fmla z1.d, p0/m, z3.d, z4.d
302; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
303; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
304; VBITS_GE_256-NEXT:    ret
305;
306; VBITS_GE_512-LABEL: fma_v8f64:
307; VBITS_GE_512:       // %bb.0:
308; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
309; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
310; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
311; VBITS_GE_512-NEXT:    ld1d { z2.d }, p0/z, [x2]
312; VBITS_GE_512-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
313; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
314; VBITS_GE_512-NEXT:    ret
315  %op1 = load <8 x double>, ptr %a
316  %op2 = load <8 x double>, ptr %b
317  %op3 = load <8 x double>, ptr %c
318  %mul = fmul contract <8 x double> %op1, %op2
319  %res = fadd contract <8 x double> %mul, %op3
320  store <8 x double> %res, ptr %a
321  ret void
322}
323
324define void @fma_v16f64(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 {
325; CHECK-LABEL: fma_v16f64:
326; CHECK:       // %bb.0:
327; CHECK-NEXT:    ptrue p0.d, vl16
328; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
329; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
330; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x2]
331; CHECK-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
332; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
333; CHECK-NEXT:    ret
334  %op1 = load <16 x double>, ptr %a
335  %op2 = load <16 x double>, ptr %b
336  %op3 = load <16 x double>, ptr %c
337  %mul = fmul contract <16 x double> %op1, %op2
338  %res = fadd contract <16 x double> %mul, %op3
339  store <16 x double> %res, ptr %a
340  ret void
341}
342
343define void @fma_v32f64(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
344; CHECK-LABEL: fma_v32f64:
345; CHECK:       // %bb.0:
346; CHECK-NEXT:    ptrue p0.d, vl32
347; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
348; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
349; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x2]
350; CHECK-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
351; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
352; CHECK-NEXT:    ret
353  %op1 = load <32 x double>, ptr %a
354  %op2 = load <32 x double>, ptr %b
355  %op3 = load <32 x double>, ptr %c
356  %mul = fmul contract <32 x double> %op1, %op2
357  %res = fadd contract <32 x double> %mul, %op3
358  store <32 x double> %res, ptr %a
359  ret void
360}
361
362attributes #0 = { "target-features"="+sve" }
363