xref: /llvm-project/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll (revision 6e1ea7e5a7b6e581bf9a030b98a7f63ee2833278)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
3
4; This file tests the following combinations related to streaming-enabled functions:
5; [ ] N  ->  S    (Normal -> Streaming)
6; [ ] S  ->  N    (Streaming -> Normal)
7; [ ] S  ->  S    (Streaming -> Streaming)
8; [ ] S  ->  SC   (Streaming -> Streaming-compatible)
9;
10; The following combination is tested in sme-streaming-compatible-interface.ll
11; [ ] SC ->  S    (Streaming-compatible -> Streaming)
12
13declare void @normal_callee()
14declare void @streaming_callee() "aarch64_pstate_sm_enabled"
15declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible"
16
17; [x] N  ->  S
18; [ ] S  ->  N
19; [ ] S  ->  S
20; [ ] S  ->  SC
21define void @normal_caller_streaming_callee() nounwind {
22; CHECK-LABEL: normal_caller_streaming_callee:
23; CHECK:       // %bb.0:
24; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
25; CHECK-NEXT:    cntd x9
26; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
27; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
28; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
29; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
30; CHECK-NEXT:    smstart sm
31; CHECK-NEXT:    bl streaming_callee
32; CHECK-NEXT:    smstop sm
33; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
34; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
35; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
36; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
37; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
38; CHECK-NEXT:    ret
39  call void @streaming_callee()
40  ret void;
41}
42
43; [ ] N  ->  S
44; [x] S  ->  N
45; [ ] S  ->  S
46; [ ] S  ->  SC
47define void @streaming_caller_normal_callee() nounwind "aarch64_pstate_sm_enabled" {
48; CHECK-LABEL: streaming_caller_normal_callee:
49; CHECK:       // %bb.0:
50; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
51; CHECK-NEXT:    cntd x9
52; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
53; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
54; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
55; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
56; CHECK-NEXT:    smstop sm
57; CHECK-NEXT:    bl normal_callee
58; CHECK-NEXT:    smstart sm
59; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
60; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
61; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
62; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
63; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
64; CHECK-NEXT:    ret
65  call void @normal_callee()
66  ret void;
67}
68
69; [ ] N  ->  S
70; [ ] S  ->  N
71; [x] S  ->  S
72; [ ] S  ->  SC
73define void @streaming_caller_streaming_callee() nounwind "aarch64_pstate_sm_enabled" {
74; CHECK-LABEL: streaming_caller_streaming_callee:
75; CHECK:       // %bb.0:
76; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
77; CHECK-NEXT:    bl streaming_callee
78; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
79; CHECK-NEXT:    ret
80  call void @streaming_callee()
81  ret void;
82}
83
84; [ ] N  ->  S
85; [ ] S  ->  N
86; [ ] S  ->  S
87; [x] S  ->  SC
88define void @streaming_caller_streaming_compatible_callee() nounwind "aarch64_pstate_sm_enabled" {
89; CHECK-LABEL: streaming_caller_streaming_compatible_callee:
90; CHECK:       // %bb.0:
91; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
92; CHECK-NEXT:    bl streaming_compatible_callee
93; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
94; CHECK-NEXT:    ret
95  call void @streaming_compatible_callee()
96  ret void;
97}
98
99;
100; Handle special cases here.
101;
102
103; Call to function-pointer (with attribute)
104define void @call_to_function_pointer_streaming_enabled(ptr %p) nounwind {
105; CHECK-LABEL: call_to_function_pointer_streaming_enabled:
106; CHECK:       // %bb.0:
107; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
108; CHECK-NEXT:    cntd x9
109; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
110; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
111; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
112; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
113; CHECK-NEXT:    smstart sm
114; CHECK-NEXT:    blr x0
115; CHECK-NEXT:    smstop sm
116; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
117; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
118; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
119; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
120; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
121; CHECK-NEXT:    ret
122  call void %p() "aarch64_pstate_sm_enabled"
123  ret void
124}
125
126; Ensure NEON registers are preserved correctly.
127define <4 x i32> @smstart_clobber_simdfp(<4 x i32> %x) nounwind {
128; CHECK-LABEL: smstart_clobber_simdfp:
129; CHECK:       // %bb.0:
130; CHECK-NEXT:    sub sp, sp, #96
131; CHECK-NEXT:    cntd x9
132; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
133; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
134; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
135; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
136; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
137; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
138; CHECK-NEXT:    smstart sm
139; CHECK-NEXT:    bl streaming_callee
140; CHECK-NEXT:    smstop sm
141; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
142; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
143; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
144; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
145; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
146; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
147; CHECK-NEXT:    add sp, sp, #96
148; CHECK-NEXT:    ret
149  call void @streaming_callee()
150  ret <4 x i32> %x;
151}
152
153; Ensure SVE registers are preserved correctly.
154define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) nounwind {
155; CHECK-LABEL: smstart_clobber_sve:
156; CHECK:       // %bb.0:
157; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
158; CHECK-NEXT:    cntd x9
159; CHECK-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
160; CHECK-NEXT:    addvl sp, sp, #-18
161; CHECK-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
162; CHECK-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
163; CHECK-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
164; CHECK-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
165; CHECK-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
166; CHECK-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
167; CHECK-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
168; CHECK-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
169; CHECK-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
170; CHECK-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
171; CHECK-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
172; CHECK-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
173; CHECK-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
174; CHECK-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
175; CHECK-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
176; CHECK-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
177; CHECK-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
178; CHECK-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
179; CHECK-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
180; CHECK-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
181; CHECK-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
182; CHECK-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
183; CHECK-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
184; CHECK-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
185; CHECK-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
186; CHECK-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
187; CHECK-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
188; CHECK-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
189; CHECK-NEXT:    addvl sp, sp, #-1
190; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
191; CHECK-NEXT:    smstart sm
192; CHECK-NEXT:    bl streaming_callee
193; CHECK-NEXT:    smstop sm
194; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
195; CHECK-NEXT:    addvl sp, sp, #1
196; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
197; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
198; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
199; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
200; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
201; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
202; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
203; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
204; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
205; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
206; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
207; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
208; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
209; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
210; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
211; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
212; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
213; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
214; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
215; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
216; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
217; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
218; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
219; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
220; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
221; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
222; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
223; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
224; CHECK-NEXT:    addvl sp, sp, #18
225; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
226; CHECK-NEXT:    ret
227  call void @streaming_callee()
228  ret <vscale x 4 x i32> %x;
229}
230
231; Call streaming callee twice; there should be no spills/fills between the two
232; calls since the registers should have already been clobbered.
233define <vscale x 4 x i32> @smstart_clobber_sve_duplicate(<vscale x 4 x i32> %x) nounwind {
234; CHECK-LABEL: smstart_clobber_sve_duplicate:
235; CHECK:       // %bb.0:
236; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
237; CHECK-NEXT:    cntd x9
238; CHECK-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
239; CHECK-NEXT:    addvl sp, sp, #-18
240; CHECK-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
241; CHECK-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
242; CHECK-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
243; CHECK-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
244; CHECK-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
245; CHECK-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
246; CHECK-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
247; CHECK-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
248; CHECK-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
249; CHECK-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
250; CHECK-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
251; CHECK-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
252; CHECK-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
253; CHECK-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
254; CHECK-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
255; CHECK-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
256; CHECK-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
257; CHECK-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
258; CHECK-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
259; CHECK-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
260; CHECK-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
261; CHECK-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
262; CHECK-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
263; CHECK-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
264; CHECK-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
265; CHECK-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
266; CHECK-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
267; CHECK-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
268; CHECK-NEXT:    addvl sp, sp, #-1
269; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
270; CHECK-NEXT:    smstart sm
271; CHECK-NEXT:    bl streaming_callee
272; CHECK-NEXT:    bl streaming_callee
273; CHECK-NEXT:    smstop sm
274; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
275; CHECK-NEXT:    addvl sp, sp, #1
276; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
277; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
278; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
279; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
280; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
281; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
282; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
283; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
284; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
285; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
286; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
287; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
288; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
289; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
290; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
291; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
292; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
293; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
294; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
295; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
296; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
297; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
298; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
299; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
300; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
301; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
302; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
303; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
304; CHECK-NEXT:    addvl sp, sp, #18
305; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
306; CHECK-NEXT:    ret
307  call void @streaming_callee()
308  call void @streaming_callee()
309  ret <vscale x 4 x i32> %x;
310}
311
312; Ensure smstart is not removed, because call to llvm.cos is not part of a chain.
313define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_pstate_sm_enabled" {
314; CHECK-LABEL: call_to_intrinsic_without_chain:
315; CHECK:       // %bb.0: // %entry
316; CHECK-NEXT:    sub sp, sp, #96
317; CHECK-NEXT:    cntd x9
318; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
319; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
320; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
321; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
322; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
323; CHECK-NEXT:    stp d0, d0, [sp] // 16-byte Folded Spill
324; CHECK-NEXT:    smstop sm
325; CHECK-NEXT:    ldr d0, [sp] // 8-byte Folded Reload
326; CHECK-NEXT:    bl cos
327; CHECK-NEXT:    str d0, [sp] // 8-byte Folded Spill
328; CHECK-NEXT:    smstart sm
329; CHECK-NEXT:    ldp d1, d0, [sp] // 16-byte Folded Reload
330; CHECK-NEXT:    fadd d0, d1, d0
331; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
332; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
333; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
334; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
335; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
336; CHECK-NEXT:    add sp, sp, #96
337; CHECK-NEXT:    ret
338entry:
339  %res = call fast double @llvm.cos.f64(double %x)
340  %res.fadd = fadd fast double %res, %x
341  ret double %res.fadd
342}
343
344declare double @llvm.cos.f64(double)
345
346; Ensure that tail call optimization is disabled when the streaming mode
347; doesn't match.
348define void @disable_tailcallopt() nounwind {
349; CHECK-LABEL: disable_tailcallopt:
350; CHECK:       // %bb.0:
351; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
352; CHECK-NEXT:    cntd x9
353; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
354; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
355; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
356; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
357; CHECK-NEXT:    smstart sm
358; CHECK-NEXT:    bl streaming_callee
359; CHECK-NEXT:    smstop sm
360; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
361; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
362; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
363; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
364; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
365; CHECK-NEXT:    ret
366  tail call void @streaming_callee()
367  ret void;
368}
369
370define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone %ptr) #0 {
371; CHECK-LABEL: call_to_non_streaming_pass_sve_objects:
372; CHECK:       // %bb.0: // %entry
373; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
374; CHECK-NEXT:    cntd x9
375; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
376; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
377; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
378; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
379; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
380; CHECK-NEXT:    addvl sp, sp, #-3
381; CHECK-NEXT:    rdsvl x3, #1
382; CHECK-NEXT:    addvl x0, sp, #2
383; CHECK-NEXT:    addvl x1, sp, #1
384; CHECK-NEXT:    mov x2, sp
385; CHECK-NEXT:    smstop sm
386; CHECK-NEXT:    bl foo
387; CHECK-NEXT:    smstart sm
388; CHECK-NEXT:    ptrue p0.b
389; CHECK-NEXT:    ld1b { z0.b }, p0/z, [sp, #2, mul vl]
390; CHECK-NEXT:    fmov w0, s0
391; CHECK-NEXT:    addvl sp, sp, #3
392; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
393; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
394; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
395; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
396; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
397; CHECK-NEXT:    ret
398entry:
399  %Data1 = alloca <vscale x 16 x i8>, align 16
400  %Data2 = alloca <vscale x 16 x i8>, align 16
401  %Data3 = alloca <vscale x 16 x i8>, align 16
402  %0 = tail call i64 @llvm.aarch64.sme.cntsb()
403  call void @foo(ptr noundef nonnull %Data1, ptr noundef nonnull %Data2, ptr noundef nonnull %Data3, i64 noundef %0)
404  %1 = load <vscale x 16 x i8>, ptr %Data1, align 16
405  %vecext = extractelement <vscale x 16 x i8> %1, i64 0
406  ret i8 %vecext
407}
408
409define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) #0 {
410; CHECK-LABEL: call_to_non_streaming_pass_args:
411; CHECK:       // %bb.0: // %entry
412; CHECK-NEXT:    sub sp, sp, #112
413; CHECK-NEXT:    cntd x9
414; CHECK-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
415; CHECK-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
416; CHECK-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
417; CHECK-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
418; CHECK-NEXT:    stp x30, x9, [sp, #96] // 16-byte Folded Spill
419; CHECK-NEXT:    stp d2, d3, [sp, #16] // 16-byte Folded Spill
420; CHECK-NEXT:    stp s0, s1, [sp, #8] // 8-byte Folded Spill
421; CHECK-NEXT:    smstop sm
422; CHECK-NEXT:    ldp s0, s1, [sp, #8] // 8-byte Folded Reload
423; CHECK-NEXT:    ldp d2, d3, [sp, #16] // 16-byte Folded Reload
424; CHECK-NEXT:    bl bar
425; CHECK-NEXT:    smstart sm
426; CHECK-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
427; CHECK-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
428; CHECK-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
429; CHECK-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
430; CHECK-NEXT:    ldp d15, d14, [sp, #32] // 16-byte Folded Reload
431; CHECK-NEXT:    add sp, sp, #112
432; CHECK-NEXT:    ret
433entry:
434  call void @bar(ptr noundef nonnull %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2)
435  ret void
436}
437
438declare i64 @llvm.aarch64.sme.cntsb()
439
440declare void @foo(ptr noundef, ptr noundef, ptr noundef, i64 noundef)
441declare void @bar(ptr noundef, i64 noundef, i64 noundef, i32 noundef, i32 noundef, float noundef, float noundef, double noundef, double noundef)
442
443attributes #0 = { nounwind vscale_range(1,16) "aarch64_pstate_sm_enabled" }
444