xref: /llvm-project/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll (revision 6e1ea7e5a7b6e581bf9a030b98a7f63ee2833278)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -fast-isel=true -aarch64-streaming-hazard-size=0 -global-isel=false -fast-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 < %s \
3; RUN:     | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-FISEL
4; RUN: llc -fast-isel=false -aarch64-streaming-hazard-size=0 -global-isel=true -global-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 < %s \
5; RUN:     | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-GISEL
6
7
8declare double @streaming_callee(double) "aarch64_pstate_sm_enabled"
9declare double @normal_callee(double)
10
11define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline optnone {
12; CHECK-FISEL-LABEL: nonstreaming_caller_streaming_callee:
13; CHECK-FISEL:       // %bb.0: // %entry
14; CHECK-FISEL-NEXT:    sub sp, sp, #96
15; CHECK-FISEL-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
16; CHECK-FISEL-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
17; CHECK-FISEL-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
18; CHECK-FISEL-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
19; CHECK-FISEL-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
20; CHECK-FISEL-NEXT:    cntd x9
21; CHECK-FISEL-NEXT:    str x9, [sp, #88] // 8-byte Folded Spill
22; CHECK-FISEL-NEXT:    str d0, [sp] // 8-byte Folded Spill
23; CHECK-FISEL-NEXT:    smstart sm
24; CHECK-FISEL-NEXT:    ldr d0, [sp] // 8-byte Folded Reload
25; CHECK-FISEL-NEXT:    bl streaming_callee
26; CHECK-FISEL-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
27; CHECK-FISEL-NEXT:    smstop sm
28; CHECK-FISEL-NEXT:    ldr d1, [sp, #8] // 8-byte Folded Reload
29; CHECK-FISEL-NEXT:    adrp x8, .LCPI0_0
30; CHECK-FISEL-NEXT:    ldr d0, [x8, :lo12:.LCPI0_0]
31; CHECK-FISEL-NEXT:    fadd d0, d1, d0
32; CHECK-FISEL-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
33; CHECK-FISEL-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
34; CHECK-FISEL-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
35; CHECK-FISEL-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
36; CHECK-FISEL-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
37; CHECK-FISEL-NEXT:    add sp, sp, #96
38; CHECK-FISEL-NEXT:    ret
39;
40; CHECK-GISEL-LABEL: nonstreaming_caller_streaming_callee:
41; CHECK-GISEL:       // %bb.0: // %entry
42; CHECK-GISEL-NEXT:    sub sp, sp, #96
43; CHECK-GISEL-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
44; CHECK-GISEL-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
45; CHECK-GISEL-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
46; CHECK-GISEL-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
47; CHECK-GISEL-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
48; CHECK-GISEL-NEXT:    cntd x9
49; CHECK-GISEL-NEXT:    str x9, [sp, #88] // 8-byte Folded Spill
50; CHECK-GISEL-NEXT:    str d0, [sp] // 8-byte Folded Spill
51; CHECK-GISEL-NEXT:    smstart sm
52; CHECK-GISEL-NEXT:    ldr d0, [sp] // 8-byte Folded Reload
53; CHECK-GISEL-NEXT:    bl streaming_callee
54; CHECK-GISEL-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
55; CHECK-GISEL-NEXT:    smstop sm
56; CHECK-GISEL-NEXT:    ldr d1, [sp, #8] // 8-byte Folded Reload
57; CHECK-GISEL-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
58; CHECK-GISEL-NEXT:    fmov d0, x8
59; CHECK-GISEL-NEXT:    fadd d0, d1, d0
60; CHECK-GISEL-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
61; CHECK-GISEL-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
62; CHECK-GISEL-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
63; CHECK-GISEL-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
64; CHECK-GISEL-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
65; CHECK-GISEL-NEXT:    add sp, sp, #96
66; CHECK-GISEL-NEXT:    ret
67entry:
68  %call = call double @streaming_callee(double %x) "aarch64_pstate_sm_enabled"
69  %add = fadd double %call, 4.200000e+01
70  ret double %add
71}
72
73
74define double @streaming_caller_nonstreaming_callee(double %x) nounwind noinline optnone "aarch64_pstate_sm_enabled" {
75; CHECK-COMMON-LABEL: streaming_caller_nonstreaming_callee:
76; CHECK-COMMON:       // %bb.0: // %entry
77; CHECK-COMMON-NEXT:    sub sp, sp, #96
78; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
79; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
80; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
81; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
82; CHECK-COMMON-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
83; CHECK-COMMON-NEXT:    cntd x9
84; CHECK-COMMON-NEXT:    str x9, [sp, #88] // 8-byte Folded Spill
85; CHECK-COMMON-NEXT:    str d0, [sp] // 8-byte Folded Spill
86; CHECK-COMMON-NEXT:    smstop sm
87; CHECK-COMMON-NEXT:    ldr d0, [sp] // 8-byte Folded Reload
88; CHECK-COMMON-NEXT:    bl normal_callee
89; CHECK-COMMON-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
90; CHECK-COMMON-NEXT:    smstart sm
91; CHECK-COMMON-NEXT:    ldr d1, [sp, #8] // 8-byte Folded Reload
92; CHECK-COMMON-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
93; CHECK-COMMON-NEXT:    fmov d0, x8
94; CHECK-COMMON-NEXT:    fadd d0, d1, d0
95; CHECK-COMMON-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
96; CHECK-COMMON-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
97; CHECK-COMMON-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
98; CHECK-COMMON-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
99; CHECK-COMMON-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
100; CHECK-COMMON-NEXT:    add sp, sp, #96
101; CHECK-COMMON-NEXT:    ret
102entry:
103  %call = call double @normal_callee(double %x)
104  %add = fadd double %call, 4.200000e+01
105  ret double %add
106}
107
108define double @locally_streaming_caller_normal_callee(double %x) nounwind noinline optnone "aarch64_pstate_sm_body" {
109; CHECK-COMMON-LABEL: locally_streaming_caller_normal_callee:
110; CHECK-COMMON:       // %bb.0:
111; CHECK-COMMON-NEXT:    sub sp, sp, #128
112; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
113; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
114; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
115; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
116; CHECK-COMMON-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
117; CHECK-COMMON-NEXT:    rdsvl x9, #1
118; CHECK-COMMON-NEXT:    lsr x9, x9, #3
119; CHECK-COMMON-NEXT:    str x9, [sp, #104] // 8-byte Folded Spill
120; CHECK-COMMON-NEXT:    cntd x9
121; CHECK-COMMON-NEXT:    str x9, [sp, #112] // 8-byte Folded Spill
122; CHECK-COMMON-NEXT:    str d0, [sp, #24] // 8-byte Folded Spill
123; CHECK-COMMON-NEXT:    smstart sm
124; CHECK-COMMON-NEXT:    ldr d0, [sp, #24] // 8-byte Folded Reload
125; CHECK-COMMON-NEXT:    str d0, [sp, #24] // 8-byte Folded Spill
126; CHECK-COMMON-NEXT:    smstop sm
127; CHECK-COMMON-NEXT:    ldr d0, [sp, #24] // 8-byte Folded Reload
128; CHECK-COMMON-NEXT:    bl normal_callee
129; CHECK-COMMON-NEXT:    str d0, [sp, #16] // 8-byte Folded Spill
130; CHECK-COMMON-NEXT:    smstart sm
131; CHECK-COMMON-NEXT:    ldr d1, [sp, #16] // 8-byte Folded Reload
132; CHECK-COMMON-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
133; CHECK-COMMON-NEXT:    fmov d0, x8
134; CHECK-COMMON-NEXT:    fadd d0, d1, d0
135; CHECK-COMMON-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
136; CHECK-COMMON-NEXT:    smstop sm
137; CHECK-COMMON-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
138; CHECK-COMMON-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
139; CHECK-COMMON-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
140; CHECK-COMMON-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
141; CHECK-COMMON-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
142; CHECK-COMMON-NEXT:    ldp d15, d14, [sp, #32] // 16-byte Folded Reload
143; CHECK-COMMON-NEXT:    add sp, sp, #128
144; CHECK-COMMON-NEXT:    ret
145  %call = call double  @normal_callee(double %x);
146  %add = fadd double %call, 4.200000e+01
147  ret double %add;
148}
149
150define double @normal_caller_to_locally_streaming_callee(double %x) nounwind noinline optnone {
151; CHECK-FISEL-LABEL: normal_caller_to_locally_streaming_callee:
152; CHECK-FISEL:       // %bb.0:
153; CHECK-FISEL-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
154; CHECK-FISEL-NEXT:    bl locally_streaming_caller_normal_callee
155; CHECK-FISEL-NEXT:    adrp x8, .LCPI3_0
156; CHECK-FISEL-NEXT:    ldr d1, [x8, :lo12:.LCPI3_0]
157; CHECK-FISEL-NEXT:    fadd d0, d0, d1
158; CHECK-FISEL-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
159; CHECK-FISEL-NEXT:    ret
160;
161; CHECK-GISEL-LABEL: normal_caller_to_locally_streaming_callee:
162; CHECK-GISEL:       // %bb.0:
163; CHECK-GISEL-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
164; CHECK-GISEL-NEXT:    bl locally_streaming_caller_normal_callee
165; CHECK-GISEL-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
166; CHECK-GISEL-NEXT:    fmov d1, x8
167; CHECK-GISEL-NEXT:    fadd d0, d0, d1
168; CHECK-GISEL-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
169; CHECK-GISEL-NEXT:    ret
170  %call = call double  @locally_streaming_caller_normal_callee(double %x) "aarch64_pstate_sm_body";
171  %add = fadd double %call, 4.200000e+01
172  ret double %add;
173}
174
175; Check attribute in the call itself
176
177define void @locally_streaming_caller_streaming_callee_ptr(ptr %p) nounwind noinline optnone "aarch64_pstate_sm_body" {
178; CHECK-COMMON-LABEL: locally_streaming_caller_streaming_callee_ptr:
179; CHECK-COMMON:       // %bb.0:
180; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
181; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
182; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
183; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
184; CHECK-COMMON-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
185; CHECK-COMMON-NEXT:    rdsvl x9, #1
186; CHECK-COMMON-NEXT:    lsr x9, x9, #3
187; CHECK-COMMON-NEXT:    str x9, [sp, #72] // 8-byte Folded Spill
188; CHECK-COMMON-NEXT:    cntd x9
189; CHECK-COMMON-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
190; CHECK-COMMON-NEXT:    smstart sm
191; CHECK-COMMON-NEXT:    blr x0
192; CHECK-COMMON-NEXT:    smstop sm
193; CHECK-COMMON-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
194; CHECK-COMMON-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
195; CHECK-COMMON-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
196; CHECK-COMMON-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
197; CHECK-COMMON-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
198; CHECK-COMMON-NEXT:    ret
199 call void %p() "aarch64_pstate_sm_enabled"
200  ret void
201}
202
203define void @normal_call_to_streaming_callee_ptr(ptr %p) nounwind noinline optnone {
204; CHECK-COMMON-LABEL: normal_call_to_streaming_callee_ptr:
205; CHECK-COMMON:       // %bb.0:
206; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
207; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
208; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
209; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
210; CHECK-COMMON-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
211; CHECK-COMMON-NEXT:    cntd x9
212; CHECK-COMMON-NEXT:    str x9, [sp, #72] // 8-byte Folded Spill
213; CHECK-COMMON-NEXT:    smstart sm
214; CHECK-COMMON-NEXT:    blr x0
215; CHECK-COMMON-NEXT:    smstop sm
216; CHECK-COMMON-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
217; CHECK-COMMON-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
218; CHECK-COMMON-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
219; CHECK-COMMON-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
220; CHECK-COMMON-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
221; CHECK-COMMON-NEXT:    ret
222  call void %p() "aarch64_pstate_sm_enabled"
223  ret void
224}
225
226;
227; Check ZA state
228;
229
230declare double @za_shared_callee(double) "aarch64_inout_za"
231
232define double  @za_new_caller_to_za_shared_callee(double %x) nounwind noinline optnone "aarch64_new_za"{
233; CHECK-COMMON-LABEL: za_new_caller_to_za_shared_callee:
234; CHECK-COMMON:       // %bb.0: // %prelude
235; CHECK-COMMON-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
236; CHECK-COMMON-NEXT:    rdsvl x8, #1
237; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
238; CHECK-COMMON-NEXT:    cbz x8, .LBB6_2
239; CHECK-COMMON-NEXT:    b .LBB6_1
240; CHECK-COMMON-NEXT:  .LBB6_1: // %save.za
241; CHECK-COMMON-NEXT:    bl __arm_tpidr2_save
242; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, xzr
243; CHECK-COMMON-NEXT:    b .LBB6_2
244; CHECK-COMMON-NEXT:  .LBB6_2: // %entry
245; CHECK-COMMON-NEXT:    smstart za
246; CHECK-COMMON-NEXT:    zero {za}
247; CHECK-COMMON-NEXT:    bl za_shared_callee
248; CHECK-COMMON-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
249; CHECK-COMMON-NEXT:    fmov d1, x8
250; CHECK-COMMON-NEXT:    fadd d0, d0, d1
251; CHECK-COMMON-NEXT:    smstop za
252; CHECK-COMMON-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
253; CHECK-COMMON-NEXT:    ret
254entry:
255  %call = call double @za_shared_callee(double %x)
256  %add = fadd double %call, 4.200000e+01
257  ret double %add;
258}
259
260define double  @za_shared_caller_to_za_none_callee(double %x) nounwind noinline optnone "aarch64_inout_za"{
261; CHECK-COMMON-LABEL: za_shared_caller_to_za_none_callee:
262; CHECK-COMMON:       // %bb.0: // %entry
263; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
264; CHECK-COMMON-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
265; CHECK-COMMON-NEXT:    mov x29, sp
266; CHECK-COMMON-NEXT:    sub sp, sp, #16
267; CHECK-COMMON-NEXT:    rdsvl x8, #1
268; CHECK-COMMON-NEXT:    mov x9, sp
269; CHECK-COMMON-NEXT:    msub x9, x8, x8, x9
270; CHECK-COMMON-NEXT:    mov sp, x9
271; CHECK-COMMON-NEXT:    stur x9, [x29, #-16]
272; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
273; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
274; CHECK-COMMON-NEXT:    sturh w8, [x29, #-8]
275; CHECK-COMMON-NEXT:    sub x8, x29, #16
276; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x8
277; CHECK-COMMON-NEXT:    bl normal_callee
278; CHECK-COMMON-NEXT:    smstart za
279; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
280; CHECK-COMMON-NEXT:    sub x0, x29, #16
281; CHECK-COMMON-NEXT:    cbz x8, .LBB7_1
282; CHECK-COMMON-NEXT:    b .LBB7_2
283; CHECK-COMMON-NEXT:  .LBB7_1: // %entry
284; CHECK-COMMON-NEXT:    bl __arm_tpidr2_restore
285; CHECK-COMMON-NEXT:    b .LBB7_2
286; CHECK-COMMON-NEXT:  .LBB7_2: // %entry
287; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, xzr
288; CHECK-COMMON-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
289; CHECK-COMMON-NEXT:    fmov d1, x8
290; CHECK-COMMON-NEXT:    fadd d0, d0, d1
291; CHECK-COMMON-NEXT:    mov sp, x29
292; CHECK-COMMON-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
293; CHECK-COMMON-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
294; CHECK-COMMON-NEXT:    ret
295entry:
296  %call = call double @normal_callee(double %x)
297  %add = fadd double %call, 4.200000e+01
298  ret double %add;
299}
300
301; Ensure we set up and restore the lazy save correctly for instructions which are lowered to lib calls.
302define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
303; CHECK-COMMON-LABEL: f128_call_za:
304; CHECK-COMMON:       // %bb.0:
305; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
306; CHECK-COMMON-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
307; CHECK-COMMON-NEXT:    mov x29, sp
308; CHECK-COMMON-NEXT:    sub sp, sp, #16
309; CHECK-COMMON-NEXT:    rdsvl x8, #1
310; CHECK-COMMON-NEXT:    mov x9, sp
311; CHECK-COMMON-NEXT:    msub x9, x8, x8, x9
312; CHECK-COMMON-NEXT:    mov sp, x9
313; CHECK-COMMON-NEXT:    stur x9, [x29, #-16]
314; CHECK-COMMON-NEXT:    sub x9, x29, #16
315; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
316; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
317; CHECK-COMMON-NEXT:    sturh w8, [x29, #-8]
318; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x9
319; CHECK-COMMON-NEXT:    bl __addtf3
320; CHECK-COMMON-NEXT:    smstart za
321; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
322; CHECK-COMMON-NEXT:    sub x0, x29, #16
323; CHECK-COMMON-NEXT:    cbnz x8, .LBB8_2
324; CHECK-COMMON-NEXT:  // %bb.1:
325; CHECK-COMMON-NEXT:    bl __arm_tpidr2_restore
326; CHECK-COMMON-NEXT:  .LBB8_2:
327; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, xzr
328; CHECK-COMMON-NEXT:    mov sp, x29
329; CHECK-COMMON-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
330; CHECK-COMMON-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
331; CHECK-COMMON-NEXT:    ret
332  %res = fadd fp128 %a, %b
333  ret fp128 %res
334}
335
336
337; Ensure we fall back to SelectionDAG isel here so that we temporarily disable streaming mode to lower the fadd (with function calls).
338define fp128 @f128_call_sm(fp128 %a, fp128 %b) "aarch64_pstate_sm_enabled" nounwind {
339; CHECK-COMMON-LABEL: f128_call_sm:
340; CHECK-COMMON:       // %bb.0:
341; CHECK-COMMON-NEXT:    sub sp, sp, #112
342; CHECK-COMMON-NEXT:    cntd x9
343; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
344; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
345; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
346; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
347; CHECK-COMMON-NEXT:    stp x30, x9, [sp, #96] // 16-byte Folded Spill
348; CHECK-COMMON-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
349; CHECK-COMMON-NEXT:    smstop sm
350; CHECK-COMMON-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
351; CHECK-COMMON-NEXT:    bl __addtf3
352; CHECK-COMMON-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
353; CHECK-COMMON-NEXT:    smstart sm
354; CHECK-COMMON-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
355; CHECK-COMMON-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
356; CHECK-COMMON-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
357; CHECK-COMMON-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
358; CHECK-COMMON-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
359; CHECK-COMMON-NEXT:    ldp d15, d14, [sp, #32] // 16-byte Folded Reload
360; CHECK-COMMON-NEXT:    add sp, sp, #112
361; CHECK-COMMON-NEXT:    ret
362  %res = fadd fp128 %a, %b
363  ret fp128 %res
364}
365
366; As above this should use Selection DAG to make sure the libcall call is lowered correctly.
367define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind {
368; CHECK-COMMON-LABEL: frem_call_za:
369; CHECK-COMMON:       // %bb.0:
370; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
371; CHECK-COMMON-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
372; CHECK-COMMON-NEXT:    mov x29, sp
373; CHECK-COMMON-NEXT:    sub sp, sp, #16
374; CHECK-COMMON-NEXT:    rdsvl x8, #1
375; CHECK-COMMON-NEXT:    mov x9, sp
376; CHECK-COMMON-NEXT:    msub x9, x8, x8, x9
377; CHECK-COMMON-NEXT:    mov sp, x9
378; CHECK-COMMON-NEXT:    stur x9, [x29, #-16]
379; CHECK-COMMON-NEXT:    sub x9, x29, #16
380; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
381; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
382; CHECK-COMMON-NEXT:    sturh w8, [x29, #-8]
383; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x9
384; CHECK-COMMON-NEXT:    bl fmod
385; CHECK-COMMON-NEXT:    smstart za
386; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
387; CHECK-COMMON-NEXT:    sub x0, x29, #16
388; CHECK-COMMON-NEXT:    cbnz x8, .LBB10_2
389; CHECK-COMMON-NEXT:  // %bb.1:
390; CHECK-COMMON-NEXT:    bl __arm_tpidr2_restore
391; CHECK-COMMON-NEXT:  .LBB10_2:
392; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, xzr
393; CHECK-COMMON-NEXT:    mov sp, x29
394; CHECK-COMMON-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
395; CHECK-COMMON-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
396; CHECK-COMMON-NEXT:    ret
397  %res = frem double %a, %b
398  ret double %res
399}
400
401; As above this should use Selection DAG to make sure the libcall is lowered correctly.
402define float @frem_call_sm(float %a, float %b) "aarch64_pstate_sm_enabled" nounwind {
403; CHECK-COMMON-LABEL: frem_call_sm:
404; CHECK-COMMON:       // %bb.0:
405; CHECK-COMMON-NEXT:    sub sp, sp, #96
406; CHECK-COMMON-NEXT:    cntd x9
407; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
408; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
409; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
410; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
411; CHECK-COMMON-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
412; CHECK-COMMON-NEXT:    stp s0, s1, [sp, #8] // 8-byte Folded Spill
413; CHECK-COMMON-NEXT:    smstop sm
414; CHECK-COMMON-NEXT:    ldp s0, s1, [sp, #8] // 8-byte Folded Reload
415; CHECK-COMMON-NEXT:    bl fmodf
416; CHECK-COMMON-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
417; CHECK-COMMON-NEXT:    smstart sm
418; CHECK-COMMON-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
419; CHECK-COMMON-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
420; CHECK-COMMON-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
421; CHECK-COMMON-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
422; CHECK-COMMON-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
423; CHECK-COMMON-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
424; CHECK-COMMON-NEXT:    add sp, sp, #96
425; CHECK-COMMON-NEXT:    ret
426  %res = frem float %a, %b
427  ret float %res
428}
429
430; As above this should use Selection DAG to make sure the libcall is lowered correctly.
431define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compatible" nounwind {
432; CHECK-COMMON-LABEL: frem_call_sm_compat:
433; CHECK-COMMON:       // %bb.0:
434; CHECK-COMMON-NEXT:    sub sp, sp, #112
435; CHECK-COMMON-NEXT:    cntd x9
436; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
437; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
438; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
439; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
440; CHECK-COMMON-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
441; CHECK-COMMON-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
442; CHECK-COMMON-NEXT:    stp s0, s1, [sp, #8] // 8-byte Folded Spill
443; CHECK-COMMON-NEXT:    bl __arm_sme_state
444; CHECK-COMMON-NEXT:    and x19, x0, #0x1
445; CHECK-COMMON-NEXT:    tbz w19, #0, .LBB12_2
446; CHECK-COMMON-NEXT:  // %bb.1:
447; CHECK-COMMON-NEXT:    smstop sm
448; CHECK-COMMON-NEXT:  .LBB12_2:
449; CHECK-COMMON-NEXT:    ldp s0, s1, [sp, #8] // 8-byte Folded Reload
450; CHECK-COMMON-NEXT:    bl fmodf
451; CHECK-COMMON-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
452; CHECK-COMMON-NEXT:    tbz w19, #0, .LBB12_4
453; CHECK-COMMON-NEXT:  // %bb.3:
454; CHECK-COMMON-NEXT:    smstart sm
455; CHECK-COMMON-NEXT:  .LBB12_4:
456; CHECK-COMMON-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
457; CHECK-COMMON-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
458; CHECK-COMMON-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
459; CHECK-COMMON-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
460; CHECK-COMMON-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
461; CHECK-COMMON-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
462; CHECK-COMMON-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
463; CHECK-COMMON-NEXT:    add sp, sp, #112
464; CHECK-COMMON-NEXT:    ret
465  %res = frem float %a, %b
466  ret float %res
467}
468
469;
470; Check ZT0 State
471;
472
473declare double @zt0_shared_callee(double) "aarch64_inout_zt0"
474
475define double  @zt0_new_caller_to_zt0_shared_callee(double %x) nounwind noinline optnone "aarch64_new_zt0" {
476; CHECK-COMMON-LABEL: zt0_new_caller_to_zt0_shared_callee:
477; CHECK-COMMON:       // %bb.0: // %prelude
478; CHECK-COMMON-NEXT:    sub sp, sp, #80
479; CHECK-COMMON-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
480; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
481; CHECK-COMMON-NEXT:    cbz x8, .LBB13_2
482; CHECK-COMMON-NEXT:    b .LBB13_1
483; CHECK-COMMON-NEXT:  .LBB13_1: // %save.za
484; CHECK-COMMON-NEXT:    mov x8, sp
485; CHECK-COMMON-NEXT:    str zt0, [x8]
486; CHECK-COMMON-NEXT:    bl __arm_tpidr2_save
487; CHECK-COMMON-NEXT:    ldr zt0, [x8]
488; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, xzr
489; CHECK-COMMON-NEXT:    b .LBB13_2
490; CHECK-COMMON-NEXT:  .LBB13_2: // %entry
491; CHECK-COMMON-NEXT:    smstart za
492; CHECK-COMMON-NEXT:    zero { zt0 }
493; CHECK-COMMON-NEXT:    bl zt0_shared_callee
494; CHECK-COMMON-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
495; CHECK-COMMON-NEXT:    fmov d1, x8
496; CHECK-COMMON-NEXT:    fadd d0, d0, d1
497; CHECK-COMMON-NEXT:    smstop za
498; CHECK-COMMON-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
499; CHECK-COMMON-NEXT:    add sp, sp, #80
500; CHECK-COMMON-NEXT:    ret
501entry:
502  %call = call double @zt0_shared_callee(double %x)
503  %add = fadd double %call, 4.200000e+01
504  ret double %add;
505}
506
507define double  @zt0_shared_caller_to_normal_callee(double %x) nounwind noinline optnone "aarch64_inout_zt0" {
508; CHECK-COMMON-LABEL: zt0_shared_caller_to_normal_callee:
509; CHECK-COMMON:       // %bb.0: // %entry
510; CHECK-COMMON-NEXT:    sub sp, sp, #80
511; CHECK-COMMON-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
512; CHECK-COMMON-NEXT:    mov x19, sp
513; CHECK-COMMON-NEXT:    str zt0, [x19]
514; CHECK-COMMON-NEXT:    smstop za
515; CHECK-COMMON-NEXT:    bl normal_callee
516; CHECK-COMMON-NEXT:    smstart za
517; CHECK-COMMON-NEXT:    ldr zt0, [x19]
518; CHECK-COMMON-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
519; CHECK-COMMON-NEXT:    fmov d1, x8
520; CHECK-COMMON-NEXT:    fadd d0, d0, d1
521; CHECK-COMMON-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
522; CHECK-COMMON-NEXT:    add sp, sp, #80
523; CHECK-COMMON-NEXT:    ret
524entry:
525  %call = call double @normal_callee(double %x)
526  %add = fadd double %call, 4.200000e+01
527  ret double %add;
528}
529
530define void @agnostic_za_function(ptr %ptr) nounwind "aarch64_za_state_agnostic" {
531; CHECK-COMMON-LABEL: agnostic_za_function:
532; CHECK-COMMON:       // %bb.0:
533; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
534; CHECK-COMMON-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
535; CHECK-COMMON-NEXT:    mov x29, sp
536; CHECK-COMMON-NEXT:    mov x8, x0
537; CHECK-COMMON-NEXT:    bl __arm_sme_state_size
538; CHECK-COMMON-NEXT:    sub sp, sp, x0
539; CHECK-COMMON-NEXT:    mov x20, sp
540; CHECK-COMMON-NEXT:    mov x0, x20
541; CHECK-COMMON-NEXT:    bl __arm_sme_save
542; CHECK-COMMON-NEXT:    blr x8
543; CHECK-COMMON-NEXT:    mov x0, x20
544; CHECK-COMMON-NEXT:    bl __arm_sme_restore
545; CHECK-COMMON-NEXT:    mov sp, x29
546; CHECK-COMMON-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
547; CHECK-COMMON-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
548; CHECK-COMMON-NEXT:    ret
549  call void %ptr()
550  ret void
551}
552
553