1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 -verify-machineinstrs < %s | FileCheck %s 3 4; FP is in CSR range, modified. 5define hidden fastcc void @callee_has_fp() #1 { 6; CHECK-LABEL: callee_has_fp: 7; CHECK: ; %bb.0: 8; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9; CHECK-NEXT: s_mov_b32 s4, s33 10; CHECK-NEXT: s_mov_b32 s33, s32 11; CHECK-NEXT: s_add_i32 s32, s32, 0x200 12; CHECK-NEXT: v_mov_b32_e32 v0, 1 13; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 14; CHECK-NEXT: s_waitcnt vmcnt(0) 15; CHECK-NEXT: s_mov_b32 s32, s33 16; CHECK-NEXT: s_mov_b32 s33, s4 17; CHECK-NEXT: s_setpc_b64 s[30:31] 18 %alloca = alloca i32, addrspace(5) 19 store volatile i32 1, ptr addrspace(5) %alloca 20 ret void 21} 22 23; Has no stack objects, but introduces them due to the CSR spill. We 24; see the FP modified in the callee with IPRA. We should not have 25; redundant spills of s33 or assert. 26define internal fastcc void @csr_vgpr_spill_fp_callee() #0 { 27; CHECK-LABEL: csr_vgpr_spill_fp_callee: 28; CHECK: ; %bb.0: ; %bb 29; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30; CHECK-NEXT: s_mov_b32 s18, s33 31; CHECK-NEXT: s_mov_b32 s33, s32 32; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 33; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill 34; CHECK-NEXT: s_mov_b64 exec, s[16:17] 35; CHECK-NEXT: s_add_i32 s32, s32, 0x400 36; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill 37; CHECK-NEXT: v_writelane_b32 v1, s30, 0 38; CHECK-NEXT: v_writelane_b32 v1, s31, 1 39; CHECK-NEXT: s_getpc_b64 s[16:17] 40; CHECK-NEXT: s_add_u32 s16, s16, callee_has_fp@rel32@lo+4 41; CHECK-NEXT: s_addc_u32 s17, s17, callee_has_fp@rel32@hi+12 42; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] 43; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] 44; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] 45; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] 46; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] 47; CHECK-NEXT: ;;#ASMSTART 48; CHECK-NEXT: ; clobber csr v40 49; CHECK-NEXT: ;;#ASMEND 50; CHECK-NEXT: v_readlane_b32 s31, v1, 1 51; CHECK-NEXT: v_readlane_b32 s30, v1, 0 52; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload 53; CHECK-NEXT: s_mov_b32 s32, s33 54; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 55; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload 56; CHECK-NEXT: s_mov_b64 exec, s[4:5] 57; CHECK-NEXT: s_mov_b32 s33, s18 58; CHECK-NEXT: s_waitcnt vmcnt(0) 59; CHECK-NEXT: s_setpc_b64 s[30:31] 60bb: 61 call fastcc void @callee_has_fp() 62 call void asm sideeffect "; clobber csr v40", "~{v40}"() 63 ret void 64} 65 66define amdgpu_kernel void @kernel_call() { 67; CHECK-LABEL: kernel_call: 68; CHECK: ; %bb.0: ; %bb 69; CHECK-NEXT: s_mov_b32 s32, 0 70; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 71; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 72; CHECK-NEXT: s_add_u32 s0, s0, s17 73; CHECK-NEXT: s_addc_u32 s1, s1, 0 74; CHECK-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane 75; CHECK-NEXT: v_writelane_b32 v3, s16, 0 76; CHECK-NEXT: s_mov_b32 s13, s15 77; CHECK-NEXT: s_mov_b32 s12, s14 78; CHECK-NEXT: v_readlane_b32 s14, v3, 0 79; CHECK-NEXT: s_getpc_b64 s[16:17] 80; CHECK-NEXT: s_add_u32 s16, s16, csr_vgpr_spill_fp_callee@rel32@lo+4 81; CHECK-NEXT: s_addc_u32 s17, s17, csr_vgpr_spill_fp_callee@rel32@hi+12 82; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] 83; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] 84; CHECK-NEXT: s_mov_b32 s15, 20 85; CHECK-NEXT: v_lshlrev_b32_e64 v2, s15, v2 86; CHECK-NEXT: s_mov_b32 s15, 10 87; CHECK-NEXT: v_lshlrev_b32_e64 v1, s15, v1 88; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 89; CHECK-NEXT: ; implicit-def: $sgpr15 90; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] 91; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] 92; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] 93; CHECK-NEXT: s_endpgm 94bb: 95 tail call fastcc void @csr_vgpr_spill_fp_callee() 96 ret void 97} 98 99; Same, except with a tail call. 100define internal fastcc void @csr_vgpr_spill_fp_tailcall_callee() #0 { 101; CHECK-LABEL: csr_vgpr_spill_fp_tailcall_callee: 102; CHECK: ; %bb.0: ; %bb 103; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 104; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 105; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill 106; CHECK-NEXT: s_mov_b64 exec, s[16:17] 107; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill 108; CHECK-NEXT: v_writelane_b32 v1, s33, 0 109; CHECK-NEXT: ;;#ASMSTART 110; CHECK-NEXT: ; clobber csr v40 111; CHECK-NEXT: ;;#ASMEND 112; CHECK-NEXT: s_getpc_b64 s[16:17] 113; CHECK-NEXT: s_add_u32 s16, s16, callee_has_fp@rel32@lo+4 114; CHECK-NEXT: s_addc_u32 s17, s17, callee_has_fp@rel32@hi+12 115; CHECK-NEXT: v_readlane_b32 s33, v1, 0 116; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload 117; CHECK-NEXT: s_xor_saveexec_b64 s[18:19], -1 118; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload 119; CHECK-NEXT: s_mov_b64 exec, s[18:19] 120; CHECK-NEXT: s_setpc_b64 s[16:17] 121bb: 122 call void asm sideeffect "; clobber csr v40", "~{v40}"() 123 tail call fastcc void @callee_has_fp() 124 ret void 125} 126 127define amdgpu_kernel void @kernel_tailcall() { 128; CHECK-LABEL: kernel_tailcall: 129; CHECK: ; %bb.0: ; %bb 130; CHECK-NEXT: s_mov_b32 s32, 0 131; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 132; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 133; CHECK-NEXT: s_add_u32 s0, s0, s17 134; CHECK-NEXT: s_addc_u32 s1, s1, 0 135; CHECK-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane 136; CHECK-NEXT: v_writelane_b32 v3, s16, 0 137; CHECK-NEXT: s_mov_b32 s13, s15 138; CHECK-NEXT: s_mov_b32 s12, s14 139; CHECK-NEXT: v_readlane_b32 s14, v3, 0 140; CHECK-NEXT: s_getpc_b64 s[16:17] 141; CHECK-NEXT: s_add_u32 s16, s16, csr_vgpr_spill_fp_tailcall_callee@rel32@lo+4 142; CHECK-NEXT: s_addc_u32 s17, s17, csr_vgpr_spill_fp_tailcall_callee@rel32@hi+12 143; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] 144; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] 145; CHECK-NEXT: s_mov_b32 s15, 20 146; CHECK-NEXT: v_lshlrev_b32_e64 v2, s15, v2 147; CHECK-NEXT: s_mov_b32 s15, 10 148; CHECK-NEXT: v_lshlrev_b32_e64 v1, s15, v1 149; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 150; CHECK-NEXT: ; implicit-def: $sgpr15 151; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] 152; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] 153; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] 154; CHECK-NEXT: s_endpgm 155bb: 156 tail call fastcc void @csr_vgpr_spill_fp_tailcall_callee() 157 ret void 158} 159 160define hidden i32 @tail_call() #1 { 161; CHECK-LABEL: tail_call: 162; CHECK: ; %bb.0: ; %entry 163; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 164; CHECK-NEXT: s_mov_b32 s4, s33 165; CHECK-NEXT: s_mov_b32 s33, s32 166; CHECK-NEXT: v_mov_b32_e32 v0, 0 167; CHECK-NEXT: s_mov_b32 s33, s4 168; CHECK-NEXT: s_setpc_b64 s[30:31] 169entry: 170 ret i32 0 171} 172 173define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 { 174; CHECK-LABEL: caller_save_vgpr_spill_fp_tail_call: 175; CHECK: ; %bb.0: ; %entry 176; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 177; CHECK-NEXT: s_mov_b32 s18, s33 178; CHECK-NEXT: s_mov_b32 s33, s32 179; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 180; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill 181; CHECK-NEXT: s_mov_b64 exec, s[16:17] 182; CHECK-NEXT: s_add_i32 s32, s32, 0x400 183; CHECK-NEXT: v_writelane_b32 v1, s30, 0 184; CHECK-NEXT: v_writelane_b32 v1, s31, 1 185; CHECK-NEXT: s_getpc_b64 s[16:17] 186; CHECK-NEXT: s_add_u32 s16, s16, tail_call@rel32@lo+4 187; CHECK-NEXT: s_addc_u32 s17, s17, tail_call@rel32@hi+12 188; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] 189; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] 190; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] 191; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] 192; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] 193; CHECK-NEXT: v_readlane_b32 s31, v1, 1 194; CHECK-NEXT: v_readlane_b32 s30, v1, 0 195; CHECK-NEXT: s_mov_b32 s32, s33 196; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 197; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload 198; CHECK-NEXT: s_mov_b64 exec, s[4:5] 199; CHECK-NEXT: s_mov_b32 s33, s18 200; CHECK-NEXT: s_waitcnt vmcnt(0) 201; CHECK-NEXT: s_setpc_b64 s[30:31] 202entry: 203 %call = call i32 @tail_call() 204 ret i32 %call 205} 206 207define hidden i32 @caller_save_vgpr_spill_fp() #0 { 208; CHECK-LABEL: caller_save_vgpr_spill_fp: 209; CHECK: ; %bb.0: ; %entry 210; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 211; CHECK-NEXT: s_mov_b32 s19, s33 212; CHECK-NEXT: s_mov_b32 s33, s32 213; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 214; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill 215; CHECK-NEXT: s_mov_b64 exec, s[16:17] 216; CHECK-NEXT: s_add_i32 s32, s32, 0x400 217; CHECK-NEXT: v_writelane_b32 v2, s30, 0 218; CHECK-NEXT: v_writelane_b32 v2, s31, 1 219; CHECK-NEXT: s_getpc_b64 s[16:17] 220; CHECK-NEXT: s_add_u32 s16, s16, caller_save_vgpr_spill_fp_tail_call@rel32@lo+4 221; CHECK-NEXT: s_addc_u32 s17, s17, caller_save_vgpr_spill_fp_tail_call@rel32@hi+12 222; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] 223; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] 224; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] 225; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] 226; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] 227; CHECK-NEXT: v_readlane_b32 s31, v2, 1 228; CHECK-NEXT: v_readlane_b32 s30, v2, 0 229; CHECK-NEXT: s_mov_b32 s32, s33 230; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 231; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload 232; CHECK-NEXT: s_mov_b64 exec, s[4:5] 233; CHECK-NEXT: s_mov_b32 s33, s19 234; CHECK-NEXT: s_waitcnt vmcnt(0) 235; CHECK-NEXT: s_setpc_b64 s[30:31] 236entry: 237 %call = call i32 @caller_save_vgpr_spill_fp_tail_call() 238 ret i32 %call 239} 240 241define protected amdgpu_kernel void @kernel() { 242; CHECK-LABEL: kernel: 243; CHECK: ; %bb.0: ; %entry 244; CHECK-NEXT: s_mov_b32 s32, 0 245; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 246; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 247; CHECK-NEXT: s_add_u32 s0, s0, s17 248; CHECK-NEXT: s_addc_u32 s1, s1, 0 249; CHECK-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane 250; CHECK-NEXT: v_writelane_b32 v3, s16, 0 251; CHECK-NEXT: s_mov_b32 s13, s15 252; CHECK-NEXT: s_mov_b32 s12, s14 253; CHECK-NEXT: v_readlane_b32 s14, v3, 0 254; CHECK-NEXT: s_getpc_b64 s[16:17] 255; CHECK-NEXT: s_add_u32 s16, s16, caller_save_vgpr_spill_fp@rel32@lo+4 256; CHECK-NEXT: s_addc_u32 s17, s17, caller_save_vgpr_spill_fp@rel32@hi+12 257; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] 258; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] 259; CHECK-NEXT: s_mov_b32 s15, 20 260; CHECK-NEXT: v_lshlrev_b32_e64 v2, s15, v2 261; CHECK-NEXT: s_mov_b32 s15, 10 262; CHECK-NEXT: v_lshlrev_b32_e64 v1, s15, v1 263; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 264; CHECK-NEXT: ; implicit-def: $sgpr15 265; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] 266; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] 267; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] 268; CHECK-NEXT: s_endpgm 269entry: 270 %call = call i32 @caller_save_vgpr_spill_fp() 271 ret void 272} 273 274attributes #0 = { "frame-pointer"="none" noinline } 275attributes #1 = { "frame-pointer"="all" noinline } 276 277!llvm.module.flags = !{!0} 278!0 = !{i32 1, !"amdhsa_code_object_version", i32 500} 279