1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=GFX803 %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX900 %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX1010 %s 5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX1100 %s 6 7define amdgpu_kernel void @test_kern_empty() local_unnamed_addr #0 { 8; GFX803-LABEL: test_kern_empty: 9; GFX803: ; %bb.0: ; %entry 10; GFX803-NEXT: s_endpgm 11; 12; GFX900-LABEL: test_kern_empty: 13; GFX900: ; %bb.0: ; %entry 14; GFX900-NEXT: s_endpgm 15; 16; GFX1010-LABEL: test_kern_empty: 17; GFX1010: ; %bb.0: ; %entry 18; GFX1010-NEXT: s_endpgm 19; 20; GFX1100-LABEL: test_kern_empty: 21; GFX1100: ; %bb.0: ; %entry 22; GFX1100-NEXT: s_endpgm 23entry: 24 ret void 25} 26 27define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 { 28; GFX803-LABEL: test_kern_stack: 29; GFX803: ; %bb.0: ; %entry 30; GFX803-NEXT: s_add_u32 s0, s0, s17 31; GFX803-NEXT: s_addc_u32 s1, s1, 0 32; GFX803-NEXT: v_mov_b32_e32 v0, 0 33; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 34; GFX803-NEXT: s_waitcnt vmcnt(0) 35; GFX803-NEXT: s_endpgm 36; 37; GFX900-LABEL: test_kern_stack: 38; GFX900: ; %bb.0: ; %entry 39; GFX900-NEXT: s_add_u32 s0, s0, s17 40; GFX900-NEXT: s_addc_u32 s1, s1, 0 41; GFX900-NEXT: v_mov_b32_e32 v0, 0 42; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 43; GFX900-NEXT: s_waitcnt vmcnt(0) 44; GFX900-NEXT: s_endpgm 45; 46; GFX1010-LABEL: test_kern_stack: 47; GFX1010: ; %bb.0: ; %entry 48; GFX1010-NEXT: v_mov_b32_e32 v0, 0 49; GFX1010-NEXT: s_add_u32 s0, s0, s17 50; GFX1010-NEXT: s_addc_u32 s1, s1, 0 51; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 52; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 53; GFX1010-NEXT: s_endpgm 54; 55; GFX1100-LABEL: test_kern_stack: 56; GFX1100: ; %bb.0: ; %entry 57; GFX1100-NEXT: v_mov_b32_e32 v0, 0 58; GFX1100-NEXT: scratch_store_b32 off, v0, off dlc 59; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 60; GFX1100-NEXT: s_endpgm 61entry: 62 %x = alloca i32, align 4, addrspace(5) 63 store volatile i32 0, ptr addrspace(5) %x, align 4 64 ret void 65} 66 67define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 { 68; GFX803-LABEL: test_kern_call: 69; GFX803: ; %bb.0: ; %entry 70; GFX803-NEXT: s_add_i32 s12, s12, s17 71; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 72; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 73; GFX803-NEXT: s_add_u32 s0, s0, s17 74; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 75; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 76; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 77; GFX803-NEXT: s_addc_u32 s1, s1, 0 78; GFX803-NEXT: s_mov_b32 s13, s15 79; GFX803-NEXT: s_mov_b32 s12, s14 80; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 81; GFX803-NEXT: s_mov_b32 s14, s16 82; GFX803-NEXT: s_mov_b32 s32, 0 83; GFX803-NEXT: s_getpc_b64 s[18:19] 84; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 85; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 86; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] 87; GFX803-NEXT: s_endpgm 88; 89; GFX900-LABEL: test_kern_call: 90; GFX900: ; %bb.0: ; %entry 91; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 92; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 93; GFX900-NEXT: s_add_u32 s0, s0, s17 94; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 95; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 96; GFX900-NEXT: s_addc_u32 s1, s1, 0 97; GFX900-NEXT: s_mov_b32 s13, s15 98; GFX900-NEXT: s_mov_b32 s12, s14 99; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 100; GFX900-NEXT: s_mov_b32 s14, s16 101; GFX900-NEXT: s_mov_b32 s32, 0 102; GFX900-NEXT: s_getpc_b64 s[18:19] 103; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 104; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 105; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] 106; GFX900-NEXT: s_endpgm 107; 108; GFX1010-LABEL: test_kern_call: 109; GFX1010: ; %bb.0: ; %entry 110; GFX1010-NEXT: s_add_u32 s12, s12, s17 111; GFX1010-NEXT: s_mov_b32 s32, 0 112; GFX1010-NEXT: s_addc_u32 s13, s13, 0 113; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 114; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 115; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 116; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 117; GFX1010-NEXT: s_add_u32 s0, s0, s17 118; GFX1010-NEXT: s_addc_u32 s1, s1, 0 119; GFX1010-NEXT: s_mov_b32 s13, s15 120; GFX1010-NEXT: s_mov_b32 s12, s14 121; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 122; GFX1010-NEXT: s_mov_b32 s14, s16 123; GFX1010-NEXT: s_getpc_b64 s[18:19] 124; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 125; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 126; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] 127; GFX1010-NEXT: s_endpgm 128; 129; GFX1100-LABEL: test_kern_call: 130; GFX1100: ; %bb.0: ; %entry 131; GFX1100-NEXT: v_mov_b32_e32 v31, v0 132; GFX1100-NEXT: s_mov_b32 s12, s13 133; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7] 134; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5] 135; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1] 136; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3] 137; GFX1100-NEXT: s_mov_b32 s13, s14 138; GFX1100-NEXT: s_mov_b32 s14, s15 139; GFX1100-NEXT: s_mov_b32 s32, 0 140; GFX1100-NEXT: s_getpc_b64 s[16:17] 141; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 142; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 143; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17] 144; GFX1100-NEXT: s_endpgm 145 146entry: 147 tail call void @ex() #0 148 ret void 149} 150 151define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { 152; GFX803-LABEL: test_kern_stack_and_call: 153; GFX803: ; %bb.0: ; %entry 154; GFX803-NEXT: s_add_i32 s12, s12, s17 155; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 156; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 157; GFX803-NEXT: s_add_u32 s0, s0, s17 158; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 159; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 160; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 161; GFX803-NEXT: s_addc_u32 s1, s1, 0 162; GFX803-NEXT: s_mov_b32 s13, s15 163; GFX803-NEXT: s_mov_b32 s12, s14 164; GFX803-NEXT: v_mov_b32_e32 v3, 0 165; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 166; GFX803-NEXT: s_mov_b32 s14, s16 167; GFX803-NEXT: s_movk_i32 s32, 0x400 168; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], 0 169; GFX803-NEXT: s_waitcnt vmcnt(0) 170; GFX803-NEXT: s_getpc_b64 s[18:19] 171; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 172; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 173; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] 174; GFX803-NEXT: s_endpgm 175; 176; GFX900-LABEL: test_kern_stack_and_call: 177; GFX900: ; %bb.0: ; %entry 178; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 179; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 180; GFX900-NEXT: s_add_u32 s0, s0, s17 181; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 182; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 183; GFX900-NEXT: s_addc_u32 s1, s1, 0 184; GFX900-NEXT: s_mov_b32 s13, s15 185; GFX900-NEXT: s_mov_b32 s12, s14 186; GFX900-NEXT: v_mov_b32_e32 v3, 0 187; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 188; GFX900-NEXT: s_mov_b32 s14, s16 189; GFX900-NEXT: s_movk_i32 s32, 0x400 190; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], 0 191; GFX900-NEXT: s_waitcnt vmcnt(0) 192; GFX900-NEXT: s_getpc_b64 s[18:19] 193; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 194; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 195; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] 196; GFX900-NEXT: s_endpgm 197; 198; GFX1010-LABEL: test_kern_stack_and_call: 199; GFX1010: ; %bb.0: ; %entry 200; GFX1010-NEXT: s_add_u32 s12, s12, s17 201; GFX1010-NEXT: s_movk_i32 s32, 0x200 202; GFX1010-NEXT: s_addc_u32 s13, s13, 0 203; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 204; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 205; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 206; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 207; GFX1010-NEXT: v_mov_b32_e32 v3, 0 208; GFX1010-NEXT: s_add_u32 s0, s0, s17 209; GFX1010-NEXT: s_addc_u32 s1, s1, 0 210; GFX1010-NEXT: s_mov_b32 s13, s15 211; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 212; GFX1010-NEXT: s_mov_b32 s12, s14 213; GFX1010-NEXT: s_mov_b32 s14, s16 214; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], 0 215; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 216; GFX1010-NEXT: s_getpc_b64 s[18:19] 217; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 218; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 219; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] 220; GFX1010-NEXT: s_endpgm 221; 222; GFX1100-LABEL: test_kern_stack_and_call: 223; GFX1100: ; %bb.0: ; %entry 224; GFX1100-NEXT: v_mov_b32_e32 v1, 0 225; GFX1100-NEXT: v_mov_b32_e32 v31, v0 226; GFX1100-NEXT: s_mov_b32 s12, s13 227; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7] 228; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5] 229; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1] 230; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3] 231; GFX1100-NEXT: s_mov_b32 s13, s14 232; GFX1100-NEXT: s_mov_b32 s14, s15 233; GFX1100-NEXT: s_mov_b32 s32, 16 234; GFX1100-NEXT: scratch_store_b32 off, v1, off dlc 235; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 236; GFX1100-NEXT: s_getpc_b64 s[16:17] 237; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 238; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 239; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17] 240; GFX1100-NEXT: s_endpgm 241 242entry: 243 %x = alloca i32, align 4, addrspace(5) 244 store volatile i32 0, ptr addrspace(5) %x, align 4 245 tail call void @ex() #0 246 ret void 247} 248 249define amdgpu_kernel void @test_force_fp_kern_empty() local_unnamed_addr #2 { 250; GFX803-LABEL: test_force_fp_kern_empty: 251; GFX803: ; %bb.0: ; %entry 252; GFX803-NEXT: s_mov_b32 s33, 0 253; GFX803-NEXT: s_endpgm 254; 255; GFX900-LABEL: test_force_fp_kern_empty: 256; GFX900: ; %bb.0: ; %entry 257; GFX900-NEXT: s_mov_b32 s33, 0 258; GFX900-NEXT: s_endpgm 259; 260; GFX1010-LABEL: test_force_fp_kern_empty: 261; GFX1010: ; %bb.0: ; %entry 262; GFX1010-NEXT: s_mov_b32 s33, 0 263; GFX1010-NEXT: s_endpgm 264; 265; GFX1100-LABEL: test_force_fp_kern_empty: 266; GFX1100: ; %bb.0: ; %entry 267; GFX1100-NEXT: s_mov_b32 s33, 0 268; GFX1100-NEXT: s_endpgm 269 270entry: 271 ret void 272} 273 274define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 { 275; GFX803-LABEL: test_force_fp_kern_stack: 276; GFX803: ; %bb.0: ; %entry 277; GFX803-NEXT: s_add_u32 s0, s0, s17 278; GFX803-NEXT: s_mov_b32 s33, 0 279; GFX803-NEXT: s_addc_u32 s1, s1, 0 280; GFX803-NEXT: v_mov_b32_e32 v0, 0 281; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s33 282; GFX803-NEXT: s_waitcnt vmcnt(0) 283; GFX803-NEXT: s_endpgm 284; 285; GFX900-LABEL: test_force_fp_kern_stack: 286; GFX900: ; %bb.0: ; %entry 287; GFX900-NEXT: s_add_u32 s0, s0, s17 288; GFX900-NEXT: s_mov_b32 s33, 0 289; GFX900-NEXT: s_addc_u32 s1, s1, 0 290; GFX900-NEXT: v_mov_b32_e32 v0, 0 291; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s33 292; GFX900-NEXT: s_waitcnt vmcnt(0) 293; GFX900-NEXT: s_endpgm 294; 295; GFX1010-LABEL: test_force_fp_kern_stack: 296; GFX1010: ; %bb.0: ; %entry 297; GFX1010-NEXT: v_mov_b32_e32 v0, 0 298; GFX1010-NEXT: s_add_u32 s0, s0, s17 299; GFX1010-NEXT: s_mov_b32 s33, 0 300; GFX1010-NEXT: s_addc_u32 s1, s1, 0 301; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 302; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 303; GFX1010-NEXT: s_endpgm 304; 305; GFX1100-LABEL: test_force_fp_kern_stack: 306; GFX1100: ; %bb.0: ; %entry 307; GFX1100-NEXT: v_mov_b32_e32 v0, 0 308; GFX1100-NEXT: s_mov_b32 s33, 0 309; GFX1100-NEXT: scratch_store_b32 off, v0, s33 dlc 310; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 311; GFX1100-NEXT: s_endpgm 312entry: 313 %x = alloca i32, align 4, addrspace(5) 314 store volatile i32 0, ptr addrspace(5) %x, align 4 315 ret void 316} 317 318define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 { 319; GFX803-LABEL: test_force_fp_kern_call: 320; GFX803: ; %bb.0: ; %entry 321; GFX803-NEXT: s_add_i32 s12, s12, s17 322; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 323; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 324; GFX803-NEXT: s_add_u32 s0, s0, s17 325; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 326; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 327; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 328; GFX803-NEXT: s_addc_u32 s1, s1, 0 329; GFX803-NEXT: s_mov_b32 s13, s15 330; GFX803-NEXT: s_mov_b32 s12, s14 331; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 332; GFX803-NEXT: s_mov_b32 s14, s16 333; GFX803-NEXT: s_mov_b32 s33, 0 334; GFX803-NEXT: s_mov_b32 s32, 0 335; GFX803-NEXT: s_getpc_b64 s[18:19] 336; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 337; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 338; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] 339; GFX803-NEXT: s_endpgm 340; 341; GFX900-LABEL: test_force_fp_kern_call: 342; GFX900: ; %bb.0: ; %entry 343; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 344; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 345; GFX900-NEXT: s_add_u32 s0, s0, s17 346; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 347; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 348; GFX900-NEXT: s_addc_u32 s1, s1, 0 349; GFX900-NEXT: s_mov_b32 s13, s15 350; GFX900-NEXT: s_mov_b32 s12, s14 351; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 352; GFX900-NEXT: s_mov_b32 s14, s16 353; GFX900-NEXT: s_mov_b32 s33, 0 354; GFX900-NEXT: s_mov_b32 s32, 0 355; GFX900-NEXT: s_getpc_b64 s[18:19] 356; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 357; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 358; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] 359; GFX900-NEXT: s_endpgm 360; 361; GFX1010-LABEL: test_force_fp_kern_call: 362; GFX1010: ; %bb.0: ; %entry 363; GFX1010-NEXT: s_add_u32 s12, s12, s17 364; GFX1010-NEXT: s_mov_b32 s33, 0 365; GFX1010-NEXT: s_mov_b32 s32, 0 366; GFX1010-NEXT: s_addc_u32 s13, s13, 0 367; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 368; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 369; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 370; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 371; GFX1010-NEXT: s_add_u32 s0, s0, s17 372; GFX1010-NEXT: s_addc_u32 s1, s1, 0 373; GFX1010-NEXT: s_mov_b32 s13, s15 374; GFX1010-NEXT: s_mov_b32 s12, s14 375; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 376; GFX1010-NEXT: s_mov_b32 s14, s16 377; GFX1010-NEXT: s_getpc_b64 s[18:19] 378; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 379; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 380; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] 381; GFX1010-NEXT: s_endpgm 382; 383; GFX1100-LABEL: test_force_fp_kern_call: 384; GFX1100: ; %bb.0: ; %entry 385; GFX1100-NEXT: v_mov_b32_e32 v31, v0 386; GFX1100-NEXT: s_mov_b32 s12, s13 387; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7] 388; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5] 389; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1] 390; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3] 391; GFX1100-NEXT: s_mov_b32 s13, s14 392; GFX1100-NEXT: s_mov_b32 s14, s15 393; GFX1100-NEXT: s_mov_b32 s33, 0 394; GFX1100-NEXT: s_mov_b32 s32, 0 395; GFX1100-NEXT: s_getpc_b64 s[16:17] 396; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 397; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 398; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17] 399; GFX1100-NEXT: s_endpgm 400; GFX1010-NEXT s_add_u32 s12, s12, s17 401; GFX1010-NEXT s_mov_b32 s33, 0 402; GFX1010-NEXT s_mov_b32 s32, 0 403; GFX1010-NEXT s_addc_u32 s13, s13, 0 404; GFX1010-NEXT s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 405; GFX1010-NEXT s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 406; GFX1010-NEXT v_lshlrev_b32_e32 v2, 20, v2 407; GFX1010-NEXT v_lshlrev_b32_e32 v1, 10, v1 408; GFX1010-NEXT s_add_u32 s0, s0, s17 409; GFX1010-NEXT s_addc_u32 s1, s1, 0 410; GFX1010-NEXT s_mov_b32 s12, s14 411; GFX1010-NEXT s_mov_b32 s13, s15 412; GFX1010-NEXT v_or3_b32 v31, v0, v1, v2 413; GFX1010-NEXT s_mov_b32 s14, s16 414; GFX1010-NEXT s_getpc_b64 s[18:19] 415; GFX1010-NEXT s_add_u32 s18, s18, ex@rel32@lo+4 416; GFX1010-NEXT s_addc_u32 s19, s19, ex@rel32@hi+12 417; GFX1010-NEXT s_swappc_b64 s[30:31], s[18:19] 418; GFX1010-NEXT s_endpgm 419entry: 420 tail call void @ex() #2 421 ret void 422} 423 424define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_addr #2 { 425; GFX803-LABEL: test_force_fp_kern_stack_and_call: 426; GFX803: ; %bb.0: ; %entry 427; GFX803-NEXT: s_add_i32 s12, s12, s17 428; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 429; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 430; GFX803-NEXT: s_add_u32 s0, s0, s17 431; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 432; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 433; GFX803-NEXT: s_mov_b32 s33, 0 434; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 435; GFX803-NEXT: s_addc_u32 s1, s1, 0 436; GFX803-NEXT: s_mov_b32 s13, s15 437; GFX803-NEXT: s_mov_b32 s12, s14 438; GFX803-NEXT: v_mov_b32_e32 v3, 0 439; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 440; GFX803-NEXT: s_mov_b32 s14, s16 441; GFX803-NEXT: s_movk_i32 s32, 0x400 442; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], s33 443; GFX803-NEXT: s_waitcnt vmcnt(0) 444; GFX803-NEXT: s_getpc_b64 s[18:19] 445; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 446; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 447; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] 448; GFX803-NEXT: s_endpgm 449; 450; GFX900-LABEL: test_force_fp_kern_stack_and_call: 451; GFX900: ; %bb.0: ; %entry 452; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 453; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 454; GFX900-NEXT: s_add_u32 s0, s0, s17 455; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 456; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 457; GFX900-NEXT: s_mov_b32 s33, 0 458; GFX900-NEXT: s_addc_u32 s1, s1, 0 459; GFX900-NEXT: s_mov_b32 s13, s15 460; GFX900-NEXT: s_mov_b32 s12, s14 461; GFX900-NEXT: v_mov_b32_e32 v3, 0 462; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 463; GFX900-NEXT: s_mov_b32 s14, s16 464; GFX900-NEXT: s_movk_i32 s32, 0x400 465; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], s33 466; GFX900-NEXT: s_waitcnt vmcnt(0) 467; GFX900-NEXT: s_getpc_b64 s[18:19] 468; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 469; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 470; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] 471; GFX900-NEXT: s_endpgm 472; 473; GFX1010-LABEL: test_force_fp_kern_stack_and_call: 474; GFX1010: ; %bb.0: ; %entry 475; GFX1010-NEXT: s_add_u32 s12, s12, s17 476; GFX1010-NEXT: s_mov_b32 s33, 0 477; GFX1010-NEXT: s_movk_i32 s32, 0x200 478; GFX1010-NEXT: s_addc_u32 s13, s13, 0 479; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 480; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 481; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 482; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 483; GFX1010-NEXT: v_mov_b32_e32 v3, 0 484; GFX1010-NEXT: s_add_u32 s0, s0, s17 485; GFX1010-NEXT: s_addc_u32 s1, s1, 0 486; GFX1010-NEXT: s_mov_b32 s13, s15 487; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 488; GFX1010-NEXT: s_mov_b32 s12, s14 489; GFX1010-NEXT: s_mov_b32 s14, s16 490; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], s33 491; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 492; GFX1010-NEXT: s_getpc_b64 s[18:19] 493; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 494; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 495; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] 496; GFX1010-NEXT: s_endpgm 497; 498; GFX1100-LABEL: test_force_fp_kern_stack_and_call: 499; GFX1100: ; %bb.0: ; %entry 500; GFX1100-NEXT: v_mov_b32_e32 v1, 0 501; GFX1100-NEXT: v_mov_b32_e32 v31, v0 502; GFX1100-NEXT: s_mov_b32 s33, 0 503; GFX1100-NEXT: s_mov_b32 s12, s13 504; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7] 505; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5] 506; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1] 507; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3] 508; GFX1100-NEXT: s_mov_b32 s13, s14 509; GFX1100-NEXT: s_mov_b32 s14, s15 510; GFX1100-NEXT: s_mov_b32 s32, 16 511; GFX1100-NEXT: scratch_store_b32 off, v1, s33 dlc 512; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 513; GFX1100-NEXT: s_getpc_b64 s[16:17] 514; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 515; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 516; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17] 517; GFX1100-NEXT: s_endpgm 518entry: 519 %x = alloca i32, align 4, addrspace(5) 520 store volatile i32 0, ptr addrspace(5) %x, align 4 521 tail call void @ex() #2 522 ret void 523} 524 525define amdgpu_kernel void @test_sgpr_offset_kernel() #1 { 526; GFX803-LABEL: test_sgpr_offset_kernel: 527; GFX803: ; %bb.0: ; %entry 528; GFX803-NEXT: s_add_u32 s0, s0, s17 529; GFX803-NEXT: s_addc_u32 s1, s1, 0 530; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc 531; GFX803-NEXT: s_waitcnt vmcnt(0) 532; GFX803-NEXT: s_mov_b32 s4, 0x40000 533; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill 534; GFX803-NEXT: ;;#ASMSTART 535; GFX803-NEXT: ;;#ASMEND 536; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload 537; GFX803-NEXT: s_waitcnt vmcnt(0) 538; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 539; GFX803-NEXT: s_waitcnt vmcnt(0) 540; GFX803-NEXT: s_endpgm 541; 542; GFX900-LABEL: test_sgpr_offset_kernel: 543; GFX900: ; %bb.0: ; %entry 544; GFX900-NEXT: s_add_u32 s0, s0, s17 545; GFX900-NEXT: s_addc_u32 s1, s1, 0 546; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc 547; GFX900-NEXT: s_waitcnt vmcnt(0) 548; GFX900-NEXT: s_mov_b32 s4, 0x40000 549; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill 550; GFX900-NEXT: ;;#ASMSTART 551; GFX900-NEXT: ;;#ASMEND 552; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload 553; GFX900-NEXT: s_waitcnt vmcnt(0) 554; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 555; GFX900-NEXT: s_waitcnt vmcnt(0) 556; GFX900-NEXT: s_endpgm 557; 558; GFX1010-LABEL: test_sgpr_offset_kernel: 559; GFX1010: ; %bb.0: ; %entry 560; GFX1010-NEXT: s_add_u32 s0, s0, s17 561; GFX1010-NEXT: s_addc_u32 s1, s1, 0 562; GFX1010-NEXT: s_mov_b32 s4, 0x20000 563; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc dlc 564; GFX1010-NEXT: s_waitcnt vmcnt(0) 565; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill 566; GFX1010-NEXT: ;;#ASMSTART 567; GFX1010-NEXT: ;;#ASMEND 568; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload 569; GFX1010-NEXT: s_waitcnt vmcnt(0) 570; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 571; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 572; GFX1010-NEXT: s_endpgm 573; 574; GFX1100-LABEL: test_sgpr_offset_kernel: 575; GFX1100: ; %bb.0: ; %entry 576; GFX1100-NEXT: scratch_load_b32 v0, off, off offset:8 glc dlc 577; GFX1100-NEXT: s_waitcnt vmcnt(0) 578; GFX1100-NEXT: s_movk_i32 s0, 0x1000 579; GFX1100-NEXT: scratch_store_b32 off, v0, s0 ; 4-byte Folded Spill 580; GFX1100-NEXT: ;;#ASMSTART 581; GFX1100-NEXT: ;;#ASMEND 582; GFX1100-NEXT: scratch_load_b32 v0, off, s0 ; 4-byte Folded Reload 583; GFX1100-NEXT: s_waitcnt vmcnt(0) 584; GFX1100-NEXT: scratch_store_b32 off, v0, off offset:8 dlc 585; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 586; GFX1100-NEXT: s_endpgm 587entry: 588 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not 589 ; fit in the instruction, and has to live in the SGPR offset. 590 %alloca = alloca i8, i32 4092, align 4, addrspace(5) 591 592 %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 593 ; 0x40000 / 64 = 4096 (for wave64) 594 ; CHECK: s_add_u32 s6, s7, 0x40000 595 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill 596 %a = load volatile i32, ptr addrspace(5) %aptr 597 598 ; Force %a to spill 599 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 600 601 %outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 602 store volatile i32 %a, ptr addrspace(5) %outptr 603 604 ret void 605} 606 607declare hidden void @ex() local_unnamed_addr #0 608 609attributes #0 = { nounwind } 610attributes #1 = { nounwind "amdgpu-num-vgpr"="8" } 611attributes #2 = { nounwind "frame-pointer"="all" } 612 613!llvm.module.flags = !{!0} 614!0 = !{i32 1, !"amdhsa_code_object_version", i32 500} 615