1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck -check-prefix=MUBUF %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 -mattr=+enable-flat-scratch < %s | FileCheck -check-prefix=FLATSCR %s 4 5; Test that the VGPR spiller correctly switches to SGPR offsets when the 6; instruction offset field would overflow, and that it accounts for memory 7; swizzling. 8 9define amdgpu_kernel void @test_inst_offset_kernel() { 10; MUBUF-LABEL: test_inst_offset_kernel: 11; MUBUF: ; %bb.0: ; %entry 12; MUBUF-NEXT: s_add_u32 s0, s0, s17 13; MUBUF-NEXT: s_addc_u32 s1, s1, 0 14; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 glc 15; MUBUF-NEXT: s_waitcnt vmcnt(0) 16; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Spill 17; MUBUF-NEXT: ;;#ASMSTART 18; MUBUF-NEXT: ;;#ASMEND 19; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Reload 20; MUBUF-NEXT: s_waitcnt vmcnt(0) 21; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 22; MUBUF-NEXT: s_waitcnt vmcnt(0) 23; MUBUF-NEXT: s_endpgm 24; 25; FLATSCR-LABEL: test_inst_offset_kernel: 26; FLATSCR: ; %bb.0: ; %entry 27; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 28; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 29; FLATSCR-NEXT: s_mov_b32 s0, 0 30; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:4 glc 31; FLATSCR-NEXT: s_waitcnt vmcnt(0) 32; FLATSCR-NEXT: s_movk_i32 s0, 0xff8 33; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; 4-byte Folded Spill 34; FLATSCR-NEXT: ;;#ASMSTART 35; FLATSCR-NEXT: ;;#ASMEND 36; FLATSCR-NEXT: scratch_load_dword v0, off, s0 ; 4-byte Folded Reload 37; FLATSCR-NEXT: s_mov_b32 s0, 0 38; FLATSCR-NEXT: s_waitcnt vmcnt(0) 39; FLATSCR-NEXT: scratch_store_dword off, v0, s0 offset:4 40; FLATSCR-NEXT: s_waitcnt vmcnt(0) 41; FLATSCR-NEXT: s_endpgm 42entry: 43 ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in 44 ; the instruction offset field. 45 %alloca = alloca i8, i32 4088, align 4, addrspace(5) 46 47 %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 48 49 50 %a = load volatile i32, ptr addrspace(5) %aptr 51 52 ; Force %a to spill. 53 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 54 55 %outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 56 store volatile i32 %a, ptr addrspace(5) %outptr 57 58 ret void 59} 60 61define amdgpu_kernel void @test_sgpr_offset_kernel() { 62; MUBUF-LABEL: test_sgpr_offset_kernel: 63; MUBUF: ; %bb.0: ; %entry 64; MUBUF-NEXT: s_add_u32 s0, s0, s17 65; MUBUF-NEXT: s_addc_u32 s1, s1, 0 66; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc 67; MUBUF-NEXT: s_waitcnt vmcnt(0) 68; MUBUF-NEXT: s_mov_b32 s4, 0x40000 69; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill 70; MUBUF-NEXT: ;;#ASMSTART 71; MUBUF-NEXT: ;;#ASMEND 72; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload 73; MUBUF-NEXT: s_waitcnt vmcnt(0) 74; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 75; MUBUF-NEXT: s_waitcnt vmcnt(0) 76; MUBUF-NEXT: s_endpgm 77; 78; FLATSCR-LABEL: test_sgpr_offset_kernel: 79; FLATSCR: ; %bb.0: ; %entry 80; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 81; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 82; FLATSCR-NEXT: s_mov_b32 s0, 0 83; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:8 glc 84; FLATSCR-NEXT: s_waitcnt vmcnt(0) 85; FLATSCR-NEXT: s_movk_i32 s0, 0x1000 86; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; 4-byte Folded Spill 87; FLATSCR-NEXT: ;;#ASMSTART 88; FLATSCR-NEXT: ;;#ASMEND 89; FLATSCR-NEXT: scratch_load_dword v0, off, s0 ; 4-byte Folded Reload 90; FLATSCR-NEXT: s_mov_b32 s0, 0 91; FLATSCR-NEXT: s_waitcnt vmcnt(0) 92; FLATSCR-NEXT: scratch_store_dword off, v0, s0 offset:8 93; FLATSCR-NEXT: s_waitcnt vmcnt(0) 94; FLATSCR-NEXT: s_endpgm 95entry: 96 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not 97 ; fit in the instruction, and has to live in the SGPR offset. 98 %alloca = alloca i8, i32 4092, align 4, addrspace(5) 99 100 %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 101 ; 0x40000 / 64 = 4096 (for wave64) 102 %a = load volatile i32, ptr addrspace(5) %aptr 103 ; Force %a to spill 104 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 105 106 %outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 107 store volatile i32 %a, ptr addrspace(5) %outptr 108 109 ret void 110} 111 112define void @test_sgpr_offset_function_scavenge_fail_func() #2 { 113; MUBUF-LABEL: test_sgpr_offset_function_scavenge_fail_func: 114; MUBUF: ; %bb.0: ; %entry 115; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 116; MUBUF-NEXT: ;;#ASMSTART 117; MUBUF-NEXT: ;;#ASMEND 118; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 glc 119; MUBUF-NEXT: s_waitcnt vmcnt(0) 120; MUBUF-NEXT: s_add_i32 s10, s32, 0x40100 121; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s10 ; 4-byte Folded Spill 122; MUBUF-NEXT: ;;#ASMSTART 123; MUBUF-NEXT: ;;#ASMEND 124; MUBUF-NEXT: ;;#ASMSTART 125; MUBUF-NEXT: ;;#ASMEND 126; MUBUF-NEXT: ;;#ASMSTART 127; MUBUF-NEXT: ;;#ASMEND 128; MUBUF-NEXT: s_add_i32 s10, s32, 0x40100 129; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s10 ; 4-byte Folded Reload 130; MUBUF-NEXT: s_waitcnt vmcnt(0) 131; MUBUF-NEXT: ;;#ASMSTART 132; MUBUF-NEXT: ;;#ASMEND 133; MUBUF-NEXT: s_setpc_b64 s[30:31] 134; 135; FLATSCR-LABEL: test_sgpr_offset_function_scavenge_fail_func: 136; FLATSCR: ; %bb.0: ; %entry 137; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 138; FLATSCR-NEXT: ;;#ASMSTART 139; FLATSCR-NEXT: ;;#ASMEND 140; FLATSCR-NEXT: scratch_load_dword v0, off, s32 offset:8 glc 141; FLATSCR-NEXT: s_waitcnt vmcnt(0) 142; FLATSCR-NEXT: s_add_i32 s8, s32, 0x1004 143; FLATSCR-NEXT: scratch_store_dword off, v0, s8 ; 4-byte Folded Spill 144; FLATSCR-NEXT: ;;#ASMSTART 145; FLATSCR-NEXT: ;;#ASMEND 146; FLATSCR-NEXT: ;;#ASMSTART 147; FLATSCR-NEXT: ;;#ASMEND 148; FLATSCR-NEXT: ;;#ASMSTART 149; FLATSCR-NEXT: ;;#ASMEND 150; FLATSCR-NEXT: s_add_i32 s8, s32, 0x1004 151; FLATSCR-NEXT: scratch_load_dword v0, off, s8 ; 4-byte Folded Reload 152; FLATSCR-NEXT: s_waitcnt vmcnt(0) 153; FLATSCR-NEXT: ;;#ASMSTART 154; FLATSCR-NEXT: ;;#ASMEND 155; FLATSCR-NEXT: s_setpc_b64 s[30:31] 156entry: 157 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not 158 ; fit in the instruction, and has to live in the SGPR offset. 159 %alloca = alloca i8, i32 4096, align 4, addrspace(5) 160 161 %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 162 163 %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"() 164 %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0 165 %asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1 166 %asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2 167 %asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3 168 %asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4 169 %asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5 170 %asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6 171 %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7 172 173 ; 0x40000 / 64 = 4096 (for wave64) 174 %a = load volatile i32, ptr addrspace(5) %aptr 175 call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a) 176 177 %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"() 178 %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0 179 %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1 180 %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2 181 %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3 182 %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4 183 %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5 184 %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6 185 %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7 186 187 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 188 ; Force %a to spill with no free SGPRs 189 call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a) 190 ret void 191} 192 193define amdgpu_kernel void @test_sgpr_offset_function_scavenge_fail_kernel() #3 { 194; MUBUF-LABEL: test_sgpr_offset_function_scavenge_fail_kernel: 195; MUBUF: ; %bb.0: ; %entry 196; MUBUF-NEXT: s_add_u32 s0, s0, s17 197; MUBUF-NEXT: s_addc_u32 s1, s1, 0 198; MUBUF-NEXT: ;;#ASMSTART 199; MUBUF-NEXT: ;;#ASMEND 200; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc 201; MUBUF-NEXT: s_waitcnt vmcnt(0) 202; MUBUF-NEXT: s_mov_b32 s10, 0x40100 203; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s10 ; 4-byte Folded Spill 204; MUBUF-NEXT: ;;#ASMSTART 205; MUBUF-NEXT: ;;#ASMEND 206; MUBUF-NEXT: ;;#ASMSTART 207; MUBUF-NEXT: ;;#ASMEND 208; MUBUF-NEXT: ;;#ASMSTART 209; MUBUF-NEXT: ;;#ASMEND 210; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s10 ; 4-byte Folded Reload 211; MUBUF-NEXT: s_waitcnt vmcnt(0) 212; MUBUF-NEXT: ;;#ASMSTART 213; MUBUF-NEXT: ;;#ASMEND 214; MUBUF-NEXT: s_endpgm 215; 216; FLATSCR-LABEL: test_sgpr_offset_function_scavenge_fail_kernel: 217; FLATSCR: ; %bb.0: ; %entry 218; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 219; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 220; FLATSCR-NEXT: s_mov_b32 s8, 0 221; FLATSCR-NEXT: ;;#ASMSTART 222; FLATSCR-NEXT: ;;#ASMEND 223; FLATSCR-NEXT: scratch_load_dword v0, off, s8 offset:8 glc 224; FLATSCR-NEXT: s_waitcnt vmcnt(0) 225; FLATSCR-NEXT: s_movk_i32 s8, 0x1004 226; FLATSCR-NEXT: scratch_store_dword off, v0, s8 ; 4-byte Folded Spill 227; FLATSCR-NEXT: ;;#ASMSTART 228; FLATSCR-NEXT: ;;#ASMEND 229; FLATSCR-NEXT: ;;#ASMSTART 230; FLATSCR-NEXT: ;;#ASMEND 231; FLATSCR-NEXT: ;;#ASMSTART 232; FLATSCR-NEXT: ;;#ASMEND 233; FLATSCR-NEXT: scratch_load_dword v0, off, s8 ; 4-byte Folded Reload 234; FLATSCR-NEXT: s_waitcnt vmcnt(0) 235; FLATSCR-NEXT: ;;#ASMSTART 236; FLATSCR-NEXT: ;;#ASMEND 237; FLATSCR-NEXT: s_endpgm 238entry: 239 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not 240 ; fit in the instruction, and has to live in the SGPR offset. 241 %alloca = alloca i8, i32 4096, align 4, addrspace(5) 242 243 %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 244 245 %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"() 246 %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0 247 %asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1 248 %asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2 249 %asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3 250 %asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4 251 %asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5 252 %asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6 253 %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7 254 255 ; 0x40000 / 64 = 4096 (for wave64) 256 %a = load volatile i32, ptr addrspace(5) %aptr 257 call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a) 258 259 %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"() 260 %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0 261 %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1 262 %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2 263 %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3 264 %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4 265 %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5 266 %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6 267 %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7 268 269 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 270 ; Force %a to spill with no free SGPRs 271 call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a) 272 ret void 273} 274 275define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() { 276; MUBUF-LABEL: test_sgpr_offset_subregs_kernel: 277; MUBUF: ; %bb.0: ; %entry 278; MUBUF-NEXT: s_add_u32 s0, s0, s17 279; MUBUF-NEXT: s_addc_u32 s1, s1, 0 280; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc 281; MUBUF-NEXT: s_waitcnt vmcnt(0) 282; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:12 glc 283; MUBUF-NEXT: s_waitcnt vmcnt(0) 284; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4084 ; 4-byte Folded Spill 285; MUBUF-NEXT: s_nop 0 286; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:4088 ; 4-byte Folded Spill 287; MUBUF-NEXT: ;;#ASMSTART 288; MUBUF-NEXT: ;;#ASMEND 289; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 glc 290; MUBUF-NEXT: s_waitcnt vmcnt(0) 291; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4084 ; 4-byte Folded Reload 292; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4088 ; 4-byte Folded Reload 293; MUBUF-NEXT: s_waitcnt vmcnt(0) 294; MUBUF-NEXT: ;;#ASMSTART 295; MUBUF-NEXT: ; v[0:1] 296; MUBUF-NEXT: ;;#ASMEND 297; MUBUF-NEXT: s_endpgm 298; 299; FLATSCR-LABEL: test_sgpr_offset_subregs_kernel: 300; FLATSCR: ; %bb.0: ; %entry 301; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 302; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 303; FLATSCR-NEXT: s_mov_b32 s0, 0 304; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:8 glc 305; FLATSCR-NEXT: s_waitcnt vmcnt(0) 306; FLATSCR-NEXT: s_movk_i32 s0, 0xff4 307; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; 8-byte Folded Spill 308; FLATSCR-NEXT: s_mov_b32 s0, 0 309; FLATSCR-NEXT: ;;#ASMSTART 310; FLATSCR-NEXT: ;;#ASMEND 311; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:4 glc 312; FLATSCR-NEXT: s_waitcnt vmcnt(0) 313; FLATSCR-NEXT: s_movk_i32 s0, 0xff4 314; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload 315; FLATSCR-NEXT: s_waitcnt vmcnt(0) 316; FLATSCR-NEXT: ;;#ASMSTART 317; FLATSCR-NEXT: ; v[0:1] 318; FLATSCR-NEXT: ;;#ASMEND 319; FLATSCR-NEXT: s_endpgm 320entry: 321 ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a 322 ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in 323 ; the instruction offset field. 324 %alloca = alloca i8, i32 4084, align 4, addrspace(5) 325 %aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1 326 %a = load volatile <2 x i32>, ptr addrspace(5) %aptr 327 328 ; Force %a to spill. 329 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 330 331 ; Ensure the alloca sticks around. 332 %bptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 333 %b = load volatile i32, ptr addrspace(5) %bptr 334 335 ; Ensure the spill is of the full super-reg. 336 call void asm sideeffect "; $0", "r"(<2 x i32> %a) 337 338 ret void 339} 340 341define amdgpu_kernel void @test_inst_offset_subregs_kernel() { 342; MUBUF-LABEL: test_inst_offset_subregs_kernel: 343; MUBUF: ; %bb.0: ; %entry 344; MUBUF-NEXT: s_add_u32 s0, s0, s17 345; MUBUF-NEXT: s_addc_u32 s1, s1, 0 346; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12 glc 347; MUBUF-NEXT: s_waitcnt vmcnt(0) 348; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 glc 349; MUBUF-NEXT: s_waitcnt vmcnt(0) 350; MUBUF-NEXT: s_mov_b32 s4, 0x3ff00 351; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill 352; MUBUF-NEXT: s_nop 0 353; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Spill 354; MUBUF-NEXT: ;;#ASMSTART 355; MUBUF-NEXT: ;;#ASMEND 356; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc 357; MUBUF-NEXT: s_waitcnt vmcnt(0) 358; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload 359; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Reload 360; MUBUF-NEXT: s_waitcnt vmcnt(0) 361; MUBUF-NEXT: ;;#ASMSTART 362; MUBUF-NEXT: ; v[0:1] 363; MUBUF-NEXT: ;;#ASMEND 364; MUBUF-NEXT: s_endpgm 365; 366; FLATSCR-LABEL: test_inst_offset_subregs_kernel: 367; FLATSCR: ; %bb.0: ; %entry 368; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 369; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 370; FLATSCR-NEXT: s_mov_b32 s0, 0 371; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:12 glc 372; FLATSCR-NEXT: s_waitcnt vmcnt(0) 373; FLATSCR-NEXT: s_movk_i32 s0, 0xffc 374; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; 8-byte Folded Spill 375; FLATSCR-NEXT: s_mov_b32 s0, 0 376; FLATSCR-NEXT: ;;#ASMSTART 377; FLATSCR-NEXT: ;;#ASMEND 378; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:8 glc 379; FLATSCR-NEXT: s_waitcnt vmcnt(0) 380; FLATSCR-NEXT: s_movk_i32 s0, 0xffc 381; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload 382; FLATSCR-NEXT: s_waitcnt vmcnt(0) 383; FLATSCR-NEXT: ;;#ASMSTART 384; FLATSCR-NEXT: ; v[0:1] 385; FLATSCR-NEXT: ;;#ASMEND 386; FLATSCR-NEXT: s_endpgm 387entry: 388 ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a 389 ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live 390 ; in the SGPR offset. 391 %alloca = alloca i8, i32 4088, align 4, addrspace(5) 392 393 ; 0x3ff00 / 64 = 4092 (for wave64) 394 %aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1 395 %a = load volatile <2 x i32>, ptr addrspace(5) %aptr 396 397 ; Force %a to spill. 398 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 399 400 ; Ensure the alloca sticks around. 401 %bptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 402 %b = load volatile i32, ptr addrspace(5) %bptr 403 404 ; Ensure the spill is of the full super-reg. 405 call void asm sideeffect "; $0", "r"(<2 x i32> %a) 406 407 ret void 408} 409 410define void @test_inst_offset_function() { 411; MUBUF-LABEL: test_inst_offset_function: 412; MUBUF: ; %bb.0: ; %entry 413; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 414; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc 415; MUBUF-NEXT: s_waitcnt vmcnt(0) 416; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4088 ; 4-byte Folded Spill 417; MUBUF-NEXT: ;;#ASMSTART 418; MUBUF-NEXT: ;;#ASMEND 419; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4088 ; 4-byte Folded Reload 420; MUBUF-NEXT: s_waitcnt vmcnt(0) 421; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 422; MUBUF-NEXT: s_waitcnt vmcnt(0) 423; MUBUF-NEXT: s_setpc_b64 s[30:31] 424; 425; FLATSCR-LABEL: test_inst_offset_function: 426; FLATSCR: ; %bb.0: ; %entry 427; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 428; FLATSCR-NEXT: scratch_load_dword v0, off, s32 offset:4 glc 429; FLATSCR-NEXT: s_waitcnt vmcnt(0) 430; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:4088 ; 4-byte Folded Spill 431; FLATSCR-NEXT: ;;#ASMSTART 432; FLATSCR-NEXT: ;;#ASMEND 433; FLATSCR-NEXT: scratch_load_dword v0, off, s32 offset:4088 ; 4-byte Folded Reload 434; FLATSCR-NEXT: s_waitcnt vmcnt(0) 435; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:4 436; FLATSCR-NEXT: s_waitcnt vmcnt(0) 437; FLATSCR-NEXT: s_setpc_b64 s[30:31] 438entry: 439 ; Occupy enough bytes of scratch, so the offset of the spill of %a 440 ; just fits in the instruction offset field when the emergency stack 441 ; slot is added. It's hard to hit the actual limit since we're also 442 ; going to insert the emergency stack slot for large frames. 443 %alloca = alloca i8, i32 4088, align 4, addrspace(5) 444 445 %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 446 447 448 %a = load volatile i32, ptr addrspace(5) %aptr 449 450 ; Force %a to spill. 451 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 452 453 %outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 454 store volatile i32 %a, ptr addrspace(5) %outptr 455 456 ret void 457} 458 459define void @test_sgpr_offset_function() { 460; MUBUF-LABEL: test_sgpr_offset_function: 461; MUBUF: ; %bb.0: ; %entry 462; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 463; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 glc 464; MUBUF-NEXT: s_waitcnt vmcnt(0) 465; MUBUF-NEXT: s_add_i32 s4, s32, 0x40100 466; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill 467; MUBUF-NEXT: ;;#ASMSTART 468; MUBUF-NEXT: ;;#ASMEND 469; MUBUF-NEXT: s_add_i32 s4, s32, 0x40100 470; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload 471; MUBUF-NEXT: s_waitcnt vmcnt(0) 472; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 473; MUBUF-NEXT: s_waitcnt vmcnt(0) 474; MUBUF-NEXT: s_setpc_b64 s[30:31] 475; 476; FLATSCR-LABEL: test_sgpr_offset_function: 477; FLATSCR: ; %bb.0: ; %entry 478; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 479; FLATSCR-NEXT: scratch_load_dword v0, off, s32 offset:8 glc 480; FLATSCR-NEXT: s_waitcnt vmcnt(0) 481; FLATSCR-NEXT: s_add_i32 s0, s32, 0x1004 482; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; 4-byte Folded Spill 483; FLATSCR-NEXT: ;;#ASMSTART 484; FLATSCR-NEXT: ;;#ASMEND 485; FLATSCR-NEXT: s_add_i32 s0, s32, 0x1004 486; FLATSCR-NEXT: scratch_load_dword v0, off, s0 ; 4-byte Folded Reload 487; FLATSCR-NEXT: s_waitcnt vmcnt(0) 488; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:8 489; FLATSCR-NEXT: s_waitcnt vmcnt(0) 490; FLATSCR-NEXT: s_setpc_b64 s[30:31] 491entry: 492 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not 493 ; fit in the instruction, and has to live in the SGPR offset. 494 %alloca = alloca i8, i32 4096, align 4, addrspace(5) 495 496 %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 497 ; 0x40000 / 64 = 4096 (for wave64) 498 %a = load volatile i32, ptr addrspace(5) %aptr 499 500 ; Force %a to spill 501 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 502 503 %outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 504 store volatile i32 %a, ptr addrspace(5) %outptr 505 506 ret void 507} 508 509define void @test_sgpr_offset_subregs_function() { 510; MUBUF-LABEL: test_sgpr_offset_subregs_function: 511; MUBUF: ; %bb.0: ; %entry 512; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 513; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 glc 514; MUBUF-NEXT: s_waitcnt vmcnt(0) 515; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 glc 516; MUBUF-NEXT: s_waitcnt vmcnt(0) 517; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4084 ; 4-byte Folded Spill 518; MUBUF-NEXT: s_nop 0 519; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4088 ; 4-byte Folded Spill 520; MUBUF-NEXT: ;;#ASMSTART 521; MUBUF-NEXT: ;;#ASMEND 522; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc 523; MUBUF-NEXT: s_waitcnt vmcnt(0) 524; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4084 ; 4-byte Folded Reload 525; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4088 ; 4-byte Folded Reload 526; MUBUF-NEXT: s_waitcnt vmcnt(0) 527; MUBUF-NEXT: ;;#ASMSTART 528; MUBUF-NEXT: ; v[0:1] 529; MUBUF-NEXT: ;;#ASMEND 530; MUBUF-NEXT: s_setpc_b64 s[30:31] 531; 532; FLATSCR-LABEL: test_sgpr_offset_subregs_function: 533; FLATSCR: ; %bb.0: ; %entry 534; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 535; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s32 offset:8 glc 536; FLATSCR-NEXT: s_waitcnt vmcnt(0) 537; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s32 offset:4084 ; 8-byte Folded Spill 538; FLATSCR-NEXT: ;;#ASMSTART 539; FLATSCR-NEXT: ;;#ASMEND 540; FLATSCR-NEXT: scratch_load_dword v0, off, s32 offset:4 glc 541; FLATSCR-NEXT: s_waitcnt vmcnt(0) 542; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s32 offset:4084 ; 8-byte Folded Reload 543; FLATSCR-NEXT: s_waitcnt vmcnt(0) 544; FLATSCR-NEXT: ;;#ASMSTART 545; FLATSCR-NEXT: ; v[0:1] 546; FLATSCR-NEXT: ;;#ASMEND 547; FLATSCR-NEXT: s_setpc_b64 s[30:31] 548entry: 549 ; We want to test the spill of the last subreg of %a is the highest 550 ; valid value for the immediate offset. We enable the emergency 551 ; stack slot for large frames, so it's hard to get the frame layout 552 ; exactly as we want to test it. 553 ; Occupy 4084 bytes of scratch, so that the spill of the last subreg of %a 554 ; still fits below offset 4096 (4084 + 8 - 4 = 4092), and can be placed in 555 ; the instruction offset field. 556 %alloca = alloca i8, i32 4084, align 4, addrspace(5) 557 %aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1 558 %a = load volatile <2 x i32>, ptr addrspace(5) %aptr 559 560 ; Force %a to spill. 561 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 562 563 ; Ensure the alloca sticks around. 564 %bptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 565 %b = load volatile i32, ptr addrspace(5) %bptr 566 567 ; Ensure the spill is of the full super-reg. 568 call void asm sideeffect "; $0", "r"(<2 x i32> %a) 569 570 ret void 571} 572 573define void @test_inst_offset_subregs_function() { 574; MUBUF-LABEL: test_inst_offset_subregs_function: 575; MUBUF: ; %bb.0: ; %entry 576; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 577; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 glc 578; MUBUF-NEXT: s_waitcnt vmcnt(0) 579; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 glc 580; MUBUF-NEXT: s_waitcnt vmcnt(0) 581; MUBUF-NEXT: s_add_i32 s4, s32, 0x3ff00 582; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill 583; MUBUF-NEXT: s_nop 0 584; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Spill 585; MUBUF-NEXT: ;;#ASMSTART 586; MUBUF-NEXT: ;;#ASMEND 587; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 glc 588; MUBUF-NEXT: s_waitcnt vmcnt(0) 589; MUBUF-NEXT: s_add_i32 s4, s32, 0x3ff00 590; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload 591; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Reload 592; MUBUF-NEXT: s_waitcnt vmcnt(0) 593; MUBUF-NEXT: ;;#ASMSTART 594; MUBUF-NEXT: ; v[0:1] 595; MUBUF-NEXT: ;;#ASMEND 596; MUBUF-NEXT: s_setpc_b64 s[30:31] 597; 598; FLATSCR-LABEL: test_inst_offset_subregs_function: 599; FLATSCR: ; %bb.0: ; %entry 600; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 601; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s32 offset:12 glc 602; FLATSCR-NEXT: s_waitcnt vmcnt(0) 603; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s32 offset:4092 ; 8-byte Folded Spill 604; FLATSCR-NEXT: ;;#ASMSTART 605; FLATSCR-NEXT: ;;#ASMEND 606; FLATSCR-NEXT: scratch_load_dword v0, off, s32 offset:8 glc 607; FLATSCR-NEXT: s_waitcnt vmcnt(0) 608; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s32 offset:4092 ; 8-byte Folded Reload 609; FLATSCR-NEXT: s_waitcnt vmcnt(0) 610; FLATSCR-NEXT: ;;#ASMSTART 611; FLATSCR-NEXT: ; v[0:1] 612; FLATSCR-NEXT: ;;#ASMEND 613; FLATSCR-NEXT: s_setpc_b64 s[30:31] 614entry: 615 ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a 616 ; does not fit below offset 4096 (408 + 4 + 8 - 4 = 4096), and has to live 617 ; in the SGPR offset. 618 %alloca = alloca i8, i32 4088, align 4, addrspace(5) 619 620 ; 0x3ff0000 / 64 = 4092 (for wave64) 621 %aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1 622 %a = load volatile <2 x i32>, ptr addrspace(5) %aptr 623 624 ; Force %a to spill. 625 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 626 627 ; Ensure the alloca sticks around. 628 %bptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1 629 %b = load volatile i32, ptr addrspace(5) %bptr 630 631 ; Ensure the spill is of the full super-reg. 632 call void asm sideeffect "; $0", "r"(<2 x i32> %a) 633 634 ret void 635} 636 637attributes #0 = { nounwind } 638attributes #1 = { nounwind "amdgpu-num-sgpr"="17" "amdgpu-num-vgpr"="8" } 639attributes #2 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" } 640attributes #3 = { nounwind "amdgpu-num-sgpr"="18" "amdgpu-num-vgpr"="8" } 641