xref: /llvm-project/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck -check-prefix=MUBUF %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 -mattr=+enable-flat-scratch < %s | FileCheck -check-prefix=FLATSCR %s
4
5; Test that the VGPR spiller correctly switches to SGPR offsets when the
6; instruction offset field would overflow, and that it accounts for memory
7; swizzling.
8
9define amdgpu_kernel void @test_inst_offset_kernel() {
10; MUBUF-LABEL: test_inst_offset_kernel:
11; MUBUF:       ; %bb.0: ; %entry
12; MUBUF-NEXT:    s_add_u32 s0, s0, s17
13; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
14; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 glc
15; MUBUF-NEXT:    s_waitcnt vmcnt(0)
16; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Spill
17; MUBUF-NEXT:    ;;#ASMSTART
18; MUBUF-NEXT:    ;;#ASMEND
19; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Reload
20; MUBUF-NEXT:    s_waitcnt vmcnt(0)
21; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
22; MUBUF-NEXT:    s_waitcnt vmcnt(0)
23; MUBUF-NEXT:    s_endpgm
24;
25; FLATSCR-LABEL: test_inst_offset_kernel:
26; FLATSCR:       ; %bb.0: ; %entry
27; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
28; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
29; FLATSCR-NEXT:    s_mov_b32 s0, 0
30; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc
31; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
32; FLATSCR-NEXT:    s_movk_i32 s0, 0xff8
33; FLATSCR-NEXT:    scratch_store_dword off, v0, s0 ; 4-byte Folded Spill
34; FLATSCR-NEXT:    ;;#ASMSTART
35; FLATSCR-NEXT:    ;;#ASMEND
36; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 ; 4-byte Folded Reload
37; FLATSCR-NEXT:    s_mov_b32 s0, 0
38; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
39; FLATSCR-NEXT:    scratch_store_dword off, v0, s0 offset:4
40; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
41; FLATSCR-NEXT:    s_endpgm
42entry:
43  ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
44  ; the instruction offset field.
45  %alloca = alloca i8, i32 4088, align 4, addrspace(5)
46
47  %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
48
49
50  %a = load volatile i32, ptr addrspace(5) %aptr
51
52  ; Force %a to spill.
53  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
54
55  %outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
56  store volatile i32 %a, ptr addrspace(5) %outptr
57
58  ret void
59}
60
61define amdgpu_kernel void @test_sgpr_offset_kernel() {
62; MUBUF-LABEL: test_sgpr_offset_kernel:
63; MUBUF:       ; %bb.0: ; %entry
64; MUBUF-NEXT:    s_add_u32 s0, s0, s17
65; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
66; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
67; MUBUF-NEXT:    s_waitcnt vmcnt(0)
68; MUBUF-NEXT:    s_mov_b32 s4, 0x40000
69; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
70; MUBUF-NEXT:    ;;#ASMSTART
71; MUBUF-NEXT:    ;;#ASMEND
72; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
73; MUBUF-NEXT:    s_waitcnt vmcnt(0)
74; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
75; MUBUF-NEXT:    s_waitcnt vmcnt(0)
76; MUBUF-NEXT:    s_endpgm
77;
78; FLATSCR-LABEL: test_sgpr_offset_kernel:
79; FLATSCR:       ; %bb.0: ; %entry
80; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
81; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
82; FLATSCR-NEXT:    s_mov_b32 s0, 0
83; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 offset:8 glc
84; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
85; FLATSCR-NEXT:    s_movk_i32 s0, 0x1000
86; FLATSCR-NEXT:    scratch_store_dword off, v0, s0 ; 4-byte Folded Spill
87; FLATSCR-NEXT:    ;;#ASMSTART
88; FLATSCR-NEXT:    ;;#ASMEND
89; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 ; 4-byte Folded Reload
90; FLATSCR-NEXT:    s_mov_b32 s0, 0
91; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
92; FLATSCR-NEXT:    scratch_store_dword off, v0, s0 offset:8
93; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
94; FLATSCR-NEXT:    s_endpgm
95entry:
96  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
97  ; fit in the instruction, and has to live in the SGPR offset.
98  %alloca = alloca i8, i32 4092, align 4, addrspace(5)
99
100  %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
101  ; 0x40000 / 64 = 4096 (for wave64)
102  %a = load volatile i32, ptr addrspace(5) %aptr
103  ; Force %a to spill
104  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
105
106  %outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
107  store volatile i32 %a, ptr addrspace(5) %outptr
108
109  ret void
110}
111
112define void @test_sgpr_offset_function_scavenge_fail_func() #2 {
113; MUBUF-LABEL: test_sgpr_offset_function_scavenge_fail_func:
114; MUBUF:       ; %bb.0: ; %entry
115; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
116; MUBUF-NEXT:    ;;#ASMSTART
117; MUBUF-NEXT:    ;;#ASMEND
118; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 glc
119; MUBUF-NEXT:    s_waitcnt vmcnt(0)
120; MUBUF-NEXT:    s_add_i32 s10, s32, 0x40100
121; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s10 ; 4-byte Folded Spill
122; MUBUF-NEXT:    ;;#ASMSTART
123; MUBUF-NEXT:    ;;#ASMEND
124; MUBUF-NEXT:    ;;#ASMSTART
125; MUBUF-NEXT:    ;;#ASMEND
126; MUBUF-NEXT:    ;;#ASMSTART
127; MUBUF-NEXT:    ;;#ASMEND
128; MUBUF-NEXT:    s_add_i32 s10, s32, 0x40100
129; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s10 ; 4-byte Folded Reload
130; MUBUF-NEXT:    s_waitcnt vmcnt(0)
131; MUBUF-NEXT:    ;;#ASMSTART
132; MUBUF-NEXT:    ;;#ASMEND
133; MUBUF-NEXT:    s_setpc_b64 s[30:31]
134;
135; FLATSCR-LABEL: test_sgpr_offset_function_scavenge_fail_func:
136; FLATSCR:       ; %bb.0: ; %entry
137; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
138; FLATSCR-NEXT:    ;;#ASMSTART
139; FLATSCR-NEXT:    ;;#ASMEND
140; FLATSCR-NEXT:    scratch_load_dword v0, off, s32 offset:8 glc
141; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
142; FLATSCR-NEXT:    s_add_i32 s8, s32, 0x1004
143; FLATSCR-NEXT:    scratch_store_dword off, v0, s8 ; 4-byte Folded Spill
144; FLATSCR-NEXT:    ;;#ASMSTART
145; FLATSCR-NEXT:    ;;#ASMEND
146; FLATSCR-NEXT:    ;;#ASMSTART
147; FLATSCR-NEXT:    ;;#ASMEND
148; FLATSCR-NEXT:    ;;#ASMSTART
149; FLATSCR-NEXT:    ;;#ASMEND
150; FLATSCR-NEXT:    s_add_i32 s8, s32, 0x1004
151; FLATSCR-NEXT:    scratch_load_dword v0, off, s8 ; 4-byte Folded Reload
152; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
153; FLATSCR-NEXT:    ;;#ASMSTART
154; FLATSCR-NEXT:    ;;#ASMEND
155; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
156entry:
157  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
158  ; fit in the instruction, and has to live in the SGPR offset.
159  %alloca = alloca i8, i32 4096, align 4, addrspace(5)
160
161  %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
162
163  %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
164  %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0
165  %asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1
166  %asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2
167  %asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3
168  %asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4
169  %asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5
170  %asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6
171  %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7
172
173  ; 0x40000 / 64 = 4096 (for wave64)
174  %a = load volatile i32, ptr addrspace(5) %aptr
175  call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a)
176
177  %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
178  %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0
179  %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1
180  %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2
181  %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3
182  %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4
183  %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5
184  %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6
185  %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7
186
187  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
188   ; Force %a to spill with no free SGPRs
189  call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a)
190  ret void
191}
192
193define amdgpu_kernel void @test_sgpr_offset_function_scavenge_fail_kernel() #3 {
194; MUBUF-LABEL: test_sgpr_offset_function_scavenge_fail_kernel:
195; MUBUF:       ; %bb.0: ; %entry
196; MUBUF-NEXT:    s_add_u32 s0, s0, s17
197; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
198; MUBUF-NEXT:    ;;#ASMSTART
199; MUBUF-NEXT:    ;;#ASMEND
200; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
201; MUBUF-NEXT:    s_waitcnt vmcnt(0)
202; MUBUF-NEXT:    s_mov_b32 s10, 0x40100
203; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s10 ; 4-byte Folded Spill
204; MUBUF-NEXT:    ;;#ASMSTART
205; MUBUF-NEXT:    ;;#ASMEND
206; MUBUF-NEXT:    ;;#ASMSTART
207; MUBUF-NEXT:    ;;#ASMEND
208; MUBUF-NEXT:    ;;#ASMSTART
209; MUBUF-NEXT:    ;;#ASMEND
210; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s10 ; 4-byte Folded Reload
211; MUBUF-NEXT:    s_waitcnt vmcnt(0)
212; MUBUF-NEXT:    ;;#ASMSTART
213; MUBUF-NEXT:    ;;#ASMEND
214; MUBUF-NEXT:    s_endpgm
215;
216; FLATSCR-LABEL: test_sgpr_offset_function_scavenge_fail_kernel:
217; FLATSCR:       ; %bb.0: ; %entry
218; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
219; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
220; FLATSCR-NEXT:    s_mov_b32 s8, 0
221; FLATSCR-NEXT:    ;;#ASMSTART
222; FLATSCR-NEXT:    ;;#ASMEND
223; FLATSCR-NEXT:    scratch_load_dword v0, off, s8 offset:8 glc
224; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
225; FLATSCR-NEXT:    s_movk_i32 s8, 0x1004
226; FLATSCR-NEXT:    scratch_store_dword off, v0, s8 ; 4-byte Folded Spill
227; FLATSCR-NEXT:    ;;#ASMSTART
228; FLATSCR-NEXT:    ;;#ASMEND
229; FLATSCR-NEXT:    ;;#ASMSTART
230; FLATSCR-NEXT:    ;;#ASMEND
231; FLATSCR-NEXT:    ;;#ASMSTART
232; FLATSCR-NEXT:    ;;#ASMEND
233; FLATSCR-NEXT:    scratch_load_dword v0, off, s8 ; 4-byte Folded Reload
234; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
235; FLATSCR-NEXT:    ;;#ASMSTART
236; FLATSCR-NEXT:    ;;#ASMEND
237; FLATSCR-NEXT:    s_endpgm
238entry:
239  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
240  ; fit in the instruction, and has to live in the SGPR offset.
241  %alloca = alloca i8, i32 4096, align 4, addrspace(5)
242
243  %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
244
245  %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
246  %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0
247  %asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1
248  %asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2
249  %asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3
250  %asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4
251  %asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5
252  %asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6
253  %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7
254
255  ; 0x40000 / 64 = 4096 (for wave64)
256  %a = load volatile i32, ptr addrspace(5) %aptr
257  call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a)
258
259  %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
260  %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0
261  %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1
262  %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2
263  %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3
264  %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4
265  %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5
266  %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6
267  %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7
268
269  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
270   ; Force %a to spill with no free SGPRs
271  call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a)
272  ret void
273}
274
275define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
276; MUBUF-LABEL: test_sgpr_offset_subregs_kernel:
277; MUBUF:       ; %bb.0: ; %entry
278; MUBUF-NEXT:    s_add_u32 s0, s0, s17
279; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
280; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
281; MUBUF-NEXT:    s_waitcnt vmcnt(0)
282; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:12 glc
283; MUBUF-NEXT:    s_waitcnt vmcnt(0)
284; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4084 ; 4-byte Folded Spill
285; MUBUF-NEXT:    s_nop 0
286; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:4088 ; 4-byte Folded Spill
287; MUBUF-NEXT:    ;;#ASMSTART
288; MUBUF-NEXT:    ;;#ASMEND
289; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 glc
290; MUBUF-NEXT:    s_waitcnt vmcnt(0)
291; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4084 ; 4-byte Folded Reload
292; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4088 ; 4-byte Folded Reload
293; MUBUF-NEXT:    s_waitcnt vmcnt(0)
294; MUBUF-NEXT:    ;;#ASMSTART
295; MUBUF-NEXT:    ; v[0:1]
296; MUBUF-NEXT:    ;;#ASMEND
297; MUBUF-NEXT:    s_endpgm
298;
299; FLATSCR-LABEL: test_sgpr_offset_subregs_kernel:
300; FLATSCR:       ; %bb.0: ; %entry
301; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
302; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
303; FLATSCR-NEXT:    s_mov_b32 s0, 0
304; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 offset:8 glc
305; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
306; FLATSCR-NEXT:    s_movk_i32 s0, 0xff4
307; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 ; 8-byte Folded Spill
308; FLATSCR-NEXT:    s_mov_b32 s0, 0
309; FLATSCR-NEXT:    ;;#ASMSTART
310; FLATSCR-NEXT:    ;;#ASMEND
311; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc
312; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
313; FLATSCR-NEXT:    s_movk_i32 s0, 0xff4
314; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload
315; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
316; FLATSCR-NEXT:    ;;#ASMSTART
317; FLATSCR-NEXT:    ; v[0:1]
318; FLATSCR-NEXT:    ;;#ASMEND
319; FLATSCR-NEXT:    s_endpgm
320entry:
321  ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
322  ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in
323  ; the instruction offset field.
324  %alloca = alloca i8, i32 4084, align 4, addrspace(5)
325  %aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1
326  %a = load volatile <2 x i32>, ptr addrspace(5) %aptr
327
328  ; Force %a to spill.
329  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
330
331  ; Ensure the alloca sticks around.
332  %bptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
333  %b = load volatile i32, ptr addrspace(5) %bptr
334
335  ; Ensure the spill is of the full super-reg.
336  call void asm sideeffect "; $0", "r"(<2 x i32> %a)
337
338  ret void
339}
340
341define amdgpu_kernel void @test_inst_offset_subregs_kernel() {
342; MUBUF-LABEL: test_inst_offset_subregs_kernel:
343; MUBUF:       ; %bb.0: ; %entry
344; MUBUF-NEXT:    s_add_u32 s0, s0, s17
345; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
346; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:12 glc
347; MUBUF-NEXT:    s_waitcnt vmcnt(0)
348; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16 glc
349; MUBUF-NEXT:    s_waitcnt vmcnt(0)
350; MUBUF-NEXT:    s_mov_b32 s4, 0x3ff00
351; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
352; MUBUF-NEXT:    s_nop 0
353; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Spill
354; MUBUF-NEXT:    ;;#ASMSTART
355; MUBUF-NEXT:    ;;#ASMEND
356; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
357; MUBUF-NEXT:    s_waitcnt vmcnt(0)
358; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
359; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Reload
360; MUBUF-NEXT:    s_waitcnt vmcnt(0)
361; MUBUF-NEXT:    ;;#ASMSTART
362; MUBUF-NEXT:    ; v[0:1]
363; MUBUF-NEXT:    ;;#ASMEND
364; MUBUF-NEXT:    s_endpgm
365;
366; FLATSCR-LABEL: test_inst_offset_subregs_kernel:
367; FLATSCR:       ; %bb.0: ; %entry
368; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
369; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
370; FLATSCR-NEXT:    s_mov_b32 s0, 0
371; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 offset:12 glc
372; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
373; FLATSCR-NEXT:    s_movk_i32 s0, 0xffc
374; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 ; 8-byte Folded Spill
375; FLATSCR-NEXT:    s_mov_b32 s0, 0
376; FLATSCR-NEXT:    ;;#ASMSTART
377; FLATSCR-NEXT:    ;;#ASMEND
378; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 offset:8 glc
379; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
380; FLATSCR-NEXT:    s_movk_i32 s0, 0xffc
381; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload
382; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
383; FLATSCR-NEXT:    ;;#ASMSTART
384; FLATSCR-NEXT:    ; v[0:1]
385; FLATSCR-NEXT:    ;;#ASMEND
386; FLATSCR-NEXT:    s_endpgm
387entry:
388  ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
389  ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live
390  ; in the SGPR offset.
391  %alloca = alloca i8, i32 4088, align 4, addrspace(5)
392
393  ; 0x3ff00 / 64 = 4092 (for wave64)
394  %aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1
395  %a = load volatile <2 x i32>, ptr addrspace(5) %aptr
396
397  ; Force %a to spill.
398  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
399
400  ; Ensure the alloca sticks around.
401  %bptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
402  %b = load volatile i32, ptr addrspace(5) %bptr
403
404  ; Ensure the spill is of the full super-reg.
405  call void asm sideeffect "; $0", "r"(<2 x i32> %a)
406
407  ret void
408}
409
410define void @test_inst_offset_function() {
411; MUBUF-LABEL: test_inst_offset_function:
412; MUBUF:       ; %bb.0: ; %entry
413; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
414; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 glc
415; MUBUF-NEXT:    s_waitcnt vmcnt(0)
416; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4088 ; 4-byte Folded Spill
417; MUBUF-NEXT:    ;;#ASMSTART
418; MUBUF-NEXT:    ;;#ASMEND
419; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4088 ; 4-byte Folded Reload
420; MUBUF-NEXT:    s_waitcnt vmcnt(0)
421; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
422; MUBUF-NEXT:    s_waitcnt vmcnt(0)
423; MUBUF-NEXT:    s_setpc_b64 s[30:31]
424;
425; FLATSCR-LABEL: test_inst_offset_function:
426; FLATSCR:       ; %bb.0: ; %entry
427; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
428; FLATSCR-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc
429; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
430; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:4088 ; 4-byte Folded Spill
431; FLATSCR-NEXT:    ;;#ASMSTART
432; FLATSCR-NEXT:    ;;#ASMEND
433; FLATSCR-NEXT:    scratch_load_dword v0, off, s32 offset:4088 ; 4-byte Folded Reload
434; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
435; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:4
436; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
437; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
438entry:
439  ; Occupy enough bytes of scratch, so the offset of the spill of %a
440  ; just fits in the instruction offset field when the emergency stack
441  ; slot is added. It's hard to hit the actual limit since we're also
442  ; going to insert the emergency stack slot for large frames.
443  %alloca = alloca i8, i32 4088, align 4, addrspace(5)
444
445  %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
446
447
448  %a = load volatile i32, ptr addrspace(5) %aptr
449
450  ; Force %a to spill.
451  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
452
453  %outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
454  store volatile i32 %a, ptr addrspace(5) %outptr
455
456  ret void
457}
458
459define void @test_sgpr_offset_function() {
460; MUBUF-LABEL: test_sgpr_offset_function:
461; MUBUF:       ; %bb.0: ; %entry
462; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
463; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 glc
464; MUBUF-NEXT:    s_waitcnt vmcnt(0)
465; MUBUF-NEXT:    s_add_i32 s4, s32, 0x40100
466; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
467; MUBUF-NEXT:    ;;#ASMSTART
468; MUBUF-NEXT:    ;;#ASMEND
469; MUBUF-NEXT:    s_add_i32 s4, s32, 0x40100
470; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
471; MUBUF-NEXT:    s_waitcnt vmcnt(0)
472; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
473; MUBUF-NEXT:    s_waitcnt vmcnt(0)
474; MUBUF-NEXT:    s_setpc_b64 s[30:31]
475;
476; FLATSCR-LABEL: test_sgpr_offset_function:
477; FLATSCR:       ; %bb.0: ; %entry
478; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
479; FLATSCR-NEXT:    scratch_load_dword v0, off, s32 offset:8 glc
480; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
481; FLATSCR-NEXT:    s_add_i32 s0, s32, 0x1004
482; FLATSCR-NEXT:    scratch_store_dword off, v0, s0 ; 4-byte Folded Spill
483; FLATSCR-NEXT:    ;;#ASMSTART
484; FLATSCR-NEXT:    ;;#ASMEND
485; FLATSCR-NEXT:    s_add_i32 s0, s32, 0x1004
486; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 ; 4-byte Folded Reload
487; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
488; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:8
489; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
490; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
491entry:
492  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
493  ; fit in the instruction, and has to live in the SGPR offset.
494  %alloca = alloca i8, i32 4096, align 4, addrspace(5)
495
496  %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
497  ; 0x40000 / 64 = 4096 (for wave64)
498  %a = load volatile i32, ptr addrspace(5) %aptr
499
500  ; Force %a to spill
501  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
502
503  %outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
504  store volatile i32 %a, ptr addrspace(5) %outptr
505
506  ret void
507}
508
509define void @test_sgpr_offset_subregs_function() {
510; MUBUF-LABEL: test_sgpr_offset_subregs_function:
511; MUBUF:       ; %bb.0: ; %entry
512; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
513; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 glc
514; MUBUF-NEXT:    s_waitcnt vmcnt(0)
515; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:12 glc
516; MUBUF-NEXT:    s_waitcnt vmcnt(0)
517; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4084 ; 4-byte Folded Spill
518; MUBUF-NEXT:    s_nop 0
519; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4088 ; 4-byte Folded Spill
520; MUBUF-NEXT:    ;;#ASMSTART
521; MUBUF-NEXT:    ;;#ASMEND
522; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 glc
523; MUBUF-NEXT:    s_waitcnt vmcnt(0)
524; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4084 ; 4-byte Folded Reload
525; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4088 ; 4-byte Folded Reload
526; MUBUF-NEXT:    s_waitcnt vmcnt(0)
527; MUBUF-NEXT:    ;;#ASMSTART
528; MUBUF-NEXT:    ; v[0:1]
529; MUBUF-NEXT:    ;;#ASMEND
530; MUBUF-NEXT:    s_setpc_b64 s[30:31]
531;
532; FLATSCR-LABEL: test_sgpr_offset_subregs_function:
533; FLATSCR:       ; %bb.0: ; %entry
534; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
535; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:8 glc
536; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
537; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 offset:4084 ; 8-byte Folded Spill
538; FLATSCR-NEXT:    ;;#ASMSTART
539; FLATSCR-NEXT:    ;;#ASMEND
540; FLATSCR-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc
541; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
542; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:4084 ; 8-byte Folded Reload
543; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
544; FLATSCR-NEXT:    ;;#ASMSTART
545; FLATSCR-NEXT:    ; v[0:1]
546; FLATSCR-NEXT:    ;;#ASMEND
547; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
548entry:
549  ; We want to test the spill of the last subreg of %a is the highest
550  ; valid value for the immediate offset. We enable the emergency
551  ; stack slot for large frames, so it's hard to get the frame layout
552  ; exactly as we want to test it.
553  ; Occupy 4084 bytes of scratch, so that the spill of the last subreg of %a
554  ; still fits below offset 4096 (4084 + 8 - 4 = 4092), and can be placed in
555  ; the instruction offset field.
556  %alloca = alloca i8, i32 4084, align 4, addrspace(5)
557  %aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1
558  %a = load volatile <2 x i32>, ptr addrspace(5) %aptr
559
560  ; Force %a to spill.
561  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
562
563  ; Ensure the alloca sticks around.
564  %bptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
565  %b = load volatile i32, ptr addrspace(5) %bptr
566
567  ; Ensure the spill is of the full super-reg.
568  call void asm sideeffect "; $0", "r"(<2 x i32> %a)
569
570  ret void
571}
572
573define void @test_inst_offset_subregs_function() {
574; MUBUF-LABEL: test_inst_offset_subregs_function:
575; MUBUF:       ; %bb.0: ; %entry
576; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
577; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 glc
578; MUBUF-NEXT:    s_waitcnt vmcnt(0)
579; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 glc
580; MUBUF-NEXT:    s_waitcnt vmcnt(0)
581; MUBUF-NEXT:    s_add_i32 s4, s32, 0x3ff00
582; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
583; MUBUF-NEXT:    s_nop 0
584; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Spill
585; MUBUF-NEXT:    ;;#ASMSTART
586; MUBUF-NEXT:    ;;#ASMEND
587; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 glc
588; MUBUF-NEXT:    s_waitcnt vmcnt(0)
589; MUBUF-NEXT:    s_add_i32 s4, s32, 0x3ff00
590; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
591; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Reload
592; MUBUF-NEXT:    s_waitcnt vmcnt(0)
593; MUBUF-NEXT:    ;;#ASMSTART
594; MUBUF-NEXT:    ; v[0:1]
595; MUBUF-NEXT:    ;;#ASMEND
596; MUBUF-NEXT:    s_setpc_b64 s[30:31]
597;
598; FLATSCR-LABEL: test_inst_offset_subregs_function:
599; FLATSCR:       ; %bb.0: ; %entry
600; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
601; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:12 glc
602; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
603; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 offset:4092 ; 8-byte Folded Spill
604; FLATSCR-NEXT:    ;;#ASMSTART
605; FLATSCR-NEXT:    ;;#ASMEND
606; FLATSCR-NEXT:    scratch_load_dword v0, off, s32 offset:8 glc
607; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
608; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:4092 ; 8-byte Folded Reload
609; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
610; FLATSCR-NEXT:    ;;#ASMSTART
611; FLATSCR-NEXT:    ; v[0:1]
612; FLATSCR-NEXT:    ;;#ASMEND
613; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
614entry:
615  ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
616  ; does not fit below offset 4096 (408 + 4 + 8 - 4 = 4096), and has to live
617  ; in the SGPR offset.
618  %alloca = alloca i8, i32 4088, align 4, addrspace(5)
619
620  ; 0x3ff0000 / 64 = 4092 (for wave64)
621  %aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1
622  %a = load volatile <2 x i32>, ptr addrspace(5) %aptr
623
624  ; Force %a to spill.
625  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
626
627  ; Ensure the alloca sticks around.
628  %bptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
629  %b = load volatile i32, ptr addrspace(5) %bptr
630
631  ; Ensure the spill is of the full super-reg.
632  call void asm sideeffect "; $0", "r"(<2 x i32> %a)
633
634  ret void
635}
636
637attributes #0 = { nounwind }
638attributes #1 = { nounwind "amdgpu-num-sgpr"="17" "amdgpu-num-vgpr"="8" }
639attributes #2 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" }
640attributes #3 = { nounwind "amdgpu-num-sgpr"="18" "amdgpu-num-vgpr"="8" }
641