xref: /llvm-project/llvm/test/CodeGen/AMDGPU/scratch-simple.ll (revision 1bf385f10291101163a346c8f075d56e1578351b)
1; RUN: llc -mtriple=amdgcn-- -mcpu=verde -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SI,SIVI,MUBUF %s
2; RUN: llc -mtriple=amdgcn-- -mcpu=gfx803 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI,SIVI,MUBUF %s
3; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,MUBUF,GFX9-MUBUF,GFX9_10-MUBUF %s
4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -filetype=obj -amdgpu-use-divergent-register-indexing < %s | llvm-readobj -r - | FileCheck --check-prefix=RELS %s
5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,MUBUF,GFX10_W32-MUBUF,GFX9_10-MUBUF %s
6; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,MUBUF,GFX10_W64-MUBUF,GFX9_10-MUBUF %s
7; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,FLATSCR,GFX9-FLATSCR %s
8; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1030 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,FLATSCR,GFX10-FLATSCR %s
9; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,FLATSCR,GFX9-FLATSCR-PAL %s
10; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,FLATSCR,GFX10-FLATSCR-PAL %s
11; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,GFX11-FLATSCR %s
12; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,GFX11-FLATSCR %s
13
14; RELS: R_AMDGPU_ABS32_LO SCRATCH_RSRC_DWORD0
15; RELS: R_AMDGPU_ABS32_LO SCRATCH_RSRC_DWORD1
16
17; This used to fail due to a v_add_i32 instruction with an illegal immediate
18; operand that was created during Local Stack Slot Allocation. Test case derived
19; from https://bugs.freedesktop.org/show_bug.cgi?id=96602
20;
21; GCN-LABEL: {{^}}ps_main:
22
23; GFX9-FLATSCR-DAG: s_add_u32 flat_scratch_lo, s0, s2
24; GFX9-FLATSCR-DAG: s_addc_u32 flat_scratch_hi, s1, 0
25; GFX9-FLATSCR-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0
26
27; GFX10-FLATSCR: s_add_u32 s0, s0, s2
28; GFX10-FLATSCR: s_addc_u32 s1, s1, 0
29; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
30; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
31
32; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[2:3]
33; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s2, s0
34; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[2:3], s[2:3], 0x0
35; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
36; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
37; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, 0
38; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
39; GFX9-FLATSCR-PAL-DAG: s_and_b32 s3, s3, 0xffff
40; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s2, s0
41; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s3, 0
42; GFX9-FLATSCR-PAL-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0
43
44; GFX10-FLATSCR-PAL: s_getpc_b64 s[2:3]
45; GFX10-FLATSCR-PAL: s_mov_b32 s2, s0
46; GFX10-FLATSCR-PAL: s_load_dwordx2 s[2:3], s[2:3], 0x0
47; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
48; GFX10-FLATSCR-PAL: s_and_b32 s3, s3, 0xffff
49; GFX10-FLATSCR-PAL: s_add_u32 s2, s2, s0
50; GFX10-FLATSCR-PAL: s_addc_u32 s3, s3, 0
51; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
52; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
53
54; SIVI-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
55; SIVI-DAG: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
56; SIVI-DAG: s_mov_b32 s6, -1
57
58; GFX9-MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
59; GFX9-MUBUF-DAG: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
60; GFX9-MUBUF-DAG: s_mov_b32 s2, -1
61
62; SI-DAG: s_mov_b32 s7, 0xe8f000
63; VI-DAG: s_mov_b32 s7, 0xe80000
64; GFX9-MUBUF-DAG: s_mov_b32 s3, 0xe00000
65; GFX10_W32-MUBUF-DAG: s_mov_b32 s3, 0x31c16000
66; GFX10_W64-MUBUF-DAG: s_mov_b32 s3, 0x31e16000
67
68; FLATSCR-NOT: SCRATCH_RSRC_DWORD
69
70; GFX9-FLATSCR: s_mov_b32 [[SP:[^,]+]], 0
71; GFX9-FLATSCR: scratch_store_dwordx4 off, v[{{[0-9:]+}}], [[SP]] offset:
72
73; GFX10-FLATSCR: scratch_store_dwordx4 off, v[{{[0-9:]+}}], off offset:
74
75; MUBUF-DAG:     v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
76; MUBUF-DAG:     v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]]
77; GFX10-FLATSCR: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0
78; GFX10-FLATSCR-PAL: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0
79; GFX11-FLATSCR: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0
80
81; MUBUF-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[HI_OFF:v[0-9]+]],{{.*}} 0x200, [[CLAMP_IDX]]
82; FLATSCR:  v_mov_b32_e32 [[LO_OFF:v[0-9]+]], [[CLAMP_IDX]]
83
84; MUBUF: buffer_load_dword {{v[0-9]+}}, [[CLAMP_IDX]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen
85; MUBUF: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen
86; FLATSCR: scratch_load_dword {{v[0-9]+}}, [[LO_OFF]], off
87; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, [[CLAMP_IDX]], off{{$}}
88define amdgpu_ps float @ps_main(i32 %idx) {
89  %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
90  %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
91  %r = fadd float %v1, %v2
92  ret float %r
93}
94
95; GCN-LABEL: {{^}}vs_main:
96; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s2
97; GFX9-FLATSCR: s_addc_u32 flat_scratch_hi, s1, 0
98
99; GFX10-FLATSCR: s_add_u32 s0, s0, s2
100; GFX10-FLATSCR: s_addc_u32 s1, s1, 0
101; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
102; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
103
104; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[2:3]
105; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s2, s0
106; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[2:3], s[2:3], 0x0
107; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
108; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
109; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, 0
110; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
111; GFX9-FLATSCR-PAL-DAG: s_and_b32 s3, s3, 0xffff
112; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s2, s0
113; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s3, 0
114
115; GFX10-FLATSCR-PAL: s_getpc_b64 s[2:3]
116; GFX10-FLATSCR-PAL: s_mov_b32 s2, s0
117; GFX10-FLATSCR-PAL: s_load_dwordx2 s[2:3], s[2:3], 0x0
118; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
119; GFX10-FLATSCR-PAL: s_and_b32 s3, s3, 0xffff
120; GFX10-FLATSCR-PAL: s_add_u32 s2, s2, s0
121; GFX10-FLATSCR-PAL: s_addc_u32 s3, s3, 0
122; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
123; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
124
125; MUBUF-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
126
127; FLATSCR-NOT: SCRATCH_RSRC_DWORD
128
129; MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
130; MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
131
132; GFX9-FLATSCR: s_mov_b32 [[SP:[^,]+]], 0
133; GFX9-FLATSCR: scratch_store_dwordx4 off, v[{{[0-9:]+}}], [[SP]] offset:
134
135; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
136; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
137
138; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
139; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
140
141define amdgpu_vs float @vs_main(i32 %idx) {
142  %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
143  %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
144  %r = fadd float %v1, %v2
145  ret float %r
146}
147
148; GCN-LABEL: {{^}}cs_main:
149; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s2
150; GFX9-FLATSCR: s_addc_u32 flat_scratch_hi, s1, 0
151
152; GFX10-FLATSCR: s_add_u32 s0, s0, s2
153; GFX10-FLATSCR: s_addc_u32 s1, s1, 0
154; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
155; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
156
157; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[2:3]
158; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s2, s0
159; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[2:3], s[2:3], 0x10
160; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
161; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
162; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, 0
163; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
164; GFX9-FLATSCR-PAL-DAG: s_and_b32 s3, s3, 0xffff
165; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s2, s0
166; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s3, 0
167
168; GFX10-FLATSCR-PAL: s_getpc_b64 s[2:3]
169; GFX10-FLATSCR-PAL: s_mov_b32 s2, s0
170; GFX10-FLATSCR-PAL: s_load_dwordx2 s[2:3], s[2:3], 0x10
171; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
172; GFX10-FLATSCR-PAL: s_and_b32 s3, s3, 0xffff
173; GFX10-FLATSCR-PAL: s_add_u32 s2, s2, s0
174; GFX10-FLATSCR-PAL: s_addc_u32 s3, s3, 0
175; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
176; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
177
178; MUBUF-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
179
180; FLATSCR-NOT: SCRATCH_RSRC_DWORD
181
182; MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
183; MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
184
185; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
186; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
187
188; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
189; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
190
191define amdgpu_cs float @cs_main(i32 %idx) {
192  %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
193  %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
194  %r = fadd float %v1, %v2
195  ret float %r
196}
197
198; GCN-LABEL: {{^}}hs_main:
199; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s5
200; GFX9-FLATSCR: s_addc_u32 flat_scratch_hi, s1, 0
201
202; GFX10-FLATSCR: s_add_u32 s0, s0, s5
203; GFX10-FLATSCR: s_addc_u32 s1, s1, 0
204; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
205; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
206
207; SIVI: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
208; SIVI-NOT: s_mov_b32 s4
209; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
210; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
211
212; GFX9_10-MUBUF: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
213; GFX9PLUS-NOT:   s_mov_b32 s5
214; GFX9_10-MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
215; GFX9_10-MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
216
217; FLATSCR-NOT: SCRATCH_RSRC_DWORD
218; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
219; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
220
221; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
222; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
223define amdgpu_hs float @hs_main(i32 %idx) {
224  %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
225  %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
226  %r = fadd float %v1, %v2
227  ret float %r
228}
229
230; GCN-LABEL: {{^}}gs_main:
231; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s5
232; GFX9-FLATSCR: s_addc_u32 flat_scratch_hi, s1, 0
233
234; GFX10-FLATSCR: s_add_u32 s0, s0, s5
235; GFX10-FLATSCR: s_addc_u32 s1, s1, 0
236; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
237; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
238
239; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[0:1]
240; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, s8
241; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[0:1], s[0:1], 0x0
242; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
243; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
244; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, 0
245; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
246; GFX9-FLATSCR-PAL-DAG: s_and_b32 s1, s1, 0xffff
247; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s0, s5
248; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s1, 0
249
250; GFX10-FLATSCR-PAL: s_getpc_b64 s[0:1]
251; GFX10-FLATSCR-PAL: s_mov_b32 s0, s8
252; GFX10-FLATSCR-PAL: s_load_dwordx2 s[0:1], s[0:1], 0x0
253; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
254; GFX10-FLATSCR-PAL: s_and_b32 s1, s1, 0xffff
255; GFX10-FLATSCR-PAL: s_add_u32 s0, s0, s5
256; GFX10-FLATSCR-PAL: s_addc_u32 s1, s1, 0
257; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
258; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
259
260; SIVI: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
261; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
262; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
263
264; GFX9_10-MUBUF: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
265; GFX9_10-MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
266; GFX9_10-MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
267
268; FLATSCR-NOT: SCRATCH_RSRC_DWORD
269; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
270; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
271
272; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
273; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
274define amdgpu_gs float @gs_main(i32 %idx) {
275  %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
276  %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
277  %r = fadd float %v1, %v2
278  ret float %r
279}
280
281; Mesa GS and HS shaders have the preloaded scratch wave offset SGPR fixed at
282; SGPR5, and the inreg implementation is used to reference it in the IR. The
283; following tests confirm the shader and anything inserted after the return
284; (i.e. SI_RETURN_TO_EPILOG) can access the scratch wave offset.
285
286; GCN-LABEL: {{^}}hs_ir_uses_scratch_offset:
287; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s5
288; GFX9-FLATSCR: s_addc_u32 flat_scratch_hi, s1, 0
289
290; GFX10-FLATSCR: s_add_u32 s0, s0, s5
291; GFX10-FLATSCR: s_addc_u32 s1, s1, 0
292; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
293; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
294
295; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[0:1]
296; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, s8
297; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[0:1], s[0:1], 0x0
298; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
299; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
300; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, 0
301; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
302; GFX9-FLATSCR-PAL-DAG: s_and_b32 s1, s1, 0xffff
303; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s0, s5
304; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s1, 0
305
306; GFX10-FLATSCR-PAL: s_getpc_b64 s[0:1]
307; GFX10-FLATSCR-PAL: s_mov_b32 s0, s8
308; GFX10-FLATSCR-PAL: s_load_dwordx2 s[0:1], s[0:1], 0x0
309; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
310; GFX10-FLATSCR-PAL: s_and_b32 s1, s1, 0xffff
311; GFX10-FLATSCR-PAL: s_add_u32 s0, s0, s5
312; GFX10-FLATSCR-PAL: s_addc_u32 s1, s1, 0
313; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
314; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
315
316; MUBUF: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
317; FLATSCR-NOT: SCRATCH_RSRC_DWORD
318
319; SIVI-NOT: s_mov_b32 s6
320; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
321; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
322
323; GFX9_10-MUBUF-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
324; GFX9_10-MUBUF-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
325
326; MUBUF-DAG: s_mov_b32 s2, s5
327
328; FLATSCR-DAG: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
329; FLATSCR-DAG: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
330
331; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
332; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
333define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) {
334  %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
335  %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
336  %f = fadd float %v1, %v2
337  %r1 = insertvalue <{i32, i32, i32, float}> undef, i32 %swo, 2
338  %r2 = insertvalue <{i32, i32, i32, float}> %r1, float %f, 3
339  ret <{i32, i32, i32, float}> %r2
340}
341
342; GCN-LABEL: {{^}}gs_ir_uses_scratch_offset:
343; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s5
344; GFX9-FLATSCR: s_addc_u32 flat_scratch_hi, s1, 0
345
346; GFX10-FLATSCR: s_add_u32 s0, s0, s5
347; GFX10-FLATSCR: s_addc_u32 s1, s1, 0
348; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
349; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
350
351; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[0:1]
352; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, s8
353; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[0:1], s[0:1], 0x0
354; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
355; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
356; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, 0
357; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
358; GFX9-FLATSCR-PAL-DAG: s_and_b32 s1, s1, 0xffff
359; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s0, s5
360; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s1, 0
361
362; GFX10-FLATSCR-PAL: s_getpc_b64 s[0:1]
363; GFX10-FLATSCR-PAL: s_mov_b32 s0, s8
364; GFX10-FLATSCR-PAL: s_load_dwordx2 s[0:1], s[0:1], 0x0
365; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
366; GFX10-FLATSCR-PAL: s_and_b32 s1, s1, 0xffff
367; GFX10-FLATSCR-PAL: s_add_u32 s0, s0, s5
368; GFX10-FLATSCR-PAL: s_addc_u32 s1, s1, 0
369; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
370; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
371
372; MUBUF: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
373; FLATSCR-NOT: SCRATCH_RSRC_DWORD
374
375; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
376; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
377
378; GFX9_10-MUBUF-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
379; GFX9_10-MUBUF-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
380
381; MUBUF-DAG: s_mov_b32 s2, s5
382
383; FLATSCR-DAG: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
384; FLATSCR-DAG: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
385
386; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
387; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
388define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) {
389  %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
390  %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
391  %f = fadd float %v1, %v2
392  %r1 = insertvalue <{i32, i32, i32, float}> undef, i32 %swo, 2
393  %r2 = insertvalue <{i32, i32, i32, float}> %r1, float %f, 3
394  ret <{i32, i32, i32, float}> %r2
395}
396