xref: /llvm-project/llvm/test/CodeGen/AMDGPU/smrd.ll (revision 3aef525aa4b9a5395b6ac4ae771e28e64b27a126)
1; RUN: llc -mtriple=amdgcn -mcpu=tahiti  -verify-machineinstrs -show-mc-encoding < %s | FileCheck --check-prefixes=SI,GCN,SICIVI,SICI,SIVIGFX9_10 %s
2; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs -show-mc-encoding < %s | FileCheck --check-prefixes=CI,GCN,SICIVI,SICI %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga   -verify-machineinstrs -show-mc-encoding < %s | FileCheck --check-prefixes=VI,GCN,SICIVI,VIGFX9_10,SIVIGFX9_10 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900  -verify-machineinstrs -show-mc-encoding < %s | FileCheck --check-prefixes=GFX9_10,GCN,VIGFX9_10,SIVIGFX9_10  %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck --check-prefixes=GFX10,GFX9_10,GCN,VIGFX9_10,SIVIGFX9_10  %s
6
7; SMRD load with an immediate offset.
8; GCN-LABEL: {{^}}smrd0:
9; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
10; VIGFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
11define amdgpu_kernel void @smrd0(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
12entry:
13  %tmp = getelementptr i32, ptr addrspace(4) %ptr, i64 1
14  %tmp1 = load i32, ptr addrspace(4) %tmp
15  store i32 %tmp1, ptr addrspace(1) %out
16  ret void
17}
18
19; SMRD load with the largest possible immediate offset.
20; GCN-LABEL: {{^}}smrd1:
21; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}}
22; VIGFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
23define amdgpu_kernel void @smrd1(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
24entry:
25  %tmp = getelementptr i32, ptr addrspace(4) %ptr, i64 255
26  %tmp1 = load i32, ptr addrspace(4) %tmp
27  store i32 %tmp1, ptr addrspace(1) %out
28  ret void
29}
30
31; SMRD load with an offset greater than the largest possible immediate.
32; GCN-LABEL: {{^}}smrd2:
33; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
34; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
35; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
36; VIGFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
37; GCN: s_endpgm
38define amdgpu_kernel void @smrd2(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
39entry:
40  %tmp = getelementptr i32, ptr addrspace(4) %ptr, i64 256
41  %tmp1 = load i32, ptr addrspace(4) %tmp
42  store i32 %tmp1, ptr addrspace(1) %out
43  ret void
44}
45
46; SMRD load with a 64-bit offset
47; GCN-LABEL: {{^}}smrd3:
48; FIXME: There are too many copies here because we don't fold immediates
49;        through REG_SEQUENCE
50; SI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0x13 ; encoding: [0x13
51; TODO: Add VI checks
52; GCN: s_endpgm
53define amdgpu_kernel void @smrd3(ptr addrspace(1) %out, [8 x i32], ptr addrspace(4) %ptr) #0 {
54entry:
55  %tmp = getelementptr i32, ptr addrspace(4) %ptr, i64 4294967296
56  %tmp1 = load i32, ptr addrspace(4) %tmp
57  store i32 %tmp1, ptr addrspace(1) %out
58  ret void
59}
60
61; SMRD load with the largest possible immediate offset on VI
62; GCN-LABEL: {{^}}smrd4:
63; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc
64; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
65; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
66; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
67; GFX9_10: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc
68; GFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
69define amdgpu_kernel void @smrd4(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
70entry:
71  %tmp = getelementptr i32, ptr addrspace(4) %ptr, i64 262143
72  %tmp1 = load i32, ptr addrspace(4) %tmp
73  store i32 %tmp1, ptr addrspace(1) %out
74  ret void
75}
76
77; SMRD load with an offset greater than the largest possible immediate on VI
78; GCN-LABEL: {{^}}smrd5:
79; SIVIGFX9_10: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000
80; SIVIGFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
81; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
82; GCN: s_endpgm
83define amdgpu_kernel void @smrd5(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
84entry:
85  %tmp = getelementptr i32, ptr addrspace(4) %ptr, i64 262144
86  %tmp1 = load i32, ptr addrspace(4) %tmp
87  store i32 %tmp1, ptr addrspace(1) %out
88  ret void
89}
90
91; GFX9+ can use a signed immediate byte offset but not without sgpr[offset]
92; GCN-LABEL: {{^}}smrd6:
93; SICIVI: s_add_u32 s{{[0-9]}}, s{{[0-9]}}, -4
94; SICIVI: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0
95; GFX9_10: s_add_u32 s2, s2, -4
96; GFX9_10: s_addc_u32 s3, s3, -1
97; GFX9_10: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0
98define amdgpu_kernel void @smrd6(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
99entry:
100  %tmp = getelementptr i32, ptr addrspace(4) %ptr, i64 -1
101  %tmp1 = load i32, ptr addrspace(4) %tmp
102  store i32 %tmp1, ptr addrspace(1) %out
103  ret void
104}
105
106; Don't use a negative SGPR offset
107; GCN-LABEL: {{^}}smrd7:
108; GCN: s_add_u32 s{{[0-9]}}, s{{[0-9]}}, 0xffe00000
109; SICIVI: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0
110; GFX9_10: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0
111define amdgpu_kernel void @smrd7(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
112entry:
113  %tmp = getelementptr i32, ptr addrspace(4) %ptr, i64 -524288
114  %tmp1 = load i32, ptr addrspace(4) %tmp
115  store i32 %tmp1, ptr addrspace(1) %out
116  ret void
117}
118
119; GCN-LABEL: {{^}}smrd_hazard:
120; GCN-DAG: s_mov_b32 s3, 3
121; GCN-DAG: s_mov_b32 s2, 2
122; GCN-DAG: s_mov_b32 s1, 1
123; GCN-DAG: s_mov_b32 s0, 0
124; SI-NEXT: nop 3
125; GCN-NEXT: s_buffer_load_dword s0, s[0:3], 0x0
126define amdgpu_ps float @smrd_hazard(<4 x i32> inreg %desc) #0 {
127main_body:
128  %d0 = insertelement <4 x i32> undef, i32 0, i32 0
129  %d1 = insertelement <4 x i32> %d0, i32 1, i32 1
130  %d2 = insertelement <4 x i32> %d1, i32 2, i32 2
131  %d3 = insertelement <4 x i32> %d2, i32 3, i32 3
132  %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %d3, i32 0, i32 0)
133  ret float %r
134}
135
136; SMRD load using the load.const.v4i32 intrinsic with an immediate offset
137; GCN-LABEL: {{^}}smrd_load_const0:
138; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
139; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10
140define amdgpu_ps void @smrd_load_const0(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 {
141main_body:
142  %tmp20 = load <4 x i32>, ptr addrspace(4) %arg
143  %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 16, i32 0)
144  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
145  ret void
146}
147
148; SMRD load using the load.const.v4i32 intrinsic with the largest possible immediate
149; offset.
150; GCN-LABEL: {{^}}smrd_load_const1:
151; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
152; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff glc ; encoding: [0xff
153; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0x3fc ;
154; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0x3fc glc ;
155define amdgpu_ps void @smrd_load_const1(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 {
156main_body:
157  %tmp20 = load <4 x i32>, ptr addrspace(4) %arg
158  %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 1020, i32 0)
159  %tmp22 = load <4 x i32>, ptr addrspace(4) %in
160  %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1020, i32 1)
161  %s.buffer.float = bitcast i32 %s.buffer to float
162  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
163  ret void
164}
165
166; SMRD load using the load.const.v4i32 intrinsic with an offset greater than the
167; largets possible immediate.
168; immediate offset.
169; GCN-LABEL: {{^}}smrd_load_const2:
170; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
171; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
172; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
173; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
174; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
175; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0x400
176; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0x400
177define amdgpu_ps void @smrd_load_const2(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 {
178main_body:
179  %tmp20 = load <4 x i32>, ptr addrspace(4) %arg
180  %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 1024, i32 0)
181  %tmp22 = load <4 x i32>, ptr addrspace(4) %in
182  %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1024, i32 0)
183  %s.buffer.float = bitcast i32 %s.buffer to float
184  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
185  ret void
186}
187
188; SMRD load with the largest possible immediate offset on VI
189; GCN-LABEL: {{^}}smrd_load_const3:
190; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc
191; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
192; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
193; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
194; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
195; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0xffffc
196; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0xffffc
197define amdgpu_ps void @smrd_load_const3(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 {
198main_body:
199  %tmp20 = load <4 x i32>, ptr addrspace(4) %arg
200  %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 1048572, i32 0)
201  %tmp22 = load <4 x i32>, ptr addrspace(4) %in
202  %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1048572, i32 0)
203  %s.buffer.float = bitcast i32 %s.buffer to float
204  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
205  ret void
206}
207
208; SMRD load with an offset greater than the largest possible immediate on VI
209; GCN-LABEL: {{^}}smrd_load_const4:
210; SIVIGFX9_10: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000
211; SIVIGFX9_10: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]:[0-9]+}}], [[OFFSET]]
212; SIVIGFX9_10: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]:[0-9]+}}], [[OFFSET]]
213; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
214; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
215; GCN: s_endpgm
216define amdgpu_ps void @smrd_load_const4(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 {
217main_body:
218  %tmp20 = load <4 x i32>, ptr addrspace(4) %arg
219  %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 1048576, i32 0)
220  %tmp22 = load <4 x i32>, ptr addrspace(4) %in
221  %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1048576, i32 0)
222  %s.buffer.float = bitcast i32 %s.buffer to float
223  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
224  ret void
225}
226
227; dwordx2 s.buffer.load
228; GCN-LABEL: {{^}}s_buffer_load_dwordx2:
229; VIGFX9_10: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x80
230; SICI: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20
231define amdgpu_ps void @s_buffer_load_dwordx2(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 {
232main_body:
233  %tmp22 = load <4 x i32>, ptr addrspace(4) %in
234  %s.buffer = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %tmp22, i32 128, i32 0)
235  %s.buffer.0 = extractelement <2 x i32> %s.buffer, i32 0
236  %s.buffer.0.float = bitcast i32 %s.buffer.0 to float
237  %s.buffer.1 = extractelement <2 x i32> %s.buffer, i32 1
238  %s.buffer.1.float = bitcast i32 %s.buffer.1 to float
239  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %s.buffer.0.float, float %s.buffer.1.float, float %s.buffer.0.float, float %s.buffer.1.float, i1 true, i1 true) #0
240  ret void
241}
242
243; dwordx4 s.buffer.load
244; GCN-LABEL: {{^}}s_buffer_load_dwordx4:
245; VIGFX9_10: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x80
246; SICI: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20
247define amdgpu_ps void @s_buffer_load_dwordx4(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 {
248main_body:
249  %tmp22 = load <4 x i32>, ptr addrspace(4) %in
250  %s.buffer = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %tmp22, i32 128, i32 0)
251  %s.buffer.0 = extractelement <4 x i32> %s.buffer, i32 0
252  %s.buffer.0.float = bitcast i32 %s.buffer.0 to float
253  %s.buffer.1 = extractelement <4 x i32> %s.buffer, i32 1
254  %s.buffer.1.float = bitcast i32 %s.buffer.1 to float
255  %s.buffer.2 = extractelement <4 x i32> %s.buffer, i32 2
256  %s.buffer.2.float = bitcast i32 %s.buffer.2 to float
257  %s.buffer.3 = extractelement <4 x i32> %s.buffer, i32 3
258  %s.buffer.3.float = bitcast i32 %s.buffer.3 to float
259  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %s.buffer.0.float, float %s.buffer.1.float, float %s.buffer.2.float, float %s.buffer.3.float, i1 true, i1 true) #0
260  ret void
261}
262
263; dwordx8 s.buffer.load
264; GCN-LABEL: {{^}}s_buffer_load_dwordx8:
265; VIGFX9_10: s_buffer_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x80
266; SICI: s_buffer_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20
267define amdgpu_ps void @s_buffer_load_dwordx8(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 {
268main_body:
269  %tmp22 = load <4 x i32>, ptr addrspace(4) %in
270  %s.buffer = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %tmp22, i32 128, i32 0)
271  %s.buffer.0 = extractelement <8 x i32> %s.buffer, i32 0
272  %s.buffer.0.float = bitcast i32 %s.buffer.0 to float
273  %s.buffer.1 = extractelement <8 x i32> %s.buffer, i32 2
274  %s.buffer.1.float = bitcast i32 %s.buffer.1 to float
275  %s.buffer.2 = extractelement <8 x i32> %s.buffer, i32 5
276  %s.buffer.2.float = bitcast i32 %s.buffer.2 to float
277  %s.buffer.3 = extractelement <8 x i32> %s.buffer, i32 7
278  %s.buffer.3.float = bitcast i32 %s.buffer.3 to float
279  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %s.buffer.0.float, float %s.buffer.1.float, float %s.buffer.2.float, float %s.buffer.3.float, i1 true, i1 true) #0
280  ret void
281}
282
283; dwordx8 s.buffer.load
284; GCN-LABEL: {{^}}s_buffer_load_dwordx8_v8f32:
285; VIGFX9_10: s_buffer_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x80
286; SICI: s_buffer_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20
287define amdgpu_ps void @s_buffer_load_dwordx8_v8f32(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 {
288main_body:
289  %tmp22 = load <4 x i32>, ptr addrspace(4) %in
290  %s.buffer = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %tmp22, i32 128, i32 0)
291  %s.buffer.0 = extractelement <8 x float> %s.buffer, i32 0
292  %s.buffer.1 = extractelement <8 x float> %s.buffer, i32 2
293  %s.buffer.2 = extractelement <8 x float> %s.buffer, i32 5
294  %s.buffer.3 = extractelement <8 x float> %s.buffer, i32 7
295  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %s.buffer.0, float %s.buffer.1, float %s.buffer.2, float %s.buffer.3, i1 true, i1 true) #0
296  ret void
297}
298
299; dwordx16 s.buffer.load
300; GCN-LABEL: {{^}}s_buffer_load_dwordx16:
301; VIGFX9_10: s_buffer_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x80
302; SICI: s_buffer_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20
303define amdgpu_ps void @s_buffer_load_dwordx16(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 {
304main_body:
305  %tmp22 = load <4 x i32>, ptr addrspace(4) %in
306  %s.buffer = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %tmp22, i32 128, i32 0)
307  %s.buffer.0 = extractelement <16 x i32> %s.buffer, i32 0
308  %s.buffer.0.float = bitcast i32 %s.buffer.0 to float
309  %s.buffer.1 = extractelement <16 x i32> %s.buffer, i32 3
310  %s.buffer.1.float = bitcast i32 %s.buffer.1 to float
311  %s.buffer.2 = extractelement <16 x i32> %s.buffer, i32 12
312  %s.buffer.2.float = bitcast i32 %s.buffer.2 to float
313  %s.buffer.3 = extractelement <16 x i32> %s.buffer, i32 15
314  %s.buffer.3.float = bitcast i32 %s.buffer.3 to float
315  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %s.buffer.0.float, float %s.buffer.1.float, float %s.buffer.2.float, float %s.buffer.3.float, i1 true, i1 true) #0
316  ret void
317}
318
319; GCN-LABEL: {{^}}s_buffer_load_dwordx16_v16f32:
320; VIGFX9_10: s_buffer_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x80
321; SICI: s_buffer_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20
322define amdgpu_ps void @s_buffer_load_dwordx16_v16f32(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 {
323main_body:
324  %tmp22 = load <4 x i32>, ptr addrspace(4) %in
325  %s.buffer = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %tmp22, i32 128, i32 0)
326  %s.buffer.0 = extractelement <16 x float> %s.buffer, i32 0
327  %s.buffer.1 = extractelement <16 x float> %s.buffer, i32 3
328  %s.buffer.2 = extractelement <16 x float> %s.buffer, i32 12
329  %s.buffer.3 = extractelement <16 x float> %s.buffer, i32 15
330  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %s.buffer.0, float %s.buffer.1, float %s.buffer.2, float %s.buffer.3, i1 true, i1 true) #0
331  ret void
332}
333
334; GCN-LABEL: {{^}}smrd_sgpr_offset:
335; GCN: s_buffer_load_dword s{{[0-9]}}, s[0:3], s4
336define amdgpu_ps float @smrd_sgpr_offset(<4 x i32> inreg %desc, i32 inreg %offset) #0 {
337main_body:
338  %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0)
339  ret float %r
340}
341
342; GCN-LABEL: {{^}}smrd_vgpr_offset:
343; GCN: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ;
344define amdgpu_ps float @smrd_vgpr_offset(<4 x i32> inreg %desc, i32 %offset) #0 {
345main_body:
346  %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0)
347  ret float %r
348}
349
350; GCN-LABEL: {{^}}smrd_vgpr_offset_imm:
351; GCN-NEXT: %bb.
352; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4092 ;
353define amdgpu_ps float @smrd_vgpr_offset_imm(<4 x i32> inreg %desc, i32 %offset) #0 {
354main_body:
355  %off = add i32 %offset, 4092
356  %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %off, i32 0)
357  ret float %r
358}
359
360; GCN-LABEL: {{^}}smrd_vgpr_offset_imm_too_large:
361; GCN-NEXT: %bb.
362; SICI-NEXT: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}0x1000, v0
363; SICI-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ;
364; VIGFX9_10-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 4 offen offset:4092 ;
365define amdgpu_ps float @smrd_vgpr_offset_imm_too_large(<4 x i32> inreg %desc, i32 %offset) #0 {
366main_body:
367  %off = add i32 %offset, 4096
368  %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %off, i32 0)
369  ret float %r
370}
371
372; GCN-LABEL: {{^}}smrd_imm_merged:
373; GCN-NEXT: %bb.
374; SICI-NEXT: s_buffer_load_dwordx4 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x1
375; SICI-NEXT: s_buffer_load_dwordx2 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x7
376; GFX10-NEXT: s_clause
377; VIGFX9_10-NEXT: s_buffer_load_dwordx4 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x4
378; VIGFX9_10-NEXT: s_buffer_load_dwordx2 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x1c
379define amdgpu_ps void @smrd_imm_merged(<4 x i32> inreg %desc) #0 {
380main_body:
381  %r1 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 4, i32 0)
382  %r2 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 8, i32 0)
383  %r3 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 12, i32 0)
384  %r4 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 16, i32 0)
385  %r5 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 28, i32 0)
386  %r6 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 32, i32 0)
387  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) #0
388  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) #0
389  ret void
390}
391
392; GCN-LABEL: {{^}}smrd_imm_merge_m0:
393;
394; GCN: s_buffer_load_dwordx2
395; SICIVI: s_mov_b32 m0
396; SICIVI-DAG: v_interp_p1_f32
397; SICIVI-DAG: v_interp_p1_f32
398; SICIVI-DAG: v_interp_p1_f32
399; SICIVI-DAG: v_interp_p2_f32
400; SICIVI-DAG: v_interp_p2_f32
401; SICIVI-DAG: v_interp_p2_f32
402;
403; extractelement does not result in movrels anymore for vectors gitting 8 dwords
404; SICIVI-NOT: s_mov_b32 m0
405; SICIVI-NOT: v_movrels_b32_e32
406; v_cndmask_b32_e32
407; v_cndmask_b32_e32
408;
409; Merging is still thwarted on GFX9 due to s_set_gpr_idx
410;
411define amdgpu_ps float @smrd_imm_merge_m0(<4 x i32> inreg %desc, i32 inreg %prim, float %u, float %v) #0 {
412main_body:
413  %idx1.f = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 0, i32 0)
414  %idx1 = bitcast float %idx1.f to i32
415
416  %v0.x1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 0, i32 0, i32 %prim)
417  %v0.x = call nsz float @llvm.amdgcn.interp.p2(float %v0.x1, float %v, i32 0, i32 0, i32 %prim)
418  %v0.y1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 0, i32 1, i32 %prim)
419  %v0.y = call nsz float @llvm.amdgcn.interp.p2(float %v0.y1, float %v, i32 0, i32 1, i32 %prim)
420  %v0.z1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 0, i32 2, i32 %prim)
421  %v0.z = call nsz float @llvm.amdgcn.interp.p2(float %v0.z1, float %v, i32 0, i32 2, i32 %prim)
422  %v0.tmp0 = insertelement <3 x float> undef, float %v0.x, i32 0
423  %v0.tmp1 = insertelement <3 x float> %v0.tmp0, float %v0.y, i32 1
424  %v0 = insertelement <3 x float> %v0.tmp1, float %v0.z, i32 2
425  %a = extractelement <3 x float> %v0, i32 %idx1
426
427  %v1.x1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 1, i32 0, i32 %prim)
428  %v1.x = call nsz float @llvm.amdgcn.interp.p2(float %v1.x1, float %v, i32 1, i32 0, i32 %prim)
429  %v1.y1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 1, i32 1, i32 %prim)
430  %v1.y = call nsz float @llvm.amdgcn.interp.p2(float %v1.y1, float %v, i32 1, i32 1, i32 %prim)
431  %v1.z1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 1, i32 2, i32 %prim)
432  %v1.z = call nsz float @llvm.amdgcn.interp.p2(float %v1.z1, float %v, i32 1, i32 2, i32 %prim)
433  %v1.tmp0 = insertelement <3 x float> undef, float %v0.x, i32 0
434  %v1.tmp1 = insertelement <3 x float> %v0.tmp0, float %v0.y, i32 1
435  %v1 = insertelement <3 x float> %v0.tmp1, float %v0.z, i32 2
436
437  %b = extractelement <3 x float> %v1, i32 %idx1
438  %c = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 4, i32 0)
439
440  %res.tmp = fadd float %a, %b
441  %res = fadd float %res.tmp, %c
442  ret float %res
443}
444
445; GCN-LABEL: {{^}}smrd_vgpr_merged:
446; GCN-NEXT: %bb.
447; GFX10-NEXT: s_clause
448; GCN-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
449; GCN-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28
450define amdgpu_ps void @smrd_vgpr_merged(<4 x i32> inreg %desc, i32 %a) #0 {
451main_body:
452  %a1 = add i32 %a, 4
453  %a2 = add i32 %a, 8
454  %a3 = add i32 %a, 12
455  %a4 = add i32 %a, 16
456  %a5 = add i32 %a, 28
457  %a6 = add i32 %a, 32
458  %r1 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a1, i32 0)
459  %r2 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a2, i32 0)
460  %r3 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a3, i32 0)
461  %r4 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a4, i32 0)
462  %r5 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a5, i32 0)
463  %r6 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a6, i32 0)
464  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) #0
465  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) #0
466  ret void
467}
468
469; GCN-LABEL: {{^}}smrd_sgpr_descriptor_promoted
470; GCN: v_readfirstlane
471define amdgpu_cs void @smrd_sgpr_descriptor_promoted(ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615), i32) #0 {
472main_body:
473  br label %.outer_loop_header
474
475ret_block:                                       ; preds = %.outer, %.label22, %main_body
476  ret void
477
478.outer_loop_header:
479  br label %.inner_loop_header
480
481.inner_loop_header:                                     ; preds = %.inner_loop_body, %.outer_loop_header
482  %loopctr.1 = phi i32 [ 0, %.outer_loop_header ], [ %loopctr.2, %.inner_loop_body ]
483  %loopctr.2 = add i32 %loopctr.1, 1
484  %inner_br1 = icmp slt i32 %loopctr.2, 10
485  br i1 %inner_br1, label %.inner_loop_body, label %ret_block
486
487.inner_loop_body:
488  %descriptor = load <4 x i32>, ptr addrspace(4) %0, align 16, !invariant.load !0
489  %load1result = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %descriptor, i32 0, i32 0)
490  store float %load1result, ptr addrspace(1) undef
491  %inner_br2 = icmp uge i32 %1, 10
492  br i1 %inner_br2, label %.inner_loop_header, label %.outer_loop_body
493
494.outer_loop_body:
495  %offset = shl i32 %loopctr.2, 6
496  %load2result = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %descriptor, i32 %offset, i32 0)
497  %outer_br = fcmp ueq float %load2result, 0x0
498  br i1 %outer_br, label %.outer_loop_header, label %ret_block
499}
500
501; SMRD load with a non-const offset
502; GCN-LABEL: {{^}}smrd_load_nonconst0:
503; SIVIGFX9_10: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
504; SIVIGFX9_10: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
505; CI: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
506; CI: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
507; GCN: s_endpgm
508define amdgpu_ps void @smrd_load_nonconst0(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in, i32 inreg %ncoff) #0 {
509main_body:
510  %tmp20 = load <4 x i32>, ptr addrspace(4) %arg
511  %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 %ncoff, i32 0)
512  %tmp22 = load <4 x i32>, ptr addrspace(4) %in
513  %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 %ncoff, i32 0)
514  %s.buffer.float = bitcast i32 %s.buffer to float
515  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
516  ret void
517}
518
519; SMRD load with a non-const non-uniform offset
520; GCN-LABEL: {{^}}smrd_load_nonconst1:
521; SIVIGFX9_10: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
522; SIVIGFX9_10: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
523; CI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
524; CI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
525; GCN: s_endpgm
526define amdgpu_ps void @smrd_load_nonconst1(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in, i32 %ncoff) #0 {
527main_body:
528  %tmp20 = load <4 x i32>, ptr addrspace(4) %arg
529  %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 %ncoff, i32 0)
530  %tmp22 = load <4 x i32>, ptr addrspace(4) %in
531  %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 %ncoff, i32 0)
532  %s.buffer.float = bitcast i32 %s.buffer to float
533  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
534  ret void
535}
536
537; SMRD load with a non-const non-uniform offset of > 4 dwords (requires splitting)
538; GCN-LABEL: {{^}}smrd_load_nonconst2:
539; SIVIGFX9_10-DAG: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
540; SIVIGFX9_10-DAG: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
541; CI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
542; CI: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
543; GCN: s_endpgm
544define amdgpu_ps void @smrd_load_nonconst2(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in, i32 %ncoff) #0 {
545main_body:
546  %tmp20 = load <4 x i32>, ptr addrspace(4) %arg
547  %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 %ncoff, i32 0)
548  %tmp22 = load <4 x i32>, ptr addrspace(4) %in
549  %s.buffer = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %tmp22, i32 %ncoff, i32 0)
550  %s.buffer.elt = extractelement <8 x i32> %s.buffer, i32 1
551  %s.buffer.float = bitcast i32 %s.buffer.elt to float
552  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
553  ret void
554}
555
556; SMRD load with a non-const non-uniform offset of > 4 dwords (requires splitting)
557; GCN-LABEL: {{^}}smrd_load_nonconst3:
558; GCN-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
559; GCN-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
560; GCN-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
561; GCN-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
562; GCN: ; return to shader part epilog
563define amdgpu_ps <16 x float> @smrd_load_nonconst3(<4 x i32> inreg %rsrc, i32 %off) #0 {
564main_body:
565  %ld = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %off, i32 0)
566  %bc = bitcast <16 x i32> %ld to <16 x float>
567  ret <16 x float> %bc
568}
569
570; GCN-LABEL: {{^}}smrd_load_nonconst4:
571; SICI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0xff8, v0 ;
572; SICI-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
573; SICI-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
574; SICI-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
575; SICI-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
576; VIGFX9_10-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 56 offen offset:4032 ;
577; VIGFX9_10-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 56 offen offset:4048 ;
578; VIGFX9_10-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 56 offen offset:4064 ;
579; VIGFX9_10-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 56 offen offset:4080 ;
580; GCN: ; return to shader part epilog
581define amdgpu_ps <16 x float> @smrd_load_nonconst4(<4 x i32> inreg %rsrc, i32 %off) #0 {
582main_body:
583  %off.2 = add i32 %off, 4088
584  %ld = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %off.2, i32 0)
585  %bc = bitcast <16 x i32> %ld to <16 x float>
586  ret <16 x float> %bc
587}
588
589; GCN-LABEL: {{^}}smrd_load_nonconst5:
590; SICI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x1004, v0
591; SICI-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
592; SICI-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
593; SICI-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
594; SICI-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
595; VIGFX9_10: s_movk_i32 s4, 0xfc0
596; VIGFX9_10-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], s4 offen offset:68 ;
597; VIGFX9_10-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], s4 offen offset:84 ;
598; VIGFX9_10-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], s4 offen offset:100 ;
599; VIGFX9_10-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], s4 offen offset:116 ;
600; GCN: ; return to shader part epilog
601define amdgpu_ps <16 x float> @smrd_load_nonconst5(<4 x i32> inreg %rsrc, i32 %off) #0 {
602main_body:
603  %off.2 = add i32 %off, 4100
604  %ld = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %off.2, i32 0)
605  %bc = bitcast <16 x i32> %ld to <16 x float>
606  ret <16 x float> %bc
607}
608
609; SMRD load dwordx2
610; GCN-LABEL: {{^}}smrd_load_dwordx2:
611; SIVIGFX9_10: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
612; CI: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
613; GCN: s_endpgm
614define amdgpu_ps void @smrd_load_dwordx2(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in, i32 inreg %ncoff) #0 {
615main_body:
616  %tmp22 = load <4 x i32>, ptr addrspace(4) %in
617  %s.buffer = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %tmp22, i32 %ncoff, i32 0)
618  %s.buffer.float = bitcast <2 x i32> %s.buffer to <2 x float>
619  %r.1 = extractelement <2 x float> %s.buffer.float, i32 0
620  %r.2 = extractelement <2 x float> %s.buffer.float, i32 1
621  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r.1, float %r.1, float %r.1, float %r.2, i1 true, i1 true) #0
622  ret void
623}
624
625; GCN-LABEL: {{^}}smrd_uniform_loop:
626;
627; TODO: we should keep the loop counter in an SGPR
628;
629; GCN: s_buffer_load_dword
630define amdgpu_ps float @smrd_uniform_loop(<4 x i32> inreg %desc, i32 %bound) #0 {
631main_body:
632  br label %loop
633
634loop:
635  %counter = phi i32 [ 0, %main_body ], [ %counter.next, %loop ]
636  %sum = phi float [ 0.0, %main_body ], [ %sum.next, %loop ]
637  %offset = shl i32 %counter, 2
638  %v = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0)
639  %sum.next = fadd float %sum, %v
640  %counter.next = add i32 %counter, 1
641  %cc = icmp uge i32 %counter.next, %bound
642  br i1 %cc, label %exit, label %loop
643
644exit:
645  ret float %sum.next
646}
647
648
649; GCN-LABEL: {{^}}smrd_uniform_loop2:
650; (this test differs from smrd_uniform_loop by the more complex structure of phis)
651;
652; TODO: we should keep the loop counter in an SGPR and use an S_BUFFER_LOAD
653;
654; GCN: buffer_load_dword
655define amdgpu_ps float @smrd_uniform_loop2(<4 x i32> inreg %desc, i32 %bound, i32 %bound.a) #0 {
656main_body:
657  br label %loop
658
659loop:
660  %counter = phi i32 [ 0, %main_body ], [ %counter.next, %loop.a ], [ %counter.next, %loop.b ]
661  %sum = phi float [ 0.0, %main_body ], [ %sum.next, %loop.a ], [ %sum.next.b, %loop.b ]
662  %offset = shl i32 %counter, 2
663  %v = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0)
664  %sum.next = fadd float %sum, %v
665  %counter.next = add i32 %counter, 1
666  %cc = icmp uge i32 %counter.next, %bound
667  br i1 %cc, label %exit, label %loop.a
668
669loop.a:
670  %cc.a = icmp uge i32 %counter.next, %bound.a
671  br i1 %cc, label %loop, label %loop.b
672
673loop.b:
674  %sum.next.b = fadd float %sum.next, 1.0
675  br label %loop
676
677exit:
678  ret float %sum.next
679}
680
681; This test checks that the load after some control flow with an offset based
682; on a divergent shader input is correctly recognized as divergent. This was
683; reduced from an actual regression. Yes, the %unused argument matters, as
684; well as the fact that %arg4 is a vector.
685;
686; GCN-LABEL: {{^}}arg_divergence:
687; GCN: buffer_load_dword v0, v0,
688; GCN-NEXT: s_waitcnt
689; GCN-NEXT: ; return to shader part epilog
690define amdgpu_cs float @arg_divergence(i32 inreg %unused, <3 x i32> %arg4) #0 {
691main_body:
692  br i1 undef, label %if1, label %endif1
693
694if1:                                              ; preds = %main_body
695  store i32 0, ptr addrspace(3) undef, align 4
696  br label %endif1
697
698endif1:                                           ; preds = %if1, %main_body
699  %tmp13 = extractelement <3 x i32> %arg4, i32 0
700  %tmp97 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 %tmp13, i32 0)
701  ret float %tmp97
702}
703
704; GCN-LABEL: {{^}}s_buffer_load_f32:
705; GCN: s_buffer_load_dword s0, s[0:3], s4
706define amdgpu_ps void @s_buffer_load_f32(<4 x i32> inreg %rsrc, i32 inreg %offset) {
707  %sgpr = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
708  call void asm sideeffect "; use $0", "s"(float %sgpr)
709  ret void
710}
711
712; GCN-LABEL: {{^}}s_buffer_load_v2f32:
713; GCN: s_buffer_load_dwordx2 s[0:1], s[0:3], s4
714define amdgpu_ps void @s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 inreg %offset) {
715  %sgpr = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %offset, i32 0)
716  call void asm sideeffect "; use $0", "s"(<2 x float> %sgpr)
717  ret void
718}
719
720; GCN-LABEL: {{^}}s_buffer_load_v4f32:
721; GCN: s_buffer_load_dwordx4 s[0:3], s[0:3], s4
722define amdgpu_ps void @s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 inreg %offset) {
723  %sgpr = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %offset, i32 0)
724  call void asm sideeffect "; use $0", "s"(<4 x float> %sgpr)
725  ret void
726}
727
728; GCN-LABEL: {{^}}s_buffer_load_v8f32:
729; GCN: s_buffer_load_dwordx8 s[0:7], s[0:3], s4
730define amdgpu_ps void @s_buffer_load_v8f32(<4 x i32> inreg %rsrc, i32 inreg %offset) {
731  %sgpr = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %offset, i32 0)
732  call void asm sideeffect "; use $0", "s"(<8 x float> %sgpr)
733  ret void
734}
735
736; GCN-LABEL: {{^}}s_buffer_load_v16f32:
737; GCN: s_buffer_load_dwordx16 s[0:15], s[0:3], s4
738define amdgpu_ps void @s_buffer_load_v16f32(<4 x i32> inreg %rsrc, i32 inreg %offset) {
739  %sgpr = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %offset, i32 0)
740  call void asm sideeffect "; use $0", "s"(<16 x float> %sgpr)
741  ret void
742}
743
744declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
745declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
746declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
747
748declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32) #1
749declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32)
750declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32)
751declare <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32>, i32, i32)
752declare <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32>, i32, i32)
753
754declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32)
755declare <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32>, i32, i32)
756declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32)
757declare <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32>, i32, i32)
758declare <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32>, i32, i32)
759
760attributes #0 = { nounwind }
761attributes #1 = { nounwind readnone }
762attributes #2 = { nounwind readnone speculatable }
763
764!0 = !{}
765