xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll (revision ade0750e3529ee251cbfb60ce66904a8553381e4)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2; RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=GFX68,VERDE %s
3; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GFX68,GFX8 %s
4; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GFX11 %s
5; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck -check-prefixes=GFX12 %s
6
7define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
8; GFX68-LABEL: buffer_store:
9; GFX68:       ; %bb.0: ; %main_body
10; GFX68-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
11; GFX68-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc
12; GFX68-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc
13; GFX68-NEXT:    s_endpgm
14;
15; GFX11-LABEL: buffer_store:
16; GFX11:       ; %bb.0: ; %main_body
17; GFX11-NEXT:    s_clause 0x2
18; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
19; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 glc
20; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 slc
21; GFX11-NEXT:    s_endpgm
22main_body:
23  call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0)
24  call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 1)
25  call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 2)
26  ret void
27}
28
29define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
30; GFX68-LABEL: buffer_store_immoffs:
31; GFX68:       ; %bb.0: ; %main_body
32; GFX68-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42
33; GFX68-NEXT:    s_endpgm
34;
35; GFX11-LABEL: buffer_store_immoffs:
36; GFX11:       ; %bb.0: ; %main_body
37; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 offset:42
38; GFX11-NEXT:    s_endpgm
39main_body:
40  call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 42, i32 0, i32 0)
41  ret void
42}
43
44define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
45; GFX68-LABEL: buffer_store_ofs:
46; GFX68:       ; %bb.0: ; %main_body
47; GFX68-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
48; GFX68-NEXT:    s_endpgm
49;
50; GFX11-LABEL: buffer_store_ofs:
51; GFX11:       ; %bb.0: ; %main_body
52; GFX11-NEXT:    buffer_store_b128 v[0:3], v4, s[0:3], 0 offen
53; GFX11-NEXT:    s_endpgm
54main_body:
55  call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0)
56  ret void
57}
58
59; Ideally, the register allocator would avoid the wait here
60define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) {
61; VERDE-LABEL: buffer_store_wait:
62; VERDE:       ; %bb.0: ; %main_body
63; VERDE-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
64; VERDE-NEXT:    s_waitcnt expcnt(0)
65; VERDE-NEXT:    buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen
66; VERDE-NEXT:    s_waitcnt vmcnt(0)
67; VERDE-NEXT:    buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen
68; VERDE-NEXT:    s_endpgm
69;
70; GFX8-LABEL: buffer_store_wait:
71; GFX8:       ; %bb.0: ; %main_body
72; GFX8-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
73; GFX8-NEXT:    buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen
74; GFX8-NEXT:    s_waitcnt vmcnt(0)
75; GFX8-NEXT:    buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen
76; GFX8-NEXT:    s_endpgm
77;
78; GFX11-LABEL: buffer_store_wait:
79; GFX11:       ; %bb.0: ; %main_body
80; GFX11-NEXT:    buffer_store_b128 v[0:3], v4, s[0:3], 0 offen
81; GFX11-NEXT:    buffer_load_b128 v[0:3], v5, s[0:3], 0 offen
82; GFX11-NEXT:    s_waitcnt vmcnt(0)
83; GFX11-NEXT:    buffer_store_b128 v[0:3], v6, s[0:3], 0 offen
84; GFX11-NEXT:    s_endpgm
85main_body:
86  call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0)
87  %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0)
88  call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i32 0)
89  ret void
90}
91
92define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %offset) {
93; GFX68-LABEL: buffer_store_x1:
94; GFX68:       ; %bb.0: ; %main_body
95; GFX68-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
96; GFX68-NEXT:    s_endpgm
97;
98; GFX11-LABEL: buffer_store_x1:
99; GFX11:       ; %bb.0: ; %main_body
100; GFX11-NEXT:    buffer_store_b32 v0, v1, s[0:3], 0 offen
101; GFX11-NEXT:    s_endpgm
102main_body:
103  call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
104  ret void
105}
106
107define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %offset) #0 {
108; GFX68-LABEL: buffer_store_x2:
109; GFX68:       ; %bb.0: ; %main_body
110; GFX68-NEXT:    buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
111; GFX68-NEXT:    s_endpgm
112;
113; GFX11-LABEL: buffer_store_x2:
114; GFX11:       ; %bb.0: ; %main_body
115; GFX11-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], 0 offen
116; GFX11-NEXT:    s_endpgm
117main_body:
118  call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
119  ret void
120}
121
122define amdgpu_ps void @buffer_store_x1_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
123; GFX68-LABEL: buffer_store_x1_offen_merged_and:
124; GFX68:       ; %bb.0:
125; GFX68-NEXT:    buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4
126; GFX68-NEXT:    buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28
127; GFX68-NEXT:    s_endpgm
128;
129; GFX11-LABEL: buffer_store_x1_offen_merged_and:
130; GFX11:       ; %bb.0:
131; GFX11-NEXT:    s_clause 0x1
132; GFX11-NEXT:    buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4
133; GFX11-NEXT:    buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28
134; GFX11-NEXT:    s_endpgm
135  %a1 = add i32 %a, 4
136  %a2 = add i32 %a, 8
137  %a3 = add i32 %a, 12
138  %a4 = add i32 %a, 16
139  %a5 = add i32 %a, 28
140  %a6 = add i32 %a, 32
141  call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0)
142  call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 %a2, i32 0, i32 0)
143  call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 %a3, i32 0, i32 0)
144  call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 %a4, i32 0, i32 0)
145  call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 %a5, i32 0, i32 0)
146  call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 %a6, i32 0, i32 0)
147  ret void
148}
149
150define amdgpu_ps void @buffer_store_x1_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
151; GFX68-LABEL: buffer_store_x1_offen_merged_or:
152; GFX68:       ; %bb.0:
153; GFX68-NEXT:    v_lshlrev_b32_e32 v0, 6, v0
154; GFX68-NEXT:    buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4
155; GFX68-NEXT:    buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28
156; GFX68-NEXT:    s_endpgm
157;
158; GFX11-LABEL: buffer_store_x1_offen_merged_or:
159; GFX11:       ; %bb.0:
160; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 6, v0
161; GFX11-NEXT:    s_clause 0x1
162; GFX11-NEXT:    buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4
163; GFX11-NEXT:    buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28
164; GFX11-NEXT:    s_endpgm
165  %a = shl i32 %inp, 6
166  %a1 = add i32 %a, 4
167  %a2 = add i32 %a, 8
168  %a3 = add i32 %a, 12
169  %a4 = add i32 %a, 16
170  %a5 = add i32 %a, 28
171  %a6 = add i32 %a, 32
172  call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0)
173  call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 %a2, i32 0, i32 0)
174  call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 %a3, i32 0, i32 0)
175  call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 %a4, i32 0, i32 0)
176  call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 %a5, i32 0, i32 0)
177  call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 %a6, i32 0, i32 0)
178  ret void
179}
180
181define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
182; GFX68-LABEL: buffer_store_x1_offen_merged_glc_slc:
183; GFX68:       ; %bb.0:
184; GFX68-NEXT:    buffer_store_dwordx2 v[1:2], v0, s[0:3], 0 offen offset:4
185; GFX68-NEXT:    buffer_store_dwordx2 v[3:4], v0, s[0:3], 0 offen offset:12 glc
186; GFX68-NEXT:    buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 glc slc
187; GFX68-NEXT:    s_endpgm
188;
189; GFX11-LABEL: buffer_store_x1_offen_merged_glc_slc:
190; GFX11:       ; %bb.0:
191; GFX11-NEXT:    s_clause 0x2
192; GFX11-NEXT:    buffer_store_b64 v[1:2], v0, s[0:3], 0 offen offset:4
193; GFX11-NEXT:    buffer_store_b64 v[3:4], v0, s[0:3], 0 offen offset:12 glc
194; GFX11-NEXT:    buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28 glc slc
195; GFX11-NEXT:    s_endpgm
196  %a1 = add i32 %a, 4
197  %a2 = add i32 %a, 8
198  %a3 = add i32 %a, 12
199  %a4 = add i32 %a, 16
200  %a5 = add i32 %a, 28
201  %a6 = add i32 %a, 32
202  call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0)
203  call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 %a2, i32 0, i32 0)
204  call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 %a3, i32 0, i32 1)
205  call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 %a4, i32 0, i32 1)
206  call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 %a5, i32 0, i32 3)
207  call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 %a6, i32 0, i32 3)
208  ret void
209}
210
211define amdgpu_ps void @buffer_store_x2_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) {
212; GFX68-LABEL: buffer_store_x2_offen_merged_and:
213; GFX68:       ; %bb.0:
214; GFX68-NEXT:    buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4
215; GFX68-NEXT:    s_endpgm
216;
217; GFX11-LABEL: buffer_store_x2_offen_merged_and:
218; GFX11:       ; %bb.0:
219; GFX11-NEXT:    buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4
220; GFX11-NEXT:    s_endpgm
221  %a1 = add i32 %a, 4
222  %a2 = add i32 %a, 12
223  call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0)
224  call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 %a2, i32 0, i32 0)
225  ret void
226}
227
228define amdgpu_ps void @buffer_store_x2_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp, <2 x float> %v1, <2 x float> %v2) {
229; GFX68-LABEL: buffer_store_x2_offen_merged_or:
230; GFX68:       ; %bb.0:
231; GFX68-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
232; GFX68-NEXT:    buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4
233; GFX68-NEXT:    s_endpgm
234;
235; GFX11-LABEL: buffer_store_x2_offen_merged_or:
236; GFX11:       ; %bb.0:
237; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
238; GFX11-NEXT:    buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4
239; GFX11-NEXT:    s_endpgm
240  %a = shl i32 %inp, 4
241  %a1 = add i32 %a, 4
242  %a2 = add i32 %a, 12
243  call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0)
244  call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 %a2, i32 0, i32 0)
245  ret void
246}
247
248define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
249; GFX68-LABEL: buffer_store_x1_offset_merged:
250; GFX68:       ; %bb.0:
251; GFX68-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4
252; GFX68-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:28
253; GFX68-NEXT:    s_endpgm
254;
255; GFX11-LABEL: buffer_store_x1_offset_merged:
256; GFX11:       ; %bb.0:
257; GFX11-NEXT:    s_clause 0x1
258; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4
259; GFX11-NEXT:    buffer_store_b64 v[4:5], off, s[0:3], 0 offset:28
260; GFX11-NEXT:    s_endpgm
261  call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0)
262  call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0)
263  call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 0)
264  call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 16, i32 0, i32 0)
265  call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 28, i32 0, i32 0)
266  call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 32, i32 0, i32 0)
267  ret void
268}
269
270define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1,<2 x float> %v2) {
271; GFX68-LABEL: buffer_store_x2_offset_merged:
272; GFX68:       ; %bb.0:
273; GFX68-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4
274; GFX68-NEXT:    s_endpgm
275;
276; GFX11-LABEL: buffer_store_x2_offset_merged:
277; GFX11:       ; %bb.0:
278; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4
279; GFX11-NEXT:    s_endpgm
280  call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0)
281  call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 12, i32 0, i32 0)
282  ret void
283}
284
285define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i32) {
286; GFX68-LABEL: buffer_store_int:
287; GFX68:       ; %bb.0: ; %main_body
288; GFX68-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
289; GFX68-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 glc
290; GFX68-NEXT:    buffer_store_dword v6, off, s[0:3], 0 slc
291; GFX68-NEXT:    s_endpgm
292;
293; GFX11-LABEL: buffer_store_int:
294; GFX11:       ; %bb.0: ; %main_body
295; GFX11-NEXT:    s_clause 0x2
296; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
297; GFX11-NEXT:    buffer_store_b64 v[4:5], off, s[0:3], 0 glc
298; GFX11-NEXT:    buffer_store_b32 v6, off, s[0:3], 0 slc
299; GFX11-NEXT:    s_endpgm
300main_body:
301  call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %1, <4 x i32> %0, i32 0, i32 0, i32 0)
302  call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %2, <4 x i32> %0, i32 0, i32 0, i32 1)
303  call void @llvm.amdgcn.raw.buffer.store.i32(i32 %3, <4 x i32> %0, i32 0, i32 0, i32 2)
304  ret void
305}
306
307define amdgpu_ps void @raw_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) {
308; GFX68-LABEL: raw_buffer_store_byte:
309; GFX68:       ; %bb.0: ; %main_body
310; GFX68-NEXT:    v_cvt_u32_f32_e32 v0, v0
311; GFX68-NEXT:    buffer_store_byte v0, off, s[0:3], 0
312; GFX68-NEXT:    s_endpgm
313;
314; GFX11-LABEL: raw_buffer_store_byte:
315; GFX11:       ; %bb.0: ; %main_body
316; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
317; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
318; GFX11-NEXT:    s_endpgm
319main_body:
320  %v2 = fptoui float %v1 to i32
321  %v3 = trunc i32 %v2 to i8
322  call void @llvm.amdgcn.raw.buffer.store.i8(i8 %v3, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
323  ret void
324}
325
326define amdgpu_ps void @raw_buffer_store_short(<4 x i32> inreg %rsrc, float %v1) {
327; GFX68-LABEL: raw_buffer_store_short:
328; GFX68:       ; %bb.0: ; %main_body
329; GFX68-NEXT:    v_cvt_u32_f32_e32 v0, v0
330; GFX68-NEXT:    buffer_store_short v0, off, s[0:3], 0
331; GFX68-NEXT:    s_endpgm
332;
333; GFX11-LABEL: raw_buffer_store_short:
334; GFX11:       ; %bb.0: ; %main_body
335; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
336; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
337; GFX11-NEXT:    s_endpgm
338main_body:
339  %v2 = fptoui float %v1 to i32
340  %v3 = trunc i32 %v2 to i16
341  call void @llvm.amdgcn.raw.buffer.store.i16(i16 %v3, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
342  ret void
343}
344
345define amdgpu_ps void @raw_buffer_store_f16(<4 x i32> inreg %rsrc, i32 %v1) {
346; GFX68-LABEL: raw_buffer_store_f16:
347; GFX68:       ; %bb.0: ; %main_body
348; GFX68-NEXT:    buffer_store_short v0, off, s[0:3], 0
349; GFX68-NEXT:    s_endpgm
350;
351; GFX11-LABEL: raw_buffer_store_f16:
352; GFX11:       ; %bb.0: ; %main_body
353; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
354; GFX11-NEXT:    s_endpgm
355main_body:
356  %trunc = trunc i32 %v1 to i16
357  %cast = bitcast i16 %trunc to half
358  call void @llvm.amdgcn.raw.buffer.store.f16(half %cast, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
359  ret void
360}
361
362define amdgpu_ps void @buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %offset) {
363; VERDE-LABEL: buffer_store_v2f16:
364; VERDE:       ; %bb.0: ; %main_body
365; VERDE-NEXT:    v_cvt_f16_f32_e32 v1, v1
366; VERDE-NEXT:    v_cvt_f16_f32_e32 v0, v0
367; VERDE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
368; VERDE-NEXT:    v_or_b32_e32 v0, v0, v1
369; VERDE-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
370; VERDE-NEXT:    s_endpgm
371;
372; GFX8-LABEL: buffer_store_v2f16:
373; GFX8:       ; %bb.0: ; %main_body
374; GFX8-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
375; GFX8-NEXT:    s_endpgm
376;
377; GFX11-LABEL: buffer_store_v2f16:
378; GFX11:       ; %bb.0: ; %main_body
379; GFX11-NEXT:    buffer_store_b32 v0, v1, s[0:3], 0 offen
380; GFX11-NEXT:    s_endpgm
381main_body:
382  call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
383  ret void
384}
385
386define amdgpu_ps void @buffer_store_v4f16(<4 x i32> inreg %rsrc, <4 x half> %data, i32 %offset) #0 {
387; VERDE-LABEL: buffer_store_v4f16:
388; VERDE:       ; %bb.0: ; %main_body
389; VERDE-NEXT:    v_cvt_f16_f32_e32 v3, v3
390; VERDE-NEXT:    v_cvt_f16_f32_e32 v2, v2
391; VERDE-NEXT:    v_cvt_f16_f32_e32 v5, v1
392; VERDE-NEXT:    v_cvt_f16_f32_e32 v0, v0
393; VERDE-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
394; VERDE-NEXT:    v_or_b32_e32 v1, v2, v1
395; VERDE-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
396; VERDE-NEXT:    v_or_b32_e32 v0, v0, v2
397; VERDE-NEXT:    buffer_store_dwordx2 v[0:1], v4, s[0:3], 0 offen
398; VERDE-NEXT:    s_endpgm
399;
400; GFX8-LABEL: buffer_store_v4f16:
401; GFX8:       ; %bb.0: ; %main_body
402; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
403; GFX8-NEXT:    s_endpgm
404;
405; GFX11-LABEL: buffer_store_v4f16:
406; GFX11:       ; %bb.0: ; %main_body
407; GFX11-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], 0 offen
408; GFX11-NEXT:    s_endpgm
409main_body:
410  call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
411  ret void
412}
413
414define amdgpu_ps void @raw_buffer_store_i16(<4 x i32> inreg %rsrc, i32 %v1) {
415; GFX68-LABEL: raw_buffer_store_i16:
416; GFX68:       ; %bb.0: ; %main_body
417; GFX68-NEXT:    buffer_store_short v0, off, s[0:3], 0
418; GFX68-NEXT:    s_endpgm
419;
420; GFX11-LABEL: raw_buffer_store_i16:
421; GFX11:       ; %bb.0: ; %main_body
422; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
423; GFX11-NEXT:    s_endpgm
424main_body:
425  %trunc = trunc i32 %v1 to i16
426  call void @llvm.amdgcn.raw.buffer.store.i16(i16 %trunc, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
427  ret void
428}
429
430define amdgpu_ps void @buffer_store_v2i16(<4 x i32> inreg %rsrc, <2 x i16> %data, i32 %offset) {
431; VERDE-LABEL: buffer_store_v2i16:
432; VERDE:       ; %bb.0: ; %main_body
433; VERDE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
434; VERDE-NEXT:    v_and_b32_e32 v0, 0xffff, v0
435; VERDE-NEXT:    v_or_b32_e32 v0, v0, v1
436; VERDE-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
437; VERDE-NEXT:    s_endpgm
438;
439; GFX8-LABEL: buffer_store_v2i16:
440; GFX8:       ; %bb.0: ; %main_body
441; GFX8-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
442; GFX8-NEXT:    s_endpgm
443;
444; GFX11-LABEL: buffer_store_v2i16:
445; GFX11:       ; %bb.0: ; %main_body
446; GFX11-NEXT:    buffer_store_b32 v0, v1, s[0:3], 0 offen
447; GFX11-NEXT:    s_endpgm
448main_body:
449  call void @llvm.amdgcn.raw.buffer.store.v2i16(<2 x i16> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
450  ret void
451}
452
453define amdgpu_ps void @buffer_store_v4i16(<4 x i32> inreg %rsrc, <4 x i16> %data, i32 %offset) #0 {
454; VERDE-LABEL: buffer_store_v4i16:
455; VERDE:       ; %bb.0: ; %main_body
456; VERDE-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
457; VERDE-NEXT:    v_and_b32_e32 v2, 0xffff, v2
458; VERDE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
459; VERDE-NEXT:    v_and_b32_e32 v0, 0xffff, v0
460; VERDE-NEXT:    v_or_b32_e32 v2, v2, v3
461; VERDE-NEXT:    v_or_b32_e32 v1, v0, v1
462; VERDE-NEXT:    buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 offen
463; VERDE-NEXT:    s_endpgm
464;
465; GFX8-LABEL: buffer_store_v4i16:
466; GFX8:       ; %bb.0: ; %main_body
467; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
468; GFX8-NEXT:    s_endpgm
469;
470; GFX11-LABEL: buffer_store_v4i16:
471; GFX11:       ; %bb.0: ; %main_body
472; GFX11-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], 0 offen
473; GFX11-NEXT:    s_endpgm
474main_body:
475  call void @llvm.amdgcn.raw.buffer.store.v4i16(<4 x i16> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
476  ret void
477}
478
479define amdgpu_ps void @raw_buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
480; GFX68-LABEL: raw_buffer_store_x1_offset_merged:
481; GFX68:       ; %bb.0:
482; GFX68-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4
483; GFX68-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:28
484; GFX68-NEXT:    s_endpgm
485;
486; GFX11-LABEL: raw_buffer_store_x1_offset_merged:
487; GFX11:       ; %bb.0:
488; GFX11-NEXT:    s_clause 0x1
489; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4
490; GFX11-NEXT:    buffer_store_b64 v[4:5], off, s[0:3], 0 offset:28
491; GFX11-NEXT:    s_endpgm
492  call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0)
493  call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0)
494  call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 0)
495  call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 16, i32 0, i32 0)
496  call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 28, i32 0, i32 0)
497  call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 32, i32 0, i32 0)
498  ret void
499}
500
501define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged_pregfx12(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
502; GFX68-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged_pregfx12:
503; GFX68:       ; %bb.0:
504; GFX68-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
505; GFX68-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:8
506; GFX68-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:12
507; GFX68-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:16
508; GFX68-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:28
509; GFX68-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:32
510; GFX68-NEXT:    s_endpgm
511;
512; GFX11-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged_pregfx12:
513; GFX11:       ; %bb.0:
514; GFX11-NEXT:    s_clause 0x5
515; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0 offset:4
516; GFX11-NEXT:    buffer_store_b32 v1, off, s[0:3], 0 offset:8
517; GFX11-NEXT:    buffer_store_b32 v2, off, s[0:3], 0 offset:12
518; GFX11-NEXT:    buffer_store_b32 v3, off, s[0:3], 0 offset:16
519; GFX11-NEXT:    buffer_store_b32 v4, off, s[0:3], 0 offset:28
520; GFX11-NEXT:    buffer_store_b32 v5, off, s[0:3], 0 offset:32
521; GFX11-NEXT:    s_endpgm
522  call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 8)
523  call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 8)
524  call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 8)
525  call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 16, i32 0, i32 8)
526  call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 28, i32 0, i32 8)
527  call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 32, i32 0, i32 8)
528  ret void
529}
530
531define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
532; GFX12-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged:
533; GFX12:       ; %bb.0:
534; GFX12-NEXT:    s_clause 0x5
535; GFX12-NEXT:    buffer_store_b32 v0, off, s[0:3], null offset:4
536; GFX12-NEXT:    buffer_store_b32 v1, off, s[0:3], null offset:8
537; GFX12-NEXT:    buffer_store_b32 v2, off, s[0:3], null offset:12
538; GFX12-NEXT:    buffer_store_b32 v3, off, s[0:3], null offset:16
539; GFX12-NEXT:    buffer_store_b32 v4, off, s[0:3], null offset:28
540; GFX12-NEXT:    buffer_store_b32 v5, off, s[0:3], null offset:32
541; GFX12-NEXT:    s_endpgm
542  call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 64)
543  call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 64)
544  call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 64)
545  call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 16, i32 0, i32 64)
546  call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 28, i32 0, i32 64)
547  call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 32, i32 0, i32 64)
548  ret void
549}
550
551declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) #0
552declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) #0
553declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #0
554declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32) #0
555declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32) #0
556declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32) #0
557declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #1
558declare void @llvm.amdgcn.raw.buffer.store.i8(i8, <4 x i32>, i32, i32, i32) #0
559declare void @llvm.amdgcn.raw.buffer.store.f16(half, <4 x i32>, i32, i32, i32) #0
560declare void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32) #0
561declare void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32) #0
562declare void @llvm.amdgcn.raw.buffer.store.i16(i16, <4 x i32>, i32, i32, i32) #0
563declare void @llvm.amdgcn.raw.buffer.store.v2i16(<2 x i16>, <4 x i32>, i32, i32, i32) #0
564declare void @llvm.amdgcn.raw.buffer.store.v4i16(<4 x i16>, <4 x i32>, i32, i32, i32) #0
565
566attributes #0 = { nounwind }
567attributes #1 = { nounwind readonly }
568