xref: /llvm-project/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll (revision ba52f06f9d92c7ca04b440f618f8d352ea121fcc)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12 %s
5
6; vgpr offset
7
8define amdgpu_ps void @test_scratch_load_i8_zext_v(ptr addrspace(5) %in, ptr %out) {
9; GFX10-LABEL: test_scratch_load_i8_zext_v:
10; GFX10:       ; %bb.0:
11; GFX10-NEXT:    s_add_u32 s0, s0, s2
12; GFX10-NEXT:    s_addc_u32 s1, s1, 0
13; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
14; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
15; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:1
16; GFX10-NEXT:    s_waitcnt vmcnt(0)
17; GFX10-NEXT:    flat_store_dword v[1:2], v0
18; GFX10-NEXT:    s_endpgm
19;
20; GFX11-LABEL: test_scratch_load_i8_zext_v:
21; GFX11:       ; %bb.0:
22; GFX11-NEXT:    scratch_load_u8 v0, v0, off offset:1
23; GFX11-NEXT:    s_waitcnt vmcnt(0)
24; GFX11-NEXT:    flat_store_b32 v[1:2], v0
25; GFX11-NEXT:    s_endpgm
26;
27; GFX12-LABEL: test_scratch_load_i8_zext_v:
28; GFX12:       ; %bb.0:
29; GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:1
30; GFX12-NEXT:    s_wait_loadcnt 0x0
31; GFX12-NEXT:    flat_store_b32 v[1:2], v0
32; GFX12-NEXT:    s_endpgm
33  %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1
34  %load = load i8, ptr addrspace(5) %gep, align 4
35  %ext = zext i8 %load to i32
36  store i32 %ext, ptr %out, align 4
37  ret void
38}
39
40define amdgpu_ps void @test_scratch_load_i8_sext_v(ptr addrspace(5) %in, ptr %out) {
41; GFX10-LABEL: test_scratch_load_i8_sext_v:
42; GFX10:       ; %bb.0:
43; GFX10-NEXT:    s_add_u32 s0, s0, s2
44; GFX10-NEXT:    s_addc_u32 s1, s1, 0
45; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
46; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
47; GFX10-NEXT:    scratch_load_sbyte v0, v0, off offset:1
48; GFX10-NEXT:    s_waitcnt vmcnt(0)
49; GFX10-NEXT:    flat_store_dword v[1:2], v0
50; GFX10-NEXT:    s_endpgm
51;
52; GFX11-LABEL: test_scratch_load_i8_sext_v:
53; GFX11:       ; %bb.0:
54; GFX11-NEXT:    scratch_load_i8 v0, v0, off offset:1
55; GFX11-NEXT:    s_waitcnt vmcnt(0)
56; GFX11-NEXT:    flat_store_b32 v[1:2], v0
57; GFX11-NEXT:    s_endpgm
58;
59; GFX12-LABEL: test_scratch_load_i8_sext_v:
60; GFX12:       ; %bb.0:
61; GFX12-NEXT:    scratch_load_i8 v0, v0, off offset:1
62; GFX12-NEXT:    s_wait_loadcnt 0x0
63; GFX12-NEXT:    flat_store_b32 v[1:2], v0
64; GFX12-NEXT:    s_endpgm
65  %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1
66  %load = load i8, ptr addrspace(5) %gep, align 4
67  %ext = sext i8 %load to i32
68  store i32 %ext, ptr %out, align 4
69  ret void
70}
71
72define amdgpu_ps void @test_scratch_load_i16_zext_v(ptr addrspace(5) %in, ptr %out) {
73; GFX10-LABEL: test_scratch_load_i16_zext_v:
74; GFX10:       ; %bb.0:
75; GFX10-NEXT:    s_add_u32 s0, s0, s2
76; GFX10-NEXT:    s_addc_u32 s1, s1, 0
77; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
78; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
79; GFX10-NEXT:    scratch_load_ushort v0, v0, off offset:2
80; GFX10-NEXT:    s_waitcnt vmcnt(0)
81; GFX10-NEXT:    flat_store_dword v[1:2], v0
82; GFX10-NEXT:    s_endpgm
83;
84; GFX11-LABEL: test_scratch_load_i16_zext_v:
85; GFX11:       ; %bb.0:
86; GFX11-NEXT:    scratch_load_u16 v0, v0, off offset:2
87; GFX11-NEXT:    s_waitcnt vmcnt(0)
88; GFX11-NEXT:    flat_store_b32 v[1:2], v0
89; GFX11-NEXT:    s_endpgm
90;
91; GFX12-LABEL: test_scratch_load_i16_zext_v:
92; GFX12:       ; %bb.0:
93; GFX12-NEXT:    scratch_load_u16 v0, v0, off offset:2
94; GFX12-NEXT:    s_wait_loadcnt 0x0
95; GFX12-NEXT:    flat_store_b32 v[1:2], v0
96; GFX12-NEXT:    s_endpgm
97  %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1
98  %load = load i16, ptr addrspace(5) %gep, align 4
99  %ext = zext i16 %load to i32
100  store i32 %ext, ptr %out, align 4
101  ret void
102}
103
104define amdgpu_ps void @test_scratch_load_i16_sext_v(ptr addrspace(5) %in, ptr %out) {
105; GFX10-LABEL: test_scratch_load_i16_sext_v:
106; GFX10:       ; %bb.0:
107; GFX10-NEXT:    s_add_u32 s0, s0, s2
108; GFX10-NEXT:    s_addc_u32 s1, s1, 0
109; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
110; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
111; GFX10-NEXT:    scratch_load_sshort v0, v0, off offset:2
112; GFX10-NEXT:    s_waitcnt vmcnt(0)
113; GFX10-NEXT:    flat_store_dword v[1:2], v0
114; GFX10-NEXT:    s_endpgm
115;
116; GFX11-LABEL: test_scratch_load_i16_sext_v:
117; GFX11:       ; %bb.0:
118; GFX11-NEXT:    scratch_load_i16 v0, v0, off offset:2
119; GFX11-NEXT:    s_waitcnt vmcnt(0)
120; GFX11-NEXT:    flat_store_b32 v[1:2], v0
121; GFX11-NEXT:    s_endpgm
122;
123; GFX12-LABEL: test_scratch_load_i16_sext_v:
124; GFX12:       ; %bb.0:
125; GFX12-NEXT:    scratch_load_i16 v0, v0, off offset:2
126; GFX12-NEXT:    s_wait_loadcnt 0x0
127; GFX12-NEXT:    flat_store_b32 v[1:2], v0
128; GFX12-NEXT:    s_endpgm
129  %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1
130  %load = load i16, ptr addrspace(5) %gep, align 4
131  %ext = sext i16 %load to i32
132  store i32 %ext, ptr %out, align 4
133  ret void
134}
135
136define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_v(ptr addrspace(5) %in, ptr %out) {
137; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_lo_v:
138; GFX10:       ; %bb.0: ; %bb
139; GFX10-NEXT:    s_add_u32 s0, s0, s2
140; GFX10-NEXT:    s_addc_u32 s1, s1, 0
141; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
142; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
143; GFX10-NEXT:    v_add_nc_u32_e32 v0, 1, v0
144; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff0000
145; GFX10-NEXT:    scratch_load_ubyte_d16 v3, v0, off
146; GFX10-NEXT:    s_waitcnt vmcnt(0)
147; GFX10-NEXT:    flat_store_dword v[1:2], v3
148; GFX10-NEXT:    s_endpgm
149;
150; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_lo_v:
151; GFX11:       ; %bb.0: ; %bb
152; GFX11-NEXT:    v_dual_mov_b32 v3, 0xffff0000 :: v_dual_add_nc_u32 v0, 1, v0
153; GFX11-NEXT:    scratch_load_d16_u8 v3, v0, off
154; GFX11-NEXT:    s_waitcnt vmcnt(0)
155; GFX11-NEXT:    flat_store_b32 v[1:2], v3
156; GFX11-NEXT:    s_endpgm
157;
158; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_lo_v:
159; GFX12:       ; %bb.0: ; %bb
160; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffff0000
161; GFX12-NEXT:    scratch_load_d16_u8 v3, v0, off offset:1
162; GFX12-NEXT:    s_wait_loadcnt 0x0
163; GFX12-NEXT:    flat_store_b32 v[1:2], v3
164; GFX12-NEXT:    s_endpgm
165bb:
166  %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
167  %load_lo = load i8, ptr addrspace(5) %gep
168  %ext = zext i8 %load_lo to i16
169  %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0
170  store <2 x i16> %result, ptr %out, align 4
171  ret void
172}
173
174define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_v(ptr addrspace(5) %in, ptr %out) {
175; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_lo_v:
176; GFX10:       ; %bb.0: ; %bb
177; GFX10-NEXT:    s_add_u32 s0, s0, s2
178; GFX10-NEXT:    s_addc_u32 s1, s1, 0
179; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
180; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
181; GFX10-NEXT:    v_add_nc_u32_e32 v0, 1, v0
182; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff0000
183; GFX10-NEXT:    scratch_load_sbyte_d16 v3, v0, off
184; GFX10-NEXT:    s_waitcnt vmcnt(0)
185; GFX10-NEXT:    flat_store_dword v[1:2], v3
186; GFX10-NEXT:    s_endpgm
187;
188; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_lo_v:
189; GFX11:       ; %bb.0: ; %bb
190; GFX11-NEXT:    v_dual_mov_b32 v3, 0xffff0000 :: v_dual_add_nc_u32 v0, 1, v0
191; GFX11-NEXT:    scratch_load_d16_i8 v3, v0, off
192; GFX11-NEXT:    s_waitcnt vmcnt(0)
193; GFX11-NEXT:    flat_store_b32 v[1:2], v3
194; GFX11-NEXT:    s_endpgm
195;
196; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_lo_v:
197; GFX12:       ; %bb.0: ; %bb
198; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffff0000
199; GFX12-NEXT:    scratch_load_d16_i8 v3, v0, off offset:1
200; GFX12-NEXT:    s_wait_loadcnt 0x0
201; GFX12-NEXT:    flat_store_b32 v[1:2], v3
202; GFX12-NEXT:    s_endpgm
203bb:
204  %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
205  %load_lo = load i8, ptr addrspace(5) %gep
206  %ext = sext i8 %load_lo to i16
207  %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0
208  store <2 x i16> %result, ptr %out, align 4
209  ret void
210}
211
212define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_v(ptr addrspace(5) %in, ptr %out) {
213; GFX10-LABEL: test_scratch_load_i16_to_d16_lo_v:
214; GFX10:       ; %bb.0: ; %bb
215; GFX10-NEXT:    s_add_u32 s0, s0, s2
216; GFX10-NEXT:    s_addc_u32 s1, s1, 0
217; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
218; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
219; GFX10-NEXT:    v_add_nc_u32_e32 v0, 2, v0
220; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff0000
221; GFX10-NEXT:    scratch_load_short_d16 v3, v0, off
222; GFX10-NEXT:    s_waitcnt vmcnt(0)
223; GFX10-NEXT:    flat_store_dword v[1:2], v3
224; GFX10-NEXT:    s_endpgm
225;
226; GFX11-LABEL: test_scratch_load_i16_to_d16_lo_v:
227; GFX11:       ; %bb.0: ; %bb
228; GFX11-NEXT:    v_dual_mov_b32 v3, 0xffff0000 :: v_dual_add_nc_u32 v0, 2, v0
229; GFX11-NEXT:    scratch_load_d16_b16 v3, v0, off
230; GFX11-NEXT:    s_waitcnt vmcnt(0)
231; GFX11-NEXT:    flat_store_b32 v[1:2], v3
232; GFX11-NEXT:    s_endpgm
233;
234; GFX12-LABEL: test_scratch_load_i16_to_d16_lo_v:
235; GFX12:       ; %bb.0: ; %bb
236; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffff0000
237; GFX12-NEXT:    scratch_load_d16_b16 v3, v0, off offset:2
238; GFX12-NEXT:    s_wait_loadcnt 0x0
239; GFX12-NEXT:    flat_store_b32 v[1:2], v3
240; GFX12-NEXT:    s_endpgm
241bb:
242  %gep = getelementptr i16, ptr addrspace(5) %in, i64 1
243  %load_lo = load i16, ptr addrspace(5) %gep
244  %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 0
245  store <2 x i16> %result, ptr %out, align 4
246  ret void
247}
248
249
250define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_v(ptr addrspace(5) %in, ptr %out) {
251; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_hi_v:
252; GFX10:       ; %bb.0: ; %bb
253; GFX10-NEXT:    s_add_u32 s0, s0, s2
254; GFX10-NEXT:    s_addc_u32 s1, s1, 0
255; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
256; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
257; GFX10-NEXT:    v_add_nc_u32_e32 v0, 1, v0
258; GFX10-NEXT:    v_mov_b32_e32 v3, -1
259; GFX10-NEXT:    scratch_load_ubyte_d16_hi v3, v0, off
260; GFX10-NEXT:    s_waitcnt vmcnt(0)
261; GFX10-NEXT:    flat_store_dword v[1:2], v3
262; GFX10-NEXT:    s_endpgm
263;
264; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_hi_v:
265; GFX11:       ; %bb.0: ; %bb
266; GFX11-NEXT:    v_dual_mov_b32 v3, -1 :: v_dual_add_nc_u32 v0, 1, v0
267; GFX11-NEXT:    scratch_load_d16_hi_u8 v3, v0, off
268; GFX11-NEXT:    s_waitcnt vmcnt(0)
269; GFX11-NEXT:    flat_store_b32 v[1:2], v3
270; GFX11-NEXT:    s_endpgm
271;
272; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_hi_v:
273; GFX12:       ; %bb.0: ; %bb
274; GFX12-NEXT:    v_mov_b32_e32 v3, -1
275; GFX12-NEXT:    scratch_load_d16_hi_u8 v3, v0, off offset:1
276; GFX12-NEXT:    s_wait_loadcnt 0x0
277; GFX12-NEXT:    flat_store_b32 v[1:2], v3
278; GFX12-NEXT:    s_endpgm
279bb:
280  %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
281  %load_lo = load i8, ptr addrspace(5) %gep
282  %ext = zext i8 %load_lo to i16
283  %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1
284  store <2 x i16> %result, ptr %out, align 4
285  ret void
286}
287
288define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_v(ptr addrspace(5) %in, ptr %out) {
289; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_hi_v:
290; GFX10:       ; %bb.0: ; %bb
291; GFX10-NEXT:    s_add_u32 s0, s0, s2
292; GFX10-NEXT:    s_addc_u32 s1, s1, 0
293; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
294; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
295; GFX10-NEXT:    v_add_nc_u32_e32 v0, 1, v0
296; GFX10-NEXT:    v_mov_b32_e32 v3, -1
297; GFX10-NEXT:    scratch_load_sbyte_d16_hi v3, v0, off
298; GFX10-NEXT:    s_waitcnt vmcnt(0)
299; GFX10-NEXT:    flat_store_dword v[1:2], v3
300; GFX10-NEXT:    s_endpgm
301;
302; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_hi_v:
303; GFX11:       ; %bb.0: ; %bb
304; GFX11-NEXT:    v_dual_mov_b32 v3, -1 :: v_dual_add_nc_u32 v0, 1, v0
305; GFX11-NEXT:    scratch_load_d16_hi_i8 v3, v0, off
306; GFX11-NEXT:    s_waitcnt vmcnt(0)
307; GFX11-NEXT:    flat_store_b32 v[1:2], v3
308; GFX11-NEXT:    s_endpgm
309;
310; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_hi_v:
311; GFX12:       ; %bb.0: ; %bb
312; GFX12-NEXT:    v_mov_b32_e32 v3, -1
313; GFX12-NEXT:    scratch_load_d16_hi_i8 v3, v0, off offset:1
314; GFX12-NEXT:    s_wait_loadcnt 0x0
315; GFX12-NEXT:    flat_store_b32 v[1:2], v3
316; GFX12-NEXT:    s_endpgm
317bb:
318  %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
319  %load_lo = load i8, ptr addrspace(5) %gep
320  %ext = sext i8 %load_lo to i16
321  %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1
322  store <2 x i16> %result, ptr %out, align 4
323  ret void
324}
325
326define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_v(ptr addrspace(5) %in, ptr %out) {
327; GFX10-LABEL: test_scratch_load_i16_to_d16_hi_v:
328; GFX10:       ; %bb.0: ; %bb
329; GFX10-NEXT:    s_add_u32 s0, s0, s2
330; GFX10-NEXT:    s_addc_u32 s1, s1, 0
331; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
332; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
333; GFX10-NEXT:    v_add_nc_u32_e32 v0, 2, v0
334; GFX10-NEXT:    v_mov_b32_e32 v3, -1
335; GFX10-NEXT:    scratch_load_short_d16_hi v3, v0, off
336; GFX10-NEXT:    s_waitcnt vmcnt(0)
337; GFX10-NEXT:    flat_store_dword v[1:2], v3
338; GFX10-NEXT:    s_endpgm
339;
340; GFX11-LABEL: test_scratch_load_i16_to_d16_hi_v:
341; GFX11:       ; %bb.0: ; %bb
342; GFX11-NEXT:    v_dual_mov_b32 v3, -1 :: v_dual_add_nc_u32 v0, 2, v0
343; GFX11-NEXT:    scratch_load_d16_hi_b16 v3, v0, off
344; GFX11-NEXT:    s_waitcnt vmcnt(0)
345; GFX11-NEXT:    flat_store_b32 v[1:2], v3
346; GFX11-NEXT:    s_endpgm
347;
348; GFX12-LABEL: test_scratch_load_i16_to_d16_hi_v:
349; GFX12:       ; %bb.0: ; %bb
350; GFX12-NEXT:    v_mov_b32_e32 v3, -1
351; GFX12-NEXT:    scratch_load_d16_hi_b16 v3, v0, off offset:2
352; GFX12-NEXT:    s_wait_loadcnt 0x0
353; GFX12-NEXT:    flat_store_b32 v[1:2], v3
354; GFX12-NEXT:    s_endpgm
355bb:
356  %gep = getelementptr i16, ptr addrspace(5) %in, i64 1
357  %load_lo = load i16, ptr addrspace(5) %gep
358  %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 1
359  store <2 x i16> %result, ptr %out, align 4
360  ret void
361}
362
363define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_v(ptr %in, ptr addrspace(5) %out) {
364; GFX10-LABEL: test_scratch_store_b8_from_d16_hi_v:
365; GFX10:       ; %bb.0: ; %bb
366; GFX10-NEXT:    s_add_u32 s0, s0, s2
367; GFX10-NEXT:    s_addc_u32 s1, s1, 0
368; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
369; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
370; GFX10-NEXT:    flat_load_dword v0, v[0:1]
371; GFX10-NEXT:    v_add_nc_u32_e32 v1, 4, v2
372; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
373; GFX10-NEXT:    scratch_store_byte_d16_hi v1, v0, off
374; GFX10-NEXT:    s_endpgm
375;
376; GFX11-LABEL: test_scratch_store_b8_from_d16_hi_v:
377; GFX11:       ; %bb.0: ; %bb
378; GFX11-NEXT:    flat_load_b32 v0, v[0:1]
379; GFX11-NEXT:    v_add_nc_u32_e32 v1, 4, v2
380; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
381; GFX11-NEXT:    scratch_store_d16_hi_b8 v1, v0, off
382; GFX11-NEXT:    s_endpgm
383;
384; GFX12-LABEL: test_scratch_store_b8_from_d16_hi_v:
385; GFX12:       ; %bb.0: ; %bb
386; GFX12-NEXT:    flat_load_b32 v0, v[0:1]
387; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
388; GFX12-NEXT:    scratch_store_d16_hi_b8 v2, v0, off offset:4
389; GFX12-NEXT:    s_endpgm
390bb:
391  %load = load <4 x i8>, ptr %in
392  %element = extractelement <4 x i8> %load, i32 2
393  %gep = getelementptr <4 x i8>, ptr addrspace(5) %out, i64 1
394  store i8 %element, ptr addrspace(5) %gep, align 4
395  ret void
396}
397
398define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_v(ptr %in, ptr addrspace(5) %out) {
399; GFX10-LABEL: test_scratch_store_b16_from_d16_hi_v:
400; GFX10:       ; %bb.0: ; %bb
401; GFX10-NEXT:    s_add_u32 s0, s0, s2
402; GFX10-NEXT:    s_addc_u32 s1, s1, 0
403; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
404; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
405; GFX10-NEXT:    flat_load_dword v0, v[0:1]
406; GFX10-NEXT:    v_add_nc_u32_e32 v1, 2, v2
407; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
408; GFX10-NEXT:    scratch_store_short_d16_hi v1, v0, off
409; GFX10-NEXT:    s_endpgm
410;
411; GFX11-LABEL: test_scratch_store_b16_from_d16_hi_v:
412; GFX11:       ; %bb.0: ; %bb
413; GFX11-NEXT:    flat_load_b32 v0, v[0:1]
414; GFX11-NEXT:    v_add_nc_u32_e32 v1, 2, v2
415; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
416; GFX11-NEXT:    scratch_store_d16_hi_b16 v1, v0, off
417; GFX11-NEXT:    s_endpgm
418;
419; GFX12-LABEL: test_scratch_store_b16_from_d16_hi_v:
420; GFX12:       ; %bb.0: ; %bb
421; GFX12-NEXT:    flat_load_b32 v0, v[0:1]
422; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
423; GFX12-NEXT:    scratch_store_d16_hi_b16 v2, v0, off offset:2
424; GFX12-NEXT:    s_endpgm
425bb:
426  %load = load <2 x i16>, ptr %in
427  %element = extractelement <2 x i16> %load, i32 1
428  %gep = getelementptr <2 x i8>, ptr addrspace(5) %out, i64 1
429  store i16 %element, ptr addrspace(5) %gep, align 4
430  ret void
431}
432
433
434
435
436; sgpr offset
437
438define amdgpu_ps void @test_scratch_load_i8_zext_s(ptr addrspace(5) inreg %in, ptr %out) {
439; GFX10-LABEL: test_scratch_load_i8_zext_s:
440; GFX10:       ; %bb.0:
441; GFX10-NEXT:    s_add_u32 s0, s0, s3
442; GFX10-NEXT:    s_addc_u32 s1, s1, 0
443; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
444; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
445; GFX10-NEXT:    scratch_load_ubyte v2, off, s2 offset:1
446; GFX10-NEXT:    s_waitcnt vmcnt(0)
447; GFX10-NEXT:    flat_store_dword v[0:1], v2
448; GFX10-NEXT:    s_endpgm
449;
450; GFX11-LABEL: test_scratch_load_i8_zext_s:
451; GFX11:       ; %bb.0:
452; GFX11-NEXT:    scratch_load_u8 v2, off, s0 offset:1
453; GFX11-NEXT:    s_waitcnt vmcnt(0)
454; GFX11-NEXT:    flat_store_b32 v[0:1], v2
455; GFX11-NEXT:    s_endpgm
456;
457; GFX12-LABEL: test_scratch_load_i8_zext_s:
458; GFX12:       ; %bb.0:
459; GFX12-NEXT:    scratch_load_u8 v2, off, s0 offset:1
460; GFX12-NEXT:    s_wait_loadcnt 0x0
461; GFX12-NEXT:    flat_store_b32 v[0:1], v2
462; GFX12-NEXT:    s_endpgm
463  %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1
464  %load = load i8, ptr addrspace(5) %gep, align 4
465  %ext = zext i8 %load to i32
466  store i32 %ext, ptr %out, align 4
467  ret void
468}
469
470define amdgpu_ps void @test_scratch_load_i8_sext_s(ptr addrspace(5) inreg %in, ptr %out) {
471; GFX10-LABEL: test_scratch_load_i8_sext_s:
472; GFX10:       ; %bb.0:
473; GFX10-NEXT:    s_add_u32 s0, s0, s3
474; GFX10-NEXT:    s_addc_u32 s1, s1, 0
475; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
476; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
477; GFX10-NEXT:    scratch_load_sbyte v2, off, s2 offset:1
478; GFX10-NEXT:    s_waitcnt vmcnt(0)
479; GFX10-NEXT:    flat_store_dword v[0:1], v2
480; GFX10-NEXT:    s_endpgm
481;
482; GFX11-LABEL: test_scratch_load_i8_sext_s:
483; GFX11:       ; %bb.0:
484; GFX11-NEXT:    scratch_load_i8 v2, off, s0 offset:1
485; GFX11-NEXT:    s_waitcnt vmcnt(0)
486; GFX11-NEXT:    flat_store_b32 v[0:1], v2
487; GFX11-NEXT:    s_endpgm
488;
489; GFX12-LABEL: test_scratch_load_i8_sext_s:
490; GFX12:       ; %bb.0:
491; GFX12-NEXT:    scratch_load_i8 v2, off, s0 offset:1
492; GFX12-NEXT:    s_wait_loadcnt 0x0
493; GFX12-NEXT:    flat_store_b32 v[0:1], v2
494; GFX12-NEXT:    s_endpgm
495  %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1
496  %load = load i8, ptr addrspace(5) %gep, align 4
497  %ext = sext i8 %load to i32
498  store i32 %ext, ptr %out, align 4
499  ret void
500}
501
502define amdgpu_ps void @test_scratch_load_i16_zext_s(ptr addrspace(5) inreg %in, ptr %out) {
503; GFX10-LABEL: test_scratch_load_i16_zext_s:
504; GFX10:       ; %bb.0:
505; GFX10-NEXT:    s_add_u32 s0, s0, s3
506; GFX10-NEXT:    s_addc_u32 s1, s1, 0
507; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
508; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
509; GFX10-NEXT:    scratch_load_ushort v2, off, s2 offset:2
510; GFX10-NEXT:    s_waitcnt vmcnt(0)
511; GFX10-NEXT:    flat_store_dword v[0:1], v2
512; GFX10-NEXT:    s_endpgm
513;
514; GFX11-LABEL: test_scratch_load_i16_zext_s:
515; GFX11:       ; %bb.0:
516; GFX11-NEXT:    scratch_load_u16 v2, off, s0 offset:2
517; GFX11-NEXT:    s_waitcnt vmcnt(0)
518; GFX11-NEXT:    flat_store_b32 v[0:1], v2
519; GFX11-NEXT:    s_endpgm
520;
521; GFX12-LABEL: test_scratch_load_i16_zext_s:
522; GFX12:       ; %bb.0:
523; GFX12-NEXT:    scratch_load_u16 v2, off, s0 offset:2
524; GFX12-NEXT:    s_wait_loadcnt 0x0
525; GFX12-NEXT:    flat_store_b32 v[0:1], v2
526; GFX12-NEXT:    s_endpgm
527  %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1
528  %load = load i16, ptr addrspace(5) %gep, align 4
529  %ext = zext i16 %load to i32
530  store i32 %ext, ptr %out, align 4
531  ret void
532}
533
534define amdgpu_ps void @test_scratch_load_i16_sext_s(ptr addrspace(5) inreg %in, ptr %out) {
535; GFX10-LABEL: test_scratch_load_i16_sext_s:
536; GFX10:       ; %bb.0:
537; GFX10-NEXT:    s_add_u32 s0, s0, s3
538; GFX10-NEXT:    s_addc_u32 s1, s1, 0
539; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
540; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
541; GFX10-NEXT:    scratch_load_sshort v2, off, s2 offset:2
542; GFX10-NEXT:    s_waitcnt vmcnt(0)
543; GFX10-NEXT:    flat_store_dword v[0:1], v2
544; GFX10-NEXT:    s_endpgm
545;
546; GFX11-LABEL: test_scratch_load_i16_sext_s:
547; GFX11:       ; %bb.0:
548; GFX11-NEXT:    scratch_load_i16 v2, off, s0 offset:2
549; GFX11-NEXT:    s_waitcnt vmcnt(0)
550; GFX11-NEXT:    flat_store_b32 v[0:1], v2
551; GFX11-NEXT:    s_endpgm
552;
553; GFX12-LABEL: test_scratch_load_i16_sext_s:
554; GFX12:       ; %bb.0:
555; GFX12-NEXT:    scratch_load_i16 v2, off, s0 offset:2
556; GFX12-NEXT:    s_wait_loadcnt 0x0
557; GFX12-NEXT:    flat_store_b32 v[0:1], v2
558; GFX12-NEXT:    s_endpgm
559  %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1
560  %load = load i16, ptr addrspace(5) %gep, align 4
561  %ext = sext i16 %load to i32
562  store i32 %ext, ptr %out, align 4
563  ret void
564}
565
566define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_s(ptr addrspace(5) inreg %in, ptr %out) {
567; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_lo_s:
568; GFX10:       ; %bb.0: ; %bb
569; GFX10-NEXT:    s_add_u32 s0, s0, s3
570; GFX10-NEXT:    s_addc_u32 s1, s1, 0
571; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
572; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
573; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff0000
574; GFX10-NEXT:    s_add_i32 s2, s2, 1
575; GFX10-NEXT:    scratch_load_ubyte_d16 v2, off, s2
576; GFX10-NEXT:    s_waitcnt vmcnt(0)
577; GFX10-NEXT:    flat_store_dword v[0:1], v2
578; GFX10-NEXT:    s_endpgm
579;
580; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_lo_s:
581; GFX11:       ; %bb.0: ; %bb
582; GFX11-NEXT:    v_mov_b32_e32 v2, 0xffff0000
583; GFX11-NEXT:    s_add_i32 s0, s0, 1
584; GFX11-NEXT:    scratch_load_d16_u8 v2, off, s0
585; GFX11-NEXT:    s_waitcnt vmcnt(0)
586; GFX11-NEXT:    flat_store_b32 v[0:1], v2
587; GFX11-NEXT:    s_endpgm
588;
589; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_lo_s:
590; GFX12:       ; %bb.0: ; %bb
591; GFX12-NEXT:    v_mov_b32_e32 v2, 0xffff0000
592; GFX12-NEXT:    scratch_load_d16_u8 v2, off, s0 offset:1
593; GFX12-NEXT:    s_wait_loadcnt 0x0
594; GFX12-NEXT:    flat_store_b32 v[0:1], v2
595; GFX12-NEXT:    s_endpgm
596bb:
597  %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
598  %load_lo = load i8, ptr addrspace(5) %gep
599  %ext = zext i8 %load_lo to i16
600  %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0
601  store <2 x i16> %result, ptr %out, align 4
602  ret void
603}
604
605define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_s(ptr addrspace(5) inreg %in, ptr %out) {
606; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_lo_s:
607; GFX10:       ; %bb.0: ; %bb
608; GFX10-NEXT:    s_add_u32 s0, s0, s3
609; GFX10-NEXT:    s_addc_u32 s1, s1, 0
610; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
611; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
612; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff0000
613; GFX10-NEXT:    s_add_i32 s2, s2, 1
614; GFX10-NEXT:    scratch_load_sbyte_d16 v2, off, s2
615; GFX10-NEXT:    s_waitcnt vmcnt(0)
616; GFX10-NEXT:    flat_store_dword v[0:1], v2
617; GFX10-NEXT:    s_endpgm
618;
619; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_lo_s:
620; GFX11:       ; %bb.0: ; %bb
621; GFX11-NEXT:    v_mov_b32_e32 v2, 0xffff0000
622; GFX11-NEXT:    s_add_i32 s0, s0, 1
623; GFX11-NEXT:    scratch_load_d16_i8 v2, off, s0
624; GFX11-NEXT:    s_waitcnt vmcnt(0)
625; GFX11-NEXT:    flat_store_b32 v[0:1], v2
626; GFX11-NEXT:    s_endpgm
627;
628; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_lo_s:
629; GFX12:       ; %bb.0: ; %bb
630; GFX12-NEXT:    v_mov_b32_e32 v2, 0xffff0000
631; GFX12-NEXT:    scratch_load_d16_i8 v2, off, s0 offset:1
632; GFX12-NEXT:    s_wait_loadcnt 0x0
633; GFX12-NEXT:    flat_store_b32 v[0:1], v2
634; GFX12-NEXT:    s_endpgm
635bb:
636  %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
637  %load_lo = load i8, ptr addrspace(5) %gep
638  %ext = sext i8 %load_lo to i16
639  %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0
640  store <2 x i16> %result, ptr %out, align 4
641  ret void
642}
643
644define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_s(ptr addrspace(5) inreg %in, ptr %out) {
645; GFX10-LABEL: test_scratch_load_i16_to_d16_lo_s:
646; GFX10:       ; %bb.0: ; %bb
647; GFX10-NEXT:    s_add_u32 s0, s0, s3
648; GFX10-NEXT:    s_addc_u32 s1, s1, 0
649; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
650; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
651; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff0000
652; GFX10-NEXT:    s_add_i32 s2, s2, 2
653; GFX10-NEXT:    scratch_load_short_d16 v2, off, s2
654; GFX10-NEXT:    s_waitcnt vmcnt(0)
655; GFX10-NEXT:    flat_store_dword v[0:1], v2
656; GFX10-NEXT:    s_endpgm
657;
658; GFX11-LABEL: test_scratch_load_i16_to_d16_lo_s:
659; GFX11:       ; %bb.0: ; %bb
660; GFX11-NEXT:    v_mov_b32_e32 v2, 0xffff0000
661; GFX11-NEXT:    s_add_i32 s0, s0, 2
662; GFX11-NEXT:    scratch_load_d16_b16 v2, off, s0
663; GFX11-NEXT:    s_waitcnt vmcnt(0)
664; GFX11-NEXT:    flat_store_b32 v[0:1], v2
665; GFX11-NEXT:    s_endpgm
666;
667; GFX12-LABEL: test_scratch_load_i16_to_d16_lo_s:
668; GFX12:       ; %bb.0: ; %bb
669; GFX12-NEXT:    v_mov_b32_e32 v2, 0xffff0000
670; GFX12-NEXT:    scratch_load_d16_b16 v2, off, s0 offset:2
671; GFX12-NEXT:    s_wait_loadcnt 0x0
672; GFX12-NEXT:    flat_store_b32 v[0:1], v2
673; GFX12-NEXT:    s_endpgm
674bb:
675  %gep = getelementptr i16, ptr addrspace(5) %in, i64 1
676  %load_lo = load i16, ptr addrspace(5) %gep
677  %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 0
678  store <2 x i16> %result, ptr %out, align 4
679  ret void
680}
681
682
683define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_s(ptr addrspace(5) inreg %in, ptr %out) {
684; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_hi_s:
685; GFX10:       ; %bb.0: ; %bb
686; GFX10-NEXT:    s_add_u32 s0, s0, s3
687; GFX10-NEXT:    s_addc_u32 s1, s1, 0
688; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
689; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
690; GFX10-NEXT:    v_mov_b32_e32 v2, -1
691; GFX10-NEXT:    s_add_i32 s2, s2, 1
692; GFX10-NEXT:    scratch_load_ubyte_d16_hi v2, off, s2
693; GFX10-NEXT:    s_waitcnt vmcnt(0)
694; GFX10-NEXT:    flat_store_dword v[0:1], v2
695; GFX10-NEXT:    s_endpgm
696;
697; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_hi_s:
698; GFX11:       ; %bb.0: ; %bb
699; GFX11-NEXT:    v_mov_b32_e32 v2, -1
700; GFX11-NEXT:    s_add_i32 s0, s0, 1
701; GFX11-NEXT:    scratch_load_d16_hi_u8 v2, off, s0
702; GFX11-NEXT:    s_waitcnt vmcnt(0)
703; GFX11-NEXT:    flat_store_b32 v[0:1], v2
704; GFX11-NEXT:    s_endpgm
705;
706; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_hi_s:
707; GFX12:       ; %bb.0: ; %bb
708; GFX12-NEXT:    v_mov_b32_e32 v2, -1
709; GFX12-NEXT:    scratch_load_d16_hi_u8 v2, off, s0 offset:1
710; GFX12-NEXT:    s_wait_loadcnt 0x0
711; GFX12-NEXT:    flat_store_b32 v[0:1], v2
712; GFX12-NEXT:    s_endpgm
713bb:
714  %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
715  %load_lo = load i8, ptr addrspace(5) %gep
716  %ext = zext i8 %load_lo to i16
717  %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1
718  store <2 x i16> %result, ptr %out, align 4
719  ret void
720}
721
722define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_s(ptr addrspace(5) inreg %in, ptr %out) {
723; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_hi_s:
724; GFX10:       ; %bb.0: ; %bb
725; GFX10-NEXT:    s_add_u32 s0, s0, s3
726; GFX10-NEXT:    s_addc_u32 s1, s1, 0
727; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
728; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
729; GFX10-NEXT:    v_mov_b32_e32 v2, -1
730; GFX10-NEXT:    s_add_i32 s2, s2, 1
731; GFX10-NEXT:    scratch_load_sbyte_d16_hi v2, off, s2
732; GFX10-NEXT:    s_waitcnt vmcnt(0)
733; GFX10-NEXT:    flat_store_dword v[0:1], v2
734; GFX10-NEXT:    s_endpgm
735;
736; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_hi_s:
737; GFX11:       ; %bb.0: ; %bb
738; GFX11-NEXT:    v_mov_b32_e32 v2, -1
739; GFX11-NEXT:    s_add_i32 s0, s0, 1
740; GFX11-NEXT:    scratch_load_d16_hi_i8 v2, off, s0
741; GFX11-NEXT:    s_waitcnt vmcnt(0)
742; GFX11-NEXT:    flat_store_b32 v[0:1], v2
743; GFX11-NEXT:    s_endpgm
744;
745; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_hi_s:
746; GFX12:       ; %bb.0: ; %bb
747; GFX12-NEXT:    v_mov_b32_e32 v2, -1
748; GFX12-NEXT:    scratch_load_d16_hi_i8 v2, off, s0 offset:1
749; GFX12-NEXT:    s_wait_loadcnt 0x0
750; GFX12-NEXT:    flat_store_b32 v[0:1], v2
751; GFX12-NEXT:    s_endpgm
752bb:
753  %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
754  %load_lo = load i8, ptr addrspace(5) %gep
755  %ext = sext i8 %load_lo to i16
756  %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1
757  store <2 x i16> %result, ptr %out, align 4
758  ret void
759}
760
761define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_s(ptr addrspace(5) inreg %in, ptr %out) {
762; GFX10-LABEL: test_scratch_load_i16_to_d16_hi_s:
763; GFX10:       ; %bb.0: ; %bb
764; GFX10-NEXT:    s_add_u32 s0, s0, s3
765; GFX10-NEXT:    s_addc_u32 s1, s1, 0
766; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
767; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
768; GFX10-NEXT:    v_mov_b32_e32 v2, -1
769; GFX10-NEXT:    s_add_i32 s2, s2, 2
770; GFX10-NEXT:    scratch_load_short_d16_hi v2, off, s2
771; GFX10-NEXT:    s_waitcnt vmcnt(0)
772; GFX10-NEXT:    flat_store_dword v[0:1], v2
773; GFX10-NEXT:    s_endpgm
774;
775; GFX11-LABEL: test_scratch_load_i16_to_d16_hi_s:
776; GFX11:       ; %bb.0: ; %bb
777; GFX11-NEXT:    v_mov_b32_e32 v2, -1
778; GFX11-NEXT:    s_add_i32 s0, s0, 2
779; GFX11-NEXT:    scratch_load_d16_hi_b16 v2, off, s0
780; GFX11-NEXT:    s_waitcnt vmcnt(0)
781; GFX11-NEXT:    flat_store_b32 v[0:1], v2
782; GFX11-NEXT:    s_endpgm
783;
784; GFX12-LABEL: test_scratch_load_i16_to_d16_hi_s:
785; GFX12:       ; %bb.0: ; %bb
786; GFX12-NEXT:    v_mov_b32_e32 v2, -1
787; GFX12-NEXT:    scratch_load_d16_hi_b16 v2, off, s0 offset:2
788; GFX12-NEXT:    s_wait_loadcnt 0x0
789; GFX12-NEXT:    flat_store_b32 v[0:1], v2
790; GFX12-NEXT:    s_endpgm
791bb:
792  %gep = getelementptr i16, ptr addrspace(5) %in, i64 1
793  %load_lo = load i16, ptr addrspace(5) %gep
794  %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 1
795  store <2 x i16> %result, ptr %out, align 4
796  ret void
797}
798
799define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_s(ptr %in, ptr addrspace(5) inreg %out) {
800; GFX10-LABEL: test_scratch_store_b8_from_d16_hi_s:
801; GFX10:       ; %bb.0: ; %bb
802; GFX10-NEXT:    s_add_u32 s0, s0, s3
803; GFX10-NEXT:    s_addc_u32 s1, s1, 0
804; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
805; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
806; GFX10-NEXT:    flat_load_dword v0, v[0:1]
807; GFX10-NEXT:    s_add_i32 s2, s2, 4
808; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
809; GFX10-NEXT:    scratch_store_byte_d16_hi off, v0, s2
810; GFX10-NEXT:    s_endpgm
811;
812; GFX11-LABEL: test_scratch_store_b8_from_d16_hi_s:
813; GFX11:       ; %bb.0: ; %bb
814; GFX11-NEXT:    flat_load_b32 v0, v[0:1]
815; GFX11-NEXT:    s_add_i32 s0, s0, 4
816; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
817; GFX11-NEXT:    scratch_store_d16_hi_b8 off, v0, s0
818; GFX11-NEXT:    s_endpgm
819;
820; GFX12-LABEL: test_scratch_store_b8_from_d16_hi_s:
821; GFX12:       ; %bb.0: ; %bb
822; GFX12-NEXT:    flat_load_b32 v0, v[0:1]
823; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
824; GFX12-NEXT:    scratch_store_d16_hi_b8 off, v0, s0 offset:4
825; GFX12-NEXT:    s_endpgm
826bb:
827  %load = load <4 x i8>, ptr %in
828  %element = extractelement <4 x i8> %load, i32 2
829  %gep = getelementptr <4 x i8>, ptr addrspace(5) %out, i64 1
830  store i8 %element, ptr addrspace(5) %gep, align 4
831  ret void
832}
833
834define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_s(ptr %in, ptr addrspace(5) inreg %out) {
835; GFX10-LABEL: test_scratch_store_b16_from_d16_hi_s:
836; GFX10:       ; %bb.0: ; %bb
837; GFX10-NEXT:    s_add_u32 s0, s0, s3
838; GFX10-NEXT:    s_addc_u32 s1, s1, 0
839; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
840; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
841; GFX10-NEXT:    flat_load_dword v0, v[0:1]
842; GFX10-NEXT:    s_add_i32 s2, s2, 2
843; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
844; GFX10-NEXT:    scratch_store_short_d16_hi off, v0, s2
845; GFX10-NEXT:    s_endpgm
846;
847; GFX11-LABEL: test_scratch_store_b16_from_d16_hi_s:
848; GFX11:       ; %bb.0: ; %bb
849; GFX11-NEXT:    flat_load_b32 v0, v[0:1]
850; GFX11-NEXT:    s_add_i32 s0, s0, 2
851; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
852; GFX11-NEXT:    scratch_store_d16_hi_b16 off, v0, s0
853; GFX11-NEXT:    s_endpgm
854;
855; GFX12-LABEL: test_scratch_store_b16_from_d16_hi_s:
856; GFX12:       ; %bb.0: ; %bb
857; GFX12-NEXT:    flat_load_b32 v0, v[0:1]
858; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
859; GFX12-NEXT:    scratch_store_d16_hi_b16 off, v0, s0 offset:2
860; GFX12-NEXT:    s_endpgm
861bb:
862  %load = load <2 x i16>, ptr %in
863  %element = extractelement <2 x i16> %load, i32 1
864  %gep = getelementptr <2 x i8>, ptr addrspace(5) %out, i64 1
865  store i16 %element, ptr addrspace(5) %gep, align 4
866  ret void
867}
868
869
870
871
872; sgpr + vgpr offset
873
874define amdgpu_ps void @test_scratch_load_i8_zext_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
875; GFX10-LABEL: test_scratch_load_i8_zext_svs:
876; GFX10:       ; %bb.0:
877; GFX10-NEXT:    s_add_u32 s0, s0, s3
878; GFX10-NEXT:    s_addc_u32 s1, s1, 0
879; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
880; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
881; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
882; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:1
883; GFX10-NEXT:    s_waitcnt vmcnt(0)
884; GFX10-NEXT:    flat_store_dword v[1:2], v0
885; GFX10-NEXT:    s_endpgm
886;
887; GFX11-LABEL: test_scratch_load_i8_zext_svs:
888; GFX11:       ; %bb.0:
889; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
890; GFX11-NEXT:    scratch_load_u8 v0, v0, off offset:1
891; GFX11-NEXT:    s_waitcnt vmcnt(0)
892; GFX11-NEXT:    flat_store_b32 v[1:2], v0
893; GFX11-NEXT:    s_endpgm
894;
895; GFX12-LABEL: test_scratch_load_i8_zext_svs:
896; GFX12:       ; %bb.0:
897; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
898; GFX12-NEXT:    scratch_load_u8 v0, v0, s0 offset:1
899; GFX12-NEXT:    s_wait_loadcnt 0x0
900; GFX12-NEXT:    flat_store_b32 v[1:2], v0
901; GFX12-NEXT:    s_endpgm
902  %voffset4 = mul i32 %voffset, 4
903  %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
904  %gep = getelementptr inbounds i8, ptr addrspace(5) %gep0, i32 1
905  %load = load i8, ptr addrspace(5) %gep, align 4
906  %ext = zext i8 %load to i32
907  store i32 %ext, ptr %out, align 4
908  ret void
909}
910
911define amdgpu_ps void @test_scratch_load_i8_sext_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
912; GFX10-LABEL: test_scratch_load_i8_sext_svs:
913; GFX10:       ; %bb.0:
914; GFX10-NEXT:    s_add_u32 s0, s0, s3
915; GFX10-NEXT:    s_addc_u32 s1, s1, 0
916; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
917; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
918; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
919; GFX10-NEXT:    scratch_load_sbyte v0, v0, off offset:1
920; GFX10-NEXT:    s_waitcnt vmcnt(0)
921; GFX10-NEXT:    flat_store_dword v[1:2], v0
922; GFX10-NEXT:    s_endpgm
923;
924; GFX11-LABEL: test_scratch_load_i8_sext_svs:
925; GFX11:       ; %bb.0:
926; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
927; GFX11-NEXT:    scratch_load_i8 v0, v0, off offset:1
928; GFX11-NEXT:    s_waitcnt vmcnt(0)
929; GFX11-NEXT:    flat_store_b32 v[1:2], v0
930; GFX11-NEXT:    s_endpgm
931;
932; GFX12-LABEL: test_scratch_load_i8_sext_svs:
933; GFX12:       ; %bb.0:
934; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
935; GFX12-NEXT:    scratch_load_i8 v0, v0, s0 offset:1
936; GFX12-NEXT:    s_wait_loadcnt 0x0
937; GFX12-NEXT:    flat_store_b32 v[1:2], v0
938; GFX12-NEXT:    s_endpgm
939  %voffset4 = mul i32 %voffset, 4
940  %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
941  %gep = getelementptr inbounds i8, ptr addrspace(5) %gep0, i32 1
942  %load = load i8, ptr addrspace(5) %gep, align 4
943  %ext = sext i8 %load to i32
944  store i32 %ext, ptr %out, align 4
945  ret void
946}
947
948define amdgpu_ps void @test_scratch_load_i16_zext_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
949; GFX10-LABEL: test_scratch_load_i16_zext_svs:
950; GFX10:       ; %bb.0:
951; GFX10-NEXT:    s_add_u32 s0, s0, s3
952; GFX10-NEXT:    s_addc_u32 s1, s1, 0
953; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
954; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
955; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
956; GFX10-NEXT:    scratch_load_ushort v0, v0, off offset:2
957; GFX10-NEXT:    s_waitcnt vmcnt(0)
958; GFX10-NEXT:    flat_store_dword v[1:2], v0
959; GFX10-NEXT:    s_endpgm
960;
961; GFX11-LABEL: test_scratch_load_i16_zext_svs:
962; GFX11:       ; %bb.0:
963; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
964; GFX11-NEXT:    scratch_load_u16 v0, v0, off offset:2
965; GFX11-NEXT:    s_waitcnt vmcnt(0)
966; GFX11-NEXT:    flat_store_b32 v[1:2], v0
967; GFX11-NEXT:    s_endpgm
968;
969; GFX12-LABEL: test_scratch_load_i16_zext_svs:
970; GFX12:       ; %bb.0:
971; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
972; GFX12-NEXT:    scratch_load_u16 v0, v0, s0 offset:2
973; GFX12-NEXT:    s_wait_loadcnt 0x0
974; GFX12-NEXT:    flat_store_b32 v[1:2], v0
975; GFX12-NEXT:    s_endpgm
976  %voffset4 = mul i32 %voffset, 4
977  %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
978  %gep = getelementptr inbounds i16, ptr addrspace(5) %gep0, i32 1
979  %load = load i16, ptr addrspace(5) %gep, align 4
980  %ext = zext i16 %load to i32
981  store i32 %ext, ptr %out, align 4
982  ret void
983}
984
985define amdgpu_ps void @test_scratch_load_i16_sext_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
986; GFX10-LABEL: test_scratch_load_i16_sext_svs:
987; GFX10:       ; %bb.0:
988; GFX10-NEXT:    s_add_u32 s0, s0, s3
989; GFX10-NEXT:    s_addc_u32 s1, s1, 0
990; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
991; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
992; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
993; GFX10-NEXT:    scratch_load_sshort v0, v0, off offset:2
994; GFX10-NEXT:    s_waitcnt vmcnt(0)
995; GFX10-NEXT:    flat_store_dword v[1:2], v0
996; GFX10-NEXT:    s_endpgm
997;
998; GFX11-LABEL: test_scratch_load_i16_sext_svs:
999; GFX11:       ; %bb.0:
1000; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
1001; GFX11-NEXT:    scratch_load_i16 v0, v0, off offset:2
1002; GFX11-NEXT:    s_waitcnt vmcnt(0)
1003; GFX11-NEXT:    flat_store_b32 v[1:2], v0
1004; GFX11-NEXT:    s_endpgm
1005;
1006; GFX12-LABEL: test_scratch_load_i16_sext_svs:
1007; GFX12:       ; %bb.0:
1008; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1009; GFX12-NEXT:    scratch_load_i16 v0, v0, s0 offset:2
1010; GFX12-NEXT:    s_wait_loadcnt 0x0
1011; GFX12-NEXT:    flat_store_b32 v[1:2], v0
1012; GFX12-NEXT:    s_endpgm
1013  %voffset4 = mul i32 %voffset, 4
1014  %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
1015  %gep = getelementptr inbounds i16, ptr addrspace(5) %gep0, i32 1
1016  %load = load i16, ptr addrspace(5) %gep, align 4
1017  %ext = sext i16 %load to i32
1018  store i32 %ext, ptr %out, align 4
1019  ret void
1020}
1021
1022define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
1023; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_lo_svs:
1024; GFX10:       ; %bb.0: ; %bb
1025; GFX10-NEXT:    s_add_u32 s0, s0, s3
1026; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1027; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1028; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1029; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1030; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff0000
1031; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 1
1032; GFX10-NEXT:    scratch_load_ubyte_d16 v3, v0, off
1033; GFX10-NEXT:    s_waitcnt vmcnt(0)
1034; GFX10-NEXT:    flat_store_dword v[1:2], v3
1035; GFX10-NEXT:    s_endpgm
1036;
1037; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_lo_svs:
1038; GFX11:       ; %bb.0: ; %bb
1039; GFX11-NEXT:    v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
1040; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1041; GFX11-NEXT:    v_add3_u32 v0, s0, v0, 1
1042; GFX11-NEXT:    scratch_load_d16_u8 v3, v0, off
1043; GFX11-NEXT:    s_waitcnt vmcnt(0)
1044; GFX11-NEXT:    flat_store_b32 v[1:2], v3
1045; GFX11-NEXT:    s_endpgm
1046;
1047; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_lo_svs:
1048; GFX12:       ; %bb.0: ; %bb
1049; GFX12-NEXT:    v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
1050; GFX12-NEXT:    scratch_load_d16_u8 v3, v0, s0 offset:1
1051; GFX12-NEXT:    s_wait_loadcnt 0x0
1052; GFX12-NEXT:    flat_store_b32 v[1:2], v3
1053; GFX12-NEXT:    s_endpgm
1054bb:
1055  %voffset4 = mul i32 %voffset, 4
1056  %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
1057  %gep = getelementptr i8, ptr addrspace(5) %gep0, i64 1
1058  %load_lo = load i8, ptr addrspace(5) %gep
1059  %ext = zext i8 %load_lo to i16
1060  %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0
1061  store <2 x i16> %result, ptr %out, align 4
1062  ret void
1063}
1064
1065define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
1066; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_lo_svs:
1067; GFX10:       ; %bb.0: ; %bb
1068; GFX10-NEXT:    s_add_u32 s0, s0, s3
1069; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1070; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1071; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1072; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1073; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff0000
1074; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 1
1075; GFX10-NEXT:    scratch_load_sbyte_d16 v3, v0, off
1076; GFX10-NEXT:    s_waitcnt vmcnt(0)
1077; GFX10-NEXT:    flat_store_dword v[1:2], v3
1078; GFX10-NEXT:    s_endpgm
1079;
1080; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_lo_svs:
1081; GFX11:       ; %bb.0: ; %bb
1082; GFX11-NEXT:    v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
1083; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1084; GFX11-NEXT:    v_add3_u32 v0, s0, v0, 1
1085; GFX11-NEXT:    scratch_load_d16_i8 v3, v0, off
1086; GFX11-NEXT:    s_waitcnt vmcnt(0)
1087; GFX11-NEXT:    flat_store_b32 v[1:2], v3
1088; GFX11-NEXT:    s_endpgm
1089;
1090; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_lo_svs:
1091; GFX12:       ; %bb.0: ; %bb
1092; GFX12-NEXT:    v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
1093; GFX12-NEXT:    scratch_load_d16_i8 v3, v0, s0 offset:1
1094; GFX12-NEXT:    s_wait_loadcnt 0x0
1095; GFX12-NEXT:    flat_store_b32 v[1:2], v3
1096; GFX12-NEXT:    s_endpgm
1097bb:
1098  %voffset4 = mul i32 %voffset, 4
1099  %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
1100  %gep = getelementptr i8, ptr addrspace(5) %gep0, i64 1
1101  %load_lo = load i8, ptr addrspace(5) %gep
1102  %ext = sext i8 %load_lo to i16
1103  %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0
1104  store <2 x i16> %result, ptr %out, align 4
1105  ret void
1106}
1107
1108define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
1109; GFX10-LABEL: test_scratch_load_i16_to_d16_lo_svs:
1110; GFX10:       ; %bb.0: ; %bb
1111; GFX10-NEXT:    s_add_u32 s0, s0, s3
1112; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1113; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1114; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1115; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1116; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff0000
1117; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 2
1118; GFX10-NEXT:    scratch_load_short_d16 v3, v0, off
1119; GFX10-NEXT:    s_waitcnt vmcnt(0)
1120; GFX10-NEXT:    flat_store_dword v[1:2], v3
1121; GFX10-NEXT:    s_endpgm
1122;
1123; GFX11-LABEL: test_scratch_load_i16_to_d16_lo_svs:
1124; GFX11:       ; %bb.0: ; %bb
1125; GFX11-NEXT:    v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
1126; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1127; GFX11-NEXT:    v_add3_u32 v0, s0, v0, 2
1128; GFX11-NEXT:    scratch_load_d16_b16 v3, v0, off
1129; GFX11-NEXT:    s_waitcnt vmcnt(0)
1130; GFX11-NEXT:    flat_store_b32 v[1:2], v3
1131; GFX11-NEXT:    s_endpgm
1132;
1133; GFX12-LABEL: test_scratch_load_i16_to_d16_lo_svs:
1134; GFX12:       ; %bb.0: ; %bb
1135; GFX12-NEXT:    v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
1136; GFX12-NEXT:    scratch_load_d16_b16 v3, v0, s0 offset:2
1137; GFX12-NEXT:    s_wait_loadcnt 0x0
1138; GFX12-NEXT:    flat_store_b32 v[1:2], v3
1139; GFX12-NEXT:    s_endpgm
1140bb:
1141  %voffset4 = mul i32 %voffset, 4
1142  %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
1143  %gep = getelementptr i16, ptr addrspace(5) %gep0, i64 1
1144  %load_lo = load i16, ptr addrspace(5) %gep
1145  %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 0
1146  store <2 x i16> %result, ptr %out, align 4
1147  ret void
1148}
1149
1150
1151define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
1152; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_hi_svs:
1153; GFX10:       ; %bb.0: ; %bb
1154; GFX10-NEXT:    s_add_u32 s0, s0, s3
1155; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1156; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1157; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1158; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1159; GFX10-NEXT:    v_mov_b32_e32 v3, -1
1160; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 1
1161; GFX10-NEXT:    scratch_load_ubyte_d16_hi v3, v0, off
1162; GFX10-NEXT:    s_waitcnt vmcnt(0)
1163; GFX10-NEXT:    flat_store_dword v[1:2], v3
1164; GFX10-NEXT:    s_endpgm
1165;
1166; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_hi_svs:
1167; GFX11:       ; %bb.0: ; %bb
1168; GFX11-NEXT:    v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
1169; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1170; GFX11-NEXT:    v_add3_u32 v0, s0, v0, 1
1171; GFX11-NEXT:    scratch_load_d16_hi_u8 v3, v0, off
1172; GFX11-NEXT:    s_waitcnt vmcnt(0)
1173; GFX11-NEXT:    flat_store_b32 v[1:2], v3
1174; GFX11-NEXT:    s_endpgm
1175;
1176; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_hi_svs:
1177; GFX12:       ; %bb.0: ; %bb
1178; GFX12-NEXT:    v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
1179; GFX12-NEXT:    scratch_load_d16_hi_u8 v3, v0, s0 offset:1
1180; GFX12-NEXT:    s_wait_loadcnt 0x0
1181; GFX12-NEXT:    flat_store_b32 v[1:2], v3
1182; GFX12-NEXT:    s_endpgm
1183bb:
1184  %voffset4 = mul i32 %voffset, 4
1185  %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
1186  %gep = getelementptr i8, ptr addrspace(5) %gep0, i64 1
1187  %load_lo = load i8, ptr addrspace(5) %gep
1188  %ext = zext i8 %load_lo to i16
1189  %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1
1190  store <2 x i16> %result, ptr %out, align 4
1191  ret void
1192}
1193
1194define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
1195; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_hi_svs:
1196; GFX10:       ; %bb.0: ; %bb
1197; GFX10-NEXT:    s_add_u32 s0, s0, s3
1198; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1199; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1200; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1201; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1202; GFX10-NEXT:    v_mov_b32_e32 v3, -1
1203; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 1
1204; GFX10-NEXT:    scratch_load_sbyte_d16_hi v3, v0, off
1205; GFX10-NEXT:    s_waitcnt vmcnt(0)
1206; GFX10-NEXT:    flat_store_dword v[1:2], v3
1207; GFX10-NEXT:    s_endpgm
1208;
1209; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_hi_svs:
1210; GFX11:       ; %bb.0: ; %bb
1211; GFX11-NEXT:    v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
1212; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1213; GFX11-NEXT:    v_add3_u32 v0, s0, v0, 1
1214; GFX11-NEXT:    scratch_load_d16_hi_i8 v3, v0, off
1215; GFX11-NEXT:    s_waitcnt vmcnt(0)
1216; GFX11-NEXT:    flat_store_b32 v[1:2], v3
1217; GFX11-NEXT:    s_endpgm
1218;
1219; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_hi_svs:
1220; GFX12:       ; %bb.0: ; %bb
1221; GFX12-NEXT:    v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
1222; GFX12-NEXT:    scratch_load_d16_hi_i8 v3, v0, s0 offset:1
1223; GFX12-NEXT:    s_wait_loadcnt 0x0
1224; GFX12-NEXT:    flat_store_b32 v[1:2], v3
1225; GFX12-NEXT:    s_endpgm
1226bb:
1227  %voffset4 = mul i32 %voffset, 4
1228  %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
1229  %gep = getelementptr i8, ptr addrspace(5) %gep0, i64 1
1230  %load_lo = load i8, ptr addrspace(5) %gep
1231  %ext = sext i8 %load_lo to i16
1232  %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1
1233  store <2 x i16> %result, ptr %out, align 4
1234  ret void
1235}
1236
1237define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
1238; GFX10-LABEL: test_scratch_load_i16_to_d16_hi_svs:
1239; GFX10:       ; %bb.0: ; %bb
1240; GFX10-NEXT:    s_add_u32 s0, s0, s3
1241; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1242; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1243; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1244; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1245; GFX10-NEXT:    v_mov_b32_e32 v3, -1
1246; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 2
1247; GFX10-NEXT:    scratch_load_short_d16_hi v3, v0, off
1248; GFX10-NEXT:    s_waitcnt vmcnt(0)
1249; GFX10-NEXT:    flat_store_dword v[1:2], v3
1250; GFX10-NEXT:    s_endpgm
1251;
1252; GFX11-LABEL: test_scratch_load_i16_to_d16_hi_svs:
1253; GFX11:       ; %bb.0: ; %bb
1254; GFX11-NEXT:    v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
1255; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1256; GFX11-NEXT:    v_add3_u32 v0, s0, v0, 2
1257; GFX11-NEXT:    scratch_load_d16_hi_b16 v3, v0, off
1258; GFX11-NEXT:    s_waitcnt vmcnt(0)
1259; GFX11-NEXT:    flat_store_b32 v[1:2], v3
1260; GFX11-NEXT:    s_endpgm
1261;
1262; GFX12-LABEL: test_scratch_load_i16_to_d16_hi_svs:
1263; GFX12:       ; %bb.0: ; %bb
1264; GFX12-NEXT:    v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
1265; GFX12-NEXT:    scratch_load_d16_hi_b16 v3, v0, s0 offset:2
1266; GFX12-NEXT:    s_wait_loadcnt 0x0
1267; GFX12-NEXT:    flat_store_b32 v[1:2], v3
1268; GFX12-NEXT:    s_endpgm
1269bb:
1270  %voffset4 = mul i32 %voffset, 4
1271  %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
1272  %gep = getelementptr i16, ptr addrspace(5) %gep0, i64 1
1273  %load_lo = load i16, ptr addrspace(5) %gep
1274  %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 1
1275  store <2 x i16> %result, ptr %out, align 4
1276  ret void
1277}
1278
1279define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_svs(ptr %in, ptr addrspace(5) inreg %out, i32 %voffset) {
1280; GFX10-LABEL: test_scratch_store_b8_from_d16_hi_svs:
1281; GFX10:       ; %bb.0: ; %bb
1282; GFX10-NEXT:    s_add_u32 s0, s0, s3
1283; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1284; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1285; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1286; GFX10-NEXT:    flat_load_dword v0, v[0:1]
1287; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v2
1288; GFX10-NEXT:    v_add3_u32 v1, s2, v1, 4
1289; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1290; GFX10-NEXT:    scratch_store_byte_d16_hi v1, v0, off
1291; GFX10-NEXT:    s_endpgm
1292;
1293; GFX11-LABEL: test_scratch_store_b8_from_d16_hi_svs:
1294; GFX11:       ; %bb.0: ; %bb
1295; GFX11-NEXT:    flat_load_b32 v0, v[0:1]
1296; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v2
1297; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1298; GFX11-NEXT:    v_add3_u32 v1, s0, v1, 4
1299; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1300; GFX11-NEXT:    scratch_store_d16_hi_b8 v1, v0, off
1301; GFX11-NEXT:    s_endpgm
1302;
1303; GFX12-LABEL: test_scratch_store_b8_from_d16_hi_svs:
1304; GFX12:       ; %bb.0: ; %bb
1305; GFX12-NEXT:    flat_load_b32 v0, v[0:1]
1306; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v2
1307; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1308; GFX12-NEXT:    scratch_store_d16_hi_b8 v1, v0, s0 offset:4
1309; GFX12-NEXT:    s_endpgm
1310bb:
1311  %load = load <4 x i8>, ptr %in
1312  %element = extractelement <4 x i8> %load, i32 2
1313  %voffset4 = mul i32 %voffset, 4
1314  %gep0 = getelementptr inbounds i8, ptr addrspace(5) %out, i32 %voffset4
1315  %gep = getelementptr <4 x i8>, ptr addrspace(5) %gep0, i64 1
1316  store i8 %element, ptr addrspace(5) %gep, align 4
1317  ret void
1318}
1319
1320define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_svs(ptr %in, ptr addrspace(5) inreg %out, i32 %voffset) {
1321; GFX10-LABEL: test_scratch_store_b16_from_d16_hi_svs:
1322; GFX10:       ; %bb.0: ; %bb
1323; GFX10-NEXT:    s_add_u32 s0, s0, s3
1324; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1325; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1326; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1327; GFX10-NEXT:    flat_load_dword v0, v[0:1]
1328; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v2
1329; GFX10-NEXT:    v_add3_u32 v1, s2, v1, 2
1330; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1331; GFX10-NEXT:    scratch_store_short_d16_hi v1, v0, off
1332; GFX10-NEXT:    s_endpgm
1333;
1334; GFX11-LABEL: test_scratch_store_b16_from_d16_hi_svs:
1335; GFX11:       ; %bb.0: ; %bb
1336; GFX11-NEXT:    flat_load_b32 v0, v[0:1]
1337; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v2
1338; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1339; GFX11-NEXT:    v_add3_u32 v1, s0, v1, 2
1340; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1341; GFX11-NEXT:    scratch_store_d16_hi_b16 v1, v0, off
1342; GFX11-NEXT:    s_endpgm
1343;
1344; GFX12-LABEL: test_scratch_store_b16_from_d16_hi_svs:
1345; GFX12:       ; %bb.0: ; %bb
1346; GFX12-NEXT:    flat_load_b32 v0, v[0:1]
1347; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v2
1348; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1349; GFX12-NEXT:    scratch_store_d16_hi_b16 v1, v0, s0 offset:2
1350; GFX12-NEXT:    s_endpgm
1351bb:
1352  %load = load <2 x i16>, ptr %in
1353  %element = extractelement <2 x i16> %load, i32 1
1354  %voffset4 = mul i32 %voffset, 4
1355  %gep0 = getelementptr inbounds i8, ptr addrspace(5) %out, i32 %voffset4
1356  %gep = getelementptr <2 x i8>, ptr addrspace(5) %gep0, i64 1
1357  store i16 %element, ptr addrspace(5) %gep, align 4
1358  ret void
1359}
1360