xref: /llvm-project/llvm/test/CodeGen/AMDGPU/memory_clause.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx902 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SCRATCH %s
4
5define amdgpu_kernel void @vector_clause(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1) {
6; GCN-LABEL: vector_clause:
7; GCN:       ; %bb.0: ; %bb
8; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
9; GCN-NEXT:    v_lshlrev_b32_e32 v16, 4, v0
10; GCN-NEXT:    s_waitcnt lgkmcnt(0)
11; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
12; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
13; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
14; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
15; GCN-NEXT:    s_waitcnt vmcnt(3)
16; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3]
17; GCN-NEXT:    s_waitcnt vmcnt(3)
18; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
19; GCN-NEXT:    s_waitcnt vmcnt(3)
20; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
21; GCN-NEXT:    s_waitcnt vmcnt(3)
22; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
23; GCN-NEXT:    s_endpgm
24;
25; GCN-SCRATCH-LABEL: vector_clause:
26; GCN-SCRATCH:       ; %bb.0: ; %bb
27; GCN-SCRATCH-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
28; GCN-SCRATCH-NEXT:    v_lshlrev_b32_e32 v16, 4, v0
29; GCN-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
30; GCN-SCRATCH-NEXT:    s_clause 0x3
31; GCN-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
32; GCN-SCRATCH-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
33; GCN-SCRATCH-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
34; GCN-SCRATCH-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
35; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(3)
36; GCN-SCRATCH-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3]
37; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(2)
38; GCN-SCRATCH-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
39; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(1)
40; GCN-SCRATCH-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
41; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
42; GCN-SCRATCH-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
43; GCN-SCRATCH-NEXT:    s_endpgm
44bb:
45  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
46  %tmp2 = zext i32 %tmp to i64
47  %tmp3 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp2
48  %tmp4 = load <4 x i32>, ptr addrspace(1) %tmp3, align 16
49  %tmp5 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp2
50  %tmp6 = add nuw nsw i64 %tmp2, 1
51  %tmp7 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp6
52  %tmp8 = load <4 x i32>, ptr addrspace(1) %tmp7, align 16
53  %tmp9 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp6
54  %tmp10 = add nuw nsw i64 %tmp2, 2
55  %tmp11 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp10
56  %tmp12 = load <4 x i32>, ptr addrspace(1) %tmp11, align 16
57  %tmp13 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp10
58  %tmp14 = add nuw nsw i64 %tmp2, 3
59  %tmp15 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp14
60  %tmp16 = load <4 x i32>, ptr addrspace(1) %tmp15, align 16
61  %tmp17 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp14
62  store <4 x i32> %tmp4, ptr addrspace(1) %tmp5, align 16
63  store <4 x i32> %tmp8, ptr addrspace(1) %tmp9, align 16
64  store <4 x i32> %tmp12, ptr addrspace(1) %tmp13, align 16
65  store <4 x i32> %tmp16, ptr addrspace(1) %tmp17, align 16
66  ret void
67}
68
69define amdgpu_kernel void @scalar_clause(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1) {
70; GCN-LABEL: scalar_clause:
71; GCN:       ; %bb.0: ; %bb
72; GCN-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x24
73; GCN-NEXT:    v_mov_b32_e32 v16, 0
74; GCN-NEXT:    s_waitcnt lgkmcnt(0)
75; GCN-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
76; GCN-NEXT:    s_waitcnt lgkmcnt(0)
77; GCN-NEXT:    v_mov_b32_e32 v0, s0
78; GCN-NEXT:    v_mov_b32_e32 v1, s1
79; GCN-NEXT:    v_mov_b32_e32 v2, s2
80; GCN-NEXT:    v_mov_b32_e32 v3, s3
81; GCN-NEXT:    v_mov_b32_e32 v4, s4
82; GCN-NEXT:    v_mov_b32_e32 v5, s5
83; GCN-NEXT:    v_mov_b32_e32 v6, s6
84; GCN-NEXT:    v_mov_b32_e32 v7, s7
85; GCN-NEXT:    v_mov_b32_e32 v8, s8
86; GCN-NEXT:    v_mov_b32_e32 v9, s9
87; GCN-NEXT:    v_mov_b32_e32 v10, s10
88; GCN-NEXT:    v_mov_b32_e32 v11, s11
89; GCN-NEXT:    v_mov_b32_e32 v12, s12
90; GCN-NEXT:    v_mov_b32_e32 v13, s13
91; GCN-NEXT:    v_mov_b32_e32 v14, s14
92; GCN-NEXT:    v_mov_b32_e32 v15, s15
93; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[18:19]
94; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[18:19] offset:16
95; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[18:19] offset:32
96; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[18:19] offset:48
97; GCN-NEXT:    s_endpgm
98;
99; GCN-SCRATCH-LABEL: scalar_clause:
100; GCN-SCRATCH:       ; %bb.0: ; %bb
101; GCN-SCRATCH-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x24
102; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v16, 0
103; GCN-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
104; GCN-SCRATCH-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
105; GCN-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
106; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v0, s0
107; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v1, s1
108; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v2, s2
109; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v3, s3
110; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v4, s4
111; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v5, s5
112; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v6, s6
113; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v7, s7
114; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v8, s8
115; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v9, s9
116; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v10, s10
117; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v11, s11
118; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v12, s12
119; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v13, s13
120; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v14, s14
121; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v15, s15
122; GCN-SCRATCH-NEXT:    global_store_dwordx4 v16, v[0:3], s[18:19]
123; GCN-SCRATCH-NEXT:    global_store_dwordx4 v16, v[4:7], s[18:19] offset:16
124; GCN-SCRATCH-NEXT:    global_store_dwordx4 v16, v[8:11], s[18:19] offset:32
125; GCN-SCRATCH-NEXT:    global_store_dwordx4 v16, v[12:15], s[18:19] offset:48
126; GCN-SCRATCH-NEXT:    s_endpgm
127bb:
128  %tmp = load <4 x i32>, ptr addrspace(1) %arg, align 16
129  %tmp2 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 1
130  %tmp3 = load <4 x i32>, ptr addrspace(1) %tmp2, align 16
131  %tmp4 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 1
132  %tmp5 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 2
133  %tmp6 = load <4 x i32>, ptr addrspace(1) %tmp5, align 16
134  %tmp7 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 2
135  %tmp8 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 3
136  %tmp9 = load <4 x i32>, ptr addrspace(1) %tmp8, align 16
137  %tmp10 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 3
138  store <4 x i32> %tmp, ptr addrspace(1) %arg1, align 16
139  store <4 x i32> %tmp3, ptr addrspace(1) %tmp4, align 16
140  store <4 x i32> %tmp6, ptr addrspace(1) %tmp7, align 16
141  store <4 x i32> %tmp9, ptr addrspace(1) %tmp10, align 16
142  ret void
143}
144
145define void @mubuf_clause(ptr addrspace(5) noalias nocapture readonly %arg, ptr addrspace(5) noalias nocapture %arg1) {
146; GCN-LABEL: mubuf_clause:
147; GCN:       ; %bb.0: ; %bb
148; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
149; GCN-NEXT:    v_lshlrev_b32_e32 v2, 4, v31
150; GCN-NEXT:    v_and_b32_e32 v2, 0x3ff0, v2
151; GCN-NEXT:    v_add_u32_e32 v0, v0, v2
152; GCN-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen offset:12
153; GCN-NEXT:    buffer_load_dword v4, v0, s[0:3], 0 offen offset:8
154; GCN-NEXT:    buffer_load_dword v5, v0, s[0:3], 0 offen offset:4
155; GCN-NEXT:    buffer_load_dword v6, v0, s[0:3], 0 offen
156; GCN-NEXT:    buffer_load_dword v7, v0, s[0:3], 0 offen offset:28
157; GCN-NEXT:    buffer_load_dword v8, v0, s[0:3], 0 offen offset:24
158; GCN-NEXT:    buffer_load_dword v9, v0, s[0:3], 0 offen offset:20
159; GCN-NEXT:    buffer_load_dword v10, v0, s[0:3], 0 offen offset:16
160; GCN-NEXT:    buffer_load_dword v11, v0, s[0:3], 0 offen offset:44
161; GCN-NEXT:    buffer_load_dword v12, v0, s[0:3], 0 offen offset:40
162; GCN-NEXT:    buffer_load_dword v13, v0, s[0:3], 0 offen offset:36
163; GCN-NEXT:    buffer_load_dword v14, v0, s[0:3], 0 offen offset:32
164; GCN-NEXT:    buffer_load_dword v15, v0, s[0:3], 0 offen offset:60
165; GCN-NEXT:    buffer_load_dword v16, v0, s[0:3], 0 offen offset:56
166; GCN-NEXT:    buffer_load_dword v17, v0, s[0:3], 0 offen offset:52
167; GCN-NEXT:    s_nop 0
168; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:48
169; GCN-NEXT:    v_add_u32_e32 v1, v1, v2
170; GCN-NEXT:    s_waitcnt vmcnt(15)
171; GCN-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen offset:12
172; GCN-NEXT:    s_waitcnt vmcnt(15)
173; GCN-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen offset:8
174; GCN-NEXT:    s_waitcnt vmcnt(15)
175; GCN-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen offset:4
176; GCN-NEXT:    s_waitcnt vmcnt(15)
177; GCN-NEXT:    buffer_store_dword v6, v1, s[0:3], 0 offen
178; GCN-NEXT:    s_waitcnt vmcnt(15)
179; GCN-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen offset:28
180; GCN-NEXT:    s_waitcnt vmcnt(15)
181; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen offset:24
182; GCN-NEXT:    s_waitcnt vmcnt(15)
183; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen offset:20
184; GCN-NEXT:    s_waitcnt vmcnt(15)
185; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen offset:16
186; GCN-NEXT:    s_waitcnt vmcnt(15)
187; GCN-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen offset:44
188; GCN-NEXT:    s_waitcnt vmcnt(15)
189; GCN-NEXT:    buffer_store_dword v12, v1, s[0:3], 0 offen offset:40
190; GCN-NEXT:    s_waitcnt vmcnt(15)
191; GCN-NEXT:    buffer_store_dword v13, v1, s[0:3], 0 offen offset:36
192; GCN-NEXT:    s_waitcnt vmcnt(15)
193; GCN-NEXT:    buffer_store_dword v14, v1, s[0:3], 0 offen offset:32
194; GCN-NEXT:    s_waitcnt vmcnt(15)
195; GCN-NEXT:    buffer_store_dword v15, v1, s[0:3], 0 offen offset:60
196; GCN-NEXT:    s_waitcnt vmcnt(15)
197; GCN-NEXT:    buffer_store_dword v16, v1, s[0:3], 0 offen offset:56
198; GCN-NEXT:    s_waitcnt vmcnt(15)
199; GCN-NEXT:    buffer_store_dword v17, v1, s[0:3], 0 offen offset:52
200; GCN-NEXT:    s_waitcnt vmcnt(15)
201; GCN-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:48
202; GCN-NEXT:    s_waitcnt vmcnt(0)
203; GCN-NEXT:    s_setpc_b64 s[30:31]
204;
205; GCN-SCRATCH-LABEL: mubuf_clause:
206; GCN-SCRATCH:       ; %bb.0: ; %bb
207; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
208; GCN-SCRATCH-NEXT:    v_lshlrev_b32_e32 v2, 4, v31
209; GCN-SCRATCH-NEXT:    v_and_b32_e32 v18, 0x3ff0, v2
210; GCN-SCRATCH-NEXT:    v_add_nc_u32_e32 v0, v0, v18
211; GCN-SCRATCH-NEXT:    s_clause 0x3
212; GCN-SCRATCH-NEXT:    scratch_load_dwordx4 v[2:5], v0, off
213; GCN-SCRATCH-NEXT:    scratch_load_dwordx4 v[6:9], v0, off offset:16
214; GCN-SCRATCH-NEXT:    scratch_load_dwordx4 v[10:13], v0, off offset:32
215; GCN-SCRATCH-NEXT:    scratch_load_dwordx4 v[14:17], v0, off offset:48
216; GCN-SCRATCH-NEXT:    v_add_nc_u32_e32 v0, v1, v18
217; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(3)
218; GCN-SCRATCH-NEXT:    scratch_store_dwordx4 v0, v[2:5], off
219; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(2)
220; GCN-SCRATCH-NEXT:    scratch_store_dwordx4 v0, v[6:9], off offset:16
221; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(1)
222; GCN-SCRATCH-NEXT:    scratch_store_dwordx4 v0, v[10:13], off offset:32
223; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
224; GCN-SCRATCH-NEXT:    scratch_store_dwordx4 v0, v[14:17], off offset:48
225; GCN-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
226bb:
227  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
228  %tmp2 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg, i32 %tmp
229  %tmp3 = load <4 x i32>, ptr addrspace(5) %tmp2, align 16
230  %tmp4 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg1, i32 %tmp
231  %tmp5 = add nuw nsw i32 %tmp, 1
232  %tmp6 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg, i32 %tmp5
233  %tmp7 = load <4 x i32>, ptr addrspace(5) %tmp6, align 16
234  %tmp8 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg1, i32 %tmp5
235  %tmp9 = add nuw nsw i32 %tmp, 2
236  %tmp10 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg, i32 %tmp9
237  %tmp11 = load <4 x i32>, ptr addrspace(5) %tmp10, align 16
238  %tmp12 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg1, i32 %tmp9
239  %tmp13 = add nuw nsw i32 %tmp, 3
240  %tmp14 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg, i32 %tmp13
241  %tmp15 = load <4 x i32>, ptr addrspace(5) %tmp14, align 16
242  %tmp16 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg1, i32 %tmp13
243  store <4 x i32> %tmp3, ptr addrspace(5) %tmp4, align 16
244  store <4 x i32> %tmp7, ptr addrspace(5) %tmp8, align 16
245  store <4 x i32> %tmp11, ptr addrspace(5) %tmp12, align 16
246  store <4 x i32> %tmp15, ptr addrspace(5) %tmp16, align 16
247  ret void
248}
249
250define amdgpu_kernel void @vector_clause_indirect(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture readnone %arg1, ptr addrspace(1) noalias nocapture %arg2) {
251; GCN-LABEL: vector_clause_indirect:
252; GCN:       ; %bb.0: ; %bb
253; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
254; GCN-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
255; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
256; GCN-NEXT:    s_waitcnt lgkmcnt(0)
257; GCN-NEXT:    global_load_dwordx2 v[8:9], v0, s[0:1]
258; GCN-NEXT:    s_waitcnt vmcnt(0)
259; GCN-NEXT:    global_load_dwordx4 v[0:3], v[8:9], off
260; GCN-NEXT:    global_load_dwordx4 v[4:7], v[8:9], off offset:16
261; GCN-NEXT:    v_mov_b32_e32 v8, 0
262; GCN-NEXT:    s_waitcnt vmcnt(1)
263; GCN-NEXT:    global_store_dwordx4 v8, v[0:3], s[2:3]
264; GCN-NEXT:    s_waitcnt vmcnt(1)
265; GCN-NEXT:    global_store_dwordx4 v8, v[4:7], s[2:3] offset:16
266; GCN-NEXT:    s_endpgm
267;
268; GCN-SCRATCH-LABEL: vector_clause_indirect:
269; GCN-SCRATCH:       ; %bb.0: ; %bb
270; GCN-SCRATCH-NEXT:    s_clause 0x1
271; GCN-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
272; GCN-SCRATCH-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
273; GCN-SCRATCH-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
274; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v8, 0
275; GCN-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
276; GCN-SCRATCH-NEXT:    global_load_dwordx2 v[4:5], v0, s[0:1]
277; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
278; GCN-SCRATCH-NEXT:    s_clause 0x1
279; GCN-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v[4:5], off
280; GCN-SCRATCH-NEXT:    global_load_dwordx4 v[4:7], v[4:5], off offset:16
281; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(1)
282; GCN-SCRATCH-NEXT:    global_store_dwordx4 v8, v[0:3], s[2:3]
283; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
284; GCN-SCRATCH-NEXT:    global_store_dwordx4 v8, v[4:7], s[2:3] offset:16
285; GCN-SCRATCH-NEXT:    s_endpgm
286bb:
287  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
288  %tmp3 = zext i32 %tmp to i64
289  %tmp4 = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 %tmp3
290  %tmp6 = load ptr addrspace(1), ptr addrspace(1) %tmp4, align 8
291  %tmp7 = load <4 x i32>, ptr addrspace(1) %tmp6, align 16
292  %tmp8 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %tmp6, i64 1
293  %tmp9 = load <4 x i32>, ptr addrspace(1) %tmp8, align 16
294  store <4 x i32> %tmp7, ptr addrspace(1) %arg2, align 16
295  %tmp10 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg2, i64 1
296  store <4 x i32> %tmp9, ptr addrspace(1) %tmp10, align 16
297  ret void
298}
299
300define void @load_global_d16_hi(ptr addrspace(1) %in, i16 %reg, ptr addrspace(1) %out) {
301; GCN-LABEL: load_global_d16_hi:
302; GCN:       ; %bb.0: ; %entry
303; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
304; GCN-NEXT:    v_mov_b32_e32 v5, v2
305; GCN-NEXT:    global_load_short_d16_hi v5, v[0:1], off
306; GCN-NEXT:    s_nop 0
307; GCN-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:64
308; GCN-NEXT:    s_waitcnt vmcnt(1)
309; GCN-NEXT:    global_store_dword v[3:4], v5, off
310; GCN-NEXT:    s_waitcnt vmcnt(1)
311; GCN-NEXT:    global_store_dword v[3:4], v2, off offset:128
312; GCN-NEXT:    s_waitcnt vmcnt(0)
313; GCN-NEXT:    s_setpc_b64 s[30:31]
314;
315; GCN-SCRATCH-LABEL: load_global_d16_hi:
316; GCN-SCRATCH:       ; %bb.0: ; %entry
317; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
318; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v5, v2
319; GCN-SCRATCH-NEXT:    s_clause 0x1
320; GCN-SCRATCH-NEXT:    global_load_short_d16_hi v5, v[0:1], off
321; GCN-SCRATCH-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:64
322; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(1)
323; GCN-SCRATCH-NEXT:    global_store_dword v[3:4], v5, off
324; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
325; GCN-SCRATCH-NEXT:    global_store_dword v[3:4], v2, off offset:128
326; GCN-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
327entry:
328  %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 32
329  %load1 = load i16, ptr addrspace(1) %in
330  %load2 = load i16, ptr addrspace(1) %gep
331  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
332  %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
333  store <2 x i16> %build1, ptr addrspace(1) %out
334  %build2 = insertelement <2 x i16> undef, i16 %reg, i32 0
335  %build3 = insertelement <2 x i16> %build2, i16 %load2, i32 1
336  %gep2 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 32
337  store <2 x i16> %build3, ptr addrspace(1) %gep2
338  ret void
339}
340
341define void @load_global_d16_lo(ptr addrspace(1) %in, i32 %reg, ptr addrspace(1) %out) {
342; GCN-LABEL: load_global_d16_lo:
343; GCN:       ; %bb.0: ; %entry
344; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
345; GCN-NEXT:    v_mov_b32_e32 v5, v2
346; GCN-NEXT:    global_load_short_d16 v5, v[0:1], off
347; GCN-NEXT:    s_nop 0
348; GCN-NEXT:    global_load_short_d16 v2, v[0:1], off offset:64
349; GCN-NEXT:    s_waitcnt vmcnt(1)
350; GCN-NEXT:    global_store_dword v[3:4], v5, off
351; GCN-NEXT:    s_waitcnt vmcnt(1)
352; GCN-NEXT:    global_store_dword v[3:4], v2, off offset:128
353; GCN-NEXT:    s_waitcnt vmcnt(0)
354; GCN-NEXT:    s_setpc_b64 s[30:31]
355;
356; GCN-SCRATCH-LABEL: load_global_d16_lo:
357; GCN-SCRATCH:       ; %bb.0: ; %entry
358; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
359; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v5, v2
360; GCN-SCRATCH-NEXT:    s_clause 0x1
361; GCN-SCRATCH-NEXT:    global_load_short_d16 v5, v[0:1], off
362; GCN-SCRATCH-NEXT:    global_load_short_d16 v2, v[0:1], off offset:64
363; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(1)
364; GCN-SCRATCH-NEXT:    global_store_dword v[3:4], v5, off
365; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
366; GCN-SCRATCH-NEXT:    global_store_dword v[3:4], v2, off offset:128
367; GCN-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
368entry:
369  %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 32
370  %reg.bc1 = bitcast i32 %reg to <2 x i16>
371  %reg.bc2 = bitcast i32 %reg to <2 x i16>
372  %load1 = load i16, ptr addrspace(1) %in
373  %load2 = load i16, ptr addrspace(1) %gep
374  %build1 = insertelement <2 x i16> %reg.bc1, i16 %load1, i32 0
375  %build2 = insertelement <2 x i16> %reg.bc2, i16 %load2, i32 0
376  %gep2 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 32
377  store <2 x i16> %build1, ptr addrspace(1) %out
378  store <2 x i16> %build2, ptr addrspace(1) %gep2
379  ret void
380}
381
382define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc) {
383; GCN-LABEL: flat_scratch_load:
384; GCN:       ; %bb.0: ; %.entry
385; GCN-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
386; GCN-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
387; GCN-NEXT:    s_mov_b32 s18, -1
388; GCN-NEXT:    s_mov_b32 s19, 0xe00000
389; GCN-NEXT:    s_add_u32 s16, s16, s11
390; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
391; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
392; GCN-NEXT:    s_addc_u32 s17, s17, 0
393; GCN-NEXT:    v_mov_b32_e32 v0, 0x40b00000
394; GCN-NEXT:    buffer_store_dword v0, off, s[16:19], 0
395; GCN-NEXT:    s_waitcnt vmcnt(0)
396; GCN-NEXT:    s_brev_b32 s0, 1
397; GCN-NEXT:    s_waitcnt lgkmcnt(0)
398; GCN-NEXT:    v_mov_b32_e32 v0, s6
399; GCN-NEXT:    s_mov_b32 s3, 0
400; GCN-NEXT:    s_mov_b32 s1, s0
401; GCN-NEXT:    s_mov_b32 s2, s0
402; GCN-NEXT:    v_mov_b32_e32 v1, s7
403; GCN-NEXT:    ;;#ASMSTART
404; GCN-NEXT:    ;;#ASMEND
405; GCN-NEXT:    buffer_load_dword v2, off, s[16:19], 0
406; GCN-NEXT:    s_nop 0
407; GCN-NEXT:    image_sample v0, v[0:1], s[8:15], s[0:3] dmask:0x1
408; GCN-NEXT:    s_waitcnt vmcnt(0)
409; GCN-NEXT:    v_add_f32_e32 v0, v2, v0
410; GCN-NEXT:    exp mrt0 v0, off, off, off done vm
411; GCN-NEXT:    s_endpgm
412;
413; GCN-SCRATCH-LABEL: flat_scratch_load:
414; GCN-SCRATCH:       ; %bb.0: ; %.entry
415; GCN-SCRATCH-NEXT:    s_add_u32 s8, s8, s13
416; GCN-SCRATCH-NEXT:    s_addc_u32 s9, s9, 0
417; GCN-SCRATCH-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
418; GCN-SCRATCH-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
419; GCN-SCRATCH-NEXT:    s_clause 0x1
420; GCN-SCRATCH-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x24
421; GCN-SCRATCH-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x44
422; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x40b00000
423; GCN-SCRATCH-NEXT:    s_brev_b32 s8, 1
424; GCN-SCRATCH-NEXT:    s_mov_b32 s9, s8
425; GCN-SCRATCH-NEXT:    scratch_store_dword off, v0, off
426; GCN-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
427; GCN-SCRATCH-NEXT:    ;;#ASMSTART
428; GCN-SCRATCH-NEXT:    ;;#ASMEND
429; GCN-SCRATCH-NEXT:    scratch_load_dword v2, off, off
430; GCN-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
431; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v0, s10
432; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v1, s11
433; GCN-SCRATCH-NEXT:    s_mov_b32 s11, 0
434; GCN-SCRATCH-NEXT:    s_mov_b32 s10, s8
435; GCN-SCRATCH-NEXT:    image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
436; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
437; GCN-SCRATCH-NEXT:    v_add_f32_e32 v0, v2, v0
438; GCN-SCRATCH-NEXT:    exp mrt0 v0, off, off, off done vm
439; GCN-SCRATCH-NEXT:    s_endpgm
440.entry:
441  %alloca = alloca float, align 4, addrspace(5)
442  store volatile float 5.5, ptr addrspace(5) %alloca
443  call void asm sideeffect "", ""()
444  ; There was a bug with flat scratch instructions that do not not use any address registers (ST mode).
445  ; To trigger, the scratch_load has to be immediately before the image_sample in MIR.
446  %load = load float, ptr addrspace(5) %alloca
447  %val = call <2 x float> @llvm.amdgcn.image.sample.2d.v2f32.f32(i32 9, float %a, float %b, <8 x i32> %desc, <4 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 0>, i1 false, i32 0, i32 0)
448  %val0 = extractelement <2 x float> %val, i32 0
449  %valadd = fadd float %load, %val0
450  call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float %valadd, float undef, float undef, float undef, i1 true, i1 true)
451  ret void
452}
453
454define amdgpu_kernel void @flat_scratch_load_clause(float %a, float %b, <8 x i32> %desc) {
455; GCN-LABEL: flat_scratch_load_clause:
456; GCN:       ; %bb.0: ; %.entry
457; GCN-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
458; GCN-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
459; GCN-NEXT:    s_mov_b32 s14, -1
460; GCN-NEXT:    s_mov_b32 s15, 0xe00000
461; GCN-NEXT:    s_add_u32 s12, s12, s11
462; GCN-NEXT:    s_addc_u32 s13, s13, 0
463; GCN-NEXT:    v_mov_b32_e32 v0, 0x40b00000
464; GCN-NEXT:    buffer_store_dword v0, off, s[12:15], 0
465; GCN-NEXT:    s_waitcnt vmcnt(0)
466; GCN-NEXT:    v_mov_b32_e32 v0, 0x40d00000
467; GCN-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4
468; GCN-NEXT:    s_waitcnt vmcnt(0)
469; GCN-NEXT:    ;;#ASMSTART
470; GCN-NEXT:    ;;#ASMEND
471; GCN-NEXT:    buffer_load_dword v0, off, s[12:15], 0
472; GCN-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4
473; GCN-NEXT:    s_waitcnt vmcnt(0)
474; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
475; GCN-NEXT:    exp mrt0 v0, off, off, off done vm
476; GCN-NEXT:    s_endpgm
477;
478; GCN-SCRATCH-LABEL: flat_scratch_load_clause:
479; GCN-SCRATCH:       ; %bb.0: ; %.entry
480; GCN-SCRATCH-NEXT:    s_add_u32 s8, s8, s13
481; GCN-SCRATCH-NEXT:    s_addc_u32 s9, s9, 0
482; GCN-SCRATCH-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
483; GCN-SCRATCH-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
484; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x40b00000
485; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0x40d00000
486; GCN-SCRATCH-NEXT:    scratch_store_dword off, v0, off
487; GCN-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
488; GCN-SCRATCH-NEXT:    scratch_store_dword off, v1, off offset:4
489; GCN-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
490; GCN-SCRATCH-NEXT:    ;;#ASMSTART
491; GCN-SCRATCH-NEXT:    ;;#ASMEND
492; GCN-SCRATCH-NEXT:    s_clause 0x1
493; GCN-SCRATCH-NEXT:    scratch_load_dword v0, off, off
494; GCN-SCRATCH-NEXT:    scratch_load_dword v1, off, off offset:4
495; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
496; GCN-SCRATCH-NEXT:    v_add_f32_e32 v0, v0, v1
497; GCN-SCRATCH-NEXT:    exp mrt0 v0, off, off, off done vm
498; GCN-SCRATCH-NEXT:    s_endpgm
499.entry:
500  %alloca = alloca float, align 4, addrspace(5)
501  %alloca2 = alloca float, align 4, addrspace(5)
502  store volatile float 5.5, ptr addrspace(5) %alloca
503  store volatile float 6.5, ptr addrspace(5) %alloca2
504  call void asm sideeffect "", ""()
505  %load0 = load float, ptr addrspace(5) %alloca
506  %load1 = load float, ptr addrspace(5) %alloca2
507  %valadd = fadd float %load0, %load1
508  call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float %valadd, float undef, float undef, float undef, i1 true, i1 true)
509  ret void
510}
511
512declare i32 @llvm.amdgcn.workitem.id.x()
513declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg)
514declare <2 x float> @llvm.amdgcn.image.sample.2d.v2f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg)
515