xref: /llvm-project/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll (revision 6206f5444fc0732e6495703c75a67f1f90f5b418)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 %s -o - | FileCheck %s
3
4%struct.S = type { [32 x i32] }
5
6@shared = addrspace(3) global %struct.S undef, align 4
7
8define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0 {
9; CHECK-LABEL: memcpy_p0_p0_minsize:
10; CHECK:       ; %bb.0: ; %entry
11; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
12; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
13; CHECK-NEXT:    v_mov_b32_e32 v12, s3
14; CHECK-NEXT:    v_mov_b32_e32 v11, s2
15; CHECK-NEXT:    flat_load_ubyte v13, v[11:12] offset:46
16; CHECK-NEXT:    flat_load_ushort v14, v[11:12] offset:44
17; CHECK-NEXT:    flat_load_dwordx3 v[8:10], v[11:12] offset:32
18; CHECK-NEXT:    flat_load_dwordx4 v[0:3], v[11:12] offset:16
19; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[11:12]
20; CHECK-NEXT:    v_mov_b32_e32 v12, s1
21; CHECK-NEXT:    v_mov_b32_e32 v11, s0
22; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
23; CHECK-NEXT:    flat_store_byte v[11:12], v13 offset:46
24; CHECK-NEXT:    flat_store_short v[11:12], v14 offset:44
25; CHECK-NEXT:    flat_store_dwordx3 v[11:12], v[8:10] offset:32
26; CHECK-NEXT:    flat_store_dwordx4 v[11:12], v[0:3] offset:16
27; CHECK-NEXT:    flat_store_dwordx4 v[11:12], v[4:7]
28; CHECK-NEXT:    s_endpgm
29entry:
30  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
31  ret void
32}
33
34define amdgpu_kernel void @memcpy_p1_p1_minsize(ptr addrspace(1) %dest, ptr addrspace(1) %src) #0 {
35; CHECK-LABEL: memcpy_p1_p1_minsize:
36; CHECK:       ; %bb.0: ; %entry
37; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
38; CHECK-NEXT:    v_mov_b32_e32 v12, 0
39; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
40; CHECK-NEXT:    global_load_dwordx2 v[8:9], v12, s[2:3] offset:32
41; CHECK-NEXT:    global_load_dwordx2 v[10:11], v12, s[2:3] offset:39
42; CHECK-NEXT:    global_load_dwordx4 v[0:3], v12, s[2:3]
43; CHECK-NEXT:    global_load_dwordx4 v[4:7], v12, s[2:3] offset:16
44; CHECK-NEXT:    s_waitcnt vmcnt(3)
45; CHECK-NEXT:    global_store_dwordx2 v12, v[8:9], s[0:1] offset:32
46; CHECK-NEXT:    s_waitcnt vmcnt(3)
47; CHECK-NEXT:    global_store_dwordx2 v12, v[10:11], s[0:1] offset:39
48; CHECK-NEXT:    s_waitcnt vmcnt(3)
49; CHECK-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1]
50; CHECK-NEXT:    s_waitcnt vmcnt(3)
51; CHECK-NEXT:    global_store_dwordx4 v12, v[4:7], s[0:1] offset:16
52; CHECK-NEXT:    s_endpgm
53entry:
54  tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 47, i1 false)
55  ret void
56}
57
58define amdgpu_kernel void @memcpy_p1_p4_minsize(ptr addrspace(1) %global, ptr addrspace(4) %0) #0 {
59; CHECK-LABEL: memcpy_p1_p4_minsize:
60; CHECK:       ; %bb.0: ; %entry
61; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
62; CHECK-NEXT:    v_mov_b32_e32 v32, 0
63; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
64; CHECK-NEXT:    global_load_dwordx4 v[0:3], v32, s[2:3]
65; CHECK-NEXT:    global_load_dwordx4 v[4:7], v32, s[2:3] offset:16
66; CHECK-NEXT:    global_load_dwordx4 v[8:11], v32, s[2:3] offset:32
67; CHECK-NEXT:    global_load_dwordx4 v[12:15], v32, s[2:3] offset:48
68; CHECK-NEXT:    global_load_dwordx4 v[16:19], v32, s[2:3] offset:64
69; CHECK-NEXT:    global_load_dwordx4 v[20:23], v32, s[2:3] offset:80
70; CHECK-NEXT:    global_load_dwordx4 v[24:27], v32, s[2:3] offset:96
71; CHECK-NEXT:    global_load_dwordx4 v[28:31], v32, s[2:3] offset:112
72; CHECK-NEXT:    s_waitcnt vmcnt(7)
73; CHECK-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1]
74; CHECK-NEXT:    s_waitcnt vmcnt(7)
75; CHECK-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
76; CHECK-NEXT:    s_waitcnt vmcnt(7)
77; CHECK-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
78; CHECK-NEXT:    s_waitcnt vmcnt(7)
79; CHECK-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
80; CHECK-NEXT:    s_waitcnt vmcnt(7)
81; CHECK-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
82; CHECK-NEXT:    s_waitcnt vmcnt(7)
83; CHECK-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
84; CHECK-NEXT:    s_waitcnt vmcnt(7)
85; CHECK-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
86; CHECK-NEXT:    s_waitcnt vmcnt(7)
87; CHECK-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
88; CHECK-NEXT:    s_endpgm
89entry:
90  tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) %global, ptr addrspace(4) %0, i64 128, i1 false)
91  ret void
92}
93
94define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #0 {
95; CHECK-LABEL: memcpy_p5_p4_minsize:
96; CHECK:       ; %bb.0: ; %entry
97; CHECK-NEXT:    s_mov_b64 s[18:19], s[2:3]
98; CHECK-NEXT:    s_mov_b64 s[16:17], s[0:1]
99; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x8
100; CHECK-NEXT:    s_load_dword s2, s[8:9], 0x0
101; CHECK-NEXT:    v_mov_b32_e32 v24, 0
102; CHECK-NEXT:    s_add_u32 s16, s16, s15
103; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
104; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
105; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
106; CHECK-NEXT:    global_load_dwordx4 v[8:11], v24, s[0:1] offset:80
107; CHECK-NEXT:    global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
108; CHECK-NEXT:    global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
109; CHECK-NEXT:    global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
110; CHECK-NEXT:    s_addc_u32 s17, s17, 0
111; CHECK-NEXT:    v_mov_b32_e32 v25, s2
112; CHECK-NEXT:    s_waitcnt vmcnt(5)
113; CHECK-NEXT:    buffer_store_dword v3, v25, s[16:19], 0 offen offset:124
114; CHECK-NEXT:    buffer_store_dword v2, v25, s[16:19], 0 offen offset:120
115; CHECK-NEXT:    buffer_store_dword v1, v25, s[16:19], 0 offen offset:116
116; CHECK-NEXT:    buffer_store_dword v0, v25, s[16:19], 0 offen offset:112
117; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:16
118; CHECK-NEXT:    s_waitcnt vmcnt(9)
119; CHECK-NEXT:    buffer_store_dword v7, v25, s[16:19], 0 offen offset:108
120; CHECK-NEXT:    buffer_store_dword v6, v25, s[16:19], 0 offen offset:104
121; CHECK-NEXT:    buffer_store_dword v5, v25, s[16:19], 0 offen offset:100
122; CHECK-NEXT:    buffer_store_dword v4, v25, s[16:19], 0 offen offset:96
123; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1]
124; CHECK-NEXT:    s_waitcnt vmcnt(13)
125; CHECK-NEXT:    buffer_store_dword v11, v25, s[16:19], 0 offen offset:92
126; CHECK-NEXT:    buffer_store_dword v10, v25, s[16:19], 0 offen offset:88
127; CHECK-NEXT:    buffer_store_dword v9, v25, s[16:19], 0 offen offset:84
128; CHECK-NEXT:    buffer_store_dword v8, v25, s[16:19], 0 offen offset:80
129; CHECK-NEXT:    s_waitcnt vmcnt(16)
130; CHECK-NEXT:    buffer_store_dword v15, v25, s[16:19], 0 offen offset:76
131; CHECK-NEXT:    buffer_store_dword v14, v25, s[16:19], 0 offen offset:72
132; CHECK-NEXT:    buffer_store_dword v13, v25, s[16:19], 0 offen offset:68
133; CHECK-NEXT:    buffer_store_dword v12, v25, s[16:19], 0 offen offset:64
134; CHECK-NEXT:    s_waitcnt vmcnt(19)
135; CHECK-NEXT:    buffer_store_dword v19, v25, s[16:19], 0 offen offset:60
136; CHECK-NEXT:    buffer_store_dword v18, v25, s[16:19], 0 offen offset:56
137; CHECK-NEXT:    buffer_store_dword v17, v25, s[16:19], 0 offen offset:52
138; CHECK-NEXT:    buffer_store_dword v16, v25, s[16:19], 0 offen offset:48
139; CHECK-NEXT:    s_waitcnt vmcnt(22)
140; CHECK-NEXT:    buffer_store_dword v23, v25, s[16:19], 0 offen offset:44
141; CHECK-NEXT:    buffer_store_dword v22, v25, s[16:19], 0 offen offset:40
142; CHECK-NEXT:    buffer_store_dword v21, v25, s[16:19], 0 offen offset:36
143; CHECK-NEXT:    buffer_store_dword v20, v25, s[16:19], 0 offen offset:32
144; CHECK-NEXT:    s_waitcnt vmcnt(21)
145; CHECK-NEXT:    buffer_store_dword v3, v25, s[16:19], 0 offen offset:28
146; CHECK-NEXT:    buffer_store_dword v2, v25, s[16:19], 0 offen offset:24
147; CHECK-NEXT:    buffer_store_dword v1, v25, s[16:19], 0 offen offset:20
148; CHECK-NEXT:    buffer_store_dword v0, v25, s[16:19], 0 offen offset:16
149; CHECK-NEXT:    s_waitcnt vmcnt(20)
150; CHECK-NEXT:    buffer_store_dword v7, v25, s[16:19], 0 offen offset:12
151; CHECK-NEXT:    buffer_store_dword v6, v25, s[16:19], 0 offen offset:8
152; CHECK-NEXT:    buffer_store_dword v5, v25, s[16:19], 0 offen offset:4
153; CHECK-NEXT:    buffer_store_dword v4, v25, s[16:19], 0 offen
154; CHECK-NEXT:    s_endpgm
155entry:
156  tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
157  ret void
158}
159
160define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %src) #0 {
161; CHECK-LABEL: memcpy_p0_p5_minsize:
162; CHECK:       ; %bb.0: ; %entry
163; CHECK-NEXT:    s_mov_b64 s[18:19], s[2:3]
164; CHECK-NEXT:    s_mov_b64 s[16:17], s[0:1]
165; CHECK-NEXT:    s_load_dword s0, s[8:9], 0x8
166; CHECK-NEXT:    s_add_u32 s16, s16, s15
167; CHECK-NEXT:    s_addc_u32 s17, s17, 0
168; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
169; CHECK-NEXT:    v_mov_b32_e32 v26, s0
170; CHECK-NEXT:    buffer_load_dword v3, v26, s[16:19], 0 offen offset:124
171; CHECK-NEXT:    buffer_load_dword v2, v26, s[16:19], 0 offen offset:120
172; CHECK-NEXT:    buffer_load_dword v1, v26, s[16:19], 0 offen offset:116
173; CHECK-NEXT:    buffer_load_dword v0, v26, s[16:19], 0 offen offset:112
174; CHECK-NEXT:    buffer_load_dword v7, v26, s[16:19], 0 offen offset:108
175; CHECK-NEXT:    buffer_load_dword v6, v26, s[16:19], 0 offen offset:104
176; CHECK-NEXT:    buffer_load_dword v5, v26, s[16:19], 0 offen offset:100
177; CHECK-NEXT:    buffer_load_dword v4, v26, s[16:19], 0 offen offset:96
178; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
179; CHECK-NEXT:    buffer_load_dword v8, v26, s[16:19], 0 offen offset:16
180; CHECK-NEXT:    buffer_load_dword v9, v26, s[16:19], 0 offen offset:20
181; CHECK-NEXT:    buffer_load_dword v10, v26, s[16:19], 0 offen offset:24
182; CHECK-NEXT:    buffer_load_dword v11, v26, s[16:19], 0 offen offset:28
183; CHECK-NEXT:    buffer_load_dword v12, v26, s[16:19], 0 offen offset:32
184; CHECK-NEXT:    buffer_load_dword v13, v26, s[16:19], 0 offen offset:36
185; CHECK-NEXT:    buffer_load_dword v14, v26, s[16:19], 0 offen offset:40
186; CHECK-NEXT:    buffer_load_dword v15, v26, s[16:19], 0 offen offset:44
187; CHECK-NEXT:    buffer_load_dword v16, v26, s[16:19], 0 offen offset:48
188; CHECK-NEXT:    buffer_load_dword v17, v26, s[16:19], 0 offen offset:52
189; CHECK-NEXT:    buffer_load_dword v18, v26, s[16:19], 0 offen offset:56
190; CHECK-NEXT:    buffer_load_dword v19, v26, s[16:19], 0 offen offset:60
191; CHECK-NEXT:    buffer_load_dword v23, v26, s[16:19], 0 offen offset:92
192; CHECK-NEXT:    buffer_load_dword v22, v26, s[16:19], 0 offen offset:88
193; CHECK-NEXT:    buffer_load_dword v21, v26, s[16:19], 0 offen offset:84
194; CHECK-NEXT:    buffer_load_dword v20, v26, s[16:19], 0 offen offset:80
195; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
196; CHECK-NEXT:    v_mov_b32_e32 v25, s1
197; CHECK-NEXT:    v_mov_b32_e32 v24, s0
198; CHECK-NEXT:    s_waitcnt vmcnt(20)
199; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[0:3] offset:112
200; CHECK-NEXT:    buffer_load_dword v3, v26, s[16:19], 0 offen offset:76
201; CHECK-NEXT:    s_nop 0
202; CHECK-NEXT:    buffer_load_dword v2, v26, s[16:19], 0 offen offset:72
203; CHECK-NEXT:    buffer_load_dword v1, v26, s[16:19], 0 offen offset:68
204; CHECK-NEXT:    buffer_load_dword v0, v26, s[16:19], 0 offen offset:64
205; CHECK-NEXT:    s_waitcnt vmcnt(0)
206; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[4:7] offset:96
207; CHECK-NEXT:    buffer_load_dword v4, v26, s[16:19], 0 offen
208; CHECK-NEXT:    s_nop 0
209; CHECK-NEXT:    buffer_load_dword v5, v26, s[16:19], 0 offen offset:4
210; CHECK-NEXT:    buffer_load_dword v6, v26, s[16:19], 0 offen offset:8
211; CHECK-NEXT:    buffer_load_dword v7, v26, s[16:19], 0 offen offset:12
212; CHECK-NEXT:    s_nop 0
213; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[20:23] offset:80
214; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[0:3] offset:64
215; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[16:19] offset:48
216; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[12:15] offset:32
217; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[8:11] offset:16
218; CHECK-NEXT:    s_waitcnt vmcnt(0)
219; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[4:7]
220; CHECK-NEXT:    s_endpgm
221entry:
222  tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false)
223  ret void
224}
225
226define amdgpu_kernel void @memcpy_p3_p4_minsize(ptr addrspace(4) %0) #0 {
227; CHECK-LABEL: memcpy_p3_p4_minsize:
228; CHECK:       ; %bb.0: ; %entry
229; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
230; CHECK-NEXT:    v_mov_b32_e32 v24, 0
231; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
232; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1]
233; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1] offset:16
234; CHECK-NEXT:    global_load_dwordx4 v[8:11], v24, s[0:1] offset:32
235; CHECK-NEXT:    global_load_dwordx4 v[12:15], v24, s[0:1] offset:48
236; CHECK-NEXT:    global_load_dwordx4 v[16:19], v24, s[0:1] offset:64
237; CHECK-NEXT:    global_load_dwordx4 v[20:23], v24, s[0:1] offset:80
238; CHECK-NEXT:    s_waitcnt vmcnt(5)
239; CHECK-NEXT:    ds_write2_b64 v24, v[0:1], v[2:3] offset1:1
240; CHECK-NEXT:    s_waitcnt vmcnt(4)
241; CHECK-NEXT:    ds_write2_b64 v24, v[4:5], v[6:7] offset0:2 offset1:3
242; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:96
243; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1] offset:112
244; CHECK-NEXT:    s_waitcnt vmcnt(5)
245; CHECK-NEXT:    ds_write2_b64 v24, v[8:9], v[10:11] offset0:4 offset1:5
246; CHECK-NEXT:    s_waitcnt vmcnt(4)
247; CHECK-NEXT:    ds_write2_b64 v24, v[12:13], v[14:15] offset0:6 offset1:7
248; CHECK-NEXT:    s_waitcnt vmcnt(3)
249; CHECK-NEXT:    ds_write2_b64 v24, v[16:17], v[18:19] offset0:8 offset1:9
250; CHECK-NEXT:    s_waitcnt vmcnt(2)
251; CHECK-NEXT:    ds_write2_b64 v24, v[20:21], v[22:23] offset0:10 offset1:11
252; CHECK-NEXT:    s_waitcnt vmcnt(1)
253; CHECK-NEXT:    ds_write2_b64 v24, v[0:1], v[2:3] offset0:12 offset1:13
254; CHECK-NEXT:    s_waitcnt vmcnt(0)
255; CHECK-NEXT:    ds_write2_b64 v24, v[4:5], v[6:7] offset0:14 offset1:15
256; CHECK-NEXT:    s_endpgm
257entry:
258  tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) @shared, ptr addrspace(4) %0, i64 128, i1 false)
259  ret void
260}
261
262define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
263; CHECK-LABEL: memcpy_p0_p3_minsize:
264; CHECK:       ; %bb.0: ; %entry
265; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
266; CHECK-NEXT:    v_mov_b32_e32 v16, 0
267; CHECK-NEXT:    ds_read2_b64 v[0:3], v16 offset1:1
268; CHECK-NEXT:    ds_read2_b64 v[4:7], v16 offset0:2 offset1:3
269; CHECK-NEXT:    ds_read2_b64 v[8:11], v16 offset0:4 offset1:5
270; CHECK-NEXT:    ds_read2_b64 v[12:15], v16 offset0:6 offset1:7
271; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
272; CHECK-NEXT:    v_mov_b32_e32 v21, s1
273; CHECK-NEXT:    v_mov_b32_e32 v20, s0
274; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[0:3]
275; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[4:7] offset:16
276; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[8:11] offset:32
277; CHECK-NEXT:    ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
278; CHECK-NEXT:    ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
279; CHECK-NEXT:    ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
280; CHECK-NEXT:    ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
281; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[12:15] offset:48
282; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
283; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[0:3] offset:64
284; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[4:7] offset:80
285; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[8:11] offset:96
286; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[16:19] offset:112
287; CHECK-NEXT:    s_endpgm
288entry:
289  tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false)
290  ret void
291}
292
293define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
294; CHECK-LABEL: memcpy_p0_p0_optsize:
295; CHECK:       ; %bb.0: ; %entry
296; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
297; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
298; CHECK-NEXT:    v_mov_b32_e32 v12, s3
299; CHECK-NEXT:    v_mov_b32_e32 v11, s2
300; CHECK-NEXT:    flat_load_ubyte v13, v[11:12] offset:46
301; CHECK-NEXT:    flat_load_ushort v14, v[11:12] offset:44
302; CHECK-NEXT:    flat_load_dwordx3 v[8:10], v[11:12] offset:32
303; CHECK-NEXT:    flat_load_dwordx4 v[0:3], v[11:12] offset:16
304; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[11:12]
305; CHECK-NEXT:    v_mov_b32_e32 v12, s1
306; CHECK-NEXT:    v_mov_b32_e32 v11, s0
307; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
308; CHECK-NEXT:    flat_store_byte v[11:12], v13 offset:46
309; CHECK-NEXT:    flat_store_short v[11:12], v14 offset:44
310; CHECK-NEXT:    flat_store_dwordx3 v[11:12], v[8:10] offset:32
311; CHECK-NEXT:    flat_store_dwordx4 v[11:12], v[0:3] offset:16
312; CHECK-NEXT:    flat_store_dwordx4 v[11:12], v[4:7]
313; CHECK-NEXT:    s_endpgm
314entry:
315  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
316  ret void
317}
318
319define amdgpu_kernel void @memcpy_p1_p1_optsize(ptr addrspace(1) %dest, ptr addrspace(1) %src) #1 {
320; CHECK-LABEL: memcpy_p1_p1_optsize:
321; CHECK:       ; %bb.0: ; %entry
322; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
323; CHECK-NEXT:    v_mov_b32_e32 v12, 0
324; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
325; CHECK-NEXT:    global_load_dwordx2 v[8:9], v12, s[2:3] offset:32
326; CHECK-NEXT:    global_load_dwordx2 v[10:11], v12, s[2:3] offset:39
327; CHECK-NEXT:    global_load_dwordx4 v[0:3], v12, s[2:3]
328; CHECK-NEXT:    global_load_dwordx4 v[4:7], v12, s[2:3] offset:16
329; CHECK-NEXT:    s_waitcnt vmcnt(3)
330; CHECK-NEXT:    global_store_dwordx2 v12, v[8:9], s[0:1] offset:32
331; CHECK-NEXT:    s_waitcnt vmcnt(3)
332; CHECK-NEXT:    global_store_dwordx2 v12, v[10:11], s[0:1] offset:39
333; CHECK-NEXT:    s_waitcnt vmcnt(3)
334; CHECK-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1]
335; CHECK-NEXT:    s_waitcnt vmcnt(3)
336; CHECK-NEXT:    global_store_dwordx4 v12, v[4:7], s[0:1] offset:16
337; CHECK-NEXT:    s_endpgm
338entry:
339  tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 47, i1 false)
340  ret void
341}
342
343define amdgpu_kernel void @memcpy_p1_p4_optsize(ptr addrspace(1) %global, ptr addrspace(4) %0) #1 {
344; CHECK-LABEL: memcpy_p1_p4_optsize:
345; CHECK:       ; %bb.0: ; %entry
346; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
347; CHECK-NEXT:    v_mov_b32_e32 v32, 0
348; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
349; CHECK-NEXT:    global_load_dwordx4 v[0:3], v32, s[2:3]
350; CHECK-NEXT:    global_load_dwordx4 v[4:7], v32, s[2:3] offset:16
351; CHECK-NEXT:    global_load_dwordx4 v[8:11], v32, s[2:3] offset:32
352; CHECK-NEXT:    global_load_dwordx4 v[12:15], v32, s[2:3] offset:48
353; CHECK-NEXT:    global_load_dwordx4 v[16:19], v32, s[2:3] offset:64
354; CHECK-NEXT:    global_load_dwordx4 v[20:23], v32, s[2:3] offset:80
355; CHECK-NEXT:    global_load_dwordx4 v[24:27], v32, s[2:3] offset:96
356; CHECK-NEXT:    global_load_dwordx4 v[28:31], v32, s[2:3] offset:112
357; CHECK-NEXT:    s_waitcnt vmcnt(7)
358; CHECK-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1]
359; CHECK-NEXT:    s_waitcnt vmcnt(7)
360; CHECK-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
361; CHECK-NEXT:    s_waitcnt vmcnt(7)
362; CHECK-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
363; CHECK-NEXT:    s_waitcnt vmcnt(7)
364; CHECK-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
365; CHECK-NEXT:    s_waitcnt vmcnt(7)
366; CHECK-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
367; CHECK-NEXT:    s_waitcnt vmcnt(7)
368; CHECK-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
369; CHECK-NEXT:    s_waitcnt vmcnt(7)
370; CHECK-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
371; CHECK-NEXT:    s_waitcnt vmcnt(7)
372; CHECK-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
373; CHECK-NEXT:    s_endpgm
374entry:
375  tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) %global, ptr addrspace(4) %0, i64 128, i1 false)
376  ret void
377}
378
379define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #1 {
380; CHECK-LABEL: memcpy_p5_p4_optsize:
381; CHECK:       ; %bb.0: ; %entry
382; CHECK-NEXT:    s_mov_b64 s[18:19], s[2:3]
383; CHECK-NEXT:    s_mov_b64 s[16:17], s[0:1]
384; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x8
385; CHECK-NEXT:    s_load_dword s2, s[8:9], 0x0
386; CHECK-NEXT:    v_mov_b32_e32 v24, 0
387; CHECK-NEXT:    s_add_u32 s16, s16, s15
388; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
389; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
390; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
391; CHECK-NEXT:    global_load_dwordx4 v[8:11], v24, s[0:1] offset:80
392; CHECK-NEXT:    global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
393; CHECK-NEXT:    global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
394; CHECK-NEXT:    global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
395; CHECK-NEXT:    s_addc_u32 s17, s17, 0
396; CHECK-NEXT:    v_mov_b32_e32 v25, s2
397; CHECK-NEXT:    s_waitcnt vmcnt(5)
398; CHECK-NEXT:    buffer_store_dword v3, v25, s[16:19], 0 offen offset:124
399; CHECK-NEXT:    buffer_store_dword v2, v25, s[16:19], 0 offen offset:120
400; CHECK-NEXT:    buffer_store_dword v1, v25, s[16:19], 0 offen offset:116
401; CHECK-NEXT:    buffer_store_dword v0, v25, s[16:19], 0 offen offset:112
402; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:16
403; CHECK-NEXT:    s_waitcnt vmcnt(9)
404; CHECK-NEXT:    buffer_store_dword v7, v25, s[16:19], 0 offen offset:108
405; CHECK-NEXT:    buffer_store_dword v6, v25, s[16:19], 0 offen offset:104
406; CHECK-NEXT:    buffer_store_dword v5, v25, s[16:19], 0 offen offset:100
407; CHECK-NEXT:    buffer_store_dword v4, v25, s[16:19], 0 offen offset:96
408; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1]
409; CHECK-NEXT:    s_waitcnt vmcnt(13)
410; CHECK-NEXT:    buffer_store_dword v11, v25, s[16:19], 0 offen offset:92
411; CHECK-NEXT:    buffer_store_dword v10, v25, s[16:19], 0 offen offset:88
412; CHECK-NEXT:    buffer_store_dword v9, v25, s[16:19], 0 offen offset:84
413; CHECK-NEXT:    buffer_store_dword v8, v25, s[16:19], 0 offen offset:80
414; CHECK-NEXT:    s_waitcnt vmcnt(16)
415; CHECK-NEXT:    buffer_store_dword v15, v25, s[16:19], 0 offen offset:76
416; CHECK-NEXT:    buffer_store_dword v14, v25, s[16:19], 0 offen offset:72
417; CHECK-NEXT:    buffer_store_dword v13, v25, s[16:19], 0 offen offset:68
418; CHECK-NEXT:    buffer_store_dword v12, v25, s[16:19], 0 offen offset:64
419; CHECK-NEXT:    s_waitcnt vmcnt(19)
420; CHECK-NEXT:    buffer_store_dword v19, v25, s[16:19], 0 offen offset:60
421; CHECK-NEXT:    buffer_store_dword v18, v25, s[16:19], 0 offen offset:56
422; CHECK-NEXT:    buffer_store_dword v17, v25, s[16:19], 0 offen offset:52
423; CHECK-NEXT:    buffer_store_dword v16, v25, s[16:19], 0 offen offset:48
424; CHECK-NEXT:    s_waitcnt vmcnt(22)
425; CHECK-NEXT:    buffer_store_dword v23, v25, s[16:19], 0 offen offset:44
426; CHECK-NEXT:    buffer_store_dword v22, v25, s[16:19], 0 offen offset:40
427; CHECK-NEXT:    buffer_store_dword v21, v25, s[16:19], 0 offen offset:36
428; CHECK-NEXT:    buffer_store_dword v20, v25, s[16:19], 0 offen offset:32
429; CHECK-NEXT:    s_waitcnt vmcnt(21)
430; CHECK-NEXT:    buffer_store_dword v3, v25, s[16:19], 0 offen offset:28
431; CHECK-NEXT:    buffer_store_dword v2, v25, s[16:19], 0 offen offset:24
432; CHECK-NEXT:    buffer_store_dword v1, v25, s[16:19], 0 offen offset:20
433; CHECK-NEXT:    buffer_store_dword v0, v25, s[16:19], 0 offen offset:16
434; CHECK-NEXT:    s_waitcnt vmcnt(20)
435; CHECK-NEXT:    buffer_store_dword v7, v25, s[16:19], 0 offen offset:12
436; CHECK-NEXT:    buffer_store_dword v6, v25, s[16:19], 0 offen offset:8
437; CHECK-NEXT:    buffer_store_dword v5, v25, s[16:19], 0 offen offset:4
438; CHECK-NEXT:    buffer_store_dword v4, v25, s[16:19], 0 offen
439; CHECK-NEXT:    s_endpgm
440entry:
441  tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
442  ret void
443}
444
445define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %src) #1 {
446; CHECK-LABEL: memcpy_p0_p5_optsize:
447; CHECK:       ; %bb.0: ; %entry
448; CHECK-NEXT:    s_mov_b64 s[18:19], s[2:3]
449; CHECK-NEXT:    s_mov_b64 s[16:17], s[0:1]
450; CHECK-NEXT:    s_load_dword s0, s[8:9], 0x8
451; CHECK-NEXT:    s_add_u32 s16, s16, s15
452; CHECK-NEXT:    s_addc_u32 s17, s17, 0
453; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
454; CHECK-NEXT:    v_mov_b32_e32 v26, s0
455; CHECK-NEXT:    buffer_load_dword v3, v26, s[16:19], 0 offen offset:124
456; CHECK-NEXT:    buffer_load_dword v2, v26, s[16:19], 0 offen offset:120
457; CHECK-NEXT:    buffer_load_dword v1, v26, s[16:19], 0 offen offset:116
458; CHECK-NEXT:    buffer_load_dword v0, v26, s[16:19], 0 offen offset:112
459; CHECK-NEXT:    buffer_load_dword v7, v26, s[16:19], 0 offen offset:108
460; CHECK-NEXT:    buffer_load_dword v6, v26, s[16:19], 0 offen offset:104
461; CHECK-NEXT:    buffer_load_dword v5, v26, s[16:19], 0 offen offset:100
462; CHECK-NEXT:    buffer_load_dword v4, v26, s[16:19], 0 offen offset:96
463; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
464; CHECK-NEXT:    buffer_load_dword v8, v26, s[16:19], 0 offen offset:16
465; CHECK-NEXT:    buffer_load_dword v9, v26, s[16:19], 0 offen offset:20
466; CHECK-NEXT:    buffer_load_dword v10, v26, s[16:19], 0 offen offset:24
467; CHECK-NEXT:    buffer_load_dword v11, v26, s[16:19], 0 offen offset:28
468; CHECK-NEXT:    buffer_load_dword v12, v26, s[16:19], 0 offen offset:32
469; CHECK-NEXT:    buffer_load_dword v13, v26, s[16:19], 0 offen offset:36
470; CHECK-NEXT:    buffer_load_dword v14, v26, s[16:19], 0 offen offset:40
471; CHECK-NEXT:    buffer_load_dword v15, v26, s[16:19], 0 offen offset:44
472; CHECK-NEXT:    buffer_load_dword v16, v26, s[16:19], 0 offen offset:48
473; CHECK-NEXT:    buffer_load_dword v17, v26, s[16:19], 0 offen offset:52
474; CHECK-NEXT:    buffer_load_dword v18, v26, s[16:19], 0 offen offset:56
475; CHECK-NEXT:    buffer_load_dword v19, v26, s[16:19], 0 offen offset:60
476; CHECK-NEXT:    buffer_load_dword v23, v26, s[16:19], 0 offen offset:92
477; CHECK-NEXT:    buffer_load_dword v22, v26, s[16:19], 0 offen offset:88
478; CHECK-NEXT:    buffer_load_dword v21, v26, s[16:19], 0 offen offset:84
479; CHECK-NEXT:    buffer_load_dword v20, v26, s[16:19], 0 offen offset:80
480; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
481; CHECK-NEXT:    v_mov_b32_e32 v25, s1
482; CHECK-NEXT:    v_mov_b32_e32 v24, s0
483; CHECK-NEXT:    s_waitcnt vmcnt(20)
484; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[0:3] offset:112
485; CHECK-NEXT:    buffer_load_dword v3, v26, s[16:19], 0 offen offset:76
486; CHECK-NEXT:    s_nop 0
487; CHECK-NEXT:    buffer_load_dword v2, v26, s[16:19], 0 offen offset:72
488; CHECK-NEXT:    buffer_load_dword v1, v26, s[16:19], 0 offen offset:68
489; CHECK-NEXT:    buffer_load_dword v0, v26, s[16:19], 0 offen offset:64
490; CHECK-NEXT:    s_waitcnt vmcnt(0)
491; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[4:7] offset:96
492; CHECK-NEXT:    buffer_load_dword v4, v26, s[16:19], 0 offen
493; CHECK-NEXT:    s_nop 0
494; CHECK-NEXT:    buffer_load_dword v5, v26, s[16:19], 0 offen offset:4
495; CHECK-NEXT:    buffer_load_dword v6, v26, s[16:19], 0 offen offset:8
496; CHECK-NEXT:    buffer_load_dword v7, v26, s[16:19], 0 offen offset:12
497; CHECK-NEXT:    s_nop 0
498; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[20:23] offset:80
499; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[0:3] offset:64
500; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[16:19] offset:48
501; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[12:15] offset:32
502; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[8:11] offset:16
503; CHECK-NEXT:    s_waitcnt vmcnt(0)
504; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[4:7]
505; CHECK-NEXT:    s_endpgm
506entry:
507  tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false)
508  ret void
509}
510
511define amdgpu_kernel void @memcpy_p3_p4_optsize(ptr addrspace(4) %0) #1 {
512; CHECK-LABEL: memcpy_p3_p4_optsize:
513; CHECK:       ; %bb.0: ; %entry
514; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
515; CHECK-NEXT:    v_mov_b32_e32 v24, 0
516; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
517; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1]
518; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1] offset:16
519; CHECK-NEXT:    global_load_dwordx4 v[8:11], v24, s[0:1] offset:32
520; CHECK-NEXT:    global_load_dwordx4 v[12:15], v24, s[0:1] offset:48
521; CHECK-NEXT:    global_load_dwordx4 v[16:19], v24, s[0:1] offset:64
522; CHECK-NEXT:    global_load_dwordx4 v[20:23], v24, s[0:1] offset:80
523; CHECK-NEXT:    s_waitcnt vmcnt(5)
524; CHECK-NEXT:    ds_write2_b64 v24, v[0:1], v[2:3] offset1:1
525; CHECK-NEXT:    s_waitcnt vmcnt(4)
526; CHECK-NEXT:    ds_write2_b64 v24, v[4:5], v[6:7] offset0:2 offset1:3
527; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:96
528; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1] offset:112
529; CHECK-NEXT:    s_waitcnt vmcnt(5)
530; CHECK-NEXT:    ds_write2_b64 v24, v[8:9], v[10:11] offset0:4 offset1:5
531; CHECK-NEXT:    s_waitcnt vmcnt(4)
532; CHECK-NEXT:    ds_write2_b64 v24, v[12:13], v[14:15] offset0:6 offset1:7
533; CHECK-NEXT:    s_waitcnt vmcnt(3)
534; CHECK-NEXT:    ds_write2_b64 v24, v[16:17], v[18:19] offset0:8 offset1:9
535; CHECK-NEXT:    s_waitcnt vmcnt(2)
536; CHECK-NEXT:    ds_write2_b64 v24, v[20:21], v[22:23] offset0:10 offset1:11
537; CHECK-NEXT:    s_waitcnt vmcnt(1)
538; CHECK-NEXT:    ds_write2_b64 v24, v[0:1], v[2:3] offset0:12 offset1:13
539; CHECK-NEXT:    s_waitcnt vmcnt(0)
540; CHECK-NEXT:    ds_write2_b64 v24, v[4:5], v[6:7] offset0:14 offset1:15
541; CHECK-NEXT:    s_endpgm
542entry:
543  tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) @shared, ptr addrspace(4) %0, i64 128, i1 false)
544  ret void
545}
546
547define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
548; CHECK-LABEL: memcpy_p0_p3_optsize:
549; CHECK:       ; %bb.0: ; %entry
550; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
551; CHECK-NEXT:    v_mov_b32_e32 v16, 0
552; CHECK-NEXT:    ds_read2_b64 v[0:3], v16 offset1:1
553; CHECK-NEXT:    ds_read2_b64 v[4:7], v16 offset0:2 offset1:3
554; CHECK-NEXT:    ds_read2_b64 v[8:11], v16 offset0:4 offset1:5
555; CHECK-NEXT:    ds_read2_b64 v[12:15], v16 offset0:6 offset1:7
556; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
557; CHECK-NEXT:    v_mov_b32_e32 v21, s1
558; CHECK-NEXT:    v_mov_b32_e32 v20, s0
559; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[0:3]
560; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[4:7] offset:16
561; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[8:11] offset:32
562; CHECK-NEXT:    ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
563; CHECK-NEXT:    ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
564; CHECK-NEXT:    ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
565; CHECK-NEXT:    ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
566; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[12:15] offset:48
567; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
568; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[0:3] offset:64
569; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[4:7] offset:80
570; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[8:11] offset:96
571; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[16:19] offset:112
572; CHECK-NEXT:    s_endpgm
573entry:
574  tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false)
575  ret void
576}
577
578declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #2
579
580declare void @llvm.memcpy.p0.p5.i64(ptr noalias nocapture writeonly, ptr addrspace(5) noalias nocapture readonly, i64, i1 immarg) #2
581
582declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64, i1 immarg) #2
583
584declare void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2
585
586declare void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2
587
588declare void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2
589
590declare void @llvm.memcpy.p0.p3.i64(ptr noalias nocapture writeonly, ptr addrspace(3) noalias nocapture readonly, i64, i1 immarg) #2
591
592attributes #0 = { minsize "amdgpu-flat-work-group-size"="1024,1024" }
593attributes #1 = { optsize "amdgpu-flat-work-group-size"="1024,1024" }
594attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
595