1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 %s -o - | FileCheck %s 3 4%struct.S = type { [32 x i32] } 5 6@shared = addrspace(3) global %struct.S undef, align 4 7 8define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0 { 9; CHECK-LABEL: memcpy_p0_p0_minsize: 10; CHECK: ; %bb.0: ; %entry 11; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 12; CHECK-NEXT: s_waitcnt lgkmcnt(0) 13; CHECK-NEXT: v_mov_b32_e32 v12, s3 14; CHECK-NEXT: v_mov_b32_e32 v11, s2 15; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46 16; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44 17; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32 18; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16 19; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12] 20; CHECK-NEXT: v_mov_b32_e32 v12, s1 21; CHECK-NEXT: v_mov_b32_e32 v11, s0 22; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 23; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46 24; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44 25; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32 26; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16 27; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7] 28; CHECK-NEXT: s_endpgm 29entry: 30 tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false) 31 ret void 32} 33 34define amdgpu_kernel void @memcpy_p1_p1_minsize(ptr addrspace(1) %dest, ptr addrspace(1) %src) #0 { 35; CHECK-LABEL: memcpy_p1_p1_minsize: 36; CHECK: ; %bb.0: ; %entry 37; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 38; CHECK-NEXT: v_mov_b32_e32 v12, 0 39; CHECK-NEXT: s_waitcnt lgkmcnt(0) 40; CHECK-NEXT: global_load_dwordx2 v[8:9], v12, s[2:3] offset:32 41; CHECK-NEXT: global_load_dwordx2 v[10:11], v12, s[2:3] offset:39 42; CHECK-NEXT: global_load_dwordx4 v[0:3], v12, s[2:3] 43; CHECK-NEXT: global_load_dwordx4 v[4:7], v12, s[2:3] offset:16 44; CHECK-NEXT: s_waitcnt vmcnt(3) 45; CHECK-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:32 46; CHECK-NEXT: s_waitcnt vmcnt(3) 47; CHECK-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] offset:39 48; CHECK-NEXT: s_waitcnt vmcnt(3) 49; CHECK-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] 50; CHECK-NEXT: s_waitcnt vmcnt(3) 51; CHECK-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16 52; CHECK-NEXT: s_endpgm 53entry: 54 tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 47, i1 false) 55 ret void 56} 57 58define amdgpu_kernel void @memcpy_p1_p4_minsize(ptr addrspace(1) %global, ptr addrspace(4) %0) #0 { 59; CHECK-LABEL: memcpy_p1_p4_minsize: 60; CHECK: ; %bb.0: ; %entry 61; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 62; CHECK-NEXT: v_mov_b32_e32 v32, 0 63; CHECK-NEXT: s_waitcnt lgkmcnt(0) 64; CHECK-NEXT: global_load_dwordx4 v[0:3], v32, s[2:3] 65; CHECK-NEXT: global_load_dwordx4 v[4:7], v32, s[2:3] offset:16 66; CHECK-NEXT: global_load_dwordx4 v[8:11], v32, s[2:3] offset:32 67; CHECK-NEXT: global_load_dwordx4 v[12:15], v32, s[2:3] offset:48 68; CHECK-NEXT: global_load_dwordx4 v[16:19], v32, s[2:3] offset:64 69; CHECK-NEXT: global_load_dwordx4 v[20:23], v32, s[2:3] offset:80 70; CHECK-NEXT: global_load_dwordx4 v[24:27], v32, s[2:3] offset:96 71; CHECK-NEXT: global_load_dwordx4 v[28:31], v32, s[2:3] offset:112 72; CHECK-NEXT: s_waitcnt vmcnt(7) 73; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] 74; CHECK-NEXT: s_waitcnt vmcnt(7) 75; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 76; CHECK-NEXT: s_waitcnt vmcnt(7) 77; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 78; CHECK-NEXT: s_waitcnt vmcnt(7) 79; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 80; CHECK-NEXT: s_waitcnt vmcnt(7) 81; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 82; CHECK-NEXT: s_waitcnt vmcnt(7) 83; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 84; CHECK-NEXT: s_waitcnt vmcnt(7) 85; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 86; CHECK-NEXT: s_waitcnt vmcnt(7) 87; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 88; CHECK-NEXT: s_endpgm 89entry: 90 tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) %global, ptr addrspace(4) %0, i64 128, i1 false) 91 ret void 92} 93 94define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #0 { 95; CHECK-LABEL: memcpy_p5_p4_minsize: 96; CHECK: ; %bb.0: ; %entry 97; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] 98; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] 99; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 100; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0 101; CHECK-NEXT: v_mov_b32_e32 v24, 0 102; CHECK-NEXT: s_add_u32 s16, s16, s15 103; CHECK-NEXT: s_waitcnt lgkmcnt(0) 104; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112 105; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96 106; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:80 107; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64 108; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48 109; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32 110; CHECK-NEXT: s_addc_u32 s17, s17, 0 111; CHECK-NEXT: v_mov_b32_e32 v25, s2 112; CHECK-NEXT: s_waitcnt vmcnt(5) 113; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:124 114; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:120 115; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:116 116; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:112 117; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16 118; CHECK-NEXT: s_waitcnt vmcnt(9) 119; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:108 120; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:104 121; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:100 122; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen offset:96 123; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] 124; CHECK-NEXT: s_waitcnt vmcnt(13) 125; CHECK-NEXT: buffer_store_dword v11, v25, s[16:19], 0 offen offset:92 126; CHECK-NEXT: buffer_store_dword v10, v25, s[16:19], 0 offen offset:88 127; CHECK-NEXT: buffer_store_dword v9, v25, s[16:19], 0 offen offset:84 128; CHECK-NEXT: buffer_store_dword v8, v25, s[16:19], 0 offen offset:80 129; CHECK-NEXT: s_waitcnt vmcnt(16) 130; CHECK-NEXT: buffer_store_dword v15, v25, s[16:19], 0 offen offset:76 131; CHECK-NEXT: buffer_store_dword v14, v25, s[16:19], 0 offen offset:72 132; CHECK-NEXT: buffer_store_dword v13, v25, s[16:19], 0 offen offset:68 133; CHECK-NEXT: buffer_store_dword v12, v25, s[16:19], 0 offen offset:64 134; CHECK-NEXT: s_waitcnt vmcnt(19) 135; CHECK-NEXT: buffer_store_dword v19, v25, s[16:19], 0 offen offset:60 136; CHECK-NEXT: buffer_store_dword v18, v25, s[16:19], 0 offen offset:56 137; CHECK-NEXT: buffer_store_dword v17, v25, s[16:19], 0 offen offset:52 138; CHECK-NEXT: buffer_store_dword v16, v25, s[16:19], 0 offen offset:48 139; CHECK-NEXT: s_waitcnt vmcnt(22) 140; CHECK-NEXT: buffer_store_dword v23, v25, s[16:19], 0 offen offset:44 141; CHECK-NEXT: buffer_store_dword v22, v25, s[16:19], 0 offen offset:40 142; CHECK-NEXT: buffer_store_dword v21, v25, s[16:19], 0 offen offset:36 143; CHECK-NEXT: buffer_store_dword v20, v25, s[16:19], 0 offen offset:32 144; CHECK-NEXT: s_waitcnt vmcnt(21) 145; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:28 146; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:24 147; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:20 148; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:16 149; CHECK-NEXT: s_waitcnt vmcnt(20) 150; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:12 151; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:8 152; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:4 153; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen 154; CHECK-NEXT: s_endpgm 155entry: 156 tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false) 157 ret void 158} 159 160define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %src) #0 { 161; CHECK-LABEL: memcpy_p0_p5_minsize: 162; CHECK: ; %bb.0: ; %entry 163; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] 164; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] 165; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8 166; CHECK-NEXT: s_add_u32 s16, s16, s15 167; CHECK-NEXT: s_addc_u32 s17, s17, 0 168; CHECK-NEXT: s_waitcnt lgkmcnt(0) 169; CHECK-NEXT: v_mov_b32_e32 v26, s0 170; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:124 171; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:120 172; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:116 173; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:112 174; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:108 175; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:104 176; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100 177; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96 178; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 179; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16 180; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20 181; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24 182; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:28 183; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:32 184; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:36 185; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:40 186; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:44 187; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:48 188; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:52 189; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:56 190; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:60 191; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:92 192; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:88 193; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:84 194; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:80 195; CHECK-NEXT: s_waitcnt lgkmcnt(0) 196; CHECK-NEXT: v_mov_b32_e32 v25, s1 197; CHECK-NEXT: v_mov_b32_e32 v24, s0 198; CHECK-NEXT: s_waitcnt vmcnt(20) 199; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112 200; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:76 201; CHECK-NEXT: s_nop 0 202; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:72 203; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:68 204; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:64 205; CHECK-NEXT: s_waitcnt vmcnt(0) 206; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96 207; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen 208; CHECK-NEXT: s_nop 0 209; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:4 210; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:8 211; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:12 212; CHECK-NEXT: s_nop 0 213; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80 214; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:64 215; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:48 216; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:32 217; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:16 218; CHECK-NEXT: s_waitcnt vmcnt(0) 219; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] 220; CHECK-NEXT: s_endpgm 221entry: 222 tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false) 223 ret void 224} 225 226define amdgpu_kernel void @memcpy_p3_p4_minsize(ptr addrspace(4) %0) #0 { 227; CHECK-LABEL: memcpy_p3_p4_minsize: 228; CHECK: ; %bb.0: ; %entry 229; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 230; CHECK-NEXT: v_mov_b32_e32 v24, 0 231; CHECK-NEXT: s_waitcnt lgkmcnt(0) 232; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] 233; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:16 234; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:32 235; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:48 236; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:64 237; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:80 238; CHECK-NEXT: s_waitcnt vmcnt(5) 239; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset1:1 240; CHECK-NEXT: s_waitcnt vmcnt(4) 241; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset0:2 offset1:3 242; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:96 243; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:112 244; CHECK-NEXT: s_waitcnt vmcnt(5) 245; CHECK-NEXT: ds_write2_b64 v24, v[8:9], v[10:11] offset0:4 offset1:5 246; CHECK-NEXT: s_waitcnt vmcnt(4) 247; CHECK-NEXT: ds_write2_b64 v24, v[12:13], v[14:15] offset0:6 offset1:7 248; CHECK-NEXT: s_waitcnt vmcnt(3) 249; CHECK-NEXT: ds_write2_b64 v24, v[16:17], v[18:19] offset0:8 offset1:9 250; CHECK-NEXT: s_waitcnt vmcnt(2) 251; CHECK-NEXT: ds_write2_b64 v24, v[20:21], v[22:23] offset0:10 offset1:11 252; CHECK-NEXT: s_waitcnt vmcnt(1) 253; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:12 offset1:13 254; CHECK-NEXT: s_waitcnt vmcnt(0) 255; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset0:14 offset1:15 256; CHECK-NEXT: s_endpgm 257entry: 258 tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) @shared, ptr addrspace(4) %0, i64 128, i1 false) 259 ret void 260} 261 262define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 { 263; CHECK-LABEL: memcpy_p0_p3_minsize: 264; CHECK: ; %bb.0: ; %entry 265; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 266; CHECK-NEXT: v_mov_b32_e32 v16, 0 267; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset1:1 268; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:2 offset1:3 269; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:4 offset1:5 270; CHECK-NEXT: ds_read2_b64 v[12:15], v16 offset0:6 offset1:7 271; CHECK-NEXT: s_waitcnt lgkmcnt(0) 272; CHECK-NEXT: v_mov_b32_e32 v21, s1 273; CHECK-NEXT: v_mov_b32_e32 v20, s0 274; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] 275; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[4:7] offset:16 276; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32 277; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9 278; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11 279; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13 280; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15 281; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48 282; CHECK-NEXT: s_waitcnt lgkmcnt(0) 283; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64 284; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[4:7] offset:80 285; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:96 286; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[16:19] offset:112 287; CHECK-NEXT: s_endpgm 288entry: 289 tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false) 290 ret void 291} 292 293define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 { 294; CHECK-LABEL: memcpy_p0_p0_optsize: 295; CHECK: ; %bb.0: ; %entry 296; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 297; CHECK-NEXT: s_waitcnt lgkmcnt(0) 298; CHECK-NEXT: v_mov_b32_e32 v12, s3 299; CHECK-NEXT: v_mov_b32_e32 v11, s2 300; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46 301; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44 302; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32 303; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16 304; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12] 305; CHECK-NEXT: v_mov_b32_e32 v12, s1 306; CHECK-NEXT: v_mov_b32_e32 v11, s0 307; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 308; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46 309; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44 310; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32 311; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16 312; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7] 313; CHECK-NEXT: s_endpgm 314entry: 315 tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false) 316 ret void 317} 318 319define amdgpu_kernel void @memcpy_p1_p1_optsize(ptr addrspace(1) %dest, ptr addrspace(1) %src) #1 { 320; CHECK-LABEL: memcpy_p1_p1_optsize: 321; CHECK: ; %bb.0: ; %entry 322; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 323; CHECK-NEXT: v_mov_b32_e32 v12, 0 324; CHECK-NEXT: s_waitcnt lgkmcnt(0) 325; CHECK-NEXT: global_load_dwordx2 v[8:9], v12, s[2:3] offset:32 326; CHECK-NEXT: global_load_dwordx2 v[10:11], v12, s[2:3] offset:39 327; CHECK-NEXT: global_load_dwordx4 v[0:3], v12, s[2:3] 328; CHECK-NEXT: global_load_dwordx4 v[4:7], v12, s[2:3] offset:16 329; CHECK-NEXT: s_waitcnt vmcnt(3) 330; CHECK-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:32 331; CHECK-NEXT: s_waitcnt vmcnt(3) 332; CHECK-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] offset:39 333; CHECK-NEXT: s_waitcnt vmcnt(3) 334; CHECK-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] 335; CHECK-NEXT: s_waitcnt vmcnt(3) 336; CHECK-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16 337; CHECK-NEXT: s_endpgm 338entry: 339 tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 47, i1 false) 340 ret void 341} 342 343define amdgpu_kernel void @memcpy_p1_p4_optsize(ptr addrspace(1) %global, ptr addrspace(4) %0) #1 { 344; CHECK-LABEL: memcpy_p1_p4_optsize: 345; CHECK: ; %bb.0: ; %entry 346; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 347; CHECK-NEXT: v_mov_b32_e32 v32, 0 348; CHECK-NEXT: s_waitcnt lgkmcnt(0) 349; CHECK-NEXT: global_load_dwordx4 v[0:3], v32, s[2:3] 350; CHECK-NEXT: global_load_dwordx4 v[4:7], v32, s[2:3] offset:16 351; CHECK-NEXT: global_load_dwordx4 v[8:11], v32, s[2:3] offset:32 352; CHECK-NEXT: global_load_dwordx4 v[12:15], v32, s[2:3] offset:48 353; CHECK-NEXT: global_load_dwordx4 v[16:19], v32, s[2:3] offset:64 354; CHECK-NEXT: global_load_dwordx4 v[20:23], v32, s[2:3] offset:80 355; CHECK-NEXT: global_load_dwordx4 v[24:27], v32, s[2:3] offset:96 356; CHECK-NEXT: global_load_dwordx4 v[28:31], v32, s[2:3] offset:112 357; CHECK-NEXT: s_waitcnt vmcnt(7) 358; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] 359; CHECK-NEXT: s_waitcnt vmcnt(7) 360; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 361; CHECK-NEXT: s_waitcnt vmcnt(7) 362; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 363; CHECK-NEXT: s_waitcnt vmcnt(7) 364; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 365; CHECK-NEXT: s_waitcnt vmcnt(7) 366; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 367; CHECK-NEXT: s_waitcnt vmcnt(7) 368; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 369; CHECK-NEXT: s_waitcnt vmcnt(7) 370; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 371; CHECK-NEXT: s_waitcnt vmcnt(7) 372; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 373; CHECK-NEXT: s_endpgm 374entry: 375 tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) %global, ptr addrspace(4) %0, i64 128, i1 false) 376 ret void 377} 378 379define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #1 { 380; CHECK-LABEL: memcpy_p5_p4_optsize: 381; CHECK: ; %bb.0: ; %entry 382; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] 383; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] 384; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 385; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0 386; CHECK-NEXT: v_mov_b32_e32 v24, 0 387; CHECK-NEXT: s_add_u32 s16, s16, s15 388; CHECK-NEXT: s_waitcnt lgkmcnt(0) 389; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112 390; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96 391; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:80 392; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64 393; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48 394; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32 395; CHECK-NEXT: s_addc_u32 s17, s17, 0 396; CHECK-NEXT: v_mov_b32_e32 v25, s2 397; CHECK-NEXT: s_waitcnt vmcnt(5) 398; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:124 399; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:120 400; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:116 401; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:112 402; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16 403; CHECK-NEXT: s_waitcnt vmcnt(9) 404; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:108 405; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:104 406; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:100 407; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen offset:96 408; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] 409; CHECK-NEXT: s_waitcnt vmcnt(13) 410; CHECK-NEXT: buffer_store_dword v11, v25, s[16:19], 0 offen offset:92 411; CHECK-NEXT: buffer_store_dword v10, v25, s[16:19], 0 offen offset:88 412; CHECK-NEXT: buffer_store_dword v9, v25, s[16:19], 0 offen offset:84 413; CHECK-NEXT: buffer_store_dword v8, v25, s[16:19], 0 offen offset:80 414; CHECK-NEXT: s_waitcnt vmcnt(16) 415; CHECK-NEXT: buffer_store_dword v15, v25, s[16:19], 0 offen offset:76 416; CHECK-NEXT: buffer_store_dword v14, v25, s[16:19], 0 offen offset:72 417; CHECK-NEXT: buffer_store_dword v13, v25, s[16:19], 0 offen offset:68 418; CHECK-NEXT: buffer_store_dword v12, v25, s[16:19], 0 offen offset:64 419; CHECK-NEXT: s_waitcnt vmcnt(19) 420; CHECK-NEXT: buffer_store_dword v19, v25, s[16:19], 0 offen offset:60 421; CHECK-NEXT: buffer_store_dword v18, v25, s[16:19], 0 offen offset:56 422; CHECK-NEXT: buffer_store_dword v17, v25, s[16:19], 0 offen offset:52 423; CHECK-NEXT: buffer_store_dword v16, v25, s[16:19], 0 offen offset:48 424; CHECK-NEXT: s_waitcnt vmcnt(22) 425; CHECK-NEXT: buffer_store_dword v23, v25, s[16:19], 0 offen offset:44 426; CHECK-NEXT: buffer_store_dword v22, v25, s[16:19], 0 offen offset:40 427; CHECK-NEXT: buffer_store_dword v21, v25, s[16:19], 0 offen offset:36 428; CHECK-NEXT: buffer_store_dword v20, v25, s[16:19], 0 offen offset:32 429; CHECK-NEXT: s_waitcnt vmcnt(21) 430; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:28 431; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:24 432; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:20 433; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:16 434; CHECK-NEXT: s_waitcnt vmcnt(20) 435; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:12 436; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:8 437; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:4 438; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen 439; CHECK-NEXT: s_endpgm 440entry: 441 tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false) 442 ret void 443} 444 445define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %src) #1 { 446; CHECK-LABEL: memcpy_p0_p5_optsize: 447; CHECK: ; %bb.0: ; %entry 448; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] 449; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] 450; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8 451; CHECK-NEXT: s_add_u32 s16, s16, s15 452; CHECK-NEXT: s_addc_u32 s17, s17, 0 453; CHECK-NEXT: s_waitcnt lgkmcnt(0) 454; CHECK-NEXT: v_mov_b32_e32 v26, s0 455; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:124 456; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:120 457; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:116 458; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:112 459; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:108 460; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:104 461; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100 462; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96 463; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 464; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16 465; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20 466; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24 467; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:28 468; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:32 469; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:36 470; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:40 471; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:44 472; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:48 473; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:52 474; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:56 475; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:60 476; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:92 477; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:88 478; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:84 479; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:80 480; CHECK-NEXT: s_waitcnt lgkmcnt(0) 481; CHECK-NEXT: v_mov_b32_e32 v25, s1 482; CHECK-NEXT: v_mov_b32_e32 v24, s0 483; CHECK-NEXT: s_waitcnt vmcnt(20) 484; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112 485; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:76 486; CHECK-NEXT: s_nop 0 487; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:72 488; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:68 489; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:64 490; CHECK-NEXT: s_waitcnt vmcnt(0) 491; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96 492; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen 493; CHECK-NEXT: s_nop 0 494; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:4 495; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:8 496; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:12 497; CHECK-NEXT: s_nop 0 498; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80 499; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:64 500; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:48 501; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:32 502; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:16 503; CHECK-NEXT: s_waitcnt vmcnt(0) 504; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] 505; CHECK-NEXT: s_endpgm 506entry: 507 tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false) 508 ret void 509} 510 511define amdgpu_kernel void @memcpy_p3_p4_optsize(ptr addrspace(4) %0) #1 { 512; CHECK-LABEL: memcpy_p3_p4_optsize: 513; CHECK: ; %bb.0: ; %entry 514; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 515; CHECK-NEXT: v_mov_b32_e32 v24, 0 516; CHECK-NEXT: s_waitcnt lgkmcnt(0) 517; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] 518; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:16 519; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:32 520; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:48 521; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:64 522; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:80 523; CHECK-NEXT: s_waitcnt vmcnt(5) 524; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset1:1 525; CHECK-NEXT: s_waitcnt vmcnt(4) 526; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset0:2 offset1:3 527; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:96 528; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:112 529; CHECK-NEXT: s_waitcnt vmcnt(5) 530; CHECK-NEXT: ds_write2_b64 v24, v[8:9], v[10:11] offset0:4 offset1:5 531; CHECK-NEXT: s_waitcnt vmcnt(4) 532; CHECK-NEXT: ds_write2_b64 v24, v[12:13], v[14:15] offset0:6 offset1:7 533; CHECK-NEXT: s_waitcnt vmcnt(3) 534; CHECK-NEXT: ds_write2_b64 v24, v[16:17], v[18:19] offset0:8 offset1:9 535; CHECK-NEXT: s_waitcnt vmcnt(2) 536; CHECK-NEXT: ds_write2_b64 v24, v[20:21], v[22:23] offset0:10 offset1:11 537; CHECK-NEXT: s_waitcnt vmcnt(1) 538; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:12 offset1:13 539; CHECK-NEXT: s_waitcnt vmcnt(0) 540; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset0:14 offset1:15 541; CHECK-NEXT: s_endpgm 542entry: 543 tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) @shared, ptr addrspace(4) %0, i64 128, i1 false) 544 ret void 545} 546 547define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 { 548; CHECK-LABEL: memcpy_p0_p3_optsize: 549; CHECK: ; %bb.0: ; %entry 550; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 551; CHECK-NEXT: v_mov_b32_e32 v16, 0 552; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset1:1 553; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:2 offset1:3 554; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:4 offset1:5 555; CHECK-NEXT: ds_read2_b64 v[12:15], v16 offset0:6 offset1:7 556; CHECK-NEXT: s_waitcnt lgkmcnt(0) 557; CHECK-NEXT: v_mov_b32_e32 v21, s1 558; CHECK-NEXT: v_mov_b32_e32 v20, s0 559; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] 560; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[4:7] offset:16 561; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32 562; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9 563; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11 564; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13 565; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15 566; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48 567; CHECK-NEXT: s_waitcnt lgkmcnt(0) 568; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64 569; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[4:7] offset:80 570; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:96 571; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[16:19] offset:112 572; CHECK-NEXT: s_endpgm 573entry: 574 tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false) 575 ret void 576} 577 578declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #2 579 580declare void @llvm.memcpy.p0.p5.i64(ptr noalias nocapture writeonly, ptr addrspace(5) noalias nocapture readonly, i64, i1 immarg) #2 581 582declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64, i1 immarg) #2 583 584declare void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2 585 586declare void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2 587 588declare void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2 589 590declare void @llvm.memcpy.p0.p3.i64(ptr noalias nocapture writeonly, ptr addrspace(3) noalias nocapture readonly, i64, i1 immarg) #2 591 592attributes #0 = { minsize "amdgpu-flat-work-group-size"="1024,1024" } 593attributes #1 = { optsize "amdgpu-flat-work-group-size"="1024,1024" } 594attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } 595