1; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s 2; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s 3 4declare void @llvm.memcpy.p3.p3.i32(ptr addrspace(3) nocapture, ptr addrspace(3) nocapture, i32, i1) nounwind 5declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) nocapture, ptr addrspace(1) nocapture, i64, i1) nounwind 6declare void @llvm.memcpy.p1.p2.i64(ptr addrspace(1) nocapture, ptr addrspace(4) nocapture, i64, i1) nounwind 7 8 9; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align1: 10; SI-DAG: ds_read_u8 11; SI-DAG: ds_read_u8 12; SI-DAG: ds_read_u8 13; SI-DAG: ds_read_u8 14; SI-DAG: ds_read_u8 15; SI-DAG: ds_read_u8 16; SI-DAG: ds_read_u8 17; SI-DAG: ds_read_u8 18 19; SI-DAG: ds_read_u8 20; SI-DAG: ds_read_u8 21; SI-DAG: ds_read_u8 22; SI-DAG: ds_read_u8 23; SI-DAG: ds_read_u8 24; SI-DAG: ds_read_u8 25; SI-DAG: ds_read_u8 26; SI-DAG: ds_read_u8 27 28; SI-DAG: ds_read_u8 29; SI-DAG: ds_read_u8 30; SI-DAG: ds_read_u8 31; SI-DAG: ds_read_u8 32; SI-DAG: ds_read_u8 33; SI-DAG: ds_read_u8 34; SI-DAG: ds_read_u8 35; SI-DAG: ds_read_u8 36 37; SI-DAG: ds_read_u8 38; SI-DAG: ds_read_u8 39; SI-DAG: ds_read_u8 40; SI-DAG: ds_read_u8 41; SI-DAG: ds_read_u8 42; SI-DAG: ds_read_u8 43; SI-DAG: ds_read_u8 44; SI-DAG: ds_read_u8 45 46; SI-DAG: ds_write_b8 47; SI-DAG: ds_write_b8 48; SI-DAG: ds_write_b8 49; SI-DAG: ds_write_b8 50; SI-DAG: ds_write_b8 51; SI-DAG: ds_write_b8 52; SI-DAG: ds_write_b8 53; SI-DAG: ds_write_b8 54 55; SI-DAG: ds_write_b8 56; SI-DAG: ds_write_b8 57; SI-DAG: ds_write_b8 58; SI-DAG: ds_write_b8 59; SI-DAG: ds_write_b8 60; SI-DAG: ds_write_b8 61; SI-DAG: ds_write_b8 62; SI-DAG: ds_write_b8 63 64; SI-DAG: ds_write_b8 65; SI-DAG: ds_write_b8 66; SI-DAG: ds_write_b8 67; SI-DAG: ds_write_b8 68; SI-DAG: ds_write_b8 69; SI-DAG: ds_write_b8 70; SI-DAG: ds_write_b8 71; SI-DAG: ds_write_b8 72 73; SI-DAG: ds_write_b8 74; SI-DAG: ds_write_b8 75; SI-DAG: ds_write_b8 76; SI-DAG: ds_write_b8 77; SI-DAG: ds_write_b8 78; SI-DAG: ds_write_b8 79; SI-DAG: ds_write_b8 80; SI-DAG: ds_write_b8 81 82; SI: s_endpgm 83define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align1(ptr addrspace(3) noalias %out, ptr addrspace(3) noalias %in) nounwind { 84 call void @llvm.memcpy.p3.p3.i32(ptr addrspace(3) %out, ptr addrspace(3) %in, i32 32, i1 false) nounwind 85 ret void 86} 87 88; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align2: 89; SI-DAG: ds_read_u16 90; SI-DAG: ds_read_u16 91; SI-DAG: ds_read_u16 92; SI-DAG: ds_read_u16 93; SI-DAG: ds_read_u16 94; SI-DAG: ds_read_u16 95; SI-DAG: ds_read_u16 96; SI-DAG: ds_read_u16 97 98; SI-DAG: ds_read_u16 99; SI-DAG: ds_read_u16 100; SI-DAG: ds_read_u16 101; SI-DAG: ds_read_u16 102; SI-DAG: ds_read_u16 103; SI-DAG: ds_read_u16 104; SI-DAG: ds_read_u16 105; SI-DAG: ds_read_u16 106 107; SI-DAG: ds_write_b16 108; SI-DAG: ds_write_b16 109; SI-DAG: ds_write_b16 110; SI-DAG: ds_write_b16 111; SI-DAG: ds_write_b16 112; SI-DAG: ds_write_b16 113; SI-DAG: ds_write_b16 114; SI-DAG: ds_write_b16 115 116; SI-DAG: ds_write_b16 117; SI-DAG: ds_write_b16 118; SI-DAG: ds_write_b16 119; SI-DAG: ds_write_b16 120; SI-DAG: ds_write_b16 121; SI-DAG: ds_write_b16 122; SI-DAG: ds_write_b16 123; SI-DAG: ds_write_b16 124 125; SI: s_endpgm 126define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align2(ptr addrspace(3) noalias %out, ptr addrspace(3) noalias %in) nounwind { 127 call void @llvm.memcpy.p3.p3.i32(ptr addrspace(3) align 2 %out, ptr addrspace(3) align 2 %in, i32 32, i1 false) nounwind 128 ret void 129} 130 131; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align4: 132; SI: ds_read2_b32 133; SI: ds_read2_b32 134; SI: ds_read2_b32 135; SI: ds_read2_b32 136 137; SI: ds_write2_b32 138; SI: ds_write2_b32 139; SI: ds_write2_b32 140; SI: ds_write2_b32 141 142; SI: s_endpgm 143define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align4(ptr addrspace(3) noalias %out, ptr addrspace(3) noalias %in) nounwind { 144 call void @llvm.memcpy.p3.p3.i32(ptr addrspace(3) align 4 %out, ptr addrspace(3) align 4 %in, i32 32, i1 false) nounwind 145 ret void 146} 147 148; FIXME: Use 64-bit ops 149; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align8: 150 151; SI: ds_read2_b64 152; SI: ds_read2_b64 153 154; SI: ds_write2_b64 155; SI: ds_write2_b64 156 157; SI-DAG: s_endpgm 158define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align8(ptr addrspace(3) noalias %out, ptr addrspace(3) noalias %in) nounwind { 159 call void @llvm.memcpy.p3.p3.i32(ptr addrspace(3) align 8 %out, ptr addrspace(3) align 8 %in, i32 32, i1 false) nounwind 160 ret void 161} 162 163; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align1: 164; SI-DAG: buffer_load_ubyte 165; SI-DAG: buffer_store_byte 166; SI-DAG: buffer_load_ubyte 167; SI-DAG: buffer_store_byte 168; SI-DAG: buffer_load_ubyte 169; SI-DAG: buffer_store_byte 170; SI-DAG: buffer_load_ubyte 171; SI-DAG: buffer_store_byte 172; SI-DAG: buffer_load_ubyte 173; SI-DAG: buffer_store_byte 174; SI-DAG: buffer_load_ubyte 175; SI-DAG: buffer_store_byte 176; SI-DAG: buffer_load_ubyte 177; SI-DAG: buffer_store_byte 178; SI-DAG: buffer_load_ubyte 179; SI-DAG: buffer_store_byte 180 181; SI-DAG: buffer_load_ubyte 182; SI-DAG: buffer_store_byte 183; SI-DAG: buffer_load_ubyte 184; SI-DAG: buffer_store_byte 185; SI-DAG: buffer_load_ubyte 186; SI-DAG: buffer_store_byte 187; SI-DAG: buffer_load_ubyte 188; SI-DAG: buffer_store_byte 189; SI-DAG: buffer_load_ubyte 190; SI-DAG: buffer_store_byte 191; SI-DAG: buffer_load_ubyte 192; SI-DAG: buffer_store_byte 193; SI-DAG: buffer_load_ubyte 194; SI-DAG: buffer_store_byte 195; SI-DAG: buffer_load_ubyte 196; SI-DAG: buffer_store_byte 197 198; SI-DAG: buffer_load_ubyte 199; SI-DAG: buffer_store_byte 200; SI-DAG: buffer_load_ubyte 201; SI-DAG: buffer_store_byte 202; SI-DAG: buffer_load_ubyte 203; SI-DAG: buffer_store_byte 204; SI-DAG: buffer_load_ubyte 205; SI-DAG: buffer_store_byte 206; SI-DAG: buffer_load_ubyte 207; SI-DAG: buffer_store_byte 208; SI-DAG: buffer_load_ubyte 209; SI-DAG: buffer_store_byte 210; SI-DAG: buffer_load_ubyte 211; SI-DAG: buffer_store_byte 212; SI-DAG: buffer_load_ubyte 213; SI-DAG: buffer_store_byte 214 215; SI-DAG: buffer_load_ubyte 216; SI-DAG: buffer_store_byte 217; SI-DAG: buffer_load_ubyte 218; SI-DAG: buffer_store_byte 219; SI-DAG: buffer_load_ubyte 220; SI-DAG: buffer_store_byte 221; SI-DAG: buffer_load_ubyte 222; SI-DAG: buffer_store_byte 223; SI-DAG: buffer_load_ubyte 224; SI-DAG: buffer_store_byte 225; SI-DAG: buffer_load_ubyte 226; SI-DAG: buffer_store_byte 227; SI-DAG: buffer_load_ubyte 228; SI-DAG: buffer_store_byte 229; SI-DAG: buffer_load_ubyte 230; SI-DAG: buffer_store_byte 231 232; SI: s_endpgm 233define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 234 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 32, i1 false) nounwind 235 ret void 236} 237 238; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align2: 239; SI-DAG: buffer_load_ushort 240; SI-DAG: buffer_load_ushort 241; SI-DAG: buffer_load_ushort 242; SI-DAG: buffer_load_ushort 243; SI-DAG: buffer_load_ushort 244; SI-DAG: buffer_load_ushort 245; SI-DAG: buffer_load_ushort 246; SI-DAG: buffer_load_ushort 247; SI-DAG: buffer_load_ushort 248; SI-DAG: buffer_load_ushort 249; SI-DAG: buffer_load_ushort 250; SI-DAG: buffer_load_ushort 251; SI-DAG: buffer_load_ushort 252; SI-DAG: buffer_load_ushort 253; SI-DAG: buffer_load_ushort 254; SI-DAG: buffer_load_ushort 255 256; SI-DAG: buffer_store_short 257; SI-DAG: buffer_store_short 258; SI-DAG: buffer_store_short 259; SI-DAG: buffer_store_short 260; SI-DAG: buffer_store_short 261; SI-DAG: buffer_store_short 262; SI-DAG: buffer_store_short 263; SI-DAG: buffer_store_short 264; SI-DAG: buffer_store_short 265; SI-DAG: buffer_store_short 266; SI-DAG: buffer_store_short 267; SI-DAG: buffer_store_short 268; SI-DAG: buffer_store_short 269; SI-DAG: buffer_store_short 270; SI-DAG: buffer_store_short 271; SI-DAG: buffer_store_short 272 273; SI: s_endpgm 274define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align2(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 275 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 2 %out, ptr addrspace(1) align 2 %in, i64 32, i1 false) nounwind 276 ret void 277} 278 279; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align4: 280; SI: buffer_load_dwordx4 281; SI: buffer_load_dwordx4 282; SI: buffer_store_dwordx4 283; SI: buffer_store_dwordx4 284; SI: s_endpgm 285define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 286 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %out, ptr addrspace(1) align 4 %in, i64 32, i1 false) nounwind 287 ret void 288} 289 290; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align8: 291; SI: buffer_load_dwordx4 292; SI: buffer_load_dwordx4 293; SI: buffer_store_dwordx4 294; SI: buffer_store_dwordx4 295; SI: s_endpgm 296define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 297 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 8 %out, ptr addrspace(1) align 8 %in, i64 32, i1 false) nounwind 298 ret void 299} 300 301; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align16: 302; SI: buffer_load_dwordx4 303; SI: buffer_load_dwordx4 304; SI: buffer_store_dwordx4 305; SI: buffer_store_dwordx4 306; SI: s_endpgm 307define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 308 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 16 %out, ptr addrspace(1) align 16 %in, i64 32, i1 false) nounwind 309 ret void 310} 311 312; Test shouldConvertConstantLoadToIntImm 313@hello.align4 = private unnamed_addr addrspace(4) constant [16 x i8] c"constant string\00", align 4 314@hello.align1 = private unnamed_addr addrspace(4) constant [16 x i8] c"constant string\00", align 1 315 316; FUNC-LABEL: {{^}}test_memcpy_const_string_align4: 317; SI: s_getpc_b64 318; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, hello.align4@rel32@lo+4 319; SI: s_addc_u32 320; SI-DAG: s_load_dwordx8 321; SI-DAG: s_load_dwordx2 322; SI-DAG: buffer_store_dwordx4 323; SI-DAG: buffer_store_dwordx4 324define amdgpu_kernel void @test_memcpy_const_string_align4(ptr addrspace(1) noalias %out) nounwind { 325 call void @llvm.memcpy.p1.p2.i64(ptr addrspace(1) align 4 %out, ptr addrspace(4) align 4 @hello.align4, i64 32, i1 false) 326 ret void 327} 328 329; FUNC-LABEL: {{^}}test_memcpy_const_string_align1: 330; SI-NOT: buffer_load 331; SI: v_mov_b32_e32 v{{[0-9]+}}, 0x 332; SI: buffer_store_byte 333; SI: buffer_store_byte 334; SI: buffer_store_byte 335; SI: buffer_store_byte 336; SI: buffer_store_byte 337; SI: buffer_store_byte 338; SI: buffer_store_byte 339; SI: buffer_store_byte 340; SI: buffer_store_byte 341; SI: buffer_store_byte 342; SI: buffer_store_byte 343; SI: buffer_store_byte 344; SI: buffer_store_byte 345; SI: buffer_store_byte 346; SI: buffer_store_byte 347; SI: buffer_store_byte 348define amdgpu_kernel void @test_memcpy_const_string_align1(ptr addrspace(1) noalias %out) nounwind { 349 call void @llvm.memcpy.p1.p2.i64(ptr addrspace(1) %out, ptr addrspace(4) @hello.align1, i64 32, i1 false) 350 ret void 351} 352