1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=SI %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-sdwa-peephole=0 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=VI %s 4 5declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 6declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone 7 8define amdgpu_kernel void @test_copy_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { 9; SI-LABEL: test_copy_v4i8: 10; SI: ; %bb.0: 11; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 12; SI-NEXT: s_mov_b32 s7, 0xf000 13; SI-NEXT: s_mov_b32 s10, 0 14; SI-NEXT: s_mov_b32 s11, s7 15; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 16; SI-NEXT: s_waitcnt lgkmcnt(0) 17; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 18; SI-NEXT: v_mov_b32_e32 v1, 0 19; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 20; SI-NEXT: s_mov_b32 s6, -1 21; SI-NEXT: s_mov_b32 s4, s0 22; SI-NEXT: s_mov_b32 s5, s1 23; SI-NEXT: s_waitcnt vmcnt(0) 24; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 25; SI-NEXT: s_endpgm 26; 27; VI-LABEL: test_copy_v4i8: 28; VI: ; %bb.0: 29; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 30; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 31; VI-NEXT: s_waitcnt lgkmcnt(0) 32; VI-NEXT: v_mov_b32_e32 v1, s3 33; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 34; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 35; VI-NEXT: flat_load_dword v0, v[0:1] 36; VI-NEXT: s_mov_b32 s3, 0xf000 37; VI-NEXT: s_mov_b32 s2, -1 38; VI-NEXT: s_waitcnt vmcnt(0) 39; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 40; VI-NEXT: s_endpgm 41 %tid.x = call i32 @llvm.amdgcn.workitem.id.x() 42 %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x 43 %val = load <4 x i8>, ptr addrspace(1) %gep, align 4 44 store <4 x i8> %val, ptr addrspace(1) %out, align 4 45 ret void 46} 47 48define amdgpu_kernel void @test_copy_v4i8_x2(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) nounwind { 49; SI-LABEL: test_copy_v4i8_x2: 50; SI: ; %bb.0: 51; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 52; SI-NEXT: s_mov_b32 s7, 0xf000 53; SI-NEXT: s_mov_b32 s2, 0 54; SI-NEXT: s_mov_b32 s3, s7 55; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 56; SI-NEXT: v_mov_b32_e32 v1, 0 57; SI-NEXT: s_waitcnt lgkmcnt(0) 58; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 59; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 60; SI-NEXT: s_mov_b32 s6, -1 61; SI-NEXT: s_mov_b32 s10, s6 62; SI-NEXT: s_mov_b32 s11, s7 63; SI-NEXT: s_waitcnt lgkmcnt(0) 64; SI-NEXT: s_mov_b32 s4, s0 65; SI-NEXT: s_mov_b32 s5, s1 66; SI-NEXT: s_mov_b32 s8, s2 67; SI-NEXT: s_mov_b32 s9, s3 68; SI-NEXT: s_waitcnt vmcnt(0) 69; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 70; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 71; SI-NEXT: s_endpgm 72; 73; VI-LABEL: test_copy_v4i8_x2: 74; VI: ; %bb.0: 75; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 76; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 77; VI-NEXT: s_mov_b32 s7, 0xf000 78; VI-NEXT: s_mov_b32 s6, -1 79; VI-NEXT: s_mov_b32 s10, s6 80; VI-NEXT: s_waitcnt lgkmcnt(0) 81; VI-NEXT: v_mov_b32_e32 v1, s1 82; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 83; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 84; VI-NEXT: flat_load_dword v0, v[0:1] 85; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 86; VI-NEXT: s_mov_b32 s11, s7 87; VI-NEXT: s_waitcnt lgkmcnt(0) 88; VI-NEXT: s_mov_b32 s4, s0 89; VI-NEXT: s_mov_b32 s5, s1 90; VI-NEXT: s_mov_b32 s8, s2 91; VI-NEXT: s_mov_b32 s9, s3 92; VI-NEXT: s_waitcnt vmcnt(0) 93; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 94; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 95; VI-NEXT: s_endpgm 96 %tid.x = call i32 @llvm.amdgcn.workitem.id.x() 97 %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x 98 %val = load <4 x i8>, ptr addrspace(1) %gep, align 4 99 store <4 x i8> %val, ptr addrspace(1) %out0, align 4 100 store <4 x i8> %val, ptr addrspace(1) %out1, align 4 101 ret void 102} 103 104define amdgpu_kernel void @test_copy_v4i8_x3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %in) nounwind { 105; SI-LABEL: test_copy_v4i8_x3: 106; SI: ; %bb.0: 107; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 108; SI-NEXT: s_mov_b32 s11, 0xf000 109; SI-NEXT: s_mov_b32 s14, 0 110; SI-NEXT: s_mov_b32 s15, s11 111; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 112; SI-NEXT: s_waitcnt lgkmcnt(0) 113; SI-NEXT: s_mov_b64 s[12:13], s[6:7] 114; SI-NEXT: v_mov_b32_e32 v1, 0 115; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 116; SI-NEXT: s_mov_b32 s10, -1 117; SI-NEXT: s_mov_b32 s8, s0 118; SI-NEXT: s_mov_b32 s9, s1 119; SI-NEXT: s_mov_b32 s14, s10 120; SI-NEXT: s_mov_b32 s6, s10 121; SI-NEXT: s_mov_b32 s7, s11 122; SI-NEXT: s_mov_b32 s12, s2 123; SI-NEXT: s_mov_b32 s13, s3 124; SI-NEXT: s_waitcnt vmcnt(0) 125; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 126; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 127; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 128; SI-NEXT: s_endpgm 129; 130; VI-LABEL: test_copy_v4i8_x3: 131; VI: ; %bb.0: 132; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 133; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 134; VI-NEXT: s_mov_b32 s11, 0xf000 135; VI-NEXT: s_mov_b32 s10, -1 136; VI-NEXT: s_mov_b32 s14, s10 137; VI-NEXT: s_waitcnt lgkmcnt(0) 138; VI-NEXT: v_mov_b32_e32 v1, s7 139; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 140; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 141; VI-NEXT: flat_load_dword v0, v[0:1] 142; VI-NEXT: s_mov_b32 s8, s0 143; VI-NEXT: s_mov_b32 s9, s1 144; VI-NEXT: s_mov_b32 s15, s11 145; VI-NEXT: s_mov_b32 s6, s10 146; VI-NEXT: s_mov_b32 s7, s11 147; VI-NEXT: s_mov_b32 s12, s2 148; VI-NEXT: s_mov_b32 s13, s3 149; VI-NEXT: s_waitcnt vmcnt(0) 150; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 151; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0 152; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 153; VI-NEXT: s_endpgm 154 %tid.x = call i32 @llvm.amdgcn.workitem.id.x() 155 %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x 156 %val = load <4 x i8>, ptr addrspace(1) %gep, align 4 157 store <4 x i8> %val, ptr addrspace(1) %out0, align 4 158 store <4 x i8> %val, ptr addrspace(1) %out1, align 4 159 store <4 x i8> %val, ptr addrspace(1) %out2, align 4 160 ret void 161} 162 163define amdgpu_kernel void @test_copy_v4i8_x4(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3, ptr addrspace(1) %in) nounwind { 164; SI-LABEL: test_copy_v4i8_x4: 165; SI: ; %bb.0: 166; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x11 167; SI-NEXT: s_mov_b32 s3, 0xf000 168; SI-NEXT: s_mov_b32 s10, 0 169; SI-NEXT: s_mov_b32 s11, s3 170; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 171; SI-NEXT: v_mov_b32_e32 v1, 0 172; SI-NEXT: s_waitcnt lgkmcnt(0) 173; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 174; SI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9 175; SI-NEXT: s_mov_b32 s2, -1 176; SI-NEXT: s_mov_b32 s14, s2 177; SI-NEXT: s_mov_b32 s15, s3 178; SI-NEXT: s_mov_b32 s18, s2 179; SI-NEXT: s_waitcnt lgkmcnt(0) 180; SI-NEXT: s_mov_b32 s0, s4 181; SI-NEXT: s_mov_b32 s1, s5 182; SI-NEXT: s_mov_b32 s19, s3 183; SI-NEXT: s_mov_b32 s22, s2 184; SI-NEXT: s_mov_b32 s23, s3 185; SI-NEXT: s_mov_b32 s12, s6 186; SI-NEXT: s_mov_b32 s13, s7 187; SI-NEXT: s_mov_b32 s16, s8 188; SI-NEXT: s_mov_b32 s17, s9 189; SI-NEXT: s_mov_b32 s20, s10 190; SI-NEXT: s_mov_b32 s21, s11 191; SI-NEXT: s_waitcnt vmcnt(0) 192; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 193; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 194; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0 195; SI-NEXT: buffer_store_dword v0, off, s[20:23], 0 196; SI-NEXT: s_endpgm 197; 198; VI-LABEL: test_copy_v4i8_x4: 199; VI: ; %bb.0: 200; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 201; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 202; VI-NEXT: s_mov_b32 s11, 0xf000 203; VI-NEXT: s_mov_b32 s10, -1 204; VI-NEXT: s_mov_b32 s14, s10 205; VI-NEXT: s_waitcnt lgkmcnt(0) 206; VI-NEXT: v_mov_b32_e32 v1, s1 207; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 208; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 209; VI-NEXT: flat_load_dword v0, v[0:1] 210; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 211; VI-NEXT: s_mov_b32 s15, s11 212; VI-NEXT: s_mov_b32 s18, s10 213; VI-NEXT: s_mov_b32 s19, s11 214; VI-NEXT: s_mov_b32 s22, s10 215; VI-NEXT: s_waitcnt lgkmcnt(0) 216; VI-NEXT: s_mov_b32 s8, s0 217; VI-NEXT: s_mov_b32 s9, s1 218; VI-NEXT: s_mov_b32 s23, s11 219; VI-NEXT: s_mov_b32 s12, s2 220; VI-NEXT: s_mov_b32 s13, s3 221; VI-NEXT: s_mov_b32 s16, s4 222; VI-NEXT: s_mov_b32 s17, s5 223; VI-NEXT: s_mov_b32 s20, s6 224; VI-NEXT: s_mov_b32 s21, s7 225; VI-NEXT: s_waitcnt vmcnt(0) 226; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 227; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0 228; VI-NEXT: buffer_store_dword v0, off, s[16:19], 0 229; VI-NEXT: buffer_store_dword v0, off, s[20:23], 0 230; VI-NEXT: s_endpgm 231 %tid.x = call i32 @llvm.amdgcn.workitem.id.x() 232 %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x 233 %val = load <4 x i8>, ptr addrspace(1) %gep, align 4 234 store <4 x i8> %val, ptr addrspace(1) %out0, align 4 235 store <4 x i8> %val, ptr addrspace(1) %out1, align 4 236 store <4 x i8> %val, ptr addrspace(1) %out2, align 4 237 store <4 x i8> %val, ptr addrspace(1) %out3, align 4 238 ret void 239} 240 241define amdgpu_kernel void @test_copy_v4i8_extra_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) nounwind { 242; SI-LABEL: test_copy_v4i8_extra_use: 243; SI: ; %bb.0: 244; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 245; SI-NEXT: s_mov_b32 s3, 0xf000 246; SI-NEXT: s_mov_b32 s10, 0 247; SI-NEXT: s_mov_b32 s11, s3 248; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 249; SI-NEXT: v_mov_b32_e32 v1, 0 250; SI-NEXT: s_waitcnt lgkmcnt(0) 251; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 252; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 253; SI-NEXT: s_mov_b32 s2, -1 254; SI-NEXT: s_mov_b32 s10, s2 255; SI-NEXT: s_waitcnt lgkmcnt(0) 256; SI-NEXT: s_mov_b32 s0, s4 257; SI-NEXT: s_mov_b32 s1, s5 258; SI-NEXT: s_mov_b32 s8, s6 259; SI-NEXT: s_mov_b32 s9, s7 260; SI-NEXT: s_waitcnt vmcnt(0) 261; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 262; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0 263; SI-NEXT: v_and_b32_e32 v2, 0xff00, v0 264; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 265; SI-NEXT: v_and_b32_e32 v4, 0xff00, v1 266; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1 267; SI-NEXT: v_or_b32_e32 v2, v2, v3 268; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 269; SI-NEXT: v_add_i32_e32 v2, vcc, 0x900, v2 270; SI-NEXT: v_or_b32_e32 v1, v4, v1 271; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 272; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 273; SI-NEXT: v_or_b32_e32 v1, v1, v2 274; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1 275; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 276; SI-NEXT: buffer_store_dword v1, off, s[8:11], 0 277; SI-NEXT: s_endpgm 278; 279; VI-LABEL: test_copy_v4i8_extra_use: 280; VI: ; %bb.0: 281; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 282; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 283; VI-NEXT: s_mov_b32 s7, 0xf000 284; VI-NEXT: s_mov_b32 s6, -1 285; VI-NEXT: s_mov_b32 s10, s6 286; VI-NEXT: s_waitcnt lgkmcnt(0) 287; VI-NEXT: v_mov_b32_e32 v1, s1 288; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 289; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 290; VI-NEXT: flat_load_dword v0, v[0:1] 291; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 292; VI-NEXT: s_mov_b32 s11, s7 293; VI-NEXT: s_waitcnt lgkmcnt(0) 294; VI-NEXT: s_mov_b32 s4, s0 295; VI-NEXT: s_mov_b32 s5, s1 296; VI-NEXT: s_mov_b32 s8, s2 297; VI-NEXT: s_mov_b32 s9, s3 298; VI-NEXT: s_waitcnt vmcnt(0) 299; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 300; VI-NEXT: v_and_b32_e32 v4, 0xffffff00, v1 301; VI-NEXT: v_add_u16_e32 v1, 9, v1 302; VI-NEXT: v_add_u16_e32 v3, 9, v0 303; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 304; VI-NEXT: v_and_b32_e32 v2, 0xffffff00, v0 305; VI-NEXT: v_and_b32_e32 v3, 0xff, v3 306; VI-NEXT: v_or_b32_e32 v1, v4, v1 307; VI-NEXT: v_or_b32_e32 v2, v2, v3 308; VI-NEXT: v_add_u16_e32 v1, 0x900, v1 309; VI-NEXT: v_add_u16_e32 v2, 0x900, v2 310; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 311; VI-NEXT: v_or_b32_e32 v1, v2, v1 312; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 313; VI-NEXT: buffer_store_dword v1, off, s[8:11], 0 314; VI-NEXT: s_endpgm 315 %tid.x = call i32 @llvm.amdgcn.workitem.id.x() 316 %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x 317 %val = load <4 x i8>, ptr addrspace(1) %gep, align 4 318 %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9> 319 store <4 x i8> %val, ptr addrspace(1) %out0, align 4 320 store <4 x i8> %add, ptr addrspace(1) %out1, align 4 321 ret void 322} 323 324; FIXME: Need to handle non-uniform case for function below (load without gep). 325define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %in) nounwind { 326; SI-LABEL: test_copy_v4i8_x2_extra_use: 327; SI: ; %bb.0: 328; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 329; SI-NEXT: s_mov_b32 s11, 0xf000 330; SI-NEXT: s_mov_b32 s14, 0 331; SI-NEXT: s_mov_b32 s15, s11 332; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 333; SI-NEXT: s_waitcnt lgkmcnt(0) 334; SI-NEXT: s_mov_b64 s[12:13], s[6:7] 335; SI-NEXT: v_mov_b32_e32 v1, 0 336; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 337; SI-NEXT: s_mov_b32 s10, -1 338; SI-NEXT: s_mov_b32 s14, s10 339; SI-NEXT: s_mov_b32 s8, s0 340; SI-NEXT: s_mov_b32 s9, s1 341; SI-NEXT: s_mov_b32 s12, s2 342; SI-NEXT: s_mov_b32 s13, s3 343; SI-NEXT: s_mov_b32 s6, s10 344; SI-NEXT: s_mov_b32 s7, s11 345; SI-NEXT: s_waitcnt vmcnt(0) 346; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 347; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0 348; SI-NEXT: v_and_b32_e32 v2, 0xff00, v0 349; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 350; SI-NEXT: v_and_b32_e32 v4, 0xff00, v1 351; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1 352; SI-NEXT: v_or_b32_e32 v2, v2, v3 353; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 354; SI-NEXT: v_add_i32_e32 v2, vcc, 0x900, v2 355; SI-NEXT: v_or_b32_e32 v1, v4, v1 356; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 357; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 358; SI-NEXT: v_or_b32_e32 v1, v1, v2 359; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1 360; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 361; SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 362; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 363; SI-NEXT: s_endpgm 364; 365; VI-LABEL: test_copy_v4i8_x2_extra_use: 366; VI: ; %bb.0: 367; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 368; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 369; VI-NEXT: s_mov_b32 s11, 0xf000 370; VI-NEXT: s_mov_b32 s10, -1 371; VI-NEXT: s_mov_b32 s14, s10 372; VI-NEXT: s_waitcnt lgkmcnt(0) 373; VI-NEXT: v_mov_b32_e32 v1, s7 374; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 375; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 376; VI-NEXT: flat_load_dword v0, v[0:1] 377; VI-NEXT: s_mov_b32 s15, s11 378; VI-NEXT: s_mov_b32 s8, s0 379; VI-NEXT: s_mov_b32 s9, s1 380; VI-NEXT: s_mov_b32 s12, s2 381; VI-NEXT: s_mov_b32 s13, s3 382; VI-NEXT: s_mov_b32 s6, s10 383; VI-NEXT: s_mov_b32 s7, s11 384; VI-NEXT: s_waitcnt vmcnt(0) 385; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 386; VI-NEXT: v_and_b32_e32 v4, 0xffffff00, v1 387; VI-NEXT: v_add_u16_e32 v1, 9, v1 388; VI-NEXT: v_add_u16_e32 v3, 9, v0 389; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 390; VI-NEXT: v_and_b32_e32 v2, 0xffffff00, v0 391; VI-NEXT: v_and_b32_e32 v3, 0xff, v3 392; VI-NEXT: v_or_b32_e32 v1, v4, v1 393; VI-NEXT: v_or_b32_e32 v2, v2, v3 394; VI-NEXT: v_add_u16_e32 v1, 0x900, v1 395; VI-NEXT: v_add_u16_e32 v2, 0x900, v2 396; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 397; VI-NEXT: v_or_b32_e32 v1, v2, v1 398; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 399; VI-NEXT: buffer_store_dword v1, off, s[12:15], 0 400; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 401; VI-NEXT: s_endpgm 402 %tid.x = call i32 @llvm.amdgcn.workitem.id.x() 403 %in.ptr = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x 404 %val = load <4 x i8>, ptr addrspace(1) %in.ptr, align 4 405 %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9> 406 store <4 x i8> %val, ptr addrspace(1) %out0, align 4 407 store <4 x i8> %add, ptr addrspace(1) %out1, align 4 408 store <4 x i8> %val, ptr addrspace(1) %out2, align 4 409 ret void 410} 411 412define amdgpu_kernel void @test_copy_v3i8_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { 413; SI-LABEL: test_copy_v3i8_align4: 414; SI: ; %bb.0: 415; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 416; SI-NEXT: s_mov_b32 s7, 0xf000 417; SI-NEXT: s_mov_b32 s10, 0 418; SI-NEXT: s_mov_b32 s11, s7 419; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 420; SI-NEXT: s_waitcnt lgkmcnt(0) 421; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 422; SI-NEXT: v_mov_b32_e32 v1, 0 423; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 424; SI-NEXT: s_mov_b32 s6, -1 425; SI-NEXT: s_mov_b32 s4, s0 426; SI-NEXT: s_mov_b32 s5, s1 427; SI-NEXT: s_waitcnt vmcnt(0) 428; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 429; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 430; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2 431; SI-NEXT: s_endpgm 432; 433; VI-LABEL: test_copy_v3i8_align4: 434; VI: ; %bb.0: 435; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 436; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 437; VI-NEXT: s_waitcnt lgkmcnt(0) 438; VI-NEXT: v_mov_b32_e32 v1, s3 439; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 440; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 441; VI-NEXT: flat_load_dword v0, v[0:1] 442; VI-NEXT: s_mov_b32 s3, 0xf000 443; VI-NEXT: s_mov_b32 s2, -1 444; VI-NEXT: s_waitcnt vmcnt(0) 445; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 446; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 447; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2 448; VI-NEXT: s_endpgm 449 %tid.x = call i32 @llvm.amdgcn.workitem.id.x() 450 %gep = getelementptr <3 x i8>, ptr addrspace(1) %in, i32 %tid.x 451 %val = load <3 x i8>, ptr addrspace(1) %gep, align 4 452 store <3 x i8> %val, ptr addrspace(1) %out, align 4 453 ret void 454} 455 456define amdgpu_kernel void @test_copy_v3i8_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { 457; SI-LABEL: test_copy_v3i8_align2: 458; SI: ; %bb.0: 459; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 460; SI-NEXT: s_mov_b32 s7, 0xf000 461; SI-NEXT: s_mov_b32 s6, -1 462; SI-NEXT: s_mov_b32 s10, s6 463; SI-NEXT: s_mov_b32 s11, s7 464; SI-NEXT: s_waitcnt lgkmcnt(0) 465; SI-NEXT: s_mov_b32 s8, s2 466; SI-NEXT: s_mov_b32 s9, s3 467; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:2 468; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 469; SI-NEXT: s_mov_b32 s4, s0 470; SI-NEXT: s_mov_b32 s5, s1 471; SI-NEXT: s_waitcnt vmcnt(1) 472; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:2 473; SI-NEXT: s_waitcnt vmcnt(1) 474; SI-NEXT: buffer_store_short v1, off, s[4:7], 0 475; SI-NEXT: s_endpgm 476; 477; VI-LABEL: test_copy_v3i8_align2: 478; VI: ; %bb.0: 479; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 480; VI-NEXT: s_mov_b32 s7, 0xf000 481; VI-NEXT: s_mov_b32 s6, -1 482; VI-NEXT: s_mov_b32 s10, s6 483; VI-NEXT: s_mov_b32 s11, s7 484; VI-NEXT: s_waitcnt lgkmcnt(0) 485; VI-NEXT: s_mov_b32 s8, s2 486; VI-NEXT: s_mov_b32 s9, s3 487; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:2 488; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 489; VI-NEXT: s_mov_b32 s4, s0 490; VI-NEXT: s_mov_b32 s5, s1 491; VI-NEXT: s_waitcnt vmcnt(1) 492; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:2 493; VI-NEXT: s_waitcnt vmcnt(1) 494; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 495; VI-NEXT: s_endpgm 496 %val = load <3 x i8>, ptr addrspace(1) %in, align 2 497 store <3 x i8> %val, ptr addrspace(1) %out, align 2 498 ret void 499} 500 501define amdgpu_kernel void @test_copy_v3i8_align1(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { 502; SI-LABEL: test_copy_v3i8_align1: 503; SI: ; %bb.0: 504; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 505; SI-NEXT: s_mov_b32 s7, 0xf000 506; SI-NEXT: s_mov_b32 s6, -1 507; SI-NEXT: s_mov_b32 s10, s6 508; SI-NEXT: s_mov_b32 s11, s7 509; SI-NEXT: s_waitcnt lgkmcnt(0) 510; SI-NEXT: s_mov_b32 s8, s2 511; SI-NEXT: s_mov_b32 s9, s3 512; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 513; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1 514; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 515; SI-NEXT: s_mov_b32 s4, s0 516; SI-NEXT: s_mov_b32 s5, s1 517; SI-NEXT: s_waitcnt vmcnt(2) 518; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 519; SI-NEXT: s_waitcnt vmcnt(2) 520; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:1 521; SI-NEXT: s_waitcnt vmcnt(2) 522; SI-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:2 523; SI-NEXT: s_endpgm 524; 525; VI-LABEL: test_copy_v3i8_align1: 526; VI: ; %bb.0: 527; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 528; VI-NEXT: s_mov_b32 s7, 0xf000 529; VI-NEXT: s_mov_b32 s6, -1 530; VI-NEXT: s_mov_b32 s10, s6 531; VI-NEXT: s_mov_b32 s11, s7 532; VI-NEXT: s_waitcnt lgkmcnt(0) 533; VI-NEXT: s_mov_b32 s8, s2 534; VI-NEXT: s_mov_b32 s9, s3 535; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 536; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2 537; VI-NEXT: s_mov_b32 s4, s0 538; VI-NEXT: s_mov_b32 s5, s1 539; VI-NEXT: s_waitcnt vmcnt(1) 540; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 541; VI-NEXT: s_waitcnt vmcnt(1) 542; VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2 543; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 544; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:1 545; VI-NEXT: s_endpgm 546 %val = load <3 x i8>, ptr addrspace(1) %in, align 1 547 store <3 x i8> %val, ptr addrspace(1) %out, align 1 548 ret void 549} 550 551define amdgpu_kernel void @test_copy_v4i8_volatile_load(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { 552; SI-LABEL: test_copy_v4i8_volatile_load: 553; SI: ; %bb.0: 554; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 555; SI-NEXT: s_mov_b32 s7, 0xf000 556; SI-NEXT: s_mov_b32 s6, -1 557; SI-NEXT: s_mov_b32 s10, s6 558; SI-NEXT: s_mov_b32 s11, s7 559; SI-NEXT: s_waitcnt lgkmcnt(0) 560; SI-NEXT: s_mov_b32 s8, s2 561; SI-NEXT: s_mov_b32 s9, s3 562; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc 563; SI-NEXT: s_waitcnt vmcnt(0) 564; SI-NEXT: s_mov_b32 s4, s0 565; SI-NEXT: s_mov_b32 s5, s1 566; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 567; SI-NEXT: s_endpgm 568; 569; VI-LABEL: test_copy_v4i8_volatile_load: 570; VI: ; %bb.0: 571; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 572; VI-NEXT: s_mov_b32 s7, 0xf000 573; VI-NEXT: s_mov_b32 s6, -1 574; VI-NEXT: s_mov_b32 s10, s6 575; VI-NEXT: s_mov_b32 s11, s7 576; VI-NEXT: s_waitcnt lgkmcnt(0) 577; VI-NEXT: s_mov_b32 s8, s2 578; VI-NEXT: s_mov_b32 s9, s3 579; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc 580; VI-NEXT: s_waitcnt vmcnt(0) 581; VI-NEXT: s_mov_b32 s4, s0 582; VI-NEXT: s_mov_b32 s5, s1 583; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 584; VI-NEXT: s_endpgm 585 %val = load volatile <4 x i8>, ptr addrspace(1) %in, align 4 586 store <4 x i8> %val, ptr addrspace(1) %out, align 4 587 ret void 588} 589 590define amdgpu_kernel void @test_copy_v4i8_volatile_store(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { 591; SI-LABEL: test_copy_v4i8_volatile_store: 592; SI: ; %bb.0: 593; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 594; SI-NEXT: s_mov_b32 s7, 0xf000 595; SI-NEXT: s_mov_b32 s6, -1 596; SI-NEXT: s_mov_b32 s10, s6 597; SI-NEXT: s_mov_b32 s11, s7 598; SI-NEXT: s_waitcnt lgkmcnt(0) 599; SI-NEXT: s_mov_b32 s8, s2 600; SI-NEXT: s_mov_b32 s9, s3 601; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:3 602; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2 603; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:1 604; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 605; SI-NEXT: s_mov_b32 s4, s0 606; SI-NEXT: s_mov_b32 s5, s1 607; SI-NEXT: s_waitcnt vmcnt(3) 608; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:3 609; SI-NEXT: s_waitcnt vmcnt(0) 610; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2 611; SI-NEXT: s_waitcnt vmcnt(0) 612; SI-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:1 613; SI-NEXT: s_waitcnt vmcnt(0) 614; SI-NEXT: buffer_store_byte v3, off, s[4:7], 0 615; SI-NEXT: s_waitcnt vmcnt(0) 616; SI-NEXT: s_endpgm 617; 618; VI-LABEL: test_copy_v4i8_volatile_store: 619; VI: ; %bb.0: 620; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 621; VI-NEXT: s_mov_b32 s7, 0xf000 622; VI-NEXT: s_mov_b32 s6, -1 623; VI-NEXT: s_mov_b32 s10, s6 624; VI-NEXT: s_mov_b32 s11, s7 625; VI-NEXT: s_waitcnt lgkmcnt(0) 626; VI-NEXT: s_mov_b32 s8, s2 627; VI-NEXT: s_mov_b32 s9, s3 628; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:3 629; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2 630; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:1 631; VI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 632; VI-NEXT: s_mov_b32 s4, s0 633; VI-NEXT: s_mov_b32 s5, s1 634; VI-NEXT: s_waitcnt vmcnt(3) 635; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:3 636; VI-NEXT: s_waitcnt vmcnt(0) 637; VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2 638; VI-NEXT: s_waitcnt vmcnt(0) 639; VI-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:1 640; VI-NEXT: s_waitcnt vmcnt(0) 641; VI-NEXT: buffer_store_byte v3, off, s[4:7], 0 642; VI-NEXT: s_waitcnt vmcnt(0) 643; VI-NEXT: s_endpgm 644 %val = load <4 x i8>, ptr addrspace(1) %in, align 4 645 store volatile <4 x i8> %val, ptr addrspace(1) %out, align 4 646 ret void 647} 648