1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-SDAG 3; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-GISEL 4; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED,UNALIGNED-SDAG 5; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED,UNALIGNED-GISEL 6 7define amdgpu_kernel void @ds1align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { 8; GCN-LABEL: ds1align1: 9; GCN: ; %bb.0: 10; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 11; GCN-NEXT: s_waitcnt lgkmcnt(0) 12; GCN-NEXT: v_mov_b32_e32 v0, s0 13; GCN-NEXT: ds_read_u8 v0, v0 14; GCN-NEXT: v_mov_b32_e32 v1, s1 15; GCN-NEXT: s_waitcnt lgkmcnt(0) 16; GCN-NEXT: ds_write_b8 v1, v0 17; GCN-NEXT: s_endpgm 18 %val = load i8, ptr addrspace(3) %in, align 1 19 store i8 %val, ptr addrspace(3) %out, align 1 20 ret void 21} 22 23define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { 24; ALIGNED-SDAG-LABEL: ds2align1: 25; ALIGNED-SDAG: ; %bb.0: 26; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 27; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 28; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 29; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 30; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:1 31; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s1 32; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 33; ALIGNED-SDAG-NEXT: ds_write_b8 v2, v1 34; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 35; ALIGNED-SDAG-NEXT: ds_write_b8 v2, v0 offset:1 36; ALIGNED-SDAG-NEXT: s_endpgm 37; 38; ALIGNED-GISEL-LABEL: ds2align1: 39; ALIGNED-GISEL: ; %bb.0: 40; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 41; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 42; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 43; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 44; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:1 45; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s1 46; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 47; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 8, v1 48; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 49; ALIGNED-GISEL-NEXT: ds_write_b8 v2, v0 50; ALIGNED-GISEL-NEXT: ds_write_b8 v2, v1 offset:1 51; ALIGNED-GISEL-NEXT: s_endpgm 52; 53; UNALIGNED-LABEL: ds2align1: 54; UNALIGNED: ; %bb.0: 55; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 56; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 57; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 58; UNALIGNED-NEXT: ds_read_u16 v0, v0 59; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 60; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 61; UNALIGNED-NEXT: ds_write_b16 v1, v0 62; UNALIGNED-NEXT: s_endpgm 63 %val = load i16, ptr addrspace(3) %in, align 1 64 store i16 %val, ptr addrspace(3) %out, align 1 65 ret void 66} 67 68define amdgpu_kernel void @ds2align2(ptr addrspace(3) %in, ptr addrspace(3) %out) { 69; GCN-LABEL: ds2align2: 70; GCN: ; %bb.0: 71; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 72; GCN-NEXT: s_waitcnt lgkmcnt(0) 73; GCN-NEXT: v_mov_b32_e32 v0, s0 74; GCN-NEXT: ds_read_u16 v0, v0 75; GCN-NEXT: v_mov_b32_e32 v1, s1 76; GCN-NEXT: s_waitcnt lgkmcnt(0) 77; GCN-NEXT: ds_write_b16 v1, v0 78; GCN-NEXT: s_endpgm 79 %val = load i16, ptr addrspace(3) %in, align 2 80 store i16 %val, ptr addrspace(3) %out, align 2 81 ret void 82} 83 84define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { 85; ALIGNED-SDAG-LABEL: ds4align1: 86; ALIGNED-SDAG: ; %bb.0: 87; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 88; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 89; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 90; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 91; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1 92; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2 93; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:3 94; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1 95; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 96; ALIGNED-SDAG-NEXT: ds_write_b8 v4, v1 97; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 98; ALIGNED-SDAG-NEXT: ds_write_b8 v4, v2 offset:1 99; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 100; ALIGNED-SDAG-NEXT: ds_write_b8 v4, v3 offset:2 101; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 102; ALIGNED-SDAG-NEXT: ds_write_b8 v4, v0 offset:3 103; ALIGNED-SDAG-NEXT: s_endpgm 104; 105; ALIGNED-GISEL-LABEL: ds4align1: 106; ALIGNED-GISEL: ; %bb.0: 107; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 108; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, 8 109; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 110; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 111; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 112; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1 113; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:3 114; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:2 115; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v5, s1 116; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 117; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 118; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 119; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v3 120; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 121; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 122; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v2, v0, v1 123; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 124; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v0 125; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:1 126; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 127; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v0 offset:2 128; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:3 129; ALIGNED-GISEL-NEXT: s_endpgm 130; 131; UNALIGNED-LABEL: ds4align1: 132; UNALIGNED: ; %bb.0: 133; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 134; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 135; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 136; UNALIGNED-NEXT: ds_read_b32 v0, v0 137; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 138; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 139; UNALIGNED-NEXT: ds_write_b32 v1, v0 140; UNALIGNED-NEXT: s_endpgm 141 %val = load i32, ptr addrspace(3) %in, align 1 142 store i32 %val, ptr addrspace(3) %out, align 1 143 ret void 144} 145 146define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out) { 147; ALIGNED-SDAG-LABEL: ds4align2: 148; ALIGNED-SDAG: ; %bb.0: 149; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 150; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 151; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 152; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 153; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:2 154; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s1 155; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 156; ALIGNED-SDAG-NEXT: ds_write_b16 v2, v1 157; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 158; ALIGNED-SDAG-NEXT: ds_write_b16 v2, v0 offset:2 159; ALIGNED-SDAG-NEXT: s_endpgm 160; 161; ALIGNED-GISEL-LABEL: ds4align2: 162; ALIGNED-GISEL: ; %bb.0: 163; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 164; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 165; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 166; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 167; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:2 168; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s1 169; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 170; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1 171; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v0 172; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v2, v0 offset:2 173; ALIGNED-GISEL-NEXT: s_endpgm 174; 175; UNALIGNED-LABEL: ds4align2: 176; UNALIGNED: ; %bb.0: 177; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 178; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 179; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 180; UNALIGNED-NEXT: ds_read_b32 v0, v0 181; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 182; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 183; UNALIGNED-NEXT: ds_write_b32 v1, v0 184; UNALIGNED-NEXT: s_endpgm 185 %val = load i32, ptr addrspace(3) %in, align 2 186 store i32 %val, ptr addrspace(3) %out, align 2 187 ret void 188} 189 190define amdgpu_kernel void @ds4align4(ptr addrspace(3) %in, ptr addrspace(3) %out) { 191; GCN-LABEL: ds4align4: 192; GCN: ; %bb.0: 193; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 194; GCN-NEXT: s_waitcnt lgkmcnt(0) 195; GCN-NEXT: v_mov_b32_e32 v0, s0 196; GCN-NEXT: ds_read_b32 v0, v0 197; GCN-NEXT: v_mov_b32_e32 v1, s1 198; GCN-NEXT: s_waitcnt lgkmcnt(0) 199; GCN-NEXT: ds_write_b32 v1, v0 200; GCN-NEXT: s_endpgm 201 %val = load i32, ptr addrspace(3) %in, align 4 202 store i32 %val, ptr addrspace(3) %out, align 4 203 ret void 204} 205 206define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { 207; ALIGNED-SDAG-LABEL: ds8align1: 208; ALIGNED-SDAG: ; %bb.0: 209; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 210; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 211; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 212; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 213; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1 214; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2 215; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:3 216; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:4 217; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5 218; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:6 219; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:7 220; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v7, s1 221; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 222; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v5 offset:4 223; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 224; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v6 offset:5 225; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v1 226; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v2 offset:1 227; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v3 offset:2 228; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v4 offset:3 229; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) 230; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v8 offset:6 231; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) 232; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v0 offset:7 233; ALIGNED-SDAG-NEXT: s_endpgm 234; 235; ALIGNED-GISEL-LABEL: ds8align1: 236; ALIGNED-GISEL: ; %bb.0: 237; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 238; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 239; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 240; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 241; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1 242; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2 243; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:3 244; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:4 245; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:5 246; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:6 247; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:7 248; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) 249; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 250; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 251; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4 252; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 253; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1 254; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 255; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v6, 8, v5 256; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 257; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 258; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v7 259; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v3, v2 260; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v2, 8, v1 261; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1 262; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1 263; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v2 offset:1 264; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, 8 265; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v4, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 266; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v3, v1 offset:2 267; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v4 offset:3 268; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 269; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v0 offset:4 270; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1 offset:5 271; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 272; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v3, v0 offset:6 273; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1 offset:7 274; ALIGNED-GISEL-NEXT: s_endpgm 275; 276; UNALIGNED-LABEL: ds8align1: 277; UNALIGNED: ; %bb.0: 278; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 279; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 280; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 281; UNALIGNED-NEXT: ds_read_b64 v[0:1], v0 282; UNALIGNED-NEXT: v_mov_b32_e32 v2, s1 283; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 284; UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] 285; UNALIGNED-NEXT: s_endpgm 286 %val = load <2 x i32>, ptr addrspace(3) %in, align 1 287 store <2 x i32> %val, ptr addrspace(3) %out, align 1 288 ret void 289} 290 291define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out) { 292; ALIGNED-SDAG-LABEL: ds8align2: 293; ALIGNED-SDAG: ; %bb.0: 294; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 295; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 296; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 297; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:4 298; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 299; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 300; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:6 301; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1 302; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 303; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v1 offset:4 304; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 305; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v2 306; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 307; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v3 offset:2 308; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 309; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v0 offset:6 310; ALIGNED-SDAG-NEXT: s_endpgm 311; 312; ALIGNED-GISEL-LABEL: ds8align2: 313; ALIGNED-GISEL: ; %bb.0: 314; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 315; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 316; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 317; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 318; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2 319; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4 320; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:6 321; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 322; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 323; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1 324; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 325; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v3 326; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v1 327; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v1 offset:2 328; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v0 offset:4 329; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v0 offset:6 330; ALIGNED-GISEL-NEXT: s_endpgm 331; 332; UNALIGNED-LABEL: ds8align2: 333; UNALIGNED: ; %bb.0: 334; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 335; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 336; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 337; UNALIGNED-NEXT: ds_read_b64 v[0:1], v0 338; UNALIGNED-NEXT: v_mov_b32_e32 v2, s1 339; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 340; UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] 341; UNALIGNED-NEXT: s_endpgm 342 %val = load <2 x i32>, ptr addrspace(3) %in, align 2 343 store <2 x i32> %val, ptr addrspace(3) %out, align 2 344 ret void 345} 346 347define amdgpu_kernel void @ds8align4(ptr addrspace(3) %in, ptr addrspace(3) %out) { 348; GCN-LABEL: ds8align4: 349; GCN: ; %bb.0: 350; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 351; GCN-NEXT: s_waitcnt lgkmcnt(0) 352; GCN-NEXT: v_mov_b32_e32 v0, s0 353; GCN-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 354; GCN-NEXT: v_mov_b32_e32 v2, s1 355; GCN-NEXT: s_waitcnt lgkmcnt(0) 356; GCN-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 357; GCN-NEXT: s_endpgm 358 %val = load <2 x i32>, ptr addrspace(3) %in, align 4 359 store <2 x i32> %val, ptr addrspace(3) %out, align 4 360 ret void 361} 362 363define amdgpu_kernel void @ds8align8(ptr addrspace(3) %in, ptr addrspace(3) %out) { 364; GCN-LABEL: ds8align8: 365; GCN: ; %bb.0: 366; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 367; GCN-NEXT: s_waitcnt lgkmcnt(0) 368; GCN-NEXT: v_mov_b32_e32 v0, s0 369; GCN-NEXT: ds_read_b64 v[0:1], v0 370; GCN-NEXT: v_mov_b32_e32 v2, s1 371; GCN-NEXT: s_waitcnt lgkmcnt(0) 372; GCN-NEXT: ds_write_b64 v2, v[0:1] 373; GCN-NEXT: s_endpgm 374 %val = load <2 x i32>, ptr addrspace(3) %in, align 8 375 store <2 x i32> %val, ptr addrspace(3) %out, align 8 376 ret void 377} 378 379define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { 380; ALIGNED-SDAG-LABEL: ds12align1: 381; ALIGNED-SDAG: ; %bb.0: 382; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 383; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 384; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 385; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 386; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1 387; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2 388; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:3 389; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:4 390; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5 391; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:6 392; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:7 393; ALIGNED-SDAG-NEXT: ds_read_u8 v9, v0 offset:8 394; ALIGNED-SDAG-NEXT: ds_read_u8 v10, v0 offset:9 395; ALIGNED-SDAG-NEXT: ds_read_u8 v11, v0 offset:10 396; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:11 397; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v12, s1 398; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 399; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v9 offset:8 400; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 401; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v10 offset:9 402; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v5 offset:4 403; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v6 offset:5 404; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v1 405; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v2 offset:1 406; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v3 offset:2 407; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v4 offset:3 408; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v7 offset:6 409; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v8 offset:7 410; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(11) 411; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v11 offset:10 412; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(11) 413; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v0 offset:11 414; ALIGNED-SDAG-NEXT: s_endpgm 415; 416; ALIGNED-GISEL-LABEL: ds12align1: 417; ALIGNED-GISEL: ; %bb.0: 418; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 419; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 420; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 421; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 422; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1 423; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2 424; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:3 425; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:4 426; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:5 427; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:6 428; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v0 offset:7 429; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) 430; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 431; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 432; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4 433; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 434; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1 435; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 436; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v6, 8, v5 437; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:8 438; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:9 439; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:10 440; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:11 441; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 442; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v6, 24, v8 443; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 444; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 445; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v3, v4, 8, v3 446; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 447; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v5 448; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 449; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 450; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v4, v3 451; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v3, 8, v1 452; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 453; ALIGNED-GISEL-NEXT: v_or3_b32 v2, v6, v7, v2 454; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 455; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v3 offset:1 456; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, 8 457; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v5, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 458; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v1 offset:2 459; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v5 offset:3 460; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v2 461; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v2 offset:4 462; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:5 463; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 464; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v2 offset:6 465; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:7 466; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 467; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v0 offset:8 468; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:9 469; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 470; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v0 offset:10 471; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:11 472; ALIGNED-GISEL-NEXT: s_endpgm 473; 474; UNALIGNED-LABEL: ds12align1: 475; UNALIGNED: ; %bb.0: 476; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 477; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 478; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 479; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 480; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1 481; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 482; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2] 483; UNALIGNED-NEXT: s_endpgm 484 %val = load <3 x i32>, ptr addrspace(3) %in, align 1 485 store <3 x i32> %val, ptr addrspace(3) %out, align 1 486 ret void 487} 488 489define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %out) { 490; ALIGNED-SDAG-LABEL: ds12align2: 491; ALIGNED-SDAG: ; %bb.0: 492; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 493; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 494; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 495; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:8 496; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 497; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 498; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4 499; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6 500; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v6, s1 501; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:10 502; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 503; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v1 offset:8 504; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 505; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v4 offset:4 506; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v2 507; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v3 offset:2 508; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 509; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v5 offset:6 510; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 511; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v0 offset:10 512; ALIGNED-SDAG-NEXT: s_endpgm 513; 514; ALIGNED-GISEL-LABEL: ds12align2: 515; ALIGNED-GISEL: ; %bb.0: 516; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 517; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 518; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 519; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 520; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2 521; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4 522; ALIGNED-GISEL-NEXT: ds_read_u16 v4, v0 offset:6 523; ALIGNED-GISEL-NEXT: ds_read_u16 v5, v0 offset:8 524; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:10 525; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v6, s1 526; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 527; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1 528; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 529; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v4, 16, v3 530; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 531; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v5 532; ALIGNED-GISEL-NEXT: ds_write_b16 v6, v1 533; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v6, v1 offset:2 534; ALIGNED-GISEL-NEXT: ds_write_b16 v6, v2 offset:4 535; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v6, v2 offset:6 536; ALIGNED-GISEL-NEXT: ds_write_b16 v6, v0 offset:8 537; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v6, v0 offset:10 538; ALIGNED-GISEL-NEXT: s_endpgm 539; 540; UNALIGNED-LABEL: ds12align2: 541; UNALIGNED: ; %bb.0: 542; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 543; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 544; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 545; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 546; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1 547; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 548; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2] 549; UNALIGNED-NEXT: s_endpgm 550 %val = load <3 x i32>, ptr addrspace(3) %in, align 2 551 store <3 x i32> %val, ptr addrspace(3) %out, align 2 552 ret void 553} 554 555define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %out) { 556; ALIGNED-LABEL: ds12align4: 557; ALIGNED: ; %bb.0: 558; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 559; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 560; ALIGNED-NEXT: v_mov_b32_e32 v2, s0 561; ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 562; ALIGNED-NEXT: ds_read_b32 v2, v2 offset:8 563; ALIGNED-NEXT: v_mov_b32_e32 v3, s1 564; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) 565; ALIGNED-NEXT: ds_write2_b32 v3, v0, v1 offset1:1 566; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) 567; ALIGNED-NEXT: ds_write_b32 v3, v2 offset:8 568; ALIGNED-NEXT: s_endpgm 569; 570; UNALIGNED-SDAG-LABEL: ds12align4: 571; UNALIGNED-SDAG: ; %bb.0: 572; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 573; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 574; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 575; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 576; UNALIGNED-SDAG-NEXT: ds_read_b32 v2, v2 offset:8 577; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s1 578; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 579; UNALIGNED-SDAG-NEXT: ds_write2_b32 v3, v0, v1 offset1:1 580; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 581; UNALIGNED-SDAG-NEXT: ds_write_b32 v3, v2 offset:8 582; UNALIGNED-SDAG-NEXT: s_endpgm 583; 584; UNALIGNED-GISEL-LABEL: ds12align4: 585; UNALIGNED-GISEL: ; %bb.0: 586; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 587; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 588; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 589; UNALIGNED-GISEL-NEXT: ds_read_b96 v[0:2], v0 590; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1 591; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 592; UNALIGNED-GISEL-NEXT: ds_write_b96 v3, v[0:2] 593; UNALIGNED-GISEL-NEXT: s_endpgm 594 %val = load <3 x i32>, ptr addrspace(3) %in, align 4 595 store <3 x i32> %val, ptr addrspace(3) %out, align 4 596 ret void 597} 598 599define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %out) { 600; ALIGNED-SDAG-LABEL: ds12align8: 601; ALIGNED-SDAG: ; %bb.0: 602; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 603; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 604; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 605; ALIGNED-SDAG-NEXT: ds_read_b64 v[0:1], v2 606; ALIGNED-SDAG-NEXT: ds_read_b32 v2, v2 offset:8 607; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s1 608; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 609; ALIGNED-SDAG-NEXT: ds_write_b64 v3, v[0:1] 610; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 611; ALIGNED-SDAG-NEXT: ds_write_b32 v3, v2 offset:8 612; ALIGNED-SDAG-NEXT: s_endpgm 613; 614; ALIGNED-GISEL-LABEL: ds12align8: 615; ALIGNED-GISEL: ; %bb.0: 616; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 617; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 618; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s0 619; ALIGNED-GISEL-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 620; ALIGNED-GISEL-NEXT: ds_read_b32 v2, v2 offset:8 621; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1 622; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 623; ALIGNED-GISEL-NEXT: ds_write2_b32 v3, v0, v1 offset1:1 624; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 625; ALIGNED-GISEL-NEXT: ds_write_b32 v3, v2 offset:8 626; ALIGNED-GISEL-NEXT: s_endpgm 627; 628; UNALIGNED-SDAG-LABEL: ds12align8: 629; UNALIGNED-SDAG: ; %bb.0: 630; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 631; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 632; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 633; UNALIGNED-SDAG-NEXT: ds_read_b32 v2, v0 offset:8 634; UNALIGNED-SDAG-NEXT: ds_read_b64 v[0:1], v0 635; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s1 636; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 637; UNALIGNED-SDAG-NEXT: ds_write_b32 v3, v2 offset:8 638; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 639; UNALIGNED-SDAG-NEXT: ds_write_b64 v3, v[0:1] 640; UNALIGNED-SDAG-NEXT: s_endpgm 641; 642; UNALIGNED-GISEL-LABEL: ds12align8: 643; UNALIGNED-GISEL: ; %bb.0: 644; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 645; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 646; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 647; UNALIGNED-GISEL-NEXT: ds_read_b96 v[0:2], v0 648; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1 649; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 650; UNALIGNED-GISEL-NEXT: ds_write_b96 v3, v[0:2] 651; UNALIGNED-GISEL-NEXT: s_endpgm 652 %val = load <3 x i32>, ptr addrspace(3) %in, align 8 653 store <3 x i32> %val, ptr addrspace(3) %out, align 8 654 ret void 655} 656 657define amdgpu_kernel void @ds12align16(ptr addrspace(3) %in, ptr addrspace(3) %out) { 658; GCN-LABEL: ds12align16: 659; GCN: ; %bb.0: 660; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 661; GCN-NEXT: s_waitcnt lgkmcnt(0) 662; GCN-NEXT: v_mov_b32_e32 v0, s0 663; GCN-NEXT: ds_read_b96 v[0:2], v0 664; GCN-NEXT: v_mov_b32_e32 v3, s1 665; GCN-NEXT: s_waitcnt lgkmcnt(0) 666; GCN-NEXT: ds_write_b96 v3, v[0:2] 667; GCN-NEXT: s_endpgm 668 %val = load <3 x i32>, ptr addrspace(3) %in, align 16 669 store <3 x i32> %val, ptr addrspace(3) %out, align 16 670 ret void 671} 672 673define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { 674; ALIGNED-SDAG-LABEL: ds16align1: 675; ALIGNED-SDAG: ; %bb.0: 676; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 677; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 678; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 679; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 680; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1 681; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2 682; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:3 683; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:4 684; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5 685; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:6 686; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:7 687; ALIGNED-SDAG-NEXT: ds_read_u8 v9, v0 offset:8 688; ALIGNED-SDAG-NEXT: ds_read_u8 v10, v0 offset:9 689; ALIGNED-SDAG-NEXT: ds_read_u8 v11, v0 offset:10 690; ALIGNED-SDAG-NEXT: ds_read_u8 v12, v0 offset:11 691; ALIGNED-SDAG-NEXT: ds_read_u8 v13, v0 offset:12 692; ALIGNED-SDAG-NEXT: ds_read_u8 v14, v0 offset:13 693; ALIGNED-SDAG-NEXT: ds_read_u8 v15, v0 offset:14 694; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:15 695; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v16, s1 696; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 697; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v13 offset:12 698; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 699; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v14 offset:13 700; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v1 701; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v2 offset:1 702; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v5 offset:4 703; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v6 offset:5 704; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v9 offset:8 705; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v10 offset:9 706; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v3 offset:2 707; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v4 offset:3 708; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v7 offset:6 709; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v8 offset:7 710; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v11 offset:10 711; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v12 offset:11 712; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(14) 713; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v15 offset:14 714; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v0 offset:15 715; ALIGNED-SDAG-NEXT: s_endpgm 716; 717; ALIGNED-GISEL-LABEL: ds16align1: 718; ALIGNED-GISEL: ; %bb.0: 719; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 720; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 721; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 722; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 723; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1 724; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2 725; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:3 726; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:4 727; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:5 728; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:6 729; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v0 offset:7 730; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) 731; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 732; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 733; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4 734; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 735; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1 736; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 737; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v6, 8, v5 738; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 739; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v8 740; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v7 741; ALIGNED-GISEL-NEXT: v_or3_b32 v2, v3, v4, v2 742; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:8 743; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:9 744; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:10 745; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:11 746; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:12 747; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v0 offset:13 748; ALIGNED-GISEL-NEXT: ds_read_u8 v9, v0 offset:14 749; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:15 750; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) 751; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v3, v4, 8, v3 752; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 753; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 24, v6 754; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 755; ALIGNED-GISEL-NEXT: v_or3_b32 v3, v4, v5, v3 756; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 757; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v4, v8, 8, v7 758; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 759; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 760; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v9 761; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v5, v4 762; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v4, 8, v1 763; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v5, s1 764; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 765; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v4 offset:1 766; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, 8 767; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v6, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 768; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v1 offset:2 769; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v6 offset:3 770; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v2 771; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v2 offset:4 772; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:5 773; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 774; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v2 offset:6 775; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:7 776; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v3 777; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v3 offset:8 778; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:9 779; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 780; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v3 offset:10 781; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:11 782; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 783; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v0 offset:12 784; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:13 785; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 786; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v0 offset:14 787; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:15 788; ALIGNED-GISEL-NEXT: s_endpgm 789; 790; UNALIGNED-LABEL: ds16align1: 791; UNALIGNED: ; %bb.0: 792; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 793; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 794; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 795; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0 796; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1 797; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 798; UNALIGNED-NEXT: ds_write_b128 v4, v[0:3] 799; UNALIGNED-NEXT: s_endpgm 800 %val = load <4 x i32>, ptr addrspace(3) %in, align 1 801 store <4 x i32> %val, ptr addrspace(3) %out, align 1 802 ret void 803} 804 805define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %out) { 806; ALIGNED-SDAG-LABEL: ds16align2: 807; ALIGNED-SDAG: ; %bb.0: 808; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 809; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 810; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 811; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:12 812; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 813; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 814; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4 815; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6 816; ALIGNED-SDAG-NEXT: ds_read_u16 v6, v0 offset:8 817; ALIGNED-SDAG-NEXT: ds_read_u16 v7, v0 offset:10 818; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v8, s1 819; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:14 820; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) 821; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v1 offset:12 822; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) 823; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v2 824; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) 825; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v4 offset:4 826; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 827; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v6 offset:8 828; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v3 offset:2 829; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v5 offset:6 830; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) 831; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v7 offset:10 832; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) 833; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v0 offset:14 834; ALIGNED-SDAG-NEXT: s_endpgm 835; 836; ALIGNED-GISEL-LABEL: ds16align2: 837; ALIGNED-GISEL: ; %bb.0: 838; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 839; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 840; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 841; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 842; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2 843; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4 844; ALIGNED-GISEL-NEXT: ds_read_u16 v4, v0 offset:6 845; ALIGNED-GISEL-NEXT: ds_read_u16 v5, v0 offset:8 846; ALIGNED-GISEL-NEXT: ds_read_u16 v6, v0 offset:10 847; ALIGNED-GISEL-NEXT: ds_read_u16 v7, v0 offset:12 848; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:14 849; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) 850; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1 851; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 852; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v4, 16, v3 853; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 854; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 855; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v3, v6, 16, v5 856; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 857; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v7 858; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v1 859; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v1 offset:2 860; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v2 offset:4 861; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v2 offset:6 862; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v3 offset:8 863; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v3 offset:10 864; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v0 offset:12 865; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v0 offset:14 866; ALIGNED-GISEL-NEXT: s_endpgm 867; 868; UNALIGNED-LABEL: ds16align2: 869; UNALIGNED: ; %bb.0: 870; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 871; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 872; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 873; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0 874; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1 875; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 876; UNALIGNED-NEXT: ds_write_b128 v4, v[0:3] 877; UNALIGNED-NEXT: s_endpgm 878 %val = load <4 x i32>, ptr addrspace(3) %in, align 2 879 store <4 x i32> %val, ptr addrspace(3) %out, align 2 880 ret void 881} 882 883define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %out) { 884; ALIGNED-LABEL: ds16align4: 885; ALIGNED: ; %bb.0: 886; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 887; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 888; ALIGNED-NEXT: v_mov_b32_e32 v2, s0 889; ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 890; ALIGNED-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 891; ALIGNED-NEXT: v_mov_b32_e32 v4, s1 892; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) 893; ALIGNED-NEXT: ds_write2_b32 v4, v0, v1 offset1:1 894; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) 895; ALIGNED-NEXT: ds_write2_b32 v4, v2, v3 offset0:2 offset1:3 896; ALIGNED-NEXT: s_endpgm 897; 898; UNALIGNED-SDAG-LABEL: ds16align4: 899; UNALIGNED-SDAG: ; %bb.0: 900; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 901; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 902; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 903; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset0:2 offset1:3 904; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[2:3], v2 offset1:1 905; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1 906; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 907; UNALIGNED-SDAG-NEXT: ds_write2_b32 v4, v0, v1 offset0:2 offset1:3 908; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 909; UNALIGNED-SDAG-NEXT: ds_write2_b32 v4, v2, v3 offset1:1 910; UNALIGNED-SDAG-NEXT: s_endpgm 911; 912; UNALIGNED-GISEL-LABEL: ds16align4: 913; UNALIGNED-GISEL: ; %bb.0: 914; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 915; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 916; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 917; UNALIGNED-GISEL-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 918; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 919; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 920; UNALIGNED-GISEL-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 921; UNALIGNED-GISEL-NEXT: s_endpgm 922 %val = load <4 x i32>, ptr addrspace(3) %in, align 4 923 store <4 x i32> %val, ptr addrspace(3) %out, align 4 924 ret void 925} 926 927define amdgpu_kernel void @ds16align8(ptr addrspace(3) %in, ptr addrspace(3) %out) { 928; GCN-LABEL: ds16align8: 929; GCN: ; %bb.0: 930; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 931; GCN-NEXT: s_waitcnt lgkmcnt(0) 932; GCN-NEXT: v_mov_b32_e32 v0, s0 933; GCN-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 934; GCN-NEXT: v_mov_b32_e32 v4, s1 935; GCN-NEXT: s_waitcnt lgkmcnt(0) 936; GCN-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 937; GCN-NEXT: s_endpgm 938 %val = load <4 x i32>, ptr addrspace(3) %in, align 8 939 store <4 x i32> %val, ptr addrspace(3) %out, align 8 940 ret void 941} 942 943define amdgpu_kernel void @ds16align16(ptr addrspace(3) %in, ptr addrspace(3) %out) { 944; GCN-LABEL: ds16align16: 945; GCN: ; %bb.0: 946; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 947; GCN-NEXT: s_waitcnt lgkmcnt(0) 948; GCN-NEXT: v_mov_b32_e32 v0, s0 949; GCN-NEXT: ds_read_b128 v[0:3], v0 950; GCN-NEXT: v_mov_b32_e32 v4, s1 951; GCN-NEXT: s_waitcnt lgkmcnt(0) 952; GCN-NEXT: ds_write_b128 v4, v[0:3] 953; GCN-NEXT: s_endpgm 954 %val = load <4 x i32>, ptr addrspace(3) %in, align 16 955 store <4 x i32> %val, ptr addrspace(3) %out, align 16 956 ret void 957} 958