1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope --check-prefix=CI %s 3; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-ALIGNED %s 4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-UNALIGNED %s 5 6; FIXME: We don't get cases where the address was an SGPR because we 7; get a copy to the address register for each one. 8 9@lds = addrspace(3) global [512 x float] undef, align 4 10@lds.f64 = addrspace(3) global [512 x double] undef, align 8 11 12define amdgpu_kernel void @simple_read2_f32(ptr addrspace(1) %out) #0 { 13; CI-LABEL: simple_read2_f32: 14; CI: ; %bb.0: 15; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 16; CI-NEXT: s_mov_b32 m0, -1 17; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 18; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 19; CI-NEXT: s_mov_b32 s3, 0xf000 20; CI-NEXT: s_mov_b32 s2, 0 21; CI-NEXT: s_waitcnt lgkmcnt(0) 22; CI-NEXT: v_add_f32_e32 v2, v1, v2 23; CI-NEXT: v_mov_b32_e32 v1, 0 24; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 25; CI-NEXT: s_endpgm 26; 27; GFX9-LABEL: simple_read2_f32: 28; GFX9: ; %bb.0: 29; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 30; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:8 31; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 32; GFX9-NEXT: s_waitcnt lgkmcnt(0) 33; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 34; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 35; GFX9-NEXT: s_endpgm 36 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 37 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i 38 %val0 = load float, ptr addrspace(3) %arrayidx0, align 4 39 %add.x = add nsw i32 %x.i, 8 40 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x 41 %val1 = load float, ptr addrspace(3) %arrayidx1, align 4 42 %sum = fadd float %val0, %val1 43 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i 44 store float %sum, ptr addrspace(1) %out.gep, align 4 45 ret void 46} 47 48define amdgpu_kernel void @simple_read2_f32_max_offset(ptr addrspace(1) %out) #0 { 49; CI-LABEL: simple_read2_f32_max_offset: 50; CI: ; %bb.0: 51; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 52; CI-NEXT: s_mov_b32 m0, -1 53; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:255 54; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 55; CI-NEXT: s_mov_b32 s3, 0xf000 56; CI-NEXT: s_mov_b32 s2, 0 57; CI-NEXT: s_waitcnt lgkmcnt(0) 58; CI-NEXT: v_add_f32_e32 v2, v1, v2 59; CI-NEXT: v_mov_b32_e32 v1, 0 60; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 61; CI-NEXT: s_endpgm 62; 63; GFX9-LABEL: simple_read2_f32_max_offset: 64; GFX9: ; %bb.0: 65; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 66; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:255 67; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 68; GFX9-NEXT: s_waitcnt lgkmcnt(0) 69; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 70; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 71; GFX9-NEXT: s_endpgm 72 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 73 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i 74 %val0 = load float, ptr addrspace(3) %arrayidx0, align 4 75 %add.x = add nsw i32 %x.i, 255 76 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x 77 %val1 = load float, ptr addrspace(3) %arrayidx1, align 4 78 %sum = fadd float %val0, %val1 79 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i 80 store float %sum, ptr addrspace(1) %out.gep, align 4 81 ret void 82} 83 84define amdgpu_kernel void @simple_read2_f32_too_far(ptr addrspace(1) %out) #0 { 85; CI-LABEL: simple_read2_f32_too_far: 86; CI: ; %bb.0: 87; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 88; CI-NEXT: s_mov_b32 m0, -1 89; CI-NEXT: ds_read_b32 v1, v0 90; CI-NEXT: ds_read_b32 v2, v0 offset:1028 91; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 92; CI-NEXT: s_mov_b32 s3, 0xf000 93; CI-NEXT: s_mov_b32 s2, 0 94; CI-NEXT: s_waitcnt lgkmcnt(0) 95; CI-NEXT: v_add_f32_e32 v2, v1, v2 96; CI-NEXT: v_mov_b32_e32 v1, 0 97; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 98; CI-NEXT: s_endpgm 99; 100; GFX9-LABEL: simple_read2_f32_too_far: 101; GFX9: ; %bb.0: 102; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 103; GFX9-NEXT: ds_read_b32 v1, v0 104; GFX9-NEXT: ds_read_b32 v2, v0 offset:1028 105; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 106; GFX9-NEXT: s_waitcnt lgkmcnt(0) 107; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 108; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 109; GFX9-NEXT: s_endpgm 110 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 111 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i 112 %val0 = load float, ptr addrspace(3) %arrayidx0, align 4 113 %add.x = add nsw i32 %x.i, 257 114 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x 115 %val1 = load float, ptr addrspace(3) %arrayidx1, align 4 116 %sum = fadd float %val0, %val1 117 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i 118 store float %sum, ptr addrspace(1) %out.gep, align 4 119 ret void 120} 121 122define amdgpu_kernel void @simple_read2_f32_x2(ptr addrspace(1) %out) #0 { 123; CI-LABEL: simple_read2_f32_x2: 124; CI: ; %bb.0: 125; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 126; CI-NEXT: s_mov_b32 m0, -1 127; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 128; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27 129; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 130; CI-NEXT: s_mov_b32 s3, 0xf000 131; CI-NEXT: s_mov_b32 s2, 0 132; CI-NEXT: s_waitcnt lgkmcnt(0) 133; CI-NEXT: v_add_f32_e32 v1, v1, v2 134; CI-NEXT: v_add_f32_e32 v2, v3, v4 135; CI-NEXT: v_add_f32_e32 v2, v1, v2 136; CI-NEXT: v_mov_b32_e32 v1, 0 137; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 138; CI-NEXT: s_endpgm 139; 140; GFX9-LABEL: simple_read2_f32_x2: 141; GFX9: ; %bb.0: 142; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 143; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset1:8 144; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 145; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 146; GFX9-NEXT: s_waitcnt lgkmcnt(0) 147; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 148; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 149; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 150; GFX9-NEXT: global_store_dword v4, v0, s[0:1] 151; GFX9-NEXT: s_endpgm 152 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 153 %idx.0 = add nsw i32 %tid.x, 0 154 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.0 155 %val0 = load float, ptr addrspace(3) %arrayidx0, align 4 156 157 %idx.1 = add nsw i32 %tid.x, 8 158 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.1 159 %val1 = load float, ptr addrspace(3) %arrayidx1, align 4 160 %sum.0 = fadd float %val0, %val1 161 162 %idx.2 = add nsw i32 %tid.x, 11 163 %arrayidx2 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.2 164 %val2 = load float, ptr addrspace(3) %arrayidx2, align 4 165 166 %idx.3 = add nsw i32 %tid.x, 27 167 %arrayidx3 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.3 168 %val3 = load float, ptr addrspace(3) %arrayidx3, align 4 169 %sum.1 = fadd float %val2, %val3 170 171 %sum = fadd float %sum.0, %sum.1 172 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %idx.0 173 store float %sum, ptr addrspace(1) %out.gep, align 4 174 ret void 175} 176 177; Make sure there is an instruction between the two sets of reads. 178define amdgpu_kernel void @simple_read2_f32_x2_barrier(ptr addrspace(1) %out) #0 { 179; CI-LABEL: simple_read2_f32_x2_barrier: 180; CI: ; %bb.0: 181; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 182; CI-NEXT: s_mov_b32 m0, -1 183; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 184; CI-NEXT: s_waitcnt lgkmcnt(0) 185; CI-NEXT: s_barrier 186; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27 187; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 188; CI-NEXT: s_mov_b32 s3, 0xf000 189; CI-NEXT: v_add_f32_e32 v1, v1, v2 190; CI-NEXT: s_mov_b32 s2, 0 191; CI-NEXT: s_waitcnt lgkmcnt(0) 192; CI-NEXT: v_add_f32_e32 v2, v3, v4 193; CI-NEXT: v_add_f32_e32 v2, v1, v2 194; CI-NEXT: v_mov_b32_e32 v1, 0 195; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 196; CI-NEXT: s_endpgm 197; 198; GFX9-LABEL: simple_read2_f32_x2_barrier: 199; GFX9: ; %bb.0: 200; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 201; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset1:8 202; GFX9-NEXT: s_waitcnt lgkmcnt(0) 203; GFX9-NEXT: s_barrier 204; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 205; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 206; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 207; GFX9-NEXT: s_waitcnt lgkmcnt(0) 208; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 209; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 210; GFX9-NEXT: global_store_dword v4, v0, s[0:1] 211; GFX9-NEXT: s_endpgm 212 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 213 %idx.0 = add nsw i32 %tid.x, 0 214 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.0 215 %val0 = load float, ptr addrspace(3) %arrayidx0, align 4 216 217 %idx.1 = add nsw i32 %tid.x, 8 218 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.1 219 %val1 = load float, ptr addrspace(3) %arrayidx1, align 4 220 %sum.0 = fadd float %val0, %val1 221 222 call void @llvm.amdgcn.s.barrier() #2 223 224 %idx.2 = add nsw i32 %tid.x, 11 225 %arrayidx2 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.2 226 %val2 = load float, ptr addrspace(3) %arrayidx2, align 4 227 228 %idx.3 = add nsw i32 %tid.x, 27 229 %arrayidx3 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.3 230 %val3 = load float, ptr addrspace(3) %arrayidx3, align 4 231 %sum.1 = fadd float %val2, %val3 232 233 %sum = fadd float %sum.0, %sum.1 234 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %idx.0 235 store float %sum, ptr addrspace(1) %out.gep, align 4 236 ret void 237} 238 239; For some reason adding something to the base address for the first 240; element results in only folding the inner pair. 241define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(ptr addrspace(1) %out) #0 { 242; CI-LABEL: simple_read2_f32_x2_nonzero_base: 243; CI: ; %bb.0: 244; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 245; CI-NEXT: s_mov_b32 m0, -1 246; CI-NEXT: ds_read2_b32 v[1:2], v0 offset0:2 offset1:8 247; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27 248; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 249; CI-NEXT: s_mov_b32 s3, 0xf000 250; CI-NEXT: s_mov_b32 s2, 0 251; CI-NEXT: s_waitcnt lgkmcnt(0) 252; CI-NEXT: v_add_f32_e32 v1, v1, v2 253; CI-NEXT: v_add_f32_e32 v2, v3, v4 254; CI-NEXT: v_add_f32_e32 v2, v1, v2 255; CI-NEXT: v_mov_b32_e32 v1, 0 256; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:8 257; CI-NEXT: s_endpgm 258; 259; GFX9-LABEL: simple_read2_f32_x2_nonzero_base: 260; GFX9: ; %bb.0: 261; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 262; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset0:2 offset1:8 263; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 264; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 265; GFX9-NEXT: s_waitcnt lgkmcnt(0) 266; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 267; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 268; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 269; GFX9-NEXT: global_store_dword v4, v0, s[0:1] offset:8 270; GFX9-NEXT: s_endpgm 271 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 272 %idx.0 = add nsw i32 %tid.x, 2 273 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.0 274 %val0 = load float, ptr addrspace(3) %arrayidx0, align 4 275 276 %idx.1 = add nsw i32 %tid.x, 8 277 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.1 278 %val1 = load float, ptr addrspace(3) %arrayidx1, align 4 279 %sum.0 = fadd float %val0, %val1 280 281 %idx.2 = add nsw i32 %tid.x, 11 282 %arrayidx2 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.2 283 %val2 = load float, ptr addrspace(3) %arrayidx2, align 4 284 285 %idx.3 = add nsw i32 %tid.x, 27 286 %arrayidx3 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.3 287 %val3 = load float, ptr addrspace(3) %arrayidx3, align 4 288 %sum.1 = fadd float %val2, %val3 289 290 %sum = fadd float %sum.0, %sum.1 291 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %idx.0 292 store float %sum, ptr addrspace(1) %out.gep, align 4 293 ret void 294} 295 296; Be careful of vectors of pointers. We don't know if the 2 pointers 297; in the vectors are really the same base, so this is not safe to 298; merge. 299; Base pointers come from different subregister of same super 300; register. We can't safely merge this. 301define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2 x ptr addrspace(3)> %lds.ptr) #0 { 302; CI-LABEL: read2_ptr_is_subreg_arg_f32: 303; CI: ; %bb.0: 304; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 305; CI-NEXT: s_mov_b32 m0, -1 306; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 307; CI-NEXT: s_waitcnt lgkmcnt(0) 308; CI-NEXT: v_mov_b32_e32 v1, s2 309; CI-NEXT: v_mov_b32_e32 v2, s3 310; CI-NEXT: ds_read_b32 v1, v1 offset:32 311; CI-NEXT: ds_read_b32 v2, v2 312; CI-NEXT: s_mov_b32 s3, 0xf000 313; CI-NEXT: s_mov_b32 s2, 0 314; CI-NEXT: s_waitcnt lgkmcnt(0) 315; CI-NEXT: v_add_f32_e32 v2, v1, v2 316; CI-NEXT: v_mov_b32_e32 v1, 0 317; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 318; CI-NEXT: s_endpgm 319; 320; GFX9-LABEL: read2_ptr_is_subreg_arg_f32: 321; GFX9: ; %bb.0: 322; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 323; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 324; GFX9-NEXT: s_waitcnt lgkmcnt(0) 325; GFX9-NEXT: v_mov_b32_e32 v1, s2 326; GFX9-NEXT: v_mov_b32_e32 v2, s3 327; GFX9-NEXT: ds_read_b32 v1, v1 offset:32 328; GFX9-NEXT: ds_read_b32 v2, v2 329; GFX9-NEXT: s_waitcnt lgkmcnt(0) 330; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 331; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 332; GFX9-NEXT: s_endpgm 333 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 334 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 335 %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 336 %gep = getelementptr inbounds float, <2 x ptr addrspace(3)> %lds.ptr, <2 x i32> %index.1 337 %gep.0 = extractelement <2 x ptr addrspace(3)> %gep, i32 0 338 %gep.1 = extractelement <2 x ptr addrspace(3)> %gep, i32 1 339 %val0 = load float, ptr addrspace(3) %gep.0, align 4 340 %val1 = load float, ptr addrspace(3) %gep.1, align 4 341 %add.x = add nsw i32 %x.i, 8 342 %sum = fadd float %val0, %val1 343 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i 344 store float %sum, ptr addrspace(1) %out.gep, align 4 345 ret void 346} 347 348; Apply a constant scalar offset after the pointer vector extract. We 349; are rejecting merges that have the same, constant 0 offset, so make 350; sure we are really rejecting it because of the different 351; subregisters. 352define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(ptr addrspace(1) %out, <2 x ptr addrspace(3)> %lds.ptr) #0 { 353; CI-LABEL: read2_ptr_is_subreg_arg_offset_f32: 354; CI: ; %bb.0: 355; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 356; CI-NEXT: s_mov_b32 m0, -1 357; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 358; CI-NEXT: s_waitcnt lgkmcnt(0) 359; CI-NEXT: v_mov_b32_e32 v1, s2 360; CI-NEXT: v_mov_b32_e32 v2, s3 361; CI-NEXT: ds_read_b32 v1, v1 offset:32 362; CI-NEXT: ds_read_b32 v2, v2 offset:32 363; CI-NEXT: s_mov_b32 s3, 0xf000 364; CI-NEXT: s_mov_b32 s2, 0 365; CI-NEXT: s_waitcnt lgkmcnt(0) 366; CI-NEXT: v_add_f32_e32 v2, v1, v2 367; CI-NEXT: v_mov_b32_e32 v1, 0 368; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 369; CI-NEXT: s_endpgm 370; 371; GFX9-LABEL: read2_ptr_is_subreg_arg_offset_f32: 372; GFX9: ; %bb.0: 373; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 374; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 375; GFX9-NEXT: s_waitcnt lgkmcnt(0) 376; GFX9-NEXT: v_mov_b32_e32 v1, s2 377; GFX9-NEXT: v_mov_b32_e32 v2, s3 378; GFX9-NEXT: ds_read_b32 v1, v1 offset:32 379; GFX9-NEXT: ds_read_b32 v2, v2 offset:32 380; GFX9-NEXT: s_waitcnt lgkmcnt(0) 381; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 382; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 383; GFX9-NEXT: s_endpgm 384 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 385 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 386 %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 387 %gep = getelementptr inbounds float, <2 x ptr addrspace(3)> %lds.ptr, <2 x i32> %index.1 388 %gep.0 = extractelement <2 x ptr addrspace(3)> %gep, i32 0 389 %gep.1 = extractelement <2 x ptr addrspace(3)> %gep, i32 1 390 391 ; Apply an additional offset after the vector that will be more obviously folded. 392 %gep.1.offset = getelementptr float, ptr addrspace(3) %gep.1, i32 8 393 394 %val0 = load float, ptr addrspace(3) %gep.0, align 4 395 %val1 = load float, ptr addrspace(3) %gep.1.offset, align 4 396 %add.x = add nsw i32 %x.i, 8 397 %sum = fadd float %val0, %val1 398 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i 399 store float %sum, ptr addrspace(1) %out.gep, align 4 400 ret void 401} 402 403define amdgpu_kernel void @read2_ptr_is_subreg_f32(ptr addrspace(1) %out) #0 { 404; CI-LABEL: read2_ptr_is_subreg_f32: 405; CI: ; %bb.0: 406; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 407; CI-NEXT: s_mov_b32 m0, -1 408; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 409; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 410; CI-NEXT: s_mov_b32 s3, 0xf000 411; CI-NEXT: s_mov_b32 s2, 0 412; CI-NEXT: s_waitcnt lgkmcnt(0) 413; CI-NEXT: v_add_f32_e32 v2, v1, v2 414; CI-NEXT: v_mov_b32_e32 v1, 0 415; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 416; CI-NEXT: s_endpgm 417; 418; GFX9-LABEL: read2_ptr_is_subreg_f32: 419; GFX9: ; %bb.0: 420; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 421; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:8 422; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 423; GFX9-NEXT: s_waitcnt lgkmcnt(0) 424; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 425; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 426; GFX9-NEXT: s_endpgm 427 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 428 %ptr.0 = insertelement <2 x ptr addrspace(3)> undef, ptr addrspace(3) @lds, i32 0 429 %ptr.1 = insertelement <2 x ptr addrspace(3)> %ptr.0, ptr addrspace(3) @lds, i32 1 430 %x.i.v.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 431 %x.i.v.1 = insertelement <2 x i32> %x.i.v.0, i32 %x.i, i32 1 432 %idx = add <2 x i32> %x.i.v.1, <i32 0, i32 8> 433 %gep = getelementptr inbounds [512 x float], <2 x ptr addrspace(3)> %ptr.1, <2 x i32> <i32 0, i32 0>, <2 x i32> %idx 434 %gep.0 = extractelement <2 x ptr addrspace(3)> %gep, i32 0 435 %gep.1 = extractelement <2 x ptr addrspace(3)> %gep, i32 1 436 %val0 = load float, ptr addrspace(3) %gep.0, align 4 437 %val1 = load float, ptr addrspace(3) %gep.1, align 4 438 %add.x = add nsw i32 %x.i, 8 439 %sum = fadd float %val0, %val1 440 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i 441 store float %sum, ptr addrspace(1) %out.gep, align 4 442 ret void 443} 444 445define amdgpu_kernel void @simple_read2_f32_volatile_0(ptr addrspace(1) %out) #0 { 446; CI-LABEL: simple_read2_f32_volatile_0: 447; CI: ; %bb.0: 448; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 449; CI-NEXT: s_mov_b32 m0, -1 450; CI-NEXT: ds_read_b32 v1, v0 451; CI-NEXT: ds_read_b32 v2, v0 offset:32 452; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 453; CI-NEXT: s_mov_b32 s3, 0xf000 454; CI-NEXT: s_mov_b32 s2, 0 455; CI-NEXT: s_waitcnt lgkmcnt(0) 456; CI-NEXT: v_add_f32_e32 v2, v1, v2 457; CI-NEXT: v_mov_b32_e32 v1, 0 458; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 459; CI-NEXT: s_endpgm 460; 461; GFX9-LABEL: simple_read2_f32_volatile_0: 462; GFX9: ; %bb.0: 463; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 464; GFX9-NEXT: ds_read_b32 v1, v0 465; GFX9-NEXT: ds_read_b32 v2, v0 offset:32 466; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 467; GFX9-NEXT: s_waitcnt lgkmcnt(0) 468; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 469; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 470; GFX9-NEXT: s_endpgm 471 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 472 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i 473 %val0 = load volatile float, ptr addrspace(3) %arrayidx0, align 4 474 %add.x = add nsw i32 %x.i, 8 475 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x 476 %val1 = load float, ptr addrspace(3) %arrayidx1, align 4 477 %sum = fadd float %val0, %val1 478 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i 479 store float %sum, ptr addrspace(1) %out.gep, align 4 480 ret void 481} 482 483define amdgpu_kernel void @simple_read2_f32_volatile_1(ptr addrspace(1) %out) #0 { 484; CI-LABEL: simple_read2_f32_volatile_1: 485; CI: ; %bb.0: 486; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 487; CI-NEXT: s_mov_b32 m0, -1 488; CI-NEXT: ds_read_b32 v1, v0 489; CI-NEXT: ds_read_b32 v2, v0 offset:32 490; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 491; CI-NEXT: s_mov_b32 s3, 0xf000 492; CI-NEXT: s_mov_b32 s2, 0 493; CI-NEXT: s_waitcnt lgkmcnt(0) 494; CI-NEXT: v_add_f32_e32 v2, v1, v2 495; CI-NEXT: v_mov_b32_e32 v1, 0 496; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 497; CI-NEXT: s_endpgm 498; 499; GFX9-LABEL: simple_read2_f32_volatile_1: 500; GFX9: ; %bb.0: 501; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 502; GFX9-NEXT: ds_read_b32 v1, v0 503; GFX9-NEXT: ds_read_b32 v2, v0 offset:32 504; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 505; GFX9-NEXT: s_waitcnt lgkmcnt(0) 506; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 507; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 508; GFX9-NEXT: s_endpgm 509 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 510 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i 511 %val0 = load float, ptr addrspace(3) %arrayidx0, align 4 512 %add.x = add nsw i32 %x.i, 8 513 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x 514 %val1 = load volatile float, ptr addrspace(3) %arrayidx1, align 4 515 %sum = fadd float %val0, %val1 516 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i 517 store float %sum, ptr addrspace(1) %out.gep, align 4 518 ret void 519} 520 521; Can't fold since not correctly aligned. 522define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { 523; CI-LABEL: unaligned_read2_f32: 524; CI: ; %bb.0: 525; CI-NEXT: s_load_dword s0, s[4:5], 0x2 526; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 527; CI-NEXT: s_mov_b32 m0, -1 528; CI-NEXT: s_mov_b32 s3, 0xf000 529; CI-NEXT: s_mov_b32 s2, 0 530; CI-NEXT: s_waitcnt lgkmcnt(0) 531; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 532; CI-NEXT: ds_read_u8 v2, v1 offset:34 533; CI-NEXT: ds_read_u8 v3, v1 offset:32 534; CI-NEXT: ds_read_u8 v4, v1 offset:3 535; CI-NEXT: ds_read_u8 v5, v1 offset:2 536; CI-NEXT: ds_read_u8 v6, v1 offset:1 537; CI-NEXT: ds_read_u8 v7, v1 538; CI-NEXT: ds_read_u8 v8, v1 offset:33 539; CI-NEXT: ds_read_u8 v1, v1 offset:35 540; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 541; CI-NEXT: s_waitcnt lgkmcnt(0) 542; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 543; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 544; CI-NEXT: v_or_b32_e32 v4, v4, v5 545; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 546; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v8 547; CI-NEXT: v_or_b32_e32 v1, v1, v2 548; CI-NEXT: v_or_b32_e32 v6, v6, v7 549; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 550; CI-NEXT: v_or_b32_e32 v3, v5, v3 551; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 552; CI-NEXT: v_or_b32_e32 v4, v4, v6 553; CI-NEXT: v_or_b32_e32 v1, v1, v3 554; CI-NEXT: v_add_f32_e32 v2, v4, v1 555; CI-NEXT: v_mov_b32_e32 v1, 0 556; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 557; CI-NEXT: s_endpgm 558; 559; GFX9-ALIGNED-LABEL: unaligned_read2_f32: 560; GFX9-ALIGNED: ; %bb.0: 561; GFX9-ALIGNED-NEXT: s_load_dword s2, s[4:5], 0x8 562; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 563; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 564; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 565; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s2, v0 566; GFX9-ALIGNED-NEXT: ds_read_u8 v2, v1 567; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v1 offset:1 568; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v1 offset:2 569; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v1 offset:3 570; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v1 offset:32 571; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v1 offset:33 572; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v1 offset:34 573; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v1 offset:35 574; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(6) 575; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 8, v2 576; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(4) 577; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 8, v4 578; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 579; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(2) 580; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v6 581; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 582; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v8 583; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v3 584; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 585; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[0:1] 586; GFX9-ALIGNED-NEXT: s_endpgm 587; 588; GFX9-UNALIGNED-LABEL: unaligned_read2_f32: 589; GFX9-UNALIGNED: ; %bb.0: 590; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[4:5], 0x8 591; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 592; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 593; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2 594; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8 595; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 596; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 597; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 598; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1] 599; GFX9-UNALIGNED-NEXT: s_endpgm 600 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 601 %arrayidx0 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %x.i 602 %val0 = load float, ptr addrspace(3) %arrayidx0, align 1 603 %add.x = add nsw i32 %x.i, 8 604 %arrayidx1 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %add.x 605 %val1 = load float, ptr addrspace(3) %arrayidx1, align 1 606 %sum = fadd float %val0, %val1 607 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i 608 store float %sum, ptr addrspace(1) %out.gep, align 4 609 ret void 610} 611 612define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { 613; CI-LABEL: unaligned_offset_read2_f32: 614; CI: ; %bb.0: 615; CI-NEXT: s_load_dword s0, s[4:5], 0x2 616; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 617; CI-NEXT: s_mov_b32 m0, -1 618; CI-NEXT: s_mov_b32 s3, 0xf000 619; CI-NEXT: s_mov_b32 s2, 0 620; CI-NEXT: s_waitcnt lgkmcnt(0) 621; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 622; CI-NEXT: ds_read_u8 v2, v1 offset:11 623; CI-NEXT: ds_read_u8 v3, v1 offset:9 624; CI-NEXT: ds_read_u8 v4, v1 offset:8 625; CI-NEXT: ds_read_u8 v5, v1 offset:7 626; CI-NEXT: ds_read_u8 v6, v1 offset:6 627; CI-NEXT: ds_read_u8 v7, v1 offset:5 628; CI-NEXT: ds_read_u8 v8, v1 offset:10 629; CI-NEXT: ds_read_u8 v1, v1 offset:12 630; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 631; CI-NEXT: s_waitcnt lgkmcnt(0) 632; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 633; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 634; CI-NEXT: v_or_b32_e32 v4, v4, v5 635; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 636; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v8 637; CI-NEXT: v_or_b32_e32 v1, v1, v2 638; CI-NEXT: v_or_b32_e32 v6, v6, v7 639; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 640; CI-NEXT: v_or_b32_e32 v3, v5, v3 641; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 642; CI-NEXT: v_or_b32_e32 v4, v4, v6 643; CI-NEXT: v_or_b32_e32 v1, v1, v3 644; CI-NEXT: v_add_f32_e32 v2, v4, v1 645; CI-NEXT: v_mov_b32_e32 v1, 0 646; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 647; CI-NEXT: s_endpgm 648; 649; GFX9-ALIGNED-LABEL: unaligned_offset_read2_f32: 650; GFX9-ALIGNED: ; %bb.0: 651; GFX9-ALIGNED-NEXT: s_load_dword s2, s[4:5], 0x8 652; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 653; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 654; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 655; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s2, v0 656; GFX9-ALIGNED-NEXT: ds_read_u8 v2, v1 offset:5 657; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v1 offset:6 658; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v1 offset:7 659; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v1 offset:8 660; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v1 offset:9 661; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v1 offset:10 662; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v1 offset:11 663; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v1 offset:12 664; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(6) 665; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 8, v2 666; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(4) 667; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 8, v4 668; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 669; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(2) 670; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v6 671; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 672; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v8 673; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v3 674; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 675; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[0:1] 676; GFX9-ALIGNED-NEXT: s_endpgm 677; 678; GFX9-UNALIGNED-LABEL: unaligned_offset_read2_f32: 679; GFX9-UNALIGNED: ; %bb.0: 680; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[4:5], 0x8 681; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 682; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 683; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2 684; GFX9-UNALIGNED-NEXT: ds_read_b64 v[0:1], v0 offset:5 685; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 686; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 687; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 688; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1] 689; GFX9-UNALIGNED-NEXT: s_endpgm 690 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 691 %base = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %x.i 692 %addr0.i8 = getelementptr inbounds i8, ptr addrspace(3) %base, i32 5 693 %val0 = load float, ptr addrspace(3) %addr0.i8, align 1 694 %addr1.i8 = getelementptr inbounds i8, ptr addrspace(3) %base, i32 9 695 %val1 = load float, ptr addrspace(3) %addr1.i8, align 1 696 %sum = fadd float %val0, %val1 697 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i 698 store float %sum, ptr addrspace(1) %out.gep, align 4 699 ret void 700} 701 702define amdgpu_kernel void @misaligned_2_simple_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { 703; CI-LABEL: misaligned_2_simple_read2_f32: 704; CI: ; %bb.0: 705; CI-NEXT: s_load_dword s0, s[4:5], 0x2 706; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 707; CI-NEXT: s_mov_b32 m0, -1 708; CI-NEXT: s_mov_b32 s3, 0xf000 709; CI-NEXT: s_mov_b32 s2, 0 710; CI-NEXT: s_waitcnt lgkmcnt(0) 711; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 712; CI-NEXT: ds_read_u16 v2, v1 offset:32 713; CI-NEXT: ds_read_u16 v3, v1 offset:2 714; CI-NEXT: ds_read_u16 v4, v1 715; CI-NEXT: ds_read_u16 v1, v1 offset:34 716; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 717; CI-NEXT: s_waitcnt lgkmcnt(0) 718; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 719; CI-NEXT: v_or_b32_e32 v3, v3, v4 720; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 721; CI-NEXT: v_or_b32_e32 v1, v1, v2 722; CI-NEXT: v_add_f32_e32 v2, v3, v1 723; CI-NEXT: v_mov_b32_e32 v1, 0 724; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 725; CI-NEXT: s_endpgm 726; 727; GFX9-ALIGNED-LABEL: misaligned_2_simple_read2_f32: 728; GFX9-ALIGNED: ; %bb.0: 729; GFX9-ALIGNED-NEXT: s_load_dword s0, s[4:5], 0x8 730; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 731; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 732; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s0, v0 733; GFX9-ALIGNED-NEXT: ds_read_u16 v2, v1 734; GFX9-ALIGNED-NEXT: ds_read_u16 v3, v1 offset:2 735; GFX9-ALIGNED-NEXT: ds_read_u16 v4, v1 offset:32 736; GFX9-ALIGNED-NEXT: ds_read_u16 v1, v1 offset:34 737; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 738; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 739; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 740; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v4 741; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 742; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[0:1] 743; GFX9-ALIGNED-NEXT: s_endpgm 744; 745; GFX9-UNALIGNED-LABEL: misaligned_2_simple_read2_f32: 746; GFX9-UNALIGNED: ; %bb.0: 747; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[4:5], 0x8 748; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 749; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 750; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2 751; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8 752; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 753; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 754; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 755; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1] 756; GFX9-UNALIGNED-NEXT: s_endpgm 757 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 758 %arrayidx0 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %x.i 759 %val0 = load float, ptr addrspace(3) %arrayidx0, align 2 760 %add.x = add nsw i32 %x.i, 8 761 %arrayidx1 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %add.x 762 %val1 = load float, ptr addrspace(3) %arrayidx1, align 2 763 %sum = fadd float %val0, %val1 764 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i 765 store float %sum, ptr addrspace(1) %out.gep, align 4 766 ret void 767} 768 769define amdgpu_kernel void @simple_read2_f64(ptr addrspace(1) %out) #0 { 770; CI-LABEL: simple_read2_f64: 771; CI: ; %bb.0: 772; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 773; CI-NEXT: s_mov_b32 m0, -1 774; CI-NEXT: ds_read2_b64 v[0:3], v4 offset1:8 775; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 776; CI-NEXT: s_mov_b32 s3, 0xf000 777; CI-NEXT: s_mov_b32 s2, 0 778; CI-NEXT: v_mov_b32_e32 v5, 0 779; CI-NEXT: s_waitcnt lgkmcnt(0) 780; CI-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 781; CI-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[0:3], 0 addr64 782; CI-NEXT: s_endpgm 783; 784; GFX9-LABEL: simple_read2_f64: 785; GFX9: ; %bb.0: 786; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 787; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:8 788; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 789; GFX9-NEXT: s_waitcnt lgkmcnt(0) 790; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 791; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 792; GFX9-NEXT: s_endpgm 793 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 794 %arrayidx0 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %x.i 795 %val0 = load double, ptr addrspace(3) %arrayidx0, align 8 796 %add.x = add nsw i32 %x.i, 8 797 %arrayidx1 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %add.x 798 %val1 = load double, ptr addrspace(3) %arrayidx1, align 8 799 %sum = fadd double %val0, %val1 800 %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i 801 store double %sum, ptr addrspace(1) %out.gep, align 8 802 ret void 803} 804 805define amdgpu_kernel void @simple_read2_f64_max_offset(ptr addrspace(1) %out) #0 { 806; CI-LABEL: simple_read2_f64_max_offset: 807; CI: ; %bb.0: 808; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 809; CI-NEXT: s_mov_b32 m0, -1 810; CI-NEXT: ds_read2_b64 v[0:3], v4 offset1:255 811; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 812; CI-NEXT: s_mov_b32 s3, 0xf000 813; CI-NEXT: s_mov_b32 s2, 0 814; CI-NEXT: v_mov_b32_e32 v5, 0 815; CI-NEXT: s_waitcnt lgkmcnt(0) 816; CI-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 817; CI-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[0:3], 0 addr64 818; CI-NEXT: s_endpgm 819; 820; GFX9-LABEL: simple_read2_f64_max_offset: 821; GFX9: ; %bb.0: 822; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 823; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:255 824; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 825; GFX9-NEXT: s_waitcnt lgkmcnt(0) 826; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 827; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 828; GFX9-NEXT: s_endpgm 829 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 830 %arrayidx0 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %x.i 831 %val0 = load double, ptr addrspace(3) %arrayidx0, align 8 832 %add.x = add nsw i32 %x.i, 255 833 %arrayidx1 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %add.x 834 %val1 = load double, ptr addrspace(3) %arrayidx1, align 8 835 %sum = fadd double %val0, %val1 836 %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i 837 store double %sum, ptr addrspace(1) %out.gep, align 8 838 ret void 839} 840 841define amdgpu_kernel void @simple_read2_f64_too_far(ptr addrspace(1) %out) #0 { 842; CI-LABEL: simple_read2_f64_too_far: 843; CI: ; %bb.0: 844; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 845; CI-NEXT: s_mov_b32 m0, -1 846; CI-NEXT: ds_read_b64 v[1:2], v0 847; CI-NEXT: ds_read_b64 v[3:4], v0 offset:2056 848; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 849; CI-NEXT: s_mov_b32 s3, 0xf000 850; CI-NEXT: s_mov_b32 s2, 0 851; CI-NEXT: s_waitcnt lgkmcnt(0) 852; CI-NEXT: v_add_f64 v[2:3], v[1:2], v[3:4] 853; CI-NEXT: v_mov_b32_e32 v1, 0 854; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 855; CI-NEXT: s_endpgm 856; 857; GFX9-LABEL: simple_read2_f64_too_far: 858; GFX9: ; %bb.0: 859; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 860; GFX9-NEXT: ds_read_b64 v[0:1], v4 861; GFX9-NEXT: ds_read_b64 v[2:3], v4 offset:2056 862; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 863; GFX9-NEXT: s_waitcnt lgkmcnt(0) 864; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 865; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 866; GFX9-NEXT: s_endpgm 867 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 868 %arrayidx0 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %x.i 869 %val0 = load double, ptr addrspace(3) %arrayidx0, align 8 870 %add.x = add nsw i32 %x.i, 257 871 %arrayidx1 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %add.x 872 %val1 = load double, ptr addrspace(3) %arrayidx1, align 8 873 %sum = fadd double %val0, %val1 874 %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i 875 store double %sum, ptr addrspace(1) %out.gep, align 8 876 ret void 877} 878 879; Alignment only 4 880define amdgpu_kernel void @misaligned_read2_f64(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { 881; CI-LABEL: misaligned_read2_f64: 882; CI: ; %bb.0: 883; CI-NEXT: s_load_dword s0, s[4:5], 0x2 884; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 885; CI-NEXT: s_mov_b32 m0, -1 886; CI-NEXT: s_mov_b32 s3, 0xf000 887; CI-NEXT: s_mov_b32 s2, 0 888; CI-NEXT: s_waitcnt lgkmcnt(0) 889; CI-NEXT: v_add_i32_e32 v3, vcc, s0, v0 890; CI-NEXT: ds_read2_b32 v[1:2], v3 offset1:1 891; CI-NEXT: ds_read2_b32 v[3:4], v3 offset0:14 offset1:15 892; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 893; CI-NEXT: s_waitcnt lgkmcnt(0) 894; CI-NEXT: v_add_f64 v[2:3], v[1:2], v[3:4] 895; CI-NEXT: v_mov_b32_e32 v1, 0 896; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 897; CI-NEXT: s_endpgm 898; 899; GFX9-LABEL: misaligned_read2_f64: 900; GFX9: ; %bb.0: 901; GFX9-NEXT: s_load_dword s0, s[4:5], 0x8 902; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 903; GFX9-NEXT: s_waitcnt lgkmcnt(0) 904; GFX9-NEXT: v_add_u32_e32 v2, s0, v4 905; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 906; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:14 offset1:15 907; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 908; GFX9-NEXT: s_waitcnt lgkmcnt(0) 909; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 910; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 911; GFX9-NEXT: s_endpgm 912 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 913 %arrayidx0 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %x.i 914 %val0 = load double, ptr addrspace(3) %arrayidx0, align 4 915 %add.x = add nsw i32 %x.i, 7 916 %arrayidx1 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x 917 %val1 = load double, ptr addrspace(3) %arrayidx1, align 4 918 %sum = fadd double %val0, %val1 919 %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i 920 store double %sum, ptr addrspace(1) %out.gep, align 4 921 ret void 922} 923 924@foo = addrspace(3) global [4 x i32] undef, align 4 925 926define amdgpu_kernel void @load_constant_adjacent_offsets(ptr addrspace(1) %out) { 927; CI-LABEL: load_constant_adjacent_offsets: 928; CI: ; %bb.0: 929; CI-NEXT: v_mov_b32_e32 v0, 0 930; CI-NEXT: s_mov_b32 m0, -1 931; CI-NEXT: ds_read_b64 v[0:1], v0 932; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 933; CI-NEXT: s_mov_b32 s3, 0xf000 934; CI-NEXT: s_mov_b32 s2, -1 935; CI-NEXT: s_waitcnt lgkmcnt(0) 936; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 937; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 938; CI-NEXT: s_endpgm 939; 940; GFX9-LABEL: load_constant_adjacent_offsets: 941; GFX9: ; %bb.0: 942; GFX9-NEXT: v_mov_b32_e32 v2, 0 943; GFX9-NEXT: ds_read_b64 v[0:1], v2 944; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 945; GFX9-NEXT: s_waitcnt lgkmcnt(0) 946; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 947; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 948; GFX9-NEXT: s_endpgm 949 %val0 = load i32, ptr addrspace(3) @foo, align 4 950 %val1 = load i32, ptr addrspace(3) getelementptr inbounds ([4 x i32], ptr addrspace(3) @foo, i32 0, i32 1), align 4 951 %sum = add i32 %val0, %val1 952 store i32 %sum, ptr addrspace(1) %out, align 4 953 ret void 954} 955 956define amdgpu_kernel void @load_constant_disjoint_offsets(ptr addrspace(1) %out) { 957; CI-LABEL: load_constant_disjoint_offsets: 958; CI: ; %bb.0: 959; CI-NEXT: v_mov_b32_e32 v0, 0 960; CI-NEXT: s_mov_b32 m0, -1 961; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:2 962; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 963; CI-NEXT: s_mov_b32 s3, 0xf000 964; CI-NEXT: s_mov_b32 s2, -1 965; CI-NEXT: s_waitcnt lgkmcnt(0) 966; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 967; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 968; CI-NEXT: s_endpgm 969; 970; GFX9-LABEL: load_constant_disjoint_offsets: 971; GFX9: ; %bb.0: 972; GFX9-NEXT: v_mov_b32_e32 v2, 0 973; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:2 974; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 975; GFX9-NEXT: s_waitcnt lgkmcnt(0) 976; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 977; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 978; GFX9-NEXT: s_endpgm 979 %val0 = load i32, ptr addrspace(3) @foo, align 4 980 %val1 = load i32, ptr addrspace(3) getelementptr inbounds ([4 x i32], ptr addrspace(3) @foo, i32 0, i32 2), align 4 981 %sum = add i32 %val0, %val1 982 store i32 %sum, ptr addrspace(1) %out, align 4 983 ret void 984} 985 986@bar = addrspace(3) global [4 x i64] undef, align 4 987 988define amdgpu_kernel void @load_misaligned64_constant_offsets(ptr addrspace(1) %out) { 989; CI-LABEL: load_misaligned64_constant_offsets: 990; CI: ; %bb.0: 991; CI-NEXT: v_mov_b32_e32 v0, 0 992; CI-NEXT: s_mov_b32 m0, -1 993; CI-NEXT: ds_read_b128 v[0:3], v0 994; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 995; CI-NEXT: s_mov_b32 s3, 0xf000 996; CI-NEXT: s_mov_b32 s2, -1 997; CI-NEXT: s_waitcnt lgkmcnt(0) 998; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 999; CI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 1000; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1001; CI-NEXT: s_endpgm 1002; 1003; GFX9-LABEL: load_misaligned64_constant_offsets: 1004; GFX9: ; %bb.0: 1005; GFX9-NEXT: v_mov_b32_e32 v4, 0 1006; GFX9-NEXT: ds_read_b128 v[0:3], v4 1007; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1008; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1009; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 1010; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 1011; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 1012; GFX9-NEXT: s_endpgm 1013 %val0 = load i64, ptr addrspace(3) @bar, align 4 1014 %val1 = load i64, ptr addrspace(3) getelementptr inbounds ([4 x i64], ptr addrspace(3) @bar, i32 0, i32 1), align 4 1015 %sum = add i64 %val0, %val1 1016 store i64 %sum, ptr addrspace(1) %out, align 8 1017 ret void 1018} 1019 1020@bar.large = addrspace(3) global [4096 x i64] undef, align 4 1021 1022define amdgpu_kernel void @load_misaligned64_constant_large_offsets(ptr addrspace(1) %out) { 1023; CI-LABEL: load_misaligned64_constant_large_offsets: 1024; CI: ; %bb.0: 1025; CI-NEXT: v_mov_b32_e32 v2, 0 1026; CI-NEXT: s_mov_b32 m0, -1 1027; CI-NEXT: ds_read_b64 v[0:1], v2 offset:16384 1028; CI-NEXT: ds_read_b64 v[2:3], v2 offset:32760 1029; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1030; CI-NEXT: s_mov_b32 s3, 0xf000 1031; CI-NEXT: s_mov_b32 s2, -1 1032; CI-NEXT: s_waitcnt lgkmcnt(0) 1033; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1034; CI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 1035; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1036; CI-NEXT: s_endpgm 1037; 1038; GFX9-LABEL: load_misaligned64_constant_large_offsets: 1039; GFX9: ; %bb.0: 1040; GFX9-NEXT: v_mov_b32_e32 v4, 0 1041; GFX9-NEXT: ds_read_b64 v[0:1], v4 offset:16384 1042; GFX9-NEXT: ds_read_b64 v[2:3], v4 offset:32760 1043; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1044; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1045; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 1046; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 1047; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 1048; GFX9-NEXT: s_endpgm 1049 %val0 = load i64, ptr addrspace(3) getelementptr inbounds ([4096 x i64], ptr addrspace(3) @bar.large, i32 0, i32 2048), align 4 1050 %val1 = load i64, ptr addrspace(3) getelementptr inbounds ([4096 x i64], ptr addrspace(3) @bar.large, i32 0, i32 4095), align 4 1051 %sum = add i64 %val0, %val1 1052 store i64 %sum, ptr addrspace(1) %out, align 8 1053 ret void 1054} 1055 1056@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4 1057@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4 1058 1059define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, i32 %lda, i32 %ldb) #0 { 1060; CI-LABEL: sgemm_inner_loop_read2_sequence: 1061; CI: ; %bb.0: 1062; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1063; CI-NEXT: s_lshl_b32 s4, s8, 2 1064; CI-NEXT: s_add_i32 s5, s4, 0xc20 1065; CI-NEXT: s_addk_i32 s4, 0xc60 1066; CI-NEXT: v_mov_b32_e32 v0, s5 1067; CI-NEXT: v_mov_b32_e32 v2, s4 1068; CI-NEXT: v_lshlrev_b32_e32 v8, 2, v1 1069; CI-NEXT: s_mov_b32 m0, -1 1070; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 1071; CI-NEXT: ds_read2_b32 v[2:3], v2 offset1:1 1072; CI-NEXT: ds_read2_b32 v[4:5], v8 offset1:1 1073; CI-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33 1074; CI-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65 1075; CI-NEXT: s_waitcnt lgkmcnt(0) 1076; CI-NEXT: v_add_f32_e32 v0, v0, v1 1077; CI-NEXT: v_add_f32_e32 v0, v0, v2 1078; CI-NEXT: v_add_f32_e32 v0, v0, v3 1079; CI-NEXT: v_add_f32_e32 v0, v0, v4 1080; CI-NEXT: v_add_f32_e32 v0, v0, v5 1081; CI-NEXT: v_add_f32_e32 v0, v0, v6 1082; CI-NEXT: v_add_f32_e32 v0, v0, v7 1083; CI-NEXT: v_add_f32_e32 v0, v0, v8 1084; CI-NEXT: s_mov_b32 s3, 0xf000 1085; CI-NEXT: s_mov_b32 s2, -1 1086; CI-NEXT: v_add_f32_e32 v0, v0, v9 1087; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1088; CI-NEXT: s_endpgm 1089; 1090; GFX9-LABEL: sgemm_inner_loop_read2_sequence: 1091; GFX9: ; %bb.0: 1092; GFX9-NEXT: s_lshl_b32 s2, s8, 2 1093; GFX9-NEXT: s_add_i32 s3, s2, 0xc20 1094; GFX9-NEXT: s_addk_i32 s2, 0xc60 1095; GFX9-NEXT: v_mov_b32_e32 v0, s3 1096; GFX9-NEXT: v_mov_b32_e32 v2, s2 1097; GFX9-NEXT: v_lshlrev_b32_e32 v8, 2, v1 1098; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 1099; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset1:1 1100; GFX9-NEXT: ds_read2_b32 v[4:5], v8 offset1:1 1101; GFX9-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33 1102; GFX9-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65 1103; GFX9-NEXT: s_waitcnt lgkmcnt(4) 1104; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 1105; GFX9-NEXT: s_waitcnt lgkmcnt(3) 1106; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 1107; GFX9-NEXT: v_add_f32_e32 v0, v0, v3 1108; GFX9-NEXT: s_waitcnt lgkmcnt(2) 1109; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 1110; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1111; GFX9-NEXT: v_add_f32_e32 v0, v0, v5 1112; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1113; GFX9-NEXT: v_add_f32_e32 v0, v0, v6 1114; GFX9-NEXT: v_add_f32_e32 v0, v0, v7 1115; GFX9-NEXT: v_add_f32_e32 v0, v0, v8 1116; GFX9-NEXT: v_mov_b32_e32 v10, 0 1117; GFX9-NEXT: v_add_f32_e32 v0, v0, v9 1118; GFX9-NEXT: global_store_dword v10, v0, s[0:1] 1119; GFX9-NEXT: s_endpgm 1120 %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1 1121 %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1 1122 %arrayidx44 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %x.i 1123 %tmp16 = load float, ptr addrspace(3) %arrayidx44, align 4 1124 %add47 = add nsw i32 %x.i, 1 1125 %arrayidx48 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %add47 1126 %tmp17 = load float, ptr addrspace(3) %arrayidx48, align 4 1127 %add51 = add nsw i32 %x.i, 16 1128 %arrayidx52 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %add51 1129 %tmp18 = load float, ptr addrspace(3) %arrayidx52, align 4 1130 %add55 = add nsw i32 %x.i, 17 1131 %arrayidx56 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %add55 1132 %tmp19 = load float, ptr addrspace(3) %arrayidx56, align 4 1133 %arrayidx60 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %y.i 1134 %tmp20 = load float, ptr addrspace(3) %arrayidx60, align 4 1135 %add63 = add nsw i32 %y.i, 1 1136 %arrayidx64 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add63 1137 %tmp21 = load float, ptr addrspace(3) %arrayidx64, align 4 1138 %add67 = add nsw i32 %y.i, 32 1139 %arrayidx68 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add67 1140 %tmp22 = load float, ptr addrspace(3) %arrayidx68, align 4 1141 %add71 = add nsw i32 %y.i, 33 1142 %arrayidx72 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add71 1143 %tmp23 = load float, ptr addrspace(3) %arrayidx72, align 4 1144 %add75 = add nsw i32 %y.i, 64 1145 %arrayidx76 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add75 1146 %tmp24 = load float, ptr addrspace(3) %arrayidx76, align 4 1147 %add79 = add nsw i32 %y.i, 65 1148 %arrayidx80 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add79 1149 %tmp25 = load float, ptr addrspace(3) %arrayidx80, align 4 1150 %sum.0 = fadd float %tmp16, %tmp17 1151 %sum.1 = fadd float %sum.0, %tmp18 1152 %sum.2 = fadd float %sum.1, %tmp19 1153 %sum.3 = fadd float %sum.2, %tmp20 1154 %sum.4 = fadd float %sum.3, %tmp21 1155 %sum.5 = fadd float %sum.4, %tmp22 1156 %sum.6 = fadd float %sum.5, %tmp23 1157 %sum.7 = fadd float %sum.6, %tmp24 1158 %sum.8 = fadd float %sum.7, %tmp25 1159 store float %sum.8, ptr addrspace(1) %C, align 4 1160 ret void 1161} 1162 1163define amdgpu_kernel void @misaligned_read2_v2i32(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { 1164; CI-LABEL: misaligned_read2_v2i32: 1165; CI: ; %bb.0: 1166; CI-NEXT: s_load_dword s2, s[4:5], 0x2 1167; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1168; CI-NEXT: s_mov_b32 m0, -1 1169; CI-NEXT: s_mov_b32 s3, 0xf000 1170; CI-NEXT: s_waitcnt lgkmcnt(0) 1171; CI-NEXT: v_mov_b32_e32 v0, s2 1172; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 1173; CI-NEXT: s_mov_b32 s2, -1 1174; CI-NEXT: s_waitcnt lgkmcnt(0) 1175; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1176; CI-NEXT: s_endpgm 1177; 1178; GFX9-LABEL: misaligned_read2_v2i32: 1179; GFX9: ; %bb.0: 1180; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 1181; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1182; GFX9-NEXT: v_mov_b32_e32 v2, 0 1183; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1184; GFX9-NEXT: v_mov_b32_e32 v0, s2 1185; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 1186; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1187; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1188; GFX9-NEXT: s_endpgm 1189 %load = load <2 x i32>, ptr addrspace(3) %in, align 4 1190 store <2 x i32> %load, ptr addrspace(1) %out, align 8 1191 ret void 1192} 1193 1194define amdgpu_kernel void @misaligned_read2_i64(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { 1195; CI-LABEL: misaligned_read2_i64: 1196; CI: ; %bb.0: 1197; CI-NEXT: s_load_dword s2, s[4:5], 0x2 1198; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1199; CI-NEXT: s_mov_b32 m0, -1 1200; CI-NEXT: s_mov_b32 s3, 0xf000 1201; CI-NEXT: s_waitcnt lgkmcnt(0) 1202; CI-NEXT: v_mov_b32_e32 v0, s2 1203; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 1204; CI-NEXT: s_mov_b32 s2, -1 1205; CI-NEXT: s_waitcnt lgkmcnt(0) 1206; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1207; CI-NEXT: s_endpgm 1208; 1209; GFX9-LABEL: misaligned_read2_i64: 1210; GFX9: ; %bb.0: 1211; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 1212; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1213; GFX9-NEXT: v_mov_b32_e32 v2, 0 1214; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1215; GFX9-NEXT: v_mov_b32_e32 v0, s2 1216; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 1217; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1218; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1219; GFX9-NEXT: s_endpgm 1220 %load = load i64, ptr addrspace(3) %in, align 4 1221 store i64 %load, ptr addrspace(1) %out, align 8 1222 ret void 1223} 1224 1225define amdgpu_kernel void @ds_read_diff_base_interleaving( 1226; CI-LABEL: ds_read_diff_base_interleaving: 1227; CI: ; %bb.0: ; %bb 1228; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 1229; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1230; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v1 1231; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1232; CI-NEXT: s_mov_b32 m0, -1 1233; CI-NEXT: s_waitcnt lgkmcnt(0) 1234; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v1 1235; CI-NEXT: v_add_i32_e32 v3, vcc, s1, v0 1236; CI-NEXT: v_add_i32_e32 v4, vcc, s2, v1 1237; CI-NEXT: v_add_i32_e32 v6, vcc, s3, v0 1238; CI-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 1239; CI-NEXT: ds_read2_b32 v[2:3], v3 offset1:4 1240; CI-NEXT: ds_read2_b32 v[4:5], v4 offset1:1 1241; CI-NEXT: ds_read2_b32 v[6:7], v6 offset1:4 1242; CI-NEXT: s_mov_b32 s7, 0xf000 1243; CI-NEXT: s_mov_b32 s6, -1 1244; CI-NEXT: s_waitcnt lgkmcnt(2) 1245; CI-NEXT: v_mul_f32_e32 v0, v0, v2 1246; CI-NEXT: v_add_f32_e32 v0, 2.0, v0 1247; CI-NEXT: s_waitcnt lgkmcnt(0) 1248; CI-NEXT: v_mul_f32_e32 v2, v4, v6 1249; CI-NEXT: v_sub_f32_e32 v0, v0, v2 1250; CI-NEXT: v_mul_f32_e32 v1, v1, v3 1251; CI-NEXT: v_sub_f32_e32 v0, v0, v1 1252; CI-NEXT: v_mul_f32_e32 v1, v5, v7 1253; CI-NEXT: v_sub_f32_e32 v0, v0, v1 1254; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:40 1255; CI-NEXT: s_endpgm 1256; 1257; GFX9-LABEL: ds_read_diff_base_interleaving: 1258; GFX9: ; %bb.0: ; %bb 1259; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 1260; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 1261; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1262; GFX9-NEXT: v_mov_b32_e32 v8, 0 1263; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1264; GFX9-NEXT: v_add_u32_e32 v2, s0, v1 1265; GFX9-NEXT: v_add_u32_e32 v3, s1, v0 1266; GFX9-NEXT: v_add_u32_e32 v4, s2, v1 1267; GFX9-NEXT: v_add_u32_e32 v6, s3, v0 1268; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 1269; GFX9-NEXT: ds_read2_b32 v[2:3], v3 offset1:4 1270; GFX9-NEXT: ds_read2_b32 v[4:5], v4 offset1:1 1271; GFX9-NEXT: ds_read2_b32 v[6:7], v6 offset1:4 1272; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1273; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1274; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 1275; GFX9-NEXT: v_add_f32_e32 v0, 2.0, v0 1276; GFX9-NEXT: v_mul_f32_e32 v2, v4, v6 1277; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 1278; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 1279; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 1280; GFX9-NEXT: v_mul_f32_e32 v1, v5, v7 1281; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 1282; GFX9-NEXT: global_store_dword v8, v0, s[0:1] offset:40 1283; GFX9-NEXT: s_endpgm 1284 ptr addrspace(1) nocapture %arg, 1285 ptr addrspace(3) %arg1, 1286 ptr addrspace(3) %arg2, 1287 ptr addrspace(3) %arg3, 1288 ptr addrspace(3) %arg4) #1 { 1289bb: 1290 %tmp = getelementptr float, ptr addrspace(1) %arg, i64 10 1291 %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.x() #2 1292 %tmp6 = tail call i32 @llvm.amdgcn.workitem.id.y() #2 1293 %tmp7 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg1, i32 0, i32 %tmp6, i32 0 1294 %tmp8 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg2, i32 0, i32 0, i32 %tmp5 1295 %tmp9 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg3, i32 0, i32 %tmp6, i32 0 1296 %tmp10 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg4, i32 0, i32 0, i32 %tmp5 1297 %tmp11 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg1, i32 0, i32 %tmp6, i32 1 1298 %tmp12 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg2, i32 0, i32 1, i32 %tmp5 1299 %tmp13 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg3, i32 0, i32 %tmp6, i32 1 1300 %tmp14 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg4, i32 0, i32 1, i32 %tmp5 1301 %tmp15 = load float, ptr addrspace(3) %tmp7 1302 %tmp16 = load float, ptr addrspace(3) %tmp8 1303 %tmp17 = fmul float %tmp15, %tmp16 1304 %tmp18 = fadd float 2.000000e+00, %tmp17 1305 %tmp19 = load float, ptr addrspace(3) %tmp9 1306 %tmp20 = load float, ptr addrspace(3) %tmp10 1307 %tmp21 = fmul float %tmp19, %tmp20 1308 %tmp22 = fsub float %tmp18, %tmp21 1309 %tmp23 = load float, ptr addrspace(3) %tmp11 1310 %tmp24 = load float, ptr addrspace(3) %tmp12 1311 %tmp25 = fmul float %tmp23, %tmp24 1312 %tmp26 = fsub float %tmp22, %tmp25 1313 %tmp27 = load float, ptr addrspace(3) %tmp13 1314 %tmp28 = load float, ptr addrspace(3) %tmp14 1315 %tmp29 = fmul float %tmp27, %tmp28 1316 %tmp30 = fsub float %tmp26, %tmp29 1317 store float %tmp30, ptr addrspace(1) %tmp 1318 ret void 1319} 1320 1321define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspace(3) %arg) { 1322; CI-LABEL: ds_read_call_read: 1323; CI: ; %bb.0: 1324; CI-NEXT: s_getpc_b64 s[40:41] 1325; CI-NEXT: s_mov_b32 s40, s0 1326; CI-NEXT: s_load_dwordx4 s[40:43], s[40:41], 0x0 1327; CI-NEXT: s_mov_b32 s14, s10 1328; CI-NEXT: v_lshlrev_b32_e32 v3, 2, v0 1329; CI-NEXT: s_mov_b32 m0, -1 1330; CI-NEXT: s_mov_b32 s12, s8 1331; CI-NEXT: s_waitcnt lgkmcnt(0) 1332; CI-NEXT: s_add_u32 s40, s40, s11 1333; CI-NEXT: s_mov_b64 s[10:11], s[6:7] 1334; CI-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0 1335; CI-NEXT: s_load_dword s6, s[4:5], 0x2 1336; CI-NEXT: s_addc_u32 s41, s41, 0 1337; CI-NEXT: s_add_u32 s8, s4, 12 1338; CI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 1339; CI-NEXT: s_mov_b32 s13, s9 1340; CI-NEXT: s_waitcnt lgkmcnt(0) 1341; CI-NEXT: v_add_i32_e32 v40, vcc, s6, v3 1342; CI-NEXT: ds_read_b32 v41, v40 1343; CI-NEXT: s_addc_u32 s9, s5, 0 1344; CI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 1345; CI-NEXT: v_or_b32_e32 v0, v0, v1 1346; CI-NEXT: s_mov_b64 s[4:5], s[0:1] 1347; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 1348; CI-NEXT: s_mov_b64 s[0:1], s[40:41] 1349; CI-NEXT: s_mov_b32 s17, void_func_void@abs32@hi 1350; CI-NEXT: s_mov_b32 s16, void_func_void@abs32@lo 1351; CI-NEXT: v_or_b32_e32 v31, v0, v2 1352; CI-NEXT: s_mov_b64 s[2:3], s[42:43] 1353; CI-NEXT: s_mov_b32 s32, 0 1354; CI-NEXT: s_mov_b32 s39, 0xf000 1355; CI-NEXT: s_mov_b32 s38, -1 1356; CI-NEXT: s_swappc_b64 s[30:31], s[16:17] 1357; CI-NEXT: ds_read_b32 v0, v40 offset:4 1358; CI-NEXT: s_waitcnt lgkmcnt(0) 1359; CI-NEXT: v_add_i32_e32 v0, vcc, v41, v0 1360; CI-NEXT: buffer_store_dword v0, off, s[36:39], 0 1361; CI-NEXT: s_endpgm 1362; 1363; GFX9-LABEL: ds_read_call_read: 1364; GFX9: ; %bb.0: 1365; GFX9-NEXT: s_getpc_b64 s[36:37] 1366; GFX9-NEXT: s_mov_b32 s36, s0 1367; GFX9-NEXT: s_load_dwordx4 s[36:39], s[36:37], 0x0 1368; GFX9-NEXT: s_mov_b32 s14, s10 1369; GFX9-NEXT: s_mov_b32 s12, s8 1370; GFX9-NEXT: s_mov_b32 s13, s9 1371; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 1372; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1373; GFX9-NEXT: s_add_u32 s36, s36, s11 1374; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] 1375; GFX9-NEXT: s_load_dword s6, s[4:5], 0x8 1376; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 1377; GFX9-NEXT: s_addc_u32 s37, s37, 0 1378; GFX9-NEXT: s_add_u32 s8, s4, 12 1379; GFX9-NEXT: s_addc_u32 s9, s5, 0 1380; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1381; GFX9-NEXT: v_lshl_add_u32 v41, v0, 2, s6 1382; GFX9-NEXT: ds_read_b32 v42, v41 1383; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 1384; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] 1385; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] 1386; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] 1387; GFX9-NEXT: s_mov_b32 s17, void_func_void@abs32@hi 1388; GFX9-NEXT: s_mov_b32 s16, void_func_void@abs32@lo 1389; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 1390; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] 1391; GFX9-NEXT: s_mov_b32 s32, 0 1392; GFX9-NEXT: v_mov_b32_e32 v40, 0 1393; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] 1394; GFX9-NEXT: ds_read_b32 v0, v41 offset:4 1395; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1396; GFX9-NEXT: v_add_u32_e32 v0, v42, v0 1397; GFX9-NEXT: global_store_dword v40, v0, s[34:35] 1398; GFX9-NEXT: s_endpgm 1399 %x = call i32 @llvm.amdgcn.workitem.id.x() 1400 %arrayidx0 = getelementptr i32, ptr addrspace(3) %arg, i32 %x 1401 %arrayidx1 = getelementptr i32, ptr addrspace(3) %arrayidx0, i32 1 1402 %v0 = load i32, ptr addrspace(3) %arrayidx0, align 4 1403 call void @void_func_void() 1404 %v1 = load i32, ptr addrspace(3) %arrayidx1, align 4 1405 %r = add i32 %v0, %v1 1406 store i32 %r, ptr addrspace(1) %out, align 4 1407 ret void 1408} 1409 1410define amdgpu_ps <2 x float> @ds_read_interp_read(i32 inreg %prims, ptr addrspace(3) %inptr) { 1411; CI-LABEL: ds_read_interp_read: 1412; CI: ; %bb.0: 1413; CI-NEXT: s_mov_b32 m0, -1 1414; CI-NEXT: ds_read_b32 v2, v0 1415; CI-NEXT: s_mov_b32 m0, s0 1416; CI-NEXT: v_interp_mov_f32 v1, p10, attr0.x 1417; CI-NEXT: s_mov_b32 m0, -1 1418; CI-NEXT: ds_read_b32 v0, v0 offset:16 1419; CI-NEXT: s_waitcnt lgkmcnt(0) 1420; CI-NEXT: v_add_f32_e32 v1, v0, v1 1421; CI-NEXT: v_mov_b32_e32 v0, v2 1422; CI-NEXT: ; return to shader part epilog 1423; 1424; GFX9-LABEL: ds_read_interp_read: 1425; GFX9: ; %bb.0: 1426; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:4 1427; GFX9-NEXT: s_mov_b32 m0, s0 1428; GFX9-NEXT: s_nop 0 1429; GFX9-NEXT: v_interp_mov_f32_e32 v2, p10, attr0.x 1430; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1431; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 1432; GFX9-NEXT: ; return to shader part epilog 1433 %v0 = load float, ptr addrspace(3) %inptr, align 4 1434 %intrp = call float @llvm.amdgcn.interp.mov(i32 0, i32 0, i32 0, i32 %prims) 1435 %ptr1 = getelementptr float, ptr addrspace(3) %inptr, i32 4 1436 %v1 = load float, ptr addrspace(3) %ptr1, align 4 1437 %v1b = fadd float %v1, %intrp 1438 %r0 = insertelement <2 x float> undef, float %v0, i32 0 1439 %r1 = insertelement <2 x float> %r0, float %v1b, i32 1 1440 ret <2 x float> %r1 1441} 1442 1443@v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1 1444 1445define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) { 1446; CI-LABEL: read2_v2i32_align1_odd_offset: 1447; CI: ; %bb.0: ; %entry 1448; CI-NEXT: v_mov_b32_e32 v0, 0 1449; CI-NEXT: s_mov_b32 m0, -1 1450; CI-NEXT: ds_read_u8 v1, v0 offset:70 1451; CI-NEXT: ds_read_u8 v2, v0 offset:72 1452; CI-NEXT: ds_read_u8 v3, v0 offset:71 1453; CI-NEXT: ds_read_u8 v4, v0 offset:69 1454; CI-NEXT: ds_read_u8 v5, v0 offset:68 1455; CI-NEXT: s_waitcnt lgkmcnt(4) 1456; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1457; CI-NEXT: s_waitcnt lgkmcnt(3) 1458; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 1459; CI-NEXT: s_waitcnt lgkmcnt(2) 1460; CI-NEXT: v_or_b32_e32 v2, v2, v3 1461; CI-NEXT: s_waitcnt lgkmcnt(1) 1462; CI-NEXT: v_or_b32_e32 v1, v1, v4 1463; CI-NEXT: ds_read_u8 v4, v0 offset:67 1464; CI-NEXT: ds_read_u8 v6, v0 offset:66 1465; CI-NEXT: ds_read_u8 v0, v0 offset:65 1466; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 1467; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1468; CI-NEXT: v_or_b32_e32 v1, v2, v1 1469; CI-NEXT: s_waitcnt lgkmcnt(0) 1470; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 1471; CI-NEXT: v_or_b32_e32 v0, v2, v0 1472; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 1473; CI-NEXT: v_or_b32_e32 v2, v2, v4 1474; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 1475; CI-NEXT: s_mov_b32 s3, 0xf000 1476; CI-NEXT: s_mov_b32 s2, -1 1477; CI-NEXT: v_or_b32_e32 v0, v2, v0 1478; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1479; CI-NEXT: s_endpgm 1480; 1481; GFX9-ALIGNED-LABEL: read2_v2i32_align1_odd_offset: 1482; GFX9-ALIGNED: ; %bb.0: ; %entry 1483; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, 0 1484; GFX9-ALIGNED-NEXT: ds_read_u8 v0, v2 offset:70 1485; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v2 offset:65 1486; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v2 offset:66 1487; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v2 offset:67 1488; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v2 offset:68 1489; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v2 offset:69 1490; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v2 offset:72 1491; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v2 offset:71 1492; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(7) 1493; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1494; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1495; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 1496; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 1497; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v7 1498; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1499; GFX9-ALIGNED-NEXT: v_or_b32_e32 v1, v1, v0 1500; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v4 1501; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v3 1502; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v6 1503; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1504; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v3, v0 1505; GFX9-ALIGNED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1506; GFX9-ALIGNED-NEXT: s_endpgm 1507; 1508; GFX9-UNALIGNED-LABEL: read2_v2i32_align1_odd_offset: 1509; GFX9-UNALIGNED: ; %bb.0: ; %entry 1510; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0 1511; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1512; GFX9-UNALIGNED-NEXT: ds_read_b64 v[0:1], v2 offset:65 1513; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 1514; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1515; GFX9-UNALIGNED-NEXT: s_endpgm 1516entry: 1517 %load = load <2 x i32>, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @v2i32_align1, i32 65), align 1 1518 store <2 x i32> %load, ptr addrspace(1) %out 1519 ret void 1520} 1521 1522declare void @void_func_void() #3 1523 1524declare i32 @llvm.amdgcn.workgroup.id.x() #1 1525declare i32 @llvm.amdgcn.workgroup.id.y() #1 1526declare i32 @llvm.amdgcn.workitem.id.x() #1 1527declare i32 @llvm.amdgcn.workitem.id.y() #1 1528 1529declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) nounwind readnone 1530 1531declare void @llvm.amdgcn.s.barrier() #2 1532 1533attributes #0 = { nounwind } 1534attributes #1 = { nounwind readnone speculatable } 1535attributes #2 = { convergent nounwind } 1536attributes #3 = { nounwind noinline } 1537 1538!llvm.module.flags = !{!0} 1539!0 = !{i32 1, !"amdhsa_code_object_version", i32 500} 1540