1; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s 2; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s 3 4@lds = addrspace(3) global [512 x float] undef, align 4 5@lds.f64 = addrspace(3) global [512 x double] undef, align 8 6 7 8; GCN-LABEL: @simple_read2st64_f32_0_1 9; CI: s_mov_b32 m0 10; GFX9-NOT: m0 11 12; GCN: ds_read2st64_b32 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]], v{{[0-9]+}} offset1:1 13; GCN: s_waitcnt lgkmcnt(0) 14; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] 15; CI: buffer_store_dword [[RESULT]] 16; GFX9: global_store_dword v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} 17define amdgpu_kernel void @simple_read2st64_f32_0_1(ptr addrspace(1) %out) #0 { 18 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 19 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i 20 %val0 = load float, ptr addrspace(3) %arrayidx0, align 4 21 %add.x = add nsw i32 %x.i, 64 22 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x 23 %val1 = load float, ptr addrspace(3) %arrayidx1, align 4 24 %sum = fadd float %val0, %val1 25 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i 26 store float %sum, ptr addrspace(1) %out.gep, align 4 27 ret void 28} 29 30; GCN-LABEL: @simple_read2st64_f32_1_2 31; CI: s_mov_b32 m0 32; GFX9-NOT: m0 33 34; GCN: ds_read2st64_b32 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]], v{{[0-9]+}} offset0:1 offset1:2 35; GCN: s_waitcnt lgkmcnt(0) 36; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] 37; CI: buffer_store_dword [[RESULT]] 38; GFX9: global_store_dword v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} 39define amdgpu_kernel void @simple_read2st64_f32_1_2(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { 40 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 41 %add.x.0 = add nsw i32 %x.i, 64 42 %arrayidx0 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %add.x.0 43 %val0 = load float, ptr addrspace(3) %arrayidx0, align 4 44 %add.x.1 = add nsw i32 %x.i, 128 45 %arrayidx1 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %add.x.1 46 %val1 = load float, ptr addrspace(3) %arrayidx1, align 4 47 %sum = fadd float %val0, %val1 48 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i 49 store float %sum, ptr addrspace(1) %out.gep, align 4 50 ret void 51} 52 53; GCN-LABEL: @simple_read2st64_f32_max_offset 54; CI: s_mov_b32 m0 55; GFX9-NOT: m0 56 57; GCN: ds_read2st64_b32 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]], v{{[0-9]+}} offset0:1 offset1:255 58; GCN: s_waitcnt lgkmcnt(0) 59; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] 60; CI: buffer_store_dword [[RESULT]] 61; GFX9: global_store_dword v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} 62define amdgpu_kernel void @simple_read2st64_f32_max_offset(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { 63 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 64 %add.x.0 = add nsw i32 %x.i, 64 65 %arrayidx0 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %add.x.0 66 %val0 = load float, ptr addrspace(3) %arrayidx0, align 4 67 %add.x.1 = add nsw i32 %x.i, 16320 68 %arrayidx1 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %add.x.1 69 %val1 = load float, ptr addrspace(3) %arrayidx1, align 4 70 %sum = fadd float %val0, %val1 71 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i 72 store float %sum, ptr addrspace(1) %out.gep, align 4 73 ret void 74} 75 76; GCN-LABEL: @simple_read2st64_f32_over_max_offset 77; CI: s_mov_b32 m0 78; GFX9-NOT: m0 79 80; GCN-NOT: ds_read2st64_b32 81; GCN-DAG: v_add_{{i|u}}32_e32 [[BIGADD:v[0-9]+]], {{(vcc, )?}}0x10000, {{v[0-9]+}} 82; GCN-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256 83; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]{{$}} 84; GCN: s_endpgm 85define amdgpu_kernel void @simple_read2st64_f32_over_max_offset(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { 86 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 87 %add.x.0 = add nsw i32 %x.i, 64 88 %arrayidx0 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %add.x.0 89 %val0 = load float, ptr addrspace(3) %arrayidx0, align 4 90 %add.x.1 = add nsw i32 %x.i, 16384 91 %arrayidx1 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %add.x.1 92 %val1 = load float, ptr addrspace(3) %arrayidx1, align 4 93 %sum = fadd float %val0, %val1 94 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i 95 store float %sum, ptr addrspace(1) %out.gep, align 4 96 ret void 97} 98 99; GCN-LABEL: @odd_invalid_read2st64_f32_0 100; CI: s_mov_b32 m0 101; GFX9-NOT: m0 102 103; GCN-NOT: ds_read2st64_b32 104; GCN: s_endpgm 105define amdgpu_kernel void @odd_invalid_read2st64_f32_0(ptr addrspace(1) %out) #0 { 106 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 107 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i 108 %val0 = load float, ptr addrspace(3) %arrayidx0, align 4 109 %add.x = add nsw i32 %x.i, 63 110 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x 111 %val1 = load float, ptr addrspace(3) %arrayidx1, align 4 112 %sum = fadd float %val0, %val1 113 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i 114 store float %sum, ptr addrspace(1) %out.gep, align 4 115 ret void 116} 117 118; GCN-LABEL: @odd_invalid_read2st64_f32_1 119; CI: s_mov_b32 m0 120; GFX9-NOT: m0 121 122; GCN-NOT: ds_read2st64_b32 123; GCN: s_endpgm 124define amdgpu_kernel void @odd_invalid_read2st64_f32_1(ptr addrspace(1) %out) #0 { 125 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 126 %add.x.0 = add nsw i32 %x.i, 64 127 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x.0 128 %val0 = load float, ptr addrspace(3) %arrayidx0, align 4 129 %add.x.1 = add nsw i32 %x.i, 127 130 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x.1 131 %val1 = load float, ptr addrspace(3) %arrayidx1, align 4 132 %sum = fadd float %val0, %val1 133 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i 134 store float %sum, ptr addrspace(1) %out.gep, align 4 135 ret void 136} 137 138; GCN-LABEL: @simple_read2st64_f64_0_1 139; CI: s_mov_b32 m0 140; GFX9-NOT: m0 141 142; GCN: ds_read2st64_b64 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]], v{{[0-9]+}} offset1:1 143; GCN: s_waitcnt lgkmcnt(0) 144; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v[[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]] 145; CI: buffer_store_dwordx2 [[RESULT]] 146; GFX9: global_store_dwordx2 v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} 147define amdgpu_kernel void @simple_read2st64_f64_0_1(ptr addrspace(1) %out) #0 { 148 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 149 %arrayidx0 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %x.i 150 %val0 = load double, ptr addrspace(3) %arrayidx0, align 8 151 %add.x = add nsw i32 %x.i, 64 152 %arrayidx1 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %add.x 153 %val1 = load double, ptr addrspace(3) %arrayidx1, align 8 154 %sum = fadd double %val0, %val1 155 %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i 156 store double %sum, ptr addrspace(1) %out.gep, align 8 157 ret void 158} 159 160; GCN-LABEL: @simple_read2st64_f64_1_2 161; CI: s_mov_b32 m0 162; GFX9-NOT: m0 163 164; GCN: ds_read2st64_b64 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]], v{{[0-9]+}} offset0:1 offset1:2 165; GCN: s_waitcnt lgkmcnt(0) 166; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v[[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]] 167 168; CI: buffer_store_dwordx2 [[RESULT]] 169; GFX9: global_store_dwordx2 v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} 170define amdgpu_kernel void @simple_read2st64_f64_1_2(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { 171 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 172 %add.x.0 = add nsw i32 %x.i, 64 173 %arrayidx0 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x.0 174 %val0 = load double, ptr addrspace(3) %arrayidx0, align 8 175 %add.x.1 = add nsw i32 %x.i, 128 176 %arrayidx1 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x.1 177 %val1 = load double, ptr addrspace(3) %arrayidx1, align 8 178 %sum = fadd double %val0, %val1 179 %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i 180 store double %sum, ptr addrspace(1) %out.gep, align 8 181 ret void 182} 183 184; Alignment only 185 186; GCN-LABEL: @misaligned_read2st64_f64 187; CI: s_mov_b32 m0 188; GFX9-NOT: m0 189 190; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1 191; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129 192; GCN: s_endpgm 193define amdgpu_kernel void @misaligned_read2st64_f64(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { 194 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 195 %arrayidx0 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %x.i 196 %val0 = load double, ptr addrspace(3) %arrayidx0, align 4 197 %add.x = add nsw i32 %x.i, 64 198 %arrayidx1 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x 199 %val1 = load double, ptr addrspace(3) %arrayidx1, align 4 200 %sum = fadd double %val0, %val1 201 %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i 202 store double %sum, ptr addrspace(1) %out.gep, align 4 203 ret void 204} 205 206; The maximum is not the usual 0xff because 0xff * 8 * 64 > 0xffff 207; GCN-LABEL: @simple_read2st64_f64_max_offset 208; CI: s_mov_b32 m0 209; GFX9-NOT: m0 210 211; GCN: ds_read2st64_b64 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]], v{{[0-9]+}} offset0:4 offset1:127 212; GCN: s_waitcnt lgkmcnt(0) 213; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v[[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]] 214 215; CI: buffer_store_dwordx2 [[RESULT]] 216; GFX9: global_store_dwordx2 v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} 217define amdgpu_kernel void @simple_read2st64_f64_max_offset(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { 218 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 219 %add.x.0 = add nsw i32 %x.i, 256 220 %arrayidx0 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x.0 221 %val0 = load double, ptr addrspace(3) %arrayidx0, align 8 222 %add.x.1 = add nsw i32 %x.i, 8128 223 %arrayidx1 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x.1 224 %val1 = load double, ptr addrspace(3) %arrayidx1, align 8 225 %sum = fadd double %val0, %val1 226 %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i 227 store double %sum, ptr addrspace(1) %out.gep, align 8 228 ret void 229} 230 231; GCN-LABEL: @simple_read2st64_f64_over_max_offset 232; CI: s_mov_b32 m0 233; GFX9-NOT: m0 234 235; GCN-NOT: ds_read2st64_b64 236; GCN-DAG: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512 237; GCN-DAG: v_add_{{i|u}}32_e32 [[BIGADD:v[0-9]+]], {{(vcc, )?}}0x10000, {{v[0-9]+}} 238; GCN: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]] 239; GCN: s_endpgm 240define amdgpu_kernel void @simple_read2st64_f64_over_max_offset(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { 241 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 242 %add.x.0 = add nsw i32 %x.i, 64 243 %arrayidx0 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x.0 244 %val0 = load double, ptr addrspace(3) %arrayidx0, align 8 245 %add.x.1 = add nsw i32 %x.i, 8192 246 %arrayidx1 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x.1 247 %val1 = load double, ptr addrspace(3) %arrayidx1, align 8 248 %sum = fadd double %val0, %val1 249 %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i 250 store double %sum, ptr addrspace(1) %out.gep, align 8 251 ret void 252} 253 254; GCN-LABEL: @invalid_read2st64_f64_odd_offset 255; CI: s_mov_b32 m0 256; GFX9-NOT: m0 257 258; GCN-NOT: ds_read2st64_b64 259; GCN: s_endpgm 260define amdgpu_kernel void @invalid_read2st64_f64_odd_offset(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { 261 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 262 %add.x.0 = add nsw i32 %x.i, 64 263 %arrayidx0 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x.0 264 %val0 = load double, ptr addrspace(3) %arrayidx0, align 8 265 %add.x.1 = add nsw i32 %x.i, 8129 266 %arrayidx1 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x.1 267 %val1 = load double, ptr addrspace(3) %arrayidx1, align 8 268 %sum = fadd double %val0, %val1 269 %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i 270 store double %sum, ptr addrspace(1) %out.gep, align 8 271 ret void 272} 273 274; The stride of 8 elements is 8 * 8 bytes. We need to make sure the 275; stride in elements, not bytes, is a multiple of 64. 276 277; GCN-LABEL: @byte_size_only_divisible_64_read2_f64 278; CI: s_mov_b32 m0 279; GFX9-NOT: m0 280 281; GCN-NOT: ds_read2st_b64 282; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8 283; GCN: s_endpgm 284define amdgpu_kernel void @byte_size_only_divisible_64_read2_f64(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { 285 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 286 %arrayidx0 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %x.i 287 %val0 = load double, ptr addrspace(3) %arrayidx0, align 8 288 %add.x = add nsw i32 %x.i, 8 289 %arrayidx1 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x 290 %val1 = load double, ptr addrspace(3) %arrayidx1, align 8 291 %sum = fadd double %val0, %val1 292 %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i 293 store double %sum, ptr addrspace(1) %out.gep, align 4 294 ret void 295} 296 297declare i32 @llvm.amdgcn.workitem.id.x() #1 298declare i32 @llvm.amdgcn.workitem.id.y() #1 299 300attributes #0 = { nounwind } 301attributes #1 = { nounwind readnone } 302