1; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI -check-prefix=OPT-SICIVI %s 2; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI -check-prefix=OPT-SICIVI %s 3; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI -check-prefix=OPT-SICIVI %s 4; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=amdgcn-unknown-unknown -mcpu=gfx900 < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-GFX9 %s 5; RUN: llc -mtriple=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s 6; RUN: llc -mtriple=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=SICIVI %s 7; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s 8; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s 9 10target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" 11 12; OPT-LABEL: @test_sink_global_small_offset_i32( 13; OPT-CI-NOT: getelementptr i32, ptr addrspace(1) %in 14; OPT-VI: getelementptr i32, ptr addrspace(1) %in 15; OPT: br i1 16; OPT-CI: getelementptr i8, 17 18; GCN-LABEL: {{^}}test_sink_global_small_offset_i32: 19define amdgpu_kernel void @test_sink_global_small_offset_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 20entry: 21 %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999 22 %in.gep = getelementptr i32, ptr addrspace(1) %in, i64 7 23 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 24 %tmp0 = icmp eq i32 %tid, 0 25 br i1 %tmp0, label %endif, label %if 26 27if: 28 %tmp1 = load i32, ptr addrspace(1) %in.gep 29 br label %endif 30 31endif: 32 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 33 store i32 %x, ptr addrspace(1) %out.gep 34 br label %done 35 36done: 37 ret void 38} 39 40; OPT-LABEL: @test_sink_global_small_max_i32_ds_offset( 41; OPT: %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 65535 42; OPT: br i1 43 44; GCN-LABEL: {{^}}test_sink_global_small_max_i32_ds_offset: 45; GCN: s_and_saveexec_b64 46; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} 47 48; GFX9: v_mov_b32_e32 [[VOFFSET:v[0-9]+]], 0xf000{{$}} 49; GFX9: global_load_sbyte {{v[0-9]+}}, [[VOFFSET]], {{s\[[0-9]+:[0-9]+\]}} offset:4095{{$}} 50; GCN: {{^}}.LBB1_2: 51; GCN: s_or_b64 exec 52define amdgpu_kernel void @test_sink_global_small_max_i32_ds_offset(ptr addrspace(1) %out, ptr addrspace(1) %in) { 53entry: 54 %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 99999 55 %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 65535 56 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 57 %tmp0 = icmp eq i32 %tid, 0 58 br i1 %tmp0, label %endif, label %if 59 60if: 61 %tmp1 = load i8, ptr addrspace(1) %in.gep 62 %tmp2 = sext i8 %tmp1 to i32 63 br label %endif 64 65endif: 66 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] 67 store i32 %x, ptr addrspace(1) %out.gep 68 br label %done 69 70done: 71 ret void 72} 73 74; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset: 75; GCN: s_and_saveexec_b64 76; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}} 77; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} 78; GFX9: global_load_sbyte {{v[0-9]+}}, [[ZERO]], {{s\[[0-9]+:[0-9]+\]}} offset:4095{{$}} 79; GCN: {{^}}.LBB2_2: 80; GCN: s_or_b64 exec 81define amdgpu_kernel void @test_sink_global_small_max_mubuf_offset(ptr addrspace(1) %out, ptr addrspace(1) %in) { 82entry: 83 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 1024 84 %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 4095 85 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 86 %tmp0 = icmp eq i32 %tid, 0 87 br i1 %tmp0, label %endif, label %if 88 89if: 90 %tmp1 = load i8, ptr addrspace(1) %in.gep 91 %tmp2 = sext i8 %tmp1 to i32 92 br label %endif 93 94endif: 95 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] 96 store i32 %x, ptr addrspace(1) %out.gep 97 br label %done 98 99done: 100 ret void 101} 102 103; GCN-LABEL: {{^}}test_sink_global_small_max_plus_1_mubuf_offset: 104; GCN: s_and_saveexec_b64 105; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} 106; GFX9: v_mov_b32_e32 [[VOFFSET:v[0-9]+]], 0x1000{{$}} 107; GFX9: global_load_sbyte {{v[0-9]+}}, [[VOFFSET]], {{s\[[0-9]+:[0-9]+\]$}} 108; GCN: {{^}}.LBB3_2: 109; GCN: s_or_b64 exec 110define amdgpu_kernel void @test_sink_global_small_max_plus_1_mubuf_offset(ptr addrspace(1) %out, ptr addrspace(1) %in) { 111entry: 112 %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 99999 113 %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 4096 114 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 115 %tmp0 = icmp eq i32 %tid, 0 116 br i1 %tmp0, label %endif, label %if 117 118if: 119 %tmp1 = load i8, ptr addrspace(1) %in.gep 120 %tmp2 = sext i8 %tmp1 to i32 121 br label %endif 122 123endif: 124 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] 125 store i32 %x, ptr addrspace(1) %out.gep 126 br label %done 127 128done: 129 ret void 130} 131 132; OPT-LABEL: @test_sink_scratch_small_offset_i32( 133; OPT-NOT: getelementptr [512 x i32] 134; OPT: br i1 135; OPT: getelementptr i8, 136 137; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32: 138; GCN: s_and_saveexec_b64 139; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4088{{$}} 140; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4088 glc{{$}} 141; GCN: {{^}}.LBB4_2: 142define amdgpu_kernel void @test_sink_scratch_small_offset_i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %arg) { 143entry: 144 %alloca = alloca [512 x i32], align 4, addrspace(5) 145 %out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i64 999998 146 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i64 999999 147 %add.arg = add i32 %arg, 8 148 %alloca.gep = getelementptr [512 x i32], ptr addrspace(5) %alloca, i32 0, i32 1022 149 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 150 %tmp0 = icmp eq i32 %tid, 0 151 br i1 %tmp0, label %endif, label %if 152 153if: 154 store volatile i32 123, ptr addrspace(5) %alloca.gep 155 %tmp1 = load volatile i32, ptr addrspace(5) %alloca.gep 156 br label %endif 157 158endif: 159 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 160 store i32 %x, ptr addrspace(1) %out.gep.0 161 %load = load volatile i32, ptr addrspace(5) %alloca.gep 162 store i32 %load, ptr addrspace(1) %out.gep.1 163 br label %done 164 165done: 166 ret void 167} 168 169; This used to be a special case when the scavenge slot was 170; fixed at offset 0. 171; OPT-LABEL: @test_sink_scratch_small_offset_i32_reserved( 172; OPT-NOT: getelementptr [512 x i32] 173; OPT: br i1 174; OPT: getelementptr i8, 175 176; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32_reserved: 177; GCN: s_and_saveexec_b64 178; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092{{$}} 179; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092 glc{{$}} 180; GCN: {{^.LBB[0-9]+}}_2: 181 182define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %arg) { 183entry: 184 %alloca = alloca [512 x i32], align 4, addrspace(5) 185 %out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i64 999998 186 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i64 999999 187 %add.arg = add i32 %arg, 8 188 %alloca.gep = getelementptr [512 x i32], ptr addrspace(5) %alloca, i32 0, i32 1023 189 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 190 %tmp0 = icmp eq i32 %tid, 0 191 br i1 %tmp0, label %endif, label %if 192 193if: 194 store volatile i32 123, ptr addrspace(5) %alloca.gep 195 %tmp1 = load volatile i32, ptr addrspace(5) %alloca.gep 196 br label %endif 197 198endif: 199 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 200 store i32 %x, ptr addrspace(1) %out.gep.0 201 %load = load volatile i32, ptr addrspace(5) %alloca.gep 202 store i32 %load, ptr addrspace(1) %out.gep.1 203 br label %done 204 205done: 206 ret void 207} 208 209; OPT-LABEL: @test_no_sink_scratch_large_offset_i32( 210; OPT: %alloca.gep = getelementptr [512 x i32], ptr addrspace(5) %alloca, i32 0, i32 1024 211; OPT: br i1 212; OPT-NOT: ptrtoint 213 214; GCN-LABEL: {{^}}test_no_sink_scratch_large_offset_i32: 215; GCN: s_and_saveexec_b64 216; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen{{$}} 217; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen glc{{$}} 218; GCN: {{^.LBB[0-9]+}}_2: 219define amdgpu_kernel void @test_no_sink_scratch_large_offset_i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %arg) { 220entry: 221 %alloca = alloca [512 x i32], align 4, addrspace(5) 222 %out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i64 999998 223 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i64 999999 224 %add.arg = add i32 %arg, 8 225 %alloca.gep = getelementptr [512 x i32], ptr addrspace(5) %alloca, i32 0, i32 1024 226 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 227 %tmp0 = icmp eq i32 %tid, 0 228 br i1 %tmp0, label %endif, label %if 229 230if: 231 store volatile i32 123, ptr addrspace(5) %alloca.gep 232 %tmp1 = load volatile i32, ptr addrspace(5) %alloca.gep 233 br label %endif 234 235endif: 236 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 237 store i32 %x, ptr addrspace(1) %out.gep.0 238 %load = load volatile i32, ptr addrspace(5) %alloca.gep 239 store i32 %load, ptr addrspace(1) %out.gep.1 240 br label %done 241 242done: 243 ret void 244} 245 246; GCN-LABEL: {{^}}test_sink_global_vreg_sreg_i32: 247; GCN: s_and_saveexec_b64 248; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 249; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 250; GCN: {{^.LBB[0-9]+}}_2: 251define amdgpu_kernel void @test_sink_global_vreg_sreg_i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %offset) { 252entry: 253 %offset.ext = zext i32 %offset to i64 254 %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999 255 %in.gep = getelementptr i32, ptr addrspace(1) %in, i64 %offset.ext 256 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 257 %tmp0 = icmp eq i32 %tid, 0 258 br i1 %tmp0, label %endif, label %if 259 260if: 261 %tmp1 = load i32, ptr addrspace(1) %in.gep 262 br label %endif 263 264endif: 265 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 266 store i32 %x, ptr addrspace(1) %out.gep 267 br label %done 268 269done: 270 ret void 271} 272 273; OPT-LABEL: @test_sink_constant_small_offset_i32 274; OPT-NOT: getelementptr i32, ptr addrspace(4) 275; OPT: br i1 276 277; GCN-LABEL: {{^}}test_sink_constant_small_offset_i32: 278; GCN: s_and_saveexec_b64 279; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}} 280; GCN: s_or_b64 exec, exec 281define amdgpu_kernel void @test_sink_constant_small_offset_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) { 282entry: 283 %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999 284 %in.gep = getelementptr i32, ptr addrspace(4) %in, i64 7 285 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 286 %tmp0 = icmp eq i32 %tid, 0 287 br i1 %tmp0, label %endif, label %if 288 289if: 290 %tmp1 = load i32, ptr addrspace(4) %in.gep 291 br label %endif 292 293endif: 294 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 295 store i32 %x, ptr addrspace(1) %out.gep 296 br label %done 297 298done: 299 ret void 300} 301 302; OPT-LABEL: @test_sink_constant_max_8_bit_offset_i32 303; OPT-NOT: getelementptr i32, ptr addrspace(4) 304; OPT: br i1 305 306; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_i32: 307; GCN: s_and_saveexec_b64 308; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}} 309; GCN: s_or_b64 exec, exec 310define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) { 311entry: 312 %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999 313 %in.gep = getelementptr i32, ptr addrspace(4) %in, i64 255 314 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 315 %tmp0 = icmp eq i32 %tid, 0 316 br i1 %tmp0, label %endif, label %if 317 318if: 319 %tmp1 = load i32, ptr addrspace(4) %in.gep 320 br label %endif 321 322endif: 323 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 324 store i32 %x, ptr addrspace(1) %out.gep 325 br label %done 326 327done: 328 ret void 329} 330 331; OPT-LABEL: @test_sink_constant_max_8_bit_offset_p1_i32 332; OPT-SI: getelementptr i32, ptr addrspace(4) 333; OPT-CI-NOT: getelementptr i32, ptr addrspace(4) 334; OPT-VI-NOT: getelementptr i32, ptr addrspace(4) 335; OPT: br i1 336 337; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_p1_i32: 338; GCN: s_and_saveexec_b64 339; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x400 340 341; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} 342; GCN: s_or_b64 exec, exec 343define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_p1_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) { 344entry: 345 %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999 346 %in.gep = getelementptr i32, ptr addrspace(4) %in, i64 256 347 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 348 %tmp0 = icmp eq i32 %tid, 0 349 br i1 %tmp0, label %endif, label %if 350 351if: 352 %tmp1 = load i32, ptr addrspace(4) %in.gep 353 br label %endif 354 355endif: 356 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 357 store i32 %x, ptr addrspace(1) %out.gep 358 br label %done 359 360done: 361 ret void 362} 363 364; OPT-LABEL: @test_sink_constant_max_32_bit_offset_i32 365; OPT-SI: getelementptr i32, ptr addrspace(4) 366; OPT-CI-NOT: getelementptr i32, ptr addrspace(4) 367; OPT: br i1 368 369; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_i32: 370; GCN: s_and_saveexec_b64 371; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, -4{{$}} 372; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}} 373; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} 374 375; VI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, -4{{$}} 376; VI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}} 377; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} 378 379; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffffff{{$}} 380 381; GCN: s_or_b64 exec, exec 382define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) { 383entry: 384 %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999 385 %in.gep = getelementptr i32, ptr addrspace(4) %in, i64 4294967295 386 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 387 %tmp0 = icmp eq i32 %tid, 0 388 br i1 %tmp0, label %endif, label %if 389 390if: 391 %tmp1 = load i32, ptr addrspace(4) %in.gep 392 br label %endif 393 394endif: 395 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 396 store i32 %x, ptr addrspace(1) %out.gep 397 br label %done 398 399done: 400 ret void 401} 402 403; OPT-LABEL: @test_sink_constant_max_32_bit_offset_p1_i32 404; OPT: getelementptr i32, ptr addrspace(4) 405; OPT: br i1 406 407; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_p1_i32: 408; GCN: s_and_saveexec_b64 409; GCN: s_add_u32 410; GCN: s_addc_u32 411; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} 412; GCN: s_or_b64 exec, exec 413define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_p1_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) { 414entry: 415 %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999 416 %in.gep = getelementptr i32, ptr addrspace(4) %in, i64 17179869181 417 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 418 %tmp0 = icmp eq i32 %tid, 0 419 br i1 %tmp0, label %endif, label %if 420 421if: 422 %tmp1 = load i32, ptr addrspace(4) %in.gep 423 br label %endif 424 425endif: 426 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 427 store i32 %x, ptr addrspace(1) %out.gep 428 br label %done 429 430done: 431 ret void 432} 433 434; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_i32: 435; GCN: s_and_saveexec_b64 436; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc{{$}} 437; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} 438 439; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x3ffff{{$}} 440; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}} 441 442; GCN: s_or_b64 exec, exec 443define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) { 444entry: 445 %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999 446 %in.gep = getelementptr i32, ptr addrspace(4) %in, i64 262143 447 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 448 %tmp0 = icmp eq i32 %tid, 0 449 br i1 %tmp0, label %endif, label %if 450 451if: 452 %tmp1 = load i32, ptr addrspace(4) %in.gep 453 br label %endif 454 455endif: 456 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 457 store i32 %x, ptr addrspace(1) %out.gep 458 br label %done 459 460done: 461 ret void 462} 463 464; OPT-LABEL: @test_sink_constant_max_20_bit_byte_offset_p1_i32 465; OPT-SI: getelementptr i32, ptr addrspace(4) 466; OPT-CI-NOT: getelementptr i32, ptr addrspace(4) 467; OPT-VI: getelementptr i32, ptr addrspace(4) 468; OPT: br i1 469 470; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_p1_i32: 471; GCN: s_and_saveexec_b64 472; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000{{$}} 473; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} 474 475; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x40000{{$}} 476 477; VI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000{{$}} 478; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} 479 480; GCN: s_or_b64 exec, exec 481define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_p1_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) { 482entry: 483 %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999 484 %in.gep = getelementptr i32, ptr addrspace(4) %in, i64 262144 485 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 486 %tmp0 = icmp eq i32 %tid, 0 487 br i1 %tmp0, label %endif, label %if 488 489if: 490 %tmp1 = load i32, ptr addrspace(4) %in.gep 491 br label %endif 492 493endif: 494 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 495 store i32 %x, ptr addrspace(1) %out.gep 496 br label %done 497 498done: 499 ret void 500} 501 502%struct.foo = type { [3 x float], [3 x float] } 503 504; OPT-LABEL: @sink_ds_address( 505; OPT: getelementptr inbounds i8, 506 507; GCN-LABEL: {{^}}sink_ds_address: 508; GCN: s_load_dword [[SREG1:s[0-9]+]], 509; GCN: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]] 510; GCN-DAG: ds_read2_b32 v[{{[0-9+:[0-9]+}}], [[VREG1]] offset0:3 offset1:5 511define amdgpu_kernel void @sink_ds_address(ptr addrspace(3) nocapture %ptr) nounwind { 512entry: 513 %x = getelementptr inbounds %struct.foo, ptr addrspace(3) %ptr, i32 0, i32 1, i32 0 514 %y = getelementptr inbounds %struct.foo, ptr addrspace(3) %ptr, i32 0, i32 1, i32 2 515 br label %bb32 516 517bb32: 518 %a = load float, ptr addrspace(3) %x, align 4 519 %b = load float, ptr addrspace(3) %y, align 4 520 %cmp = fcmp one float %a, %b 521 br i1 %cmp, label %bb34, label %bb33 522 523bb33: 524 unreachable 525 526bb34: 527 unreachable 528} 529 530; Address offset is not a multiple of 4. This is a valid mubuf offset, 531; but not smrd. 532 533; OPT-LABEL: @test_sink_constant_small_max_mubuf_offset_load_i32_align_1( 534; OPT: br i1 %tmp0, 535; OPT: if: 536; OPT: getelementptr i8, {{.*}} 4095 537define amdgpu_kernel void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(ptr addrspace(1) %out, ptr addrspace(4) %in) { 538entry: 539 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 1024 540 %in.gep = getelementptr i8, ptr addrspace(4) %in, i64 4095 541 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 542 %tmp0 = icmp eq i32 %tid, 0 543 br i1 %tmp0, label %endif, label %if 544 545if: 546 %tmp1 = load i32, ptr addrspace(4) %in.gep, align 1 547 br label %endif 548 549endif: 550 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 551 store i32 %x, ptr addrspace(1) %out.gep 552 br label %done 553 554done: 555 ret void 556} 557 558; OPT-LABEL: @test_sink_local_small_offset_atomicrmw_i32( 559; OPT: %sunkaddr = getelementptr i8, ptr addrspace(3) %in, i32 28 560; OPT: %tmp1 = atomicrmw add ptr addrspace(3) %sunkaddr, i32 2 seq_cst 561define amdgpu_kernel void @test_sink_local_small_offset_atomicrmw_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) { 562entry: 563 %out.gep = getelementptr i32, ptr addrspace(3) %out, i32 999999 564 %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7 565 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 566 %tmp0 = icmp eq i32 %tid, 0 567 br i1 %tmp0, label %endif, label %if 568 569if: 570 %tmp1 = atomicrmw add ptr addrspace(3) %in.gep, i32 2 seq_cst 571 br label %endif 572 573endif: 574 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 575 store i32 %x, ptr addrspace(3) %out.gep 576 br label %done 577 578done: 579 ret void 580} 581 582; OPT-LABEL: @test_sink_local_small_offset_cmpxchg_i32( 583; OPT: %sunkaddr = getelementptr i8, ptr addrspace(3) %in, i32 28 584; OPT: %tmp1.struct = cmpxchg ptr addrspace(3) %sunkaddr, i32 undef, i32 2 seq_cst monotonic 585define amdgpu_kernel void @test_sink_local_small_offset_cmpxchg_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) { 586entry: 587 %out.gep = getelementptr i32, ptr addrspace(3) %out, i32 999999 588 %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7 589 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 590 %tmp0 = icmp eq i32 %tid, 0 591 br i1 %tmp0, label %endif, label %if 592 593if: 594 %tmp1.struct = cmpxchg ptr addrspace(3) %in.gep, i32 undef, i32 2 seq_cst monotonic 595 %tmp1 = extractvalue { i32, i1 } %tmp1.struct, 0 596 br label %endif 597 598endif: 599 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 600 store i32 %x, ptr addrspace(3) %out.gep 601 br label %done 602 603done: 604 ret void 605} 606 607; OPT-LABEL: @test_wrong_operand_local_small_offset_cmpxchg_i32( 608; OPT: %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7 609; OPT: br i1 610; OPT: cmpxchg ptr addrspace(3) undef, ptr addrspace(3) %in.gep, ptr addrspace(3) undef seq_cst monotonic 611define amdgpu_kernel void @test_wrong_operand_local_small_offset_cmpxchg_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) { 612entry: 613 %out.gep = getelementptr ptr addrspace(3), ptr addrspace(3) %out, i32 999999 614 %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7 615 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 616 %tmp0 = icmp eq i32 %tid, 0 617 br i1 %tmp0, label %endif, label %if 618 619if: 620 %tmp1.struct = cmpxchg ptr addrspace(3) undef, ptr addrspace(3) %in.gep, ptr addrspace(3) undef seq_cst monotonic 621 %tmp1 = extractvalue { ptr addrspace(3), i1 } %tmp1.struct, 0 622 br label %endif 623 624endif: 625 %x = phi ptr addrspace(3) [ %tmp1, %if ], [ null, %entry ] 626 store ptr addrspace(3) %x, ptr addrspace(3) %out.gep 627 br label %done 628 629done: 630 ret void 631} 632 633; OPT-LABEL: @test_sink_global_small_min_scratch_global_offset( 634; OPT-SICIVI: %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 -4096 635; OPT-SICIV: br 636; OPT-SICIVI: %tmp1 = load i8, ptr addrspace(1) %in.gep 637 638; OPT-GFX9: br 639; OPT-GFX9: %sunkaddr = getelementptr i8, ptr addrspace(1) %in, i64 -4096 640; OPT-GFX9: load i8, ptr addrspace(1) %sunkaddr 641 642; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_offset: 643; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} 644; GFX9: global_load_sbyte v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:-4096{{$}} 645define amdgpu_kernel void @test_sink_global_small_min_scratch_global_offset(ptr addrspace(1) %out, ptr addrspace(1) %in) { 646entry: 647 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 1024 648 %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 -4096 649 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 650 %tmp0 = icmp eq i32 %tid, 0 651 br i1 %tmp0, label %endif, label %if 652 653if: 654 %tmp1 = load i8, ptr addrspace(1) %in.gep 655 %tmp2 = sext i8 %tmp1 to i32 656 br label %endif 657 658endif: 659 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] 660 store i32 %x, ptr addrspace(1) %out.gep 661 br label %done 662 663done: 664 ret void 665} 666 667; OPT-LABEL: @test_sink_global_small_min_scratch_global_neg1_offset( 668; OPT: %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 -4097 669; OPT: br 670; OPT: load i8, ptr addrspace(1) %in.gep 671 672; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_neg1_offset: 673define amdgpu_kernel void @test_sink_global_small_min_scratch_global_neg1_offset(ptr addrspace(1) %out, ptr addrspace(1) %in) { 674entry: 675 %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 99999 676 %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 -4097 677 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 678 %tmp0 = icmp eq i32 %tid, 0 679 br i1 %tmp0, label %endif, label %if 680 681if: 682 %tmp1 = load i8, ptr addrspace(1) %in.gep 683 %tmp2 = sext i8 %tmp1 to i32 684 br label %endif 685 686endif: 687 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] 688 store i32 %x, ptr addrspace(1) %out.gep 689 br label %done 690 691done: 692 ret void 693} 694 695; OPT-LABEL: @test_sink_small_offset_ds_append( 696; OPT: %sunkaddr = getelementptr i8, ptr addrspace(3) %in, i32 28 697; OPT: %tmp1 = call i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) %sunkaddr, i1 false) 698define amdgpu_kernel void @test_sink_small_offset_ds_append(ptr addrspace(3) %out, ptr addrspace(3) %in) { 699entry: 700 %out.gep = getelementptr i32, ptr addrspace(3) %out, i32 999999 701 %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7 702 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 703 %tmp0 = icmp eq i32 %tid, 0 704 br i1 %tmp0, label %endif, label %if 705 706if: 707 %tmp1 = call i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) %in.gep, i1 false) 708 br label %endif 709 710endif: 711 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 712 store i32 %x, ptr addrspace(3) %out.gep 713 br label %done 714 715done: 716 ret void 717} 718 719; OPT-LABEL: @test_sink_small_offset_ds_consume( 720; OPT: %sunkaddr = getelementptr i8, ptr addrspace(3) %in, i32 28 721; OPT: %tmp1 = call i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) %sunkaddr, i1 false) 722define amdgpu_kernel void @test_sink_small_offset_ds_consume(ptr addrspace(3) %out, ptr addrspace(3) %in) { 723entry: 724 %out.gep = getelementptr i32, ptr addrspace(3) %out, i32 999999 725 %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7 726 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 727 %tmp0 = icmp eq i32 %tid, 0 728 br i1 %tmp0, label %endif, label %if 729 730if: 731 %tmp1 = call i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) %in.gep, i1 false) 732 br label %endif 733 734endif: 735 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] 736 store i32 %x, ptr addrspace(3) %out.gep 737 br label %done 738 739done: 740 ret void 741} 742 743declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0 744declare i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) nocapture, i1 immarg) #3 745declare i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) nocapture, i1 immarg) #3 746 747attributes #0 = { nounwind readnone } 748attributes #1 = { nounwind } 749attributes #2 = { nounwind argmemonly } 750attributes #3 = { argmemonly convergent nounwind willreturn } 751