1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -mtriple=amdgcn -mcpu=gfx900 -amdgpu-aa -amdgpu-aa-wrapper -amdgpu-annotate-uniform -S < %s | FileCheck %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefix=GCN %s 4 5; Check that barrier or fence in between of loads is not considered a clobber 6; for the purpose of converting vector loads into scalar. 7 8@LDS = linkonce_odr hidden local_unnamed_addr addrspace(3) global i32 undef 9 10; GCN-LABEL: {{^}}simple_barrier: 11; GCN: s_load_dword s 12; GCN: s_waitcnt lgkmcnt(0) 13; GCN: s_barrier 14; GCN: s_waitcnt lgkmcnt(0) 15; GCN: ; wave barrier 16; GCN-NOT: global_load_dword 17; GCN: s_load_dword s 18; GCN-NOT: global_load_dword 19; GCN: global_store_dword 20define amdgpu_kernel void @simple_barrier(ptr addrspace(1) %arg) { 21; CHECK-LABEL: @simple_barrier( 22; CHECK-NEXT: bb: 23; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0 24; CHECK-NEXT: fence syncscope("workgroup") release 25; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() 26; CHECK-NEXT: fence syncscope("workgroup") acquire 27; CHECK-NEXT: tail call void @llvm.amdgcn.wave.barrier() 28; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0 29; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber !0 30; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] 31; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2 32; CHECK-NEXT: store i32 [[I3]], ptr addrspace(1) [[I4]], align 4 33; CHECK-NEXT: ret void 34; 35bb: 36 %i = load i32, ptr addrspace(1) %arg, align 4 37 fence syncscope("workgroup") release 38 tail call void @llvm.amdgcn.s.barrier() 39 fence syncscope("workgroup") acquire 40 tail call void @llvm.amdgcn.wave.barrier() 41 %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1 42 %i2 = load i32, ptr addrspace(1) %i1, align 4 43 %i3 = add i32 %i2, %i 44 %i4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2 45 store i32 %i3, ptr addrspace(1) %i4, align 4 46 ret void 47} 48 49; GCN-LABEL: {{^}}memory_phi_no_clobber: 50; GCN: s_load_dword s 51; GCN: s_waitcnt lgkmcnt(0) 52; GCN: s_waitcnt lgkmcnt(0) 53; GCN: s_barrier 54; GCN-NOT: global_load_dword 55; GCN: s_load_dword s 56; GCN-NOT: global_load_dword 57; GCN: global_store_dword 58define amdgpu_kernel void @memory_phi_no_clobber(ptr addrspace(1) %arg) { 59; CHECK-LABEL: @memory_phi_no_clobber( 60; CHECK-NEXT: bb: 61; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0 62; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0 63; CHECK: if.then: 64; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() 65; CHECK-NEXT: br label [[IF_END:%.*]], !amdgpu.uniform !0 66; CHECK: if.else: 67; CHECK-NEXT: fence syncscope("workgroup") release 68; CHECK-NEXT: br label [[IF_END]], !amdgpu.uniform !0 69; CHECK: if.end: 70; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0 71; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber !0 72; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] 73; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2 74; CHECK-NEXT: store i32 [[I3]], ptr addrspace(1) [[I4]], align 4 75; CHECK-NEXT: ret void 76; 77bb: 78 %i = load i32, ptr addrspace(1) %arg, align 4 79 br i1 undef, label %if.then, label %if.else 80 81if.then: 82 tail call void @llvm.amdgcn.s.barrier() 83 br label %if.end 84 85if.else: 86 fence syncscope("workgroup") release 87 br label %if.end 88 89if.end: 90 %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1 91 %i2 = load i32, ptr addrspace(1) %i1, align 4 92 %i3 = add i32 %i2, %i 93 %i4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2 94 store i32 %i3, ptr addrspace(1) %i4, align 4 95 ret void 96} 97 98; GCN-LABEL: {{^}}memory_phi_clobber1: 99; GCN: s_load_dword s 100; GCN: s_barrier 101; GCN: global_store_dword 102; GCN: global_load_dword 103; GCN: global_store_dword 104define amdgpu_kernel void @memory_phi_clobber1(ptr addrspace(1) %arg) { 105; CHECK-LABEL: @memory_phi_clobber1( 106; CHECK-NEXT: bb: 107; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0 108; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0 109; CHECK: if.then: 110; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 3 111; CHECK-NEXT: store i32 1, ptr addrspace(1) [[GEP]], align 4 112; CHECK-NEXT: br label [[IF_END:%.*]], !amdgpu.uniform !0 113; CHECK: if.else: 114; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() 115; CHECK-NEXT: br label [[IF_END]], !amdgpu.uniform !0 116; CHECK: if.end: 117; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0 118; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4 119; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] 120; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2 121; CHECK-NEXT: store i32 [[I3]], ptr addrspace(1) [[I4]], align 4 122; CHECK-NEXT: ret void 123; 124bb: 125 %i = load i32, ptr addrspace(1) %arg, align 4 126 br i1 undef, label %if.then, label %if.else 127 128if.then: 129 %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 3 130 store i32 1, ptr addrspace(1) %gep, align 4 131 br label %if.end 132 133if.else: 134 tail call void @llvm.amdgcn.s.barrier() 135 br label %if.end 136 137if.end: 138 %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1 139 %i2 = load i32, ptr addrspace(1) %i1, align 4 140 %i3 = add i32 %i2, %i 141 %i4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2 142 store i32 %i3, ptr addrspace(1) %i4, align 4 143 ret void 144} 145 146; GCN-LABEL: {{^}}memory_phi_clobber2: 147; GCN-DAG: s_load_dword s 148; GCN-DAG: global_store_dword 149; GCN: s_barrier 150; GCN: global_load_dword 151; GCN: global_store_dword 152define amdgpu_kernel void @memory_phi_clobber2(ptr addrspace(1) %arg) { 153; CHECK-LABEL: @memory_phi_clobber2( 154; CHECK-NEXT: bb: 155; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0 156; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0 157; CHECK: if.then: 158; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() 159; CHECK-NEXT: br label [[IF_END:%.*]], !amdgpu.uniform !0 160; CHECK: if.else: 161; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 3 162; CHECK-NEXT: store i32 1, ptr addrspace(1) [[GEP]], align 4 163; CHECK-NEXT: br label [[IF_END]], !amdgpu.uniform !0 164; CHECK: if.end: 165; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0 166; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4 167; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] 168; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2 169; CHECK-NEXT: store i32 [[I3]], ptr addrspace(1) [[I4]], align 4 170; CHECK-NEXT: ret void 171; 172bb: 173 %i = load i32, ptr addrspace(1) %arg, align 4 174 br i1 undef, label %if.then, label %if.else 175 176if.then: 177 tail call void @llvm.amdgcn.s.barrier() 178 br label %if.end 179 180if.else: 181 %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 3 182 store i32 1, ptr addrspace(1) %gep, align 4 183 br label %if.end 184 185if.end: 186 %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1 187 %i2 = load i32, ptr addrspace(1) %i1, align 4 188 %i3 = add i32 %i2, %i 189 %i4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2 190 store i32 %i3, ptr addrspace(1) %i4, align 4 191 ret void 192} 193 194; GCN-LABEL: {{^}}no_clobbering_loop1: 195; GCN: s_load_dword s 196; GCN: s_load_dword s 197; GCN-NOT: global_load_dword 198; GCN: global_store_dword 199define amdgpu_kernel void @no_clobbering_loop1(ptr addrspace(1) %arg, i1 %cc) { 200; CHECK-LABEL: @no_clobbering_loop1( 201; CHECK-NEXT: bb: 202; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0 203; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform !0 204; CHECK: while.cond: 205; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0 206; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber !0 207; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] 208; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2 209; CHECK-NEXT: store i32 [[I3]], ptr addrspace(1) [[I4]], align 4 210; CHECK-NEXT: tail call void @llvm.amdgcn.wave.barrier() 211; CHECK-NEXT: br i1 [[CC:%.*]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform !0 212; CHECK: end: 213; CHECK-NEXT: ret void 214; 215bb: 216 %i = load i32, ptr addrspace(1) %arg, align 4 217 br label %while.cond 218 219while.cond: 220 %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1 221 %i2 = load i32, ptr addrspace(1) %i1, align 4 222 %i3 = add i32 %i2, %i 223 %i4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2 224 store i32 %i3, ptr addrspace(1) %i4, align 4 225 tail call void @llvm.amdgcn.wave.barrier() 226 br i1 %cc, label %while.cond, label %end 227 228end: 229 ret void 230} 231 232; GCN-LABEL: {{^}}no_clobbering_loop2: 233; GCN: s_load_dword s 234; GCN: s_load_dword s 235; GCN-NOT: global_load_dword 236; GCN: global_store_dword 237define amdgpu_kernel void @no_clobbering_loop2(ptr addrspace(1) noalias %arg, ptr addrspace(1) noalias %out, i32 %n) { 238; CHECK-LABEL: @no_clobbering_loop2( 239; CHECK-NEXT: bb: 240; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0 241; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform !0 242; CHECK: while.cond: 243; CHECK-NEXT: [[C:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[INC:%.*]], [[WHILE_COND]] ] 244; CHECK-NEXT: [[ACC:%.*]] = phi i32 [ [[I]], [[BB]] ], [ [[I3:%.*]], [[WHILE_COND]] ] 245; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i32 [[C]], !amdgpu.uniform !0 246; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber !0 247; CHECK-NEXT: [[I3]] = add i32 [[I2]], [[ACC]] 248; CHECK-NEXT: tail call void @llvm.amdgcn.wave.barrier() 249; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[C]], 1 250; CHECK-NEXT: [[CC:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] 251; CHECK-NEXT: br i1 [[CC]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform !0 252; CHECK: end: 253; CHECK-NEXT: store i32 [[I3]], ptr addrspace(1) [[OUT:%.*]], align 4 254; CHECK-NEXT: ret void 255; 256bb: 257 %i = load i32, ptr addrspace(1) %arg, align 4 258 br label %while.cond 259 260while.cond: 261 %c = phi i32 [ 0, %bb ], [ %inc, %while.cond ] 262 %acc = phi i32 [ %i, %bb ], [ %i3, %while.cond ] 263 %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %c 264 %i2 = load i32, ptr addrspace(1) %i1, align 4 265 %i3 = add i32 %i2, %acc 266 tail call void @llvm.amdgcn.wave.barrier() 267 %inc = add nuw nsw i32 %c, 1 268 %cc = icmp eq i32 %inc, %n 269 br i1 %cc, label %while.cond, label %end 270 271end: 272 store i32 %i3, ptr addrspace(1) %out, align 4 273 ret void 274} 275 276; GCN-LABEL: {{^}}clobbering_loop: 277; GCN: s_load_dword s 278; GCN: global_load_dword 279; GCN: global_store_dword 280define amdgpu_kernel void @clobbering_loop(ptr addrspace(1) %arg, ptr addrspace(1) %out, i1 %cc) { 281; CHECK-LABEL: @clobbering_loop( 282; CHECK-NEXT: bb: 283; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0 284; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform !0 285; CHECK: while.cond: 286; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0 287; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4 288; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] 289; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT:%.*]], i64 1 290; CHECK-NEXT: store i32 [[I3]], ptr addrspace(1) [[I4]], align 4 291; CHECK-NEXT: tail call void @llvm.amdgcn.wave.barrier() 292; CHECK-NEXT: br i1 [[CC:%.*]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform !0 293; CHECK: end: 294; CHECK-NEXT: ret void 295; 296bb: 297 %i = load i32, ptr addrspace(1) %arg, align 4 298 br label %while.cond 299 300while.cond: 301 %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1 302 %i2 = load i32, ptr addrspace(1) %i1, align 4 303 %i3 = add i32 %i2, %i 304 %i4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1 305 store i32 %i3, ptr addrspace(1) %i4, align 4 306 tail call void @llvm.amdgcn.wave.barrier() 307 br i1 %cc, label %while.cond, label %end 308 309end: 310 ret void 311} 312 313; GCN-LABEL: {{^}}clobber_by_atomic_load: 314; GCN: s_load_dword s 315; GCN: global_load_dword {{.*}} glc 316; GCN: global_load_dword 317; GCN: global_store_dword 318define amdgpu_kernel void @clobber_by_atomic_load(ptr addrspace(1) %arg) { 319; CHECK-LABEL: @clobber_by_atomic_load( 320; CHECK-NEXT: bb: 321; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0 322; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2, !amdgpu.uniform !0 323; CHECK-NEXT: [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[GEP]] seq_cst, align 4, !amdgpu.noclobber !0 324; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 3, !amdgpu.uniform !0 325; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4 326; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] 327; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 4 328; CHECK-NEXT: store i32 [[I3]], ptr addrspace(1) [[I4]], align 4 329; CHECK-NEXT: ret void 330; 331bb: 332 %i = load i32, ptr addrspace(1) %arg, align 4 333 %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2 334 %val = load atomic i32, ptr addrspace(1) %gep seq_cst, align 4 335 %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 3 336 %i2 = load i32, ptr addrspace(1) %i1, align 4 337 %i3 = add i32 %i2, %i 338 %i4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 4 339 store i32 %i3, ptr addrspace(1) %i4, align 4 340 ret void 341} 342 343; GCN-LABEL: {{^}}no_alias_store: 344; GCN: ds_write_b32 345; GCN: s_barrier 346; GCN: s_load_dword s 347; GCN-NOT: global_load_dword 348; GCN: global_store_dword 349define protected amdgpu_kernel void @no_alias_store(ptr addrspace(1) %in, ptr addrspace(1) %out) { 350; CHECK-LABEL: @no_alias_store( 351; CHECK-NEXT: entry: 352; CHECK-NEXT: store i32 0, ptr addrspace(3) @LDS, align 4 353; CHECK-NEXT: fence syncscope("workgroup") release 354; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() 355; CHECK-NEXT: fence syncscope("workgroup") acquire 356; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0 357; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4 358; CHECK-NEXT: ret void 359; 360entry: 361 store i32 0, ptr addrspace(3) @LDS, align 4 362 fence syncscope("workgroup") release 363 tail call void @llvm.amdgcn.s.barrier() 364 fence syncscope("workgroup") acquire 365 %ld = load i32, ptr addrspace(1) %in, align 4 366 store i32 %ld, ptr addrspace(1) %out, align 4 367 ret void 368} 369 370; GCN-LABEL: {{^}}may_alias_store: 371; GCN: global_store_dword 372; GCN: s_barrier 373; GCN: global_load_dword 374; GCN: global_store_dword 375define protected amdgpu_kernel void @may_alias_store(ptr addrspace(1) %in, ptr addrspace(1) %out) { 376; CHECK-LABEL: @may_alias_store( 377; CHECK-NEXT: entry: 378; CHECK-NEXT: store i32 0, ptr addrspace(1) [[OUT:%.*]], align 4 379; CHECK-NEXT: fence syncscope("workgroup") release 380; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() 381; CHECK-NEXT: fence syncscope("workgroup") acquire 382; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4 383; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT]], align 4 384; CHECK-NEXT: ret void 385; 386entry: 387 store i32 0, ptr addrspace(1) %out, align 4 388 fence syncscope("workgroup") release 389 tail call void @llvm.amdgcn.s.barrier() 390 fence syncscope("workgroup") acquire 391 %ld = load i32, ptr addrspace(1) %in, align 4 392 store i32 %ld, ptr addrspace(1) %out, align 4 393 ret void 394} 395 396; GCN-LABEL: {{^}}no_alias_volatile_store: 397; GCN: ds_write_b32 398; GCN: s_barrier 399; GCN: s_load_dword s 400; GCN-NOT: global_load_dword 401; GCN: global_store_dword 402define protected amdgpu_kernel void @no_alias_volatile_store(ptr addrspace(1) %in, ptr addrspace(1) %out) { 403; CHECK-LABEL: @no_alias_volatile_store( 404; CHECK-NEXT: entry: 405; CHECK-NEXT: store volatile i32 0, ptr addrspace(3) @LDS, align 4 406; CHECK-NEXT: fence syncscope("workgroup") release 407; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() 408; CHECK-NEXT: fence syncscope("workgroup") acquire 409; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0 410; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4 411; CHECK-NEXT: ret void 412; 413entry: 414 store volatile i32 0, ptr addrspace(3) @LDS, align 4 415 fence syncscope("workgroup") release 416 tail call void @llvm.amdgcn.s.barrier() 417 fence syncscope("workgroup") acquire 418 %ld = load i32, ptr addrspace(1) %in, align 4 419 store i32 %ld, ptr addrspace(1) %out, align 4 420 ret void 421} 422 423; GCN-LABEL: {{^}}no_alias_atomic_rmw_relaxed: 424; GCN: ds_add_u32 425; GCN: s_load_dword s 426; GCN-NOT: global_load_dword 427; GCN: global_store_dword 428define protected amdgpu_kernel void @no_alias_atomic_rmw_relaxed(ptr addrspace(1) %in, ptr addrspace(1) %out) { 429; CHECK-LABEL: @no_alias_atomic_rmw_relaxed( 430; CHECK-NEXT: entry: 431; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add ptr addrspace(3) @LDS, i32 5 monotonic, align 4 432; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0 433; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4 434; CHECK-NEXT: ret void 435; 436entry: 437 %unused = atomicrmw add ptr addrspace(3) @LDS, i32 5 monotonic 438 %ld = load i32, ptr addrspace(1) %in, align 4 439 store i32 %ld, ptr addrspace(1) %out, align 4 440 ret void 441} 442 443; GCN-LABEL: {{^}}no_alias_atomic_cmpxchg: 444; GCN: ds_cmpst_b32 445; GCN: s_load_dword s 446; GCN-NOT: global_load_dword 447; GCN: global_store_dword 448define protected amdgpu_kernel void @no_alias_atomic_cmpxchg(ptr addrspace(1) %in, ptr addrspace(1) %out, i32 %swap) { 449; CHECK-LABEL: @no_alias_atomic_cmpxchg( 450; CHECK-NEXT: entry: 451; CHECK-NEXT: [[UNUSED:%.*]] = cmpxchg ptr addrspace(3) @LDS, i32 7, i32 [[SWAP:%.*]] seq_cst monotonic, align 4 452; CHECK-NEXT: fence syncscope("workgroup") release 453; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() 454; CHECK-NEXT: fence syncscope("workgroup") acquire 455; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0 456; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4 457; CHECK-NEXT: ret void 458; 459entry: 460 %unused = cmpxchg ptr addrspace(3) @LDS, i32 7, i32 %swap seq_cst monotonic 461 fence syncscope("workgroup") release 462 tail call void @llvm.amdgcn.s.barrier() 463 fence syncscope("workgroup") acquire 464 %ld = load i32, ptr addrspace(1) %in, align 4 465 store i32 %ld, ptr addrspace(1) %out, align 4 466 ret void 467} 468 469; GCN-LABEL: {{^}}no_alias_atomic_rmw: 470; GCN: ds_add_u32 471; GCN: s_load_dword s 472; GCN-NOT: global_load_dword 473; GCN: global_store_dword 474define protected amdgpu_kernel void @no_alias_atomic_rmw(ptr addrspace(1) %in, ptr addrspace(1) %out) { 475; CHECK-LABEL: @no_alias_atomic_rmw( 476; CHECK-NEXT: entry: 477; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add ptr addrspace(3) @LDS, i32 5 seq_cst, align 4 478; CHECK-NEXT: fence syncscope("workgroup") release 479; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() 480; CHECK-NEXT: fence syncscope("workgroup") acquire 481; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0 482; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4 483; CHECK-NEXT: ret void 484; 485entry: 486 %unused = atomicrmw add ptr addrspace(3) @LDS, i32 5 seq_cst 487 fence syncscope("workgroup") release 488 tail call void @llvm.amdgcn.s.barrier() 489 fence syncscope("workgroup") acquire 490 %ld = load i32, ptr addrspace(1) %in, align 4 491 store i32 %ld, ptr addrspace(1) %out, align 4 492 ret void 493} 494 495; GCN-LABEL: {{^}}may_alias_atomic_cmpxchg: 496; GCN: global_atomic_cmpswap 497; GCN: global_load_dword 498; GCN: global_store_dword 499define protected amdgpu_kernel void @may_alias_atomic_cmpxchg(ptr addrspace(1) %in, ptr addrspace(1) %out, i32 %swap) { 500; CHECK-LABEL: @may_alias_atomic_cmpxchg( 501; CHECK-NEXT: entry: 502; CHECK-NEXT: [[UNUSED:%.*]] = cmpxchg ptr addrspace(1) [[OUT:%.*]], i32 7, i32 [[SWAP:%.*]] seq_cst monotonic, align 4 503; CHECK-NEXT: fence syncscope("workgroup") release 504; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() 505; CHECK-NEXT: fence syncscope("workgroup") acquire 506; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4 507; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT]], align 4 508; CHECK-NEXT: ret void 509; 510entry: 511 %unused = cmpxchg ptr addrspace(1) %out, i32 7, i32 %swap seq_cst monotonic 512 fence syncscope("workgroup") release 513 tail call void @llvm.amdgcn.s.barrier() 514 fence syncscope("workgroup") acquire 515 %ld = load i32, ptr addrspace(1) %in, align 4 516 store i32 %ld, ptr addrspace(1) %out, align 4 517 ret void 518} 519 520; GCN-LABEL: {{^}}may_alias_atomic_rmw: 521; GCN: global_atomic_add 522; GCN: global_load_dword 523; GCN: global_store_dword 524define protected amdgpu_kernel void @may_alias_atomic_rmw(ptr addrspace(1) %in, ptr addrspace(1) %out) { 525; CHECK-LABEL: @may_alias_atomic_rmw( 526; CHECK-NEXT: entry: 527; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add ptr addrspace(1) [[OUT:%.*]], i32 5 syncscope("agent") seq_cst, align 4 528; CHECK-NEXT: fence syncscope("workgroup") release 529; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() 530; CHECK-NEXT: fence syncscope("workgroup") acquire 531; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4 532; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT]], align 4 533; CHECK-NEXT: ret void 534; 535entry: 536 %unused = atomicrmw add ptr addrspace(1) %out, i32 5 syncscope("agent") seq_cst 537 fence syncscope("workgroup") release 538 tail call void @llvm.amdgcn.s.barrier() 539 fence syncscope("workgroup") acquire 540 %ld = load i32, ptr addrspace(1) %in, align 4 541 store i32 %ld, ptr addrspace(1) %out, align 4 542 ret void 543} 544 545; GCN-LABEL: {{^}}no_alias_atomic_rmw_then_clobber: 546; GCN: global_store_dword 547; GCN: global_store_dword 548; GCN: ds_add_u32 549; GCN: global_load_dword 550; GCN: global_store_dword 551define protected amdgpu_kernel void @no_alias_atomic_rmw_then_clobber(ptr addrspace(1) %in, ptr addrspace(1) %out, ptr addrspace(1) noalias %noalias) { 552; CHECK-LABEL: @no_alias_atomic_rmw_then_clobber( 553; CHECK-NEXT: entry: 554; CHECK-NEXT: store i32 1, ptr addrspace(1) [[OUT:%.*]], align 4 555; CHECK-NEXT: store i32 2, ptr addrspace(1) [[NOALIAS:%.*]], align 4 556; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add ptr addrspace(3) @LDS, i32 5 seq_cst, align 4 557; CHECK-NEXT: fence syncscope("workgroup") release 558; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() 559; CHECK-NEXT: fence syncscope("workgroup") acquire 560; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4 561; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT]], align 4 562; CHECK-NEXT: ret void 563; 564entry: 565 store i32 1, ptr addrspace(1) %out, align 4 566 store i32 2, ptr addrspace(1) %noalias, align 4 567 %unused = atomicrmw add ptr addrspace(3) @LDS, i32 5 seq_cst 568 fence syncscope("workgroup") release 569 tail call void @llvm.amdgcn.s.barrier() 570 fence syncscope("workgroup") acquire 571 %ld = load i32, ptr addrspace(1) %in, align 4 572 store i32 %ld, ptr addrspace(1) %out, align 4 573 ret void 574} 575 576; GCN-LABEL: {{^}}no_alias_atomic_rmw_then_no_alias_store: 577; GCN: global_store_dword 578; GCN: ds_add_u32 579; GCN: s_load_dword s 580; GCN-NOT: global_load_dword 581; GCN: global_store_dword 582define protected amdgpu_kernel void @no_alias_atomic_rmw_then_no_alias_store(ptr addrspace(1) %in, ptr addrspace(1) %out, ptr addrspace(1) noalias %noalias) { 583; CHECK-LABEL: @no_alias_atomic_rmw_then_no_alias_store( 584; CHECK-NEXT: entry: 585; CHECK-NEXT: store i32 2, ptr addrspace(1) [[NOALIAS:%.*]], align 4 586; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add ptr addrspace(3) @LDS, i32 5 seq_cst, align 4 587; CHECK-NEXT: fence syncscope("workgroup") release 588; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() 589; CHECK-NEXT: fence syncscope("workgroup") acquire 590; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0 591; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4 592; CHECK-NEXT: ret void 593; 594entry: 595 store i32 2, ptr addrspace(1) %noalias, align 4 596 %unused = atomicrmw add ptr addrspace(3) @LDS, i32 5 seq_cst 597 fence syncscope("workgroup") release 598 tail call void @llvm.amdgcn.s.barrier() 599 fence syncscope("workgroup") acquire 600 %ld = load i32, ptr addrspace(1) %in, align 4 601 store i32 %ld, ptr addrspace(1) %out, align 4 602 ret void 603} 604 605declare void @llvm.amdgcn.s.barrier() 606declare void @llvm.amdgcn.wave.barrier() 607