1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -passes=amdgpu-promote-alloca < %s | FileCheck %s 3; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds< %s | FileCheck -check-prefix=NOLDS %s 4 5; This normally would be fixed by instcombine to be compare to the GEP 6; indices 7 8define amdgpu_kernel void @lds_promoted_alloca_icmp_same_derived_pointer(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { 9; CHECK-LABEL: @lds_promoted_alloca_icmp_same_derived_pointer( 10; CHECK-NEXT: [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() 11; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1 12; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !invariant.load [[META0:![0-9]+]] 13; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2 14; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP4]], align 4, !range [[RNG1:![0-9]+]], !invariant.load [[META0]] 15; CHECK-NEXT: [[TMP6:%.*]] = lshr i32 [[TMP3]], 16 16; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.x() 17; CHECK-NEXT: [[TMP8:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.y() 18; CHECK-NEXT: [[TMP9:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.z() 19; CHECK-NEXT: [[TMP10:%.*]] = mul nuw nsw i32 [[TMP6]], [[TMP5]] 20; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP7]] 21; CHECK-NEXT: [[TMP12:%.*]] = mul nuw nsw i32 [[TMP8]], [[TMP5]] 22; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP11]], [[TMP12]] 23; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP9]] 24; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [256 x [16 x i32]], ptr addrspace(3) @lds_promoted_alloca_icmp_same_derived_pointer.alloca, i32 0, i32 [[TMP14]] 25; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP15]], i32 0, i32 [[A:%.*]] 26; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP15]], i32 0, i32 [[B:%.*]] 27; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr addrspace(3) [[PTR0]], [[PTR1]] 28; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32 29; CHECK-NEXT: store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4 30; CHECK-NEXT: ret void 31; 32; NOLDS-LABEL: @lds_promoted_alloca_icmp_same_derived_pointer( 33; NOLDS-NEXT: [[ALLOCA:%.*]] = alloca [16 x i32], align 4, addrspace(5) 34; NOLDS-NEXT: [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[A:%.*]] 35; NOLDS-NEXT: [[PTR1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[B:%.*]] 36; NOLDS-NEXT: [[CMP:%.*]] = icmp eq ptr addrspace(5) [[PTR0]], [[PTR1]] 37; NOLDS-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32 38; NOLDS-NEXT: store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4 39; NOLDS-NEXT: ret void 40; 41 %alloca = alloca [16 x i32], align 4, addrspace(5) 42 %ptr0 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a 43 %ptr1 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %b 44 %cmp = icmp eq ptr addrspace(5) %ptr0, %ptr1 45 %zext = zext i1 %cmp to i32 46 store volatile i32 %zext, ptr addrspace(1) %out 47 ret void 48} 49 50define amdgpu_kernel void @lds_promoted_alloca_icmp_null_rhs(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { 51; CHECK-LABEL: @lds_promoted_alloca_icmp_null_rhs( 52; CHECK-NEXT: [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() 53; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1 54; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !invariant.load [[META0]] 55; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2 56; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP4]], align 4, !range [[RNG1]], !invariant.load [[META0]] 57; CHECK-NEXT: [[TMP6:%.*]] = lshr i32 [[TMP3]], 16 58; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.x() 59; CHECK-NEXT: [[TMP8:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.y() 60; CHECK-NEXT: [[TMP9:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.z() 61; CHECK-NEXT: [[TMP10:%.*]] = mul nuw nsw i32 [[TMP6]], [[TMP5]] 62; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP7]] 63; CHECK-NEXT: [[TMP12:%.*]] = mul nuw nsw i32 [[TMP8]], [[TMP5]] 64; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP11]], [[TMP12]] 65; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP9]] 66; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [256 x [16 x i32]], ptr addrspace(3) @lds_promoted_alloca_icmp_null_rhs.alloca, i32 0, i32 [[TMP14]] 67; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP15]], i32 0, i32 [[A:%.*]] 68; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr addrspace(3) [[PTR0]], null 69; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32 70; CHECK-NEXT: store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4 71; CHECK-NEXT: ret void 72; 73; NOLDS-LABEL: @lds_promoted_alloca_icmp_null_rhs( 74; NOLDS-NEXT: [[ALLOCA:%.*]] = alloca [16 x i32], align 4, addrspace(5) 75; NOLDS-NEXT: [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[A:%.*]] 76; NOLDS-NEXT: [[CMP:%.*]] = icmp eq ptr addrspace(5) [[PTR0]], null 77; NOLDS-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32 78; NOLDS-NEXT: store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4 79; NOLDS-NEXT: ret void 80; 81 %alloca = alloca [16 x i32], align 4, addrspace(5) 82 %ptr0 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a 83 %cmp = icmp eq ptr addrspace(5) %ptr0, null 84 %zext = zext i1 %cmp to i32 85 store volatile i32 %zext, ptr addrspace(1) %out 86 ret void 87} 88 89define amdgpu_kernel void @lds_promoted_alloca_icmp_null_lhs(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { 90; CHECK-LABEL: @lds_promoted_alloca_icmp_null_lhs( 91; CHECK-NEXT: [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() 92; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1 93; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !invariant.load [[META0]] 94; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2 95; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP4]], align 4, !range [[RNG1]], !invariant.load [[META0]] 96; CHECK-NEXT: [[TMP6:%.*]] = lshr i32 [[TMP3]], 16 97; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.x() 98; CHECK-NEXT: [[TMP8:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.y() 99; CHECK-NEXT: [[TMP9:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.z() 100; CHECK-NEXT: [[TMP10:%.*]] = mul nuw nsw i32 [[TMP6]], [[TMP5]] 101; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP7]] 102; CHECK-NEXT: [[TMP12:%.*]] = mul nuw nsw i32 [[TMP8]], [[TMP5]] 103; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP11]], [[TMP12]] 104; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP9]] 105; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [256 x [16 x i32]], ptr addrspace(3) @lds_promoted_alloca_icmp_null_lhs.alloca, i32 0, i32 [[TMP14]] 106; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP15]], i32 0, i32 [[A:%.*]] 107; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr addrspace(3) null, [[PTR0]] 108; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32 109; CHECK-NEXT: store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4 110; CHECK-NEXT: ret void 111; 112; NOLDS-LABEL: @lds_promoted_alloca_icmp_null_lhs( 113; NOLDS-NEXT: [[ALLOCA:%.*]] = alloca [16 x i32], align 4, addrspace(5) 114; NOLDS-NEXT: [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[A:%.*]] 115; NOLDS-NEXT: [[CMP:%.*]] = icmp eq ptr addrspace(5) null, [[PTR0]] 116; NOLDS-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32 117; NOLDS-NEXT: store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4 118; NOLDS-NEXT: ret void 119; 120 %alloca = alloca [16 x i32], align 4, addrspace(5) 121 %ptr0 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a 122 %cmp = icmp eq ptr addrspace(5) null, %ptr0 123 %zext = zext i1 %cmp to i32 124 store volatile i32 %zext, ptr addrspace(1) %out 125 ret void 126} 127 128define amdgpu_kernel void @lds_promoted_alloca_icmp_unknown_ptr(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { 129; CHECK-LABEL: @lds_promoted_alloca_icmp_unknown_ptr( 130; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [16 x i32], align 4, addrspace(5) 131; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[A:%.*]] 132; CHECK-NEXT: [[PTR1:%.*]] = call ptr addrspace(5) @get_unknown_pointer() 133; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr addrspace(5) [[PTR0]], [[PTR1]] 134; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32 135; CHECK-NEXT: store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4 136; CHECK-NEXT: ret void 137; 138; NOLDS-LABEL: @lds_promoted_alloca_icmp_unknown_ptr( 139; NOLDS-NEXT: [[ALLOCA:%.*]] = alloca [16 x i32], align 4, addrspace(5) 140; NOLDS-NEXT: [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[A:%.*]] 141; NOLDS-NEXT: [[PTR1:%.*]] = call ptr addrspace(5) @get_unknown_pointer() 142; NOLDS-NEXT: [[CMP:%.*]] = icmp eq ptr addrspace(5) [[PTR0]], [[PTR1]] 143; NOLDS-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32 144; NOLDS-NEXT: store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4 145; NOLDS-NEXT: ret void 146; 147 %alloca = alloca [16 x i32], align 4, addrspace(5) 148 %ptr0 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a 149 %ptr1 = call ptr addrspace(5) @get_unknown_pointer() 150 %cmp = icmp eq ptr addrspace(5) %ptr0, %ptr1 151 %zext = zext i1 %cmp to i32 152 store volatile i32 %zext, ptr addrspace(1) %out 153 ret void 154} 155 156declare ptr addrspace(5) @get_unknown_pointer() #0 157 158attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" } 159