xref: /llvm-project/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll (revision cc19374afa55b6ccafb07ef0cb6550f4222bf99f)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -passes=amdgpu-promote-alloca < %s | FileCheck %s
3; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds< %s | FileCheck -check-prefix=NOLDS %s
4
5; This normally would be fixed by instcombine to be compare to the GEP
6; indices
7
8define amdgpu_kernel void @lds_promoted_alloca_icmp_same_derived_pointer(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
9; CHECK-LABEL: @lds_promoted_alloca_icmp_same_derived_pointer(
10; CHECK-NEXT:    [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
11; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1
12; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !invariant.load [[META0:![0-9]+]]
13; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2
14; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP4]], align 4, !range [[RNG1:![0-9]+]], !invariant.load [[META0]]
15; CHECK-NEXT:    [[TMP6:%.*]] = lshr i32 [[TMP3]], 16
16; CHECK-NEXT:    [[TMP7:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.x()
17; CHECK-NEXT:    [[TMP8:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.y()
18; CHECK-NEXT:    [[TMP9:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.z()
19; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw nsw i32 [[TMP6]], [[TMP5]]
20; CHECK-NEXT:    [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP7]]
21; CHECK-NEXT:    [[TMP12:%.*]] = mul nuw nsw i32 [[TMP8]], [[TMP5]]
22; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP11]], [[TMP12]]
23; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP9]]
24; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [256 x [16 x i32]], ptr addrspace(3) @lds_promoted_alloca_icmp_same_derived_pointer.alloca, i32 0, i32 [[TMP14]]
25; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP15]], i32 0, i32 [[A:%.*]]
26; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP15]], i32 0, i32 [[B:%.*]]
27; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr addrspace(3) [[PTR0]], [[PTR1]]
28; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
29; CHECK-NEXT:    store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4
30; CHECK-NEXT:    ret void
31;
32; NOLDS-LABEL: @lds_promoted_alloca_icmp_same_derived_pointer(
33; NOLDS-NEXT:    [[ALLOCA:%.*]] = alloca [16 x i32], align 4, addrspace(5)
34; NOLDS-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[A:%.*]]
35; NOLDS-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[B:%.*]]
36; NOLDS-NEXT:    [[CMP:%.*]] = icmp eq ptr addrspace(5) [[PTR0]], [[PTR1]]
37; NOLDS-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
38; NOLDS-NEXT:    store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4
39; NOLDS-NEXT:    ret void
40;
41  %alloca = alloca [16 x i32], align 4, addrspace(5)
42  %ptr0 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
43  %ptr1 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %b
44  %cmp = icmp eq ptr addrspace(5) %ptr0, %ptr1
45  %zext = zext i1 %cmp to i32
46  store volatile i32 %zext, ptr addrspace(1) %out
47  ret void
48}
49
50define amdgpu_kernel void @lds_promoted_alloca_icmp_null_rhs(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
51; CHECK-LABEL: @lds_promoted_alloca_icmp_null_rhs(
52; CHECK-NEXT:    [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
53; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1
54; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !invariant.load [[META0]]
55; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2
56; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP4]], align 4, !range [[RNG1]], !invariant.load [[META0]]
57; CHECK-NEXT:    [[TMP6:%.*]] = lshr i32 [[TMP3]], 16
58; CHECK-NEXT:    [[TMP7:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.x()
59; CHECK-NEXT:    [[TMP8:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.y()
60; CHECK-NEXT:    [[TMP9:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.z()
61; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw nsw i32 [[TMP6]], [[TMP5]]
62; CHECK-NEXT:    [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP7]]
63; CHECK-NEXT:    [[TMP12:%.*]] = mul nuw nsw i32 [[TMP8]], [[TMP5]]
64; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP11]], [[TMP12]]
65; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP9]]
66; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [256 x [16 x i32]], ptr addrspace(3) @lds_promoted_alloca_icmp_null_rhs.alloca, i32 0, i32 [[TMP14]]
67; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP15]], i32 0, i32 [[A:%.*]]
68; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr addrspace(3) [[PTR0]], null
69; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
70; CHECK-NEXT:    store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4
71; CHECK-NEXT:    ret void
72;
73; NOLDS-LABEL: @lds_promoted_alloca_icmp_null_rhs(
74; NOLDS-NEXT:    [[ALLOCA:%.*]] = alloca [16 x i32], align 4, addrspace(5)
75; NOLDS-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[A:%.*]]
76; NOLDS-NEXT:    [[CMP:%.*]] = icmp eq ptr addrspace(5) [[PTR0]], null
77; NOLDS-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
78; NOLDS-NEXT:    store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4
79; NOLDS-NEXT:    ret void
80;
81  %alloca = alloca [16 x i32], align 4, addrspace(5)
82  %ptr0 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
83  %cmp = icmp eq ptr addrspace(5) %ptr0, null
84  %zext = zext i1 %cmp to i32
85  store volatile i32 %zext, ptr addrspace(1) %out
86  ret void
87}
88
89define amdgpu_kernel void @lds_promoted_alloca_icmp_null_lhs(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
90; CHECK-LABEL: @lds_promoted_alloca_icmp_null_lhs(
91; CHECK-NEXT:    [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
92; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1
93; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !invariant.load [[META0]]
94; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2
95; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP4]], align 4, !range [[RNG1]], !invariant.load [[META0]]
96; CHECK-NEXT:    [[TMP6:%.*]] = lshr i32 [[TMP3]], 16
97; CHECK-NEXT:    [[TMP7:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.x()
98; CHECK-NEXT:    [[TMP8:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.y()
99; CHECK-NEXT:    [[TMP9:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.z()
100; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw nsw i32 [[TMP6]], [[TMP5]]
101; CHECK-NEXT:    [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP7]]
102; CHECK-NEXT:    [[TMP12:%.*]] = mul nuw nsw i32 [[TMP8]], [[TMP5]]
103; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP11]], [[TMP12]]
104; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP9]]
105; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [256 x [16 x i32]], ptr addrspace(3) @lds_promoted_alloca_icmp_null_lhs.alloca, i32 0, i32 [[TMP14]]
106; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP15]], i32 0, i32 [[A:%.*]]
107; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr addrspace(3) null, [[PTR0]]
108; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
109; CHECK-NEXT:    store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4
110; CHECK-NEXT:    ret void
111;
112; NOLDS-LABEL: @lds_promoted_alloca_icmp_null_lhs(
113; NOLDS-NEXT:    [[ALLOCA:%.*]] = alloca [16 x i32], align 4, addrspace(5)
114; NOLDS-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[A:%.*]]
115; NOLDS-NEXT:    [[CMP:%.*]] = icmp eq ptr addrspace(5) null, [[PTR0]]
116; NOLDS-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
117; NOLDS-NEXT:    store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4
118; NOLDS-NEXT:    ret void
119;
120  %alloca = alloca [16 x i32], align 4, addrspace(5)
121  %ptr0 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
122  %cmp = icmp eq ptr addrspace(5) null, %ptr0
123  %zext = zext i1 %cmp to i32
124  store volatile i32 %zext, ptr addrspace(1) %out
125  ret void
126}
127
128define amdgpu_kernel void @lds_promoted_alloca_icmp_unknown_ptr(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
129; CHECK-LABEL: @lds_promoted_alloca_icmp_unknown_ptr(
130; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [16 x i32], align 4, addrspace(5)
131; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[A:%.*]]
132; CHECK-NEXT:    [[PTR1:%.*]] = call ptr addrspace(5) @get_unknown_pointer()
133; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr addrspace(5) [[PTR0]], [[PTR1]]
134; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
135; CHECK-NEXT:    store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4
136; CHECK-NEXT:    ret void
137;
138; NOLDS-LABEL: @lds_promoted_alloca_icmp_unknown_ptr(
139; NOLDS-NEXT:    [[ALLOCA:%.*]] = alloca [16 x i32], align 4, addrspace(5)
140; NOLDS-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[A:%.*]]
141; NOLDS-NEXT:    [[PTR1:%.*]] = call ptr addrspace(5) @get_unknown_pointer()
142; NOLDS-NEXT:    [[CMP:%.*]] = icmp eq ptr addrspace(5) [[PTR0]], [[PTR1]]
143; NOLDS-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
144; NOLDS-NEXT:    store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4
145; NOLDS-NEXT:    ret void
146;
147  %alloca = alloca [16 x i32], align 4, addrspace(5)
148  %ptr0 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
149  %ptr1 = call ptr addrspace(5) @get_unknown_pointer()
150  %cmp = icmp eq ptr addrspace(5) %ptr0, %ptr1
151  %zext = zext i1 %cmp to i32
152  store volatile i32 %zext, ptr addrspace(1) %out
153  ret void
154}
155
156declare ptr addrspace(5) @get_unknown_pointer() #0
157
158attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }
159