1; RUN: opt -S -mtriple=amdgcn-unknown-unknown -passes=amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefixes=SI,SICI,ALL %s 2; RUN: opt -S -mcpu=tonga -mtriple=amdgcn-unknown-unknown -passes=amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefixes=CI,SICI,ALL %s 3; RUN: opt -S -mcpu=gfx1010 -mtriple=amdgcn-unknown-unknown -passes=amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefixes=GFX10PLUS,ALL %s 4; RUN: opt -S -mcpu=gfx1100 -mtriple=amdgcn-unknown-unknown -passes=amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefixes=GFX10PLUS,ALL %s 5 6; SI-NOT: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] poison, align 4 7; CI: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] poison, align 4 8 9define amdgpu_kernel void @promote_alloca_size_63(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 { 10entry: 11 %stack = alloca [5 x i32], align 4, addrspace(5) 12 %0 = load i32, ptr addrspace(1) %in, align 4 13 %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0 14 store i32 4, ptr addrspace(5) %arrayidx1, align 4 15 %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1 16 %1 = load i32, ptr addrspace(1) %arrayidx2, align 4 17 %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1 18 store i32 5, ptr addrspace(5) %arrayidx3, align 4 19 %2 = load i32, ptr addrspace(5) %stack, align 4 20 store i32 %2, ptr addrspace(1) %out, align 4 21 %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1 22 %3 = load i32, ptr addrspace(5) %arrayidx12 23 %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1 24 store i32 %3, ptr addrspace(1) %arrayidx13 25 ret void 26} 27 28; ALL: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] poison, align 4 29 30define amdgpu_kernel void @promote_alloca_size_256(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #1 { 31entry: 32 %stack = alloca [5 x i32], align 4, addrspace(5) 33 %0 = load i32, ptr addrspace(1) %in, align 4 34 %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0 35 store i32 4, ptr addrspace(5) %arrayidx1, align 4 36 %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1 37 %1 = load i32, ptr addrspace(1) %arrayidx2, align 4 38 %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1 39 store i32 5, ptr addrspace(5) %arrayidx3, align 4 40 %2 = load i32, ptr addrspace(5) %stack, align 4 41 store i32 %2, ptr addrspace(1) %out, align 4 42 %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1 43 %3 = load i32, ptr addrspace(5) %arrayidx12 44 %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1 45 store i32 %3, ptr addrspace(1) %arrayidx13 46 ret void 47} 48 49; SI-NOT: @promote_alloca_size_1600.stack 50; CI: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1024 x [5 x i32]] poison, align 4 51; GFX10PLUS: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1024 x [5 x i32]] poison, align 4 52 53define amdgpu_kernel void @promote_alloca_size_1600(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #2 { 54entry: 55 %stack = alloca [5 x i32], align 4, addrspace(5) 56 %0 = load i32, ptr addrspace(1) %in, align 4 57 %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0 58 store i32 4, ptr addrspace(5) %arrayidx1, align 4 59 %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1 60 %1 = load i32, ptr addrspace(1) %arrayidx2, align 4 61 %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1 62 store i32 5, ptr addrspace(5) %arrayidx3, align 4 63 %2 = load i32, ptr addrspace(5) %stack, align 4 64 store i32 %2, ptr addrspace(1) %out, align 4 65 %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1 66 %3 = load i32, ptr addrspace(5) %arrayidx12 67 %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1 68 store i32 %3, ptr addrspace(1) %arrayidx13 69 ret void 70} 71 72; ALL-LABEL: @occupancy_0( 73; CI-NOT: alloca [5 x i32] 74; SI: alloca [5 x i32] 75define amdgpu_kernel void @occupancy_0(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #3 { 76entry: 77 %stack = alloca [5 x i32], align 4, addrspace(5) 78 %0 = load i32, ptr addrspace(1) %in, align 4 79 %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0 80 store i32 4, ptr addrspace(5) %arrayidx1, align 4 81 %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1 82 %1 = load i32, ptr addrspace(1) %arrayidx2, align 4 83 %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1 84 store i32 5, ptr addrspace(5) %arrayidx3, align 4 85 %2 = load i32, ptr addrspace(5) %stack, align 4 86 store i32 %2, ptr addrspace(1) %out, align 4 87 %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1 88 %3 = load i32, ptr addrspace(5) %arrayidx12 89 %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1 90 store i32 %3, ptr addrspace(1) %arrayidx13 91 ret void 92} 93 94; ALL-LABEL: @occupancy_max( 95; CI-NOT: alloca [5 x i32] 96; SI: alloca [5 x i32] 97define amdgpu_kernel void @occupancy_max(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #4 { 98entry: 99 %stack = alloca [5 x i32], align 4, addrspace(5) 100 %0 = load i32, ptr addrspace(1) %in, align 4 101 %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0 102 store i32 4, ptr addrspace(5) %arrayidx1, align 4 103 %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1 104 %1 = load i32, ptr addrspace(1) %arrayidx2, align 4 105 %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1 106 store i32 5, ptr addrspace(5) %arrayidx3, align 4 107 %2 = load i32, ptr addrspace(5) %stack, align 4 108 store i32 %2, ptr addrspace(1) %out, align 4 109 %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1 110 %3 = load i32, ptr addrspace(5) %arrayidx12 111 %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1 112 store i32 %3, ptr addrspace(1) %arrayidx13 113 ret void 114} 115 116; SI-LABEL: @occupancy_6( 117; CI-LABEL: @occupancy_6( 118; SI: alloca 119; CI-NOT: alloca 120define amdgpu_kernel void @occupancy_6(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #5 { 121entry: 122 %stack = alloca [42 x i8], align 4, addrspace(5) 123 %tmp = load i8, ptr addrspace(1) %in, align 1 124 %tmp4 = sext i8 %tmp to i64 125 %arrayidx1 = getelementptr inbounds [42 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4 126 store i8 4, ptr addrspace(5) %arrayidx1, align 1 127 %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1 128 %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1 129 %tmp5 = sext i8 %tmp1 to i64 130 %arrayidx3 = getelementptr inbounds [42 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5 131 store i8 5, ptr addrspace(5) %arrayidx3, align 1 132 %tmp2 = load i8, ptr addrspace(5) %stack, align 1 133 store i8 %tmp2, ptr addrspace(1) %out, align 1 134 %arrayidx12 = getelementptr inbounds [42 x i8], ptr addrspace(5) %stack, i64 0, i64 1 135 %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1 136 %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1 137 store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1 138 ret void 139} 140 141; ALL-LABEL: @occupancy_6_over( 142; SICI: alloca [43 x i8] 143; GFX10PLUS-NOT: alloca 144 145define amdgpu_kernel void @occupancy_6_over(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #5 { 146entry: 147 %stack = alloca [43 x i8], align 4, addrspace(5) 148 %tmp = load i8, ptr addrspace(1) %in, align 1 149 %tmp4 = sext i8 %tmp to i64 150 %arrayidx1 = getelementptr inbounds [43 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4 151 store i8 4, ptr addrspace(5) %arrayidx1, align 1 152 %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1 153 %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1 154 %tmp5 = sext i8 %tmp1 to i64 155 %arrayidx3 = getelementptr inbounds [43 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5 156 store i8 5, ptr addrspace(5) %arrayidx3, align 1 157 %tmp2 = load i8, ptr addrspace(5) %stack, align 1 158 store i8 %tmp2, ptr addrspace(1) %out, align 1 159 %arrayidx12 = getelementptr inbounds [43 x i8], ptr addrspace(5) %stack, i64 0, i64 1 160 %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1 161 %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1 162 store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1 163 ret void 164} 165 166; SI-LABEL: @occupancy_8( 167; CI-LABEL: @occupancy_8( 168; SI: alloca 169; CI-NOT: alloca 170define amdgpu_kernel void @occupancy_8(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #6 { 171entry: 172 %stack = alloca [32 x i8], align 4, addrspace(5) 173 %tmp = load i8, ptr addrspace(1) %in, align 1 174 %tmp4 = sext i8 %tmp to i64 175 %arrayidx1 = getelementptr inbounds [32 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4 176 store i8 4, ptr addrspace(5) %arrayidx1, align 1 177 %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1 178 %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1 179 %tmp5 = sext i8 %tmp1 to i64 180 %arrayidx3 = getelementptr inbounds [32 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5 181 store i8 5, ptr addrspace(5) %arrayidx3, align 1 182 %tmp2 = load i8, ptr addrspace(5) %stack, align 1 183 store i8 %tmp2, ptr addrspace(1) %out, align 1 184 %arrayidx12 = getelementptr inbounds [32 x i8], ptr addrspace(5) %stack, i64 0, i64 1 185 %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1 186 %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1 187 store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1 188 ret void 189} 190 191; ALL-LABEL: @occupancy_8_over( 192; SICI: alloca [33 x i8] 193; GFX10PLUS-NOT: alloca 194 195define amdgpu_kernel void @occupancy_8_over(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #6 { 196entry: 197 %stack = alloca [33 x i8], align 4, addrspace(5) 198 %tmp = load i8, ptr addrspace(1) %in, align 1 199 %tmp4 = sext i8 %tmp to i64 200 %arrayidx1 = getelementptr inbounds [33 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4 201 store i8 4, ptr addrspace(5) %arrayidx1, align 1 202 %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1 203 %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1 204 %tmp5 = sext i8 %tmp1 to i64 205 %arrayidx3 = getelementptr inbounds [33 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5 206 store i8 5, ptr addrspace(5) %arrayidx3, align 1 207 %tmp2 = load i8, ptr addrspace(5) %stack, align 1 208 store i8 %tmp2, ptr addrspace(1) %out, align 1 209 %arrayidx12 = getelementptr inbounds [33 x i8], ptr addrspace(5) %stack, i64 0, i64 1 210 %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1 211 %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1 212 store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1 213 ret void 214} 215 216; SI-LABEL: @occupancy_9( 217; CI-LABEL: @occupancy_9( 218; SI: alloca 219; CI-NOT: alloca 220define amdgpu_kernel void @occupancy_9(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #7 { 221entry: 222 %stack = alloca [28 x i8], align 4, addrspace(5) 223 %tmp = load i8, ptr addrspace(1) %in, align 1 224 %tmp4 = sext i8 %tmp to i64 225 %arrayidx1 = getelementptr inbounds [28 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4 226 store i8 4, ptr addrspace(5) %arrayidx1, align 1 227 %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1 228 %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1 229 %tmp5 = sext i8 %tmp1 to i64 230 %arrayidx3 = getelementptr inbounds [28 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5 231 store i8 5, ptr addrspace(5) %arrayidx3, align 1 232 %tmp2 = load i8, ptr addrspace(5) %stack, align 1 233 store i8 %tmp2, ptr addrspace(1) %out, align 1 234 %arrayidx12 = getelementptr inbounds [28 x i8], ptr addrspace(5) %stack, i64 0, i64 1 235 %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1 236 %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1 237 store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1 238 ret void 239} 240 241; ALL-LABEL: @occupancy_9_over( 242; SICI: alloca [29 x i8] 243; GFX10PLUS-NOT: alloca 244 245define amdgpu_kernel void @occupancy_9_over(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #7 { 246entry: 247 %stack = alloca [29 x i8], align 4, addrspace(5) 248 %tmp = load i8, ptr addrspace(1) %in, align 1 249 %tmp4 = sext i8 %tmp to i64 250 %arrayidx1 = getelementptr inbounds [29 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4 251 store i8 4, ptr addrspace(5) %arrayidx1, align 1 252 %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1 253 %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1 254 %tmp5 = sext i8 %tmp1 to i64 255 %arrayidx3 = getelementptr inbounds [29 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5 256 store i8 5, ptr addrspace(5) %arrayidx3, align 1 257 %tmp2 = load i8, ptr addrspace(5) %stack, align 1 258 store i8 %tmp2, ptr addrspace(1) %out, align 1 259 %arrayidx12 = getelementptr inbounds [29 x i8], ptr addrspace(5) %stack, i64 0, i64 1 260 %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1 261 %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1 262 store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1 263 ret void 264} 265 266attributes #0 = { nounwind "amdgpu-flat-work-group-size"="63,63" } 267attributes #1 = { nounwind "amdgpu-waves-per-eu"="1,3" "amdgpu-flat-work-group-size"="256,256" } 268attributes #2 = { nounwind "amdgpu-waves-per-eu"="1,9" "amdgpu-flat-work-group-size"="1024,1024" } 269attributes #3 = { nounwind "amdgpu-waves-per-eu"="1,10" } 270attributes #4 = { nounwind "amdgpu-waves-per-eu"="1,10" } 271attributes #5 = { nounwind "amdgpu-waves-per-eu"="1,6" "amdgpu-flat-work-group-size"="64,64" } 272attributes #6 = { nounwind "amdgpu-waves-per-eu"="1,8" "amdgpu-flat-work-group-size"="64,64" } 273attributes #7 = { nounwind "amdgpu-waves-per-eu"="1,9" "amdgpu-flat-work-group-size"="64,64" } 274