1; RUN: opt -S -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-promote-alloca < %s | FileCheck -check-prefix=IR %s 2; RUN: llc -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-lower-module-lds=false < %s | FileCheck -check-prefix=ASM %s 3 4target datalayout = "A5" 5 6@all_lds = internal unnamed_addr addrspace(3) global [16384 x i32] undef, align 4 7@some_lds = internal unnamed_addr addrspace(3) global [32 x i32] undef, align 4 8@some_dynamic_lds = external hidden addrspace(3) global [0 x i32], align 4 9 10@initializer_user_some = addrspace(1) global i32 ptrtoint (ptr addrspace(3) @some_lds to i32), align 4 11@initializer_user_all = addrspace(1) global i32 ptrtoint (ptr addrspace(3) @all_lds to i32), align 4 12 13; This function cannot promote to using LDS because of the size of the 14; constant expression use in the function, which was previously not 15; detected. 16; IR-LABEL: @constant_expression_uses_all_lds( 17; IR: alloca 18 19; ASM-LABEL: constant_expression_uses_all_lds: 20; ASM: .amdhsa_group_segment_fixed_size 65536 21define amdgpu_kernel void @constant_expression_uses_all_lds(ptr addrspace(1) nocapture %out, i32 %idx) #0 { 22entry: 23 %stack = alloca [4 x i32], align 4, addrspace(5) 24 %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1 25 %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2 26 %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3 27 store i32 9, ptr addrspace(5) %stack 28 store i32 10, ptr addrspace(5) %gep1 29 store i32 99, ptr addrspace(5) %gep2 30 store i32 43, ptr addrspace(5) %gep3 31 %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx 32 %load = load i32, ptr addrspace(5) %arrayidx, align 4 33 store i32 %load, ptr addrspace(1) %out 34 35 store volatile i32 ptrtoint (ptr addrspace(3) @all_lds to i32), ptr addrspace(1) undef 36 ret void 37} 38 39; Has a constant expression use through a single level of constant 40; expression, but not enough LDS to block promotion 41 42; IR-LABEL: @constant_expression_uses_some_lds( 43; IR-NOT: alloca 44 45; ASM-LABEL: {{^}}constant_expression_uses_some_lds: 46; ASM: .amdhsa_group_segment_fixed_size 4224{{$}} 47define amdgpu_kernel void @constant_expression_uses_some_lds(ptr addrspace(1) nocapture %out, i32 %idx) #0 { 48entry: 49 %stack = alloca [4 x i32], align 4, addrspace(5) 50 %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1 51 %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2 52 %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3 53 store i32 9, ptr addrspace(5) %stack 54 store i32 10, ptr addrspace(5) %gep1 55 store i32 99, ptr addrspace(5) %gep2 56 store i32 43, ptr addrspace(5) %gep3 57 %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx 58 %load = load i32, ptr addrspace(5) %arrayidx, align 4 59 store i32 %load, ptr addrspace(1) %out 60 store volatile i32 ptrtoint (ptr addrspace(3) @some_lds to i32), ptr addrspace(1) undef 61 ret void 62} 63 64; Has a constant expression use through a single level of constant 65; expression, but usage of dynamic LDS should block promotion 66 67; IR-LABEL: @constant_expression_uses_some_dynamic_lds( 68; IR: alloca 69 70; ASM-LABEL: {{^}}constant_expression_uses_some_dynamic_lds: 71; ASM: .amdhsa_group_segment_fixed_size 0{{$}} 72define amdgpu_kernel void @constant_expression_uses_some_dynamic_lds(ptr addrspace(1) nocapture %out, i32 %idx) #0 { 73entry: 74 %stack = alloca [4 x i32], align 4, addrspace(5) 75 %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1 76 %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2 77 %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3 78 store i32 9, ptr addrspace(5) %stack 79 store i32 10, ptr addrspace(5) %gep1 80 store i32 99, ptr addrspace(5) %gep2 81 store i32 43, ptr addrspace(5) %gep3 82 %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx 83 %load = load i32, ptr addrspace(5) %arrayidx, align 4 84 store i32 %load, ptr addrspace(1) %out 85 store i32 1234, ptr addrspacecast (ptr addrspace(3) @some_dynamic_lds to ptr), align 4 86 ret void 87} 88 89declare void @callee(ptr) 90 91; IR-LABEL: @constant_expression_uses_all_lds_multi_level( 92; IR: alloca 93 94; ASM-LABEL: {{^}}constant_expression_uses_all_lds_multi_level: 95; ASM: .amdhsa_group_segment_fixed_size 65536{{$}} 96define amdgpu_kernel void @constant_expression_uses_all_lds_multi_level(ptr addrspace(1) nocapture %out, i32 %idx) #0 { 97entry: 98 %stack = alloca [4 x i32], align 4, addrspace(5) 99 %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1 100 %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2 101 %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3 102 store i32 9, ptr addrspace(5) %stack 103 store i32 10, ptr addrspace(5) %gep1 104 store i32 99, ptr addrspace(5) %gep2 105 store i32 43, ptr addrspace(5) %gep3 106 %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx 107 %load = load i32, ptr addrspace(5) %arrayidx, align 4 108 store i32 %load, ptr addrspace(1) %out 109 call void @callee(ptr addrspacecast (ptr addrspace(3) getelementptr inbounds ([16384 x i32], ptr addrspace(3) @all_lds, i32 0, i32 8) to ptr)) 110 ret void 111} 112 113; IR-LABEL: @constant_expression_uses_some_lds_multi_level( 114; IR-NOT: alloca 115; IR: llvm.amdgcn.workitem.id 116 117; ASM-LABEL: {{^}}constant_expression_uses_some_lds_multi_level: 118; ASM: .amdhsa_group_segment_fixed_size 4224{{$}} 119define amdgpu_kernel void @constant_expression_uses_some_lds_multi_level(ptr addrspace(1) nocapture %out, i32 %idx) #0 { 120entry: 121 %stack = alloca [4 x i32], align 4, addrspace(5) 122 %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1 123 %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2 124 %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3 125 store i32 9, ptr addrspace(5) %stack 126 store i32 10, ptr addrspace(5) %gep1 127 store i32 99, ptr addrspace(5) %gep2 128 store i32 43, ptr addrspace(5) %gep3 129 %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx 130 %load = load i32, ptr addrspace(5) %arrayidx, align 4 131 store i32 %load, ptr addrspace(1) %out 132 call void @callee(ptr addrspacecast (ptr addrspace(3) getelementptr inbounds ([32 x i32], ptr addrspace(3) @some_lds, i32 0, i32 8) to ptr)) 133 ret void 134} 135 136; IR-LABEL: @constant_expression_uses_some_dynamic_lds_multi_level( 137; IR: alloca 138 139; ASM-LABEL: {{^}}constant_expression_uses_some_dynamic_lds_multi_level: 140; ASM: .amdhsa_group_segment_fixed_size 0{{$}} 141define amdgpu_kernel void @constant_expression_uses_some_dynamic_lds_multi_level(ptr addrspace(1) nocapture %out, i32 %idx) #0 { 142entry: 143 %stack = alloca [4 x i32], align 4, addrspace(5) 144 %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1 145 %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2 146 %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3 147 store i32 9, ptr addrspace(5) %stack 148 store i32 10, ptr addrspace(5) %gep1 149 store i32 99, ptr addrspace(5) %gep2 150 store i32 43, ptr addrspace(5) %gep3 151 %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx 152 %load = load i32, ptr addrspace(5) %arrayidx, align 4 153 store i32 %load, ptr addrspace(1) %out 154 call void @callee(ptr addrspacecast (ptr addrspace(3) @some_dynamic_lds to ptr)) 155 ret void 156} 157 158; IR-LABEL: @constant_expression_uses_some_lds_global_initializer( 159; IR-NOT: alloca 160; IR: llvm.amdgcn.workitem.id 161 162; ASM-LABEL: {{^}}constant_expression_uses_some_lds_global_initializer: 163; ASM: .amdhsa_group_segment_fixed_size 4096{{$}} 164define amdgpu_kernel void @constant_expression_uses_some_lds_global_initializer(ptr addrspace(1) nocapture %out, i32 %idx) #0 { 165entry: 166 %stack = alloca [4 x i32], align 4, addrspace(5) 167 %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1 168 %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2 169 %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3 170 store i32 9, ptr addrspace(5) %stack 171 store i32 10, ptr addrspace(5) %gep1 172 store i32 99, ptr addrspace(5) %gep2 173 store i32 43, ptr addrspace(5) %gep3 174 %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx 175 %load = load i32, ptr addrspace(5) %arrayidx, align 4 176 store i32 %load, ptr addrspace(1) %out 177 178 store volatile i32 ptrtoint (ptr addrspace(1) @initializer_user_some to i32), ptr addrspace(1) undef 179 ret void 180} 181 182; We can't actually handle LDS initializers in global initializers, 183; but this should count as usage. 184 185; IR-LABEL: @constant_expression_uses_all_lds_global_initializer( 186; IR: alloca 187 188; ASM-LABEL: {{^}}constant_expression_uses_all_lds_global_initializer: 189; ASM: .group_segment_fixed_size: 65536 190define amdgpu_kernel void @constant_expression_uses_all_lds_global_initializer(ptr addrspace(1) nocapture %out, i32 %idx) #0 { 191entry: 192 %stack = alloca [4 x i32], align 4, addrspace(5) 193 %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1 194 %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2 195 %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3 196 store i32 9, ptr addrspace(5) %stack 197 store i32 10, ptr addrspace(5) %gep1 198 store i32 99, ptr addrspace(5) %gep2 199 store i32 43, ptr addrspace(5) %gep3 200 %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx 201 %load = load i32, ptr addrspace(5) %arrayidx, align 4 202 store i32 %load, ptr addrspace(1) %out 203 store volatile i32 ptrtoint (ptr addrspace(1) @initializer_user_all to i32), ptr addrspace(1) undef 204 ret void 205} 206 207attributes #0 = { "amdgpu-waves-per-eu"="1,5" "amdgpu-flat-work-group-size"="256,256" } 208