1; RUN: llc -mtriple=r600 -mcpu=redwood -disable-promote-alloca-to-vector < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC 2; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600-VECT -check-prefix=FUNC 3; RUN: opt -S -mtriple=r600-unknown-unknown -mcpu=redwood -passes=amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck -check-prefix=OPT %s 4target datalayout = "A5" 5 6declare i32 @llvm.r600.read.tidig.x() nounwind readnone 7 8; FUNC-LABEL: {{^}}mova_same_clause: 9 10; R600: LDS_WRITE 11; R600: LDS_WRITE 12; R600: LDS_READ 13; R600: LDS_READ 14 15; OPT: call range(i32 0, 257) i32 @llvm.r600.read.local.size.y() 16; OPT: call range(i32 0, 257) i32 @llvm.r600.read.local.size.z() 17; OPT: call range(i32 0, 256) i32 @llvm.r600.read.tidig.x() 18; OPT: call range(i32 0, 256) i32 @llvm.r600.read.tidig.y() 19; OPT: call range(i32 0, 256) i32 @llvm.r600.read.tidig.z() 20 21define amdgpu_kernel void @mova_same_clause(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 { 22entry: 23 %stack = alloca [5 x i32], align 4, addrspace(5) 24 %0 = load i32, ptr addrspace(1) %in, align 4 25 %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0 26 store i32 4, ptr addrspace(5) %arrayidx1, align 4 27 %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1 28 %1 = load i32, ptr addrspace(1) %arrayidx2, align 4 29 %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1 30 store i32 5, ptr addrspace(5) %arrayidx3, align 4 31 %2 = load i32, ptr addrspace(5) %stack, align 4 32 store i32 %2, ptr addrspace(1) %out, align 4 33 %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1 34 %3 = load i32, ptr addrspace(5) %arrayidx12 35 %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1 36 store i32 %3, ptr addrspace(1) %arrayidx13 37 ret void 38} 39 40; This test checks that the stack offset is calculated correctly for structs. 41; All register loads/stores should be optimized away, so there shouldn't be 42; any MOVA instructions. 43; 44; XXX: This generated code has unnecessary MOVs, we should be able to optimize 45; this. 46 47; FUNC-LABEL: {{^}}multiple_structs: 48; R600-NOT: MOVA_INT 49%struct.point = type { i32, i32 } 50 51define amdgpu_kernel void @multiple_structs(ptr addrspace(1) %out) #0 { 52entry: 53 %a = alloca %struct.point, addrspace(5) 54 %b = alloca %struct.point, addrspace(5) 55 %a.y.ptr = getelementptr inbounds %struct.point, ptr addrspace(5) %a, i32 0, i32 1 56 %b.y.ptr = getelementptr inbounds %struct.point, ptr addrspace(5) %b, i32 0, i32 1 57 store i32 0, ptr addrspace(5) %a 58 store i32 1, ptr addrspace(5) %a.y.ptr 59 store i32 2, ptr addrspace(5) %b 60 store i32 3, ptr addrspace(5) %b.y.ptr 61 %a.indirect = load i32, ptr addrspace(5) %a 62 %b.indirect = load i32, ptr addrspace(5) %b 63 %0 = add i32 %a.indirect, %b.indirect 64 store i32 %0, ptr addrspace(1) %out 65 ret void 66} 67 68; Test direct access of a private array inside a loop. The private array 69; loads and stores should be lowered to copies, so there shouldn't be any 70; MOVA instructions. 71 72; FUNC-LABEL: {{^}}direct_loop: 73; R600-NOT: MOVA_INT 74 75define amdgpu_kernel void @direct_loop(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 76entry: 77 %prv_array_const = alloca [2 x i32], addrspace(5) 78 %prv_array = alloca [2 x i32], addrspace(5) 79 %a = load i32, ptr addrspace(1) %in 80 %b_src_ptr = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1 81 %b = load i32, ptr addrspace(1) %b_src_ptr 82 store i32 %a, ptr addrspace(5) %prv_array_const 83 %b_dst_ptr = getelementptr inbounds [2 x i32], ptr addrspace(5) %prv_array_const, i32 0, i32 1 84 store i32 %b, ptr addrspace(5) %b_dst_ptr 85 br label %for.body 86 87for.body: 88 %inc = phi i32 [0, %entry], [%count, %for.body] 89 %x = load i32, ptr addrspace(5) %prv_array_const 90 %y = load i32, ptr addrspace(5) %prv_array 91 %xy = add i32 %x, %y 92 store i32 %xy, ptr addrspace(5) %prv_array 93 %count = add i32 %inc, 1 94 %done = icmp eq i32 %count, 4095 95 br i1 %done, label %for.end, label %for.body 96 97for.end: 98 %value = load i32, ptr addrspace(5) %prv_array 99 store i32 %value, ptr addrspace(1) %out 100 ret void 101} 102 103; FUNC-LABEL: {{^}}short_array: 104 105; R600-VECT: MOVA_INT 106define amdgpu_kernel void @short_array(ptr addrspace(1) %out, i32 %index) #0 { 107entry: 108 %0 = alloca [2 x i16], addrspace(5) 109 %1 = getelementptr inbounds [2 x i16], ptr addrspace(5) %0, i32 0, i32 1 110 store i16 0, ptr addrspace(5) %0 111 store i16 1, ptr addrspace(5) %1 112 %2 = getelementptr inbounds [2 x i16], ptr addrspace(5) %0, i32 0, i32 %index 113 %3 = load i16, ptr addrspace(5) %2 114 %4 = sext i16 %3 to i32 115 store i32 %4, ptr addrspace(1) %out 116 ret void 117} 118 119; FUNC-LABEL: {{^}}char_array: 120 121; R600-VECT: MOVA_INT 122define amdgpu_kernel void @char_array(ptr addrspace(1) %out, i32 %index) #0 { 123entry: 124 %0 = alloca [2 x i8], addrspace(5) 125 %1 = getelementptr inbounds [2 x i8], ptr addrspace(5) %0, i32 0, i32 1 126 store i8 0, ptr addrspace(5) %0 127 store i8 1, ptr addrspace(5) %1 128 %2 = getelementptr inbounds [2 x i8], ptr addrspace(5) %0, i32 0, i32 %index 129 %3 = load i8, ptr addrspace(5) %2 130 %4 = sext i8 %3 to i32 131 store i32 %4, ptr addrspace(1) %out 132 ret void 133 134} 135 136; Make sure we don't overwrite workitem information with private memory 137 138; FUNC-LABEL: {{^}}work_item_info: 139; R600-NOT: MOV T0.X 140; Additional check in case the move ends up in the last slot 141; R600-NOT: MOV * TO.X 142define amdgpu_kernel void @work_item_info(ptr addrspace(1) %out, i32 %in) #0 { 143entry: 144 %0 = alloca [2 x i32], addrspace(5) 145 %1 = getelementptr inbounds [2 x i32], ptr addrspace(5) %0, i32 0, i32 1 146 store i32 0, ptr addrspace(5) %0 147 store i32 1, ptr addrspace(5) %1 148 %2 = getelementptr inbounds [2 x i32], ptr addrspace(5) %0, i32 0, i32 %in 149 %3 = load i32, ptr addrspace(5) %2 150 %4 = call i32 @llvm.r600.read.tidig.x() 151 %5 = add i32 %3, %4 152 store i32 %5, ptr addrspace(1) %out 153 ret void 154} 155 156; Test that two stack objects are not stored in the same register 157; The second stack object should be in T3.X 158; FUNC-LABEL: {{^}}no_overlap: 159define amdgpu_kernel void @no_overlap(ptr addrspace(1) %out, i32 %in) #0 { 160entry: 161 %0 = alloca [3 x i8], align 1, addrspace(5) 162 %1 = alloca [2 x i8], align 1, addrspace(5) 163 %2 = getelementptr inbounds [3 x i8], ptr addrspace(5) %0, i32 0, i32 1 164 %3 = getelementptr inbounds [3 x i8], ptr addrspace(5) %0, i32 0, i32 2 165 %4 = getelementptr inbounds [2 x i8], ptr addrspace(5) %1, i32 0, i32 1 166 store i8 0, ptr addrspace(5) %0 167 store i8 1, ptr addrspace(5) %2 168 store i8 2, ptr addrspace(5) %3 169 store i8 1, ptr addrspace(5) %1 170 store i8 0, ptr addrspace(5) %4 171 %5 = getelementptr inbounds [3 x i8], ptr addrspace(5) %0, i32 0, i32 %in 172 %6 = getelementptr inbounds [2 x i8], ptr addrspace(5) %1, i32 0, i32 %in 173 %7 = load i8, ptr addrspace(5) %5 174 %8 = load i8, ptr addrspace(5) %6 175 %9 = add i8 %7, %8 176 %10 = sext i8 %9 to i32 177 store i32 %10, ptr addrspace(1) %out 178 ret void 179} 180 181define amdgpu_kernel void @char_array_array(ptr addrspace(1) %out, i32 %index) #0 { 182entry: 183 %alloca = alloca [2 x [2 x i8]], addrspace(5) 184 %gep1 = getelementptr inbounds [2 x [2 x i8]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1 185 store i8 0, ptr addrspace(5) %alloca 186 store i8 1, ptr addrspace(5) %gep1 187 %gep2 = getelementptr inbounds [2 x [2 x i8]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 %index 188 %load = load i8, ptr addrspace(5) %gep2 189 %sext = sext i8 %load to i32 190 store i32 %sext, ptr addrspace(1) %out 191 ret void 192} 193 194define amdgpu_kernel void @i32_array_array(ptr addrspace(1) %out, i32 %index) #0 { 195entry: 196 %alloca = alloca [2 x [2 x i32]], addrspace(5) 197 %gep1 = getelementptr inbounds [2 x [2 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1 198 store i32 0, ptr addrspace(5) %alloca 199 store i32 1, ptr addrspace(5) %gep1 200 %gep2 = getelementptr inbounds [2 x [2 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 %index 201 %load = load i32, ptr addrspace(5) %gep2 202 store i32 %load, ptr addrspace(1) %out 203 ret void 204} 205 206define amdgpu_kernel void @i64_array_array(ptr addrspace(1) %out, i32 %index) #0 { 207entry: 208 %alloca = alloca [2 x [2 x i64]], addrspace(5) 209 %gep1 = getelementptr inbounds [2 x [2 x i64]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1 210 store i64 0, ptr addrspace(5) %alloca 211 store i64 1, ptr addrspace(5) %gep1 212 %gep2 = getelementptr inbounds [2 x [2 x i64]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 %index 213 %load = load i64, ptr addrspace(5) %gep2 214 store i64 %load, ptr addrspace(1) %out 215 ret void 216} 217 218%struct.pair32 = type { i32, i32 } 219 220define amdgpu_kernel void @struct_array_array(ptr addrspace(1) %out, i32 %index) #0 { 221entry: 222 %alloca = alloca [2 x [2 x %struct.pair32]], addrspace(5) 223 %gep0 = getelementptr inbounds [2 x [2 x %struct.pair32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0, i32 1 224 %gep1 = getelementptr inbounds [2 x [2 x %struct.pair32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1, i32 1 225 store i32 0, ptr addrspace(5) %gep0 226 store i32 1, ptr addrspace(5) %gep1 227 %gep2 = getelementptr inbounds [2 x [2 x %struct.pair32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 %index, i32 0 228 %load = load i32, ptr addrspace(5) %gep2 229 store i32 %load, ptr addrspace(1) %out 230 ret void 231} 232 233define amdgpu_kernel void @struct_pair32_array(ptr addrspace(1) %out, i32 %index) #0 { 234entry: 235 %alloca = alloca [2 x %struct.pair32], addrspace(5) 236 %gep0 = getelementptr inbounds [2 x %struct.pair32], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1 237 %gep1 = getelementptr inbounds [2 x %struct.pair32], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0 238 store i32 0, ptr addrspace(5) %gep0 239 store i32 1, ptr addrspace(5) %gep1 240 %gep2 = getelementptr inbounds [2 x %struct.pair32], ptr addrspace(5) %alloca, i32 0, i32 %index, i32 0 241 %load = load i32, ptr addrspace(5) %gep2 242 store i32 %load, ptr addrspace(1) %out 243 ret void 244} 245 246define amdgpu_kernel void @select_private(ptr addrspace(1) %out, i32 %in) nounwind { 247entry: 248 %tmp = alloca [2 x i32], addrspace(5) 249 %tmp2 = getelementptr inbounds [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 1 250 store i32 0, ptr addrspace(5) %tmp 251 store i32 1, ptr addrspace(5) %tmp2 252 %cmp = icmp eq i32 %in, 0 253 %sel = select i1 %cmp, ptr addrspace(5) %tmp, ptr addrspace(5) %tmp2 254 %load = load i32, ptr addrspace(5) %sel 255 store i32 %load, ptr addrspace(1) %out 256 ret void 257} 258 259; AMDGPUPromoteAlloca does not know how to handle ptrtoint. When it 260; finds one, it should stop trying to promote. 261 262; FUNC-LABEL: ptrtoint: 263; SI-NOT: ds_write 264; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen 265; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; 266define amdgpu_kernel void @ptrtoint(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { 267 %alloca = alloca [16 x i32], addrspace(5) 268 %tmp0 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a 269 store i32 5, ptr addrspace(5) %tmp0 270 %tmp1 = ptrtoint ptr addrspace(5) %alloca to i32 271 %tmp2 = add i32 %tmp1, 5 272 %tmp3 = inttoptr i32 %tmp2 to ptr addrspace(5) 273 %tmp4 = getelementptr inbounds i32, ptr addrspace(5) %tmp3, i32 %b 274 %tmp5 = load i32, ptr addrspace(5) %tmp4 275 store i32 %tmp5, ptr addrspace(1) %out 276 ret void 277} 278 279attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" "amdgpu-flat-work-group-size"="1,256" } 280