1; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck %s 2 3target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" 4 5declare i32 @llvm.amdgcn.workitem.id.x() #1 6 7; CHECK-LABEL: @merge_v2p1i8( 8; CHECK: load <2 x i64> 9; CHECK: inttoptr i64 %{{[^ ]+}} to ptr addrspace(1) 10; CHECK: inttoptr i64 %{{[^ ]+}} to ptr addrspace(1) 11; CHECK: store <2 x i64> zeroinitializer 12define amdgpu_kernel void @merge_v2p1i8(ptr addrspace(1) nocapture %a, ptr addrspace(1) nocapture readonly %b) #0 { 13entry: 14 %a.1 = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) %a, i64 1 15 %b.1 = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) %b, i64 1 16 17 %ld.c = load ptr addrspace(1), ptr addrspace(1) %b, align 4 18 %ld.c.idx.1 = load ptr addrspace(1), ptr addrspace(1) %b.1, align 4 19 20 store ptr addrspace(1) null, ptr addrspace(1) %a, align 4 21 store ptr addrspace(1) null, ptr addrspace(1) %a.1, align 4 22 23 ret void 24} 25 26; CHECK-LABEL: @merge_v2p3i8( 27; CHECK: load <2 x i32> 28; CHECK: inttoptr i32 %{{[^ ]+}} to ptr addrspace(3) 29; CHECK: inttoptr i32 %{{[^ ]+}} to ptr addrspace(3) 30; CHECK: store <2 x i32> zeroinitializer 31define amdgpu_kernel void @merge_v2p3i8(ptr addrspace(3) nocapture %a, ptr addrspace(3) nocapture readonly %b) #0 { 32entry: 33 %a.1 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %a, i64 1 34 %b.1 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %b, i64 1 35 36 %ld.c = load ptr addrspace(3), ptr addrspace(3) %b, align 4 37 %ld.c.idx.1 = load ptr addrspace(3), ptr addrspace(3) %b.1, align 4 38 39 store ptr addrspace(3) null, ptr addrspace(3) %a, align 4 40 store ptr addrspace(3) null, ptr addrspace(3) %a.1, align 4 41 42 ret void 43} 44 45; CHECK-LABEL: @merge_ptr_i32( 46; CHECK: load <4 x i32> 47; CHECK: store <4 x i32> 48define amdgpu_kernel void @merge_ptr_i32(ptr addrspace(3) nocapture %a, ptr addrspace(3) nocapture readonly %b) #0 { 49entry: 50 %a.0 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %a, i64 0 51 %a.1 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %a, i64 1 52 %a.2 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %a, i64 2 53 54 %b.0 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %b, i64 0 55 %b.1 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %b, i64 1 56 %b.2 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %b, i64 2 57 58 %ld.0 = load i32, ptr addrspace(3) %b.0, align 16 59 %ld.1 = load ptr addrspace(3), ptr addrspace(3) %b.1, align 4 60 %ld.2 = load <2 x i32>, ptr addrspace(3) %b.2, align 8 61 62 store i32 0, ptr addrspace(3) %a.0, align 16 63 store ptr addrspace(3) null, ptr addrspace(3) %a.1, align 4 64 store <2 x i32> <i32 0, i32 0>, ptr addrspace(3) %a.2, align 8 65 66 ret void 67} 68 69; CHECK-LABEL: @merge_ptr_i32_vec_first( 70; CHECK: load <4 x i32> 71; CHECK: store <4 x i32> 72define amdgpu_kernel void @merge_ptr_i32_vec_first(ptr addrspace(3) nocapture %a, ptr addrspace(3) nocapture readonly %b) #0 { 73entry: 74 %a.0 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %a, i64 0 75 %a.1 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %a, i64 2 76 %a.2 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %a, i64 3 77 78 %b.0 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %b, i64 0 79 %b.1 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %b, i64 2 80 %b.2 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %b, i64 3 81 82 %ld.0 = load <2 x i32>, ptr addrspace(3) %b.0, align 16 83 %ld.1 = load ptr addrspace(3), ptr addrspace(3) %b.1, align 8 84 %ld.2 = load i32, ptr addrspace(3) %b.2, align 4 85 86 store <2 x i32> <i32 0, i32 0>, ptr addrspace(3) %a.0, align 16 87 store ptr addrspace(3) null, ptr addrspace(3) %a.1, align 8 88 store i32 0, ptr addrspace(3) %a.2, align 4 89 90 ret void 91} 92 93; CHECK-LABEL: @merge_load_i64_ptr64( 94; CHECK: load <2 x i64> 95; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1 96; CHECK: inttoptr i64 [[ELT1]] to ptr addrspace(1) 97define amdgpu_kernel void @merge_load_i64_ptr64(ptr addrspace(1) nocapture %a) #0 { 98entry: 99 %a.1 = getelementptr inbounds i64, ptr addrspace(1) %a, i64 1 100 101 %ld.0 = load i64, ptr addrspace(1) %a 102 %ld.1 = load ptr addrspace(1), ptr addrspace(1) %a.1 103 104 ret void 105} 106 107; CHECK-LABEL: @merge_load_ptr64_i64( 108; CHECK: load <2 x i64> 109; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 0 110; CHECK: inttoptr i64 [[ELT0]] to ptr addrspace(1) 111define amdgpu_kernel void @merge_load_ptr64_i64(ptr addrspace(1) nocapture %a) #0 { 112entry: 113 %a.1 = getelementptr inbounds i64, ptr addrspace(1) %a, i64 1 114 115 %ld.0 = load ptr addrspace(1), ptr addrspace(1) %a 116 %ld.1 = load i64, ptr addrspace(1) %a.1 117 118 ret void 119} 120 121; CHECK-LABEL: @merge_store_ptr64_i64( 122; CHECK: [[ELT0:%[^ ]+]] = ptrtoint ptr addrspace(1) %ptr0 to i64 123; CHECK: insertelement <2 x i64> poison, i64 [[ELT0]], i32 0 124; CHECK: store <2 x i64> 125define amdgpu_kernel void @merge_store_ptr64_i64(ptr addrspace(1) nocapture %a, ptr addrspace(1) %ptr0, i64 %val1) #0 { 126entry: 127 %a.1 = getelementptr inbounds i64, ptr addrspace(1) %a, i64 1 128 129 130 store ptr addrspace(1) %ptr0, ptr addrspace(1) %a 131 store i64 %val1, ptr addrspace(1) %a.1 132 133 ret void 134} 135 136; CHECK-LABEL: @merge_store_i64_ptr64( 137; CHECK: [[ELT1:%[^ ]+]] = ptrtoint ptr addrspace(1) %ptr1 to i64 138; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1]], i32 1 139; CHECK: store <2 x i64> 140define amdgpu_kernel void @merge_store_i64_ptr64(ptr addrspace(1) nocapture %a, i64 %val0, ptr addrspace(1) %ptr1) #0 { 141entry: 142 %a.1 = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) %a, i64 1 143 144 store i64 %val0, ptr addrspace(1) %a 145 store ptr addrspace(1) %ptr1, ptr addrspace(1) %a.1 146 147 ret void 148} 149 150; CHECK-LABEL: @merge_load_i32_ptr32( 151; CHECK: load <2 x i32> 152; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i32> %{{[^ ]+}}, i32 1 153; CHECK: inttoptr i32 [[ELT1]] to ptr addrspace(3) 154define amdgpu_kernel void @merge_load_i32_ptr32(ptr addrspace(3) nocapture %a) #0 { 155entry: 156 %a.1 = getelementptr inbounds i32, ptr addrspace(3) %a, i32 1 157 158 %ld.0 = load i32, ptr addrspace(3) %a 159 %ld.1 = load ptr addrspace(3), ptr addrspace(3) %a.1 160 161 ret void 162} 163 164; CHECK-LABEL: @merge_load_ptr32_i32( 165; CHECK: load <2 x i32> 166; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i32> %{{[^ ]+}}, i32 0 167; CHECK: inttoptr i32 [[ELT0]] to ptr addrspace(3) 168define amdgpu_kernel void @merge_load_ptr32_i32(ptr addrspace(3) nocapture %a) #0 { 169entry: 170 %a.1 = getelementptr inbounds i32, ptr addrspace(3) %a, i32 1 171 172 %ld.0 = load ptr addrspace(3), ptr addrspace(3) %a 173 %ld.1 = load i32, ptr addrspace(3) %a.1 174 175 ret void 176} 177 178; CHECK-LABEL: @merge_store_ptr32_i32( 179; CHECK: [[ELT0:%[^ ]+]] = ptrtoint ptr addrspace(3) %ptr0 to i32 180; CHECK: insertelement <2 x i32> poison, i32 [[ELT0]], i32 0 181; CHECK: store <2 x i32> 182define amdgpu_kernel void @merge_store_ptr32_i32(ptr addrspace(3) nocapture %a, ptr addrspace(3) %ptr0, i32 %val1) #0 { 183entry: 184 %a.1 = getelementptr inbounds i32, ptr addrspace(3) %a, i32 1 185 186 store ptr addrspace(3) %ptr0, ptr addrspace(3) %a 187 store i32 %val1, ptr addrspace(3) %a.1 188 189 ret void 190} 191 192; CHECK-LABEL: @merge_store_i32_ptr32( 193; CHECK: [[ELT1:%[^ ]+]] = ptrtoint ptr addrspace(3) %ptr1 to i32 194; CHECK: insertelement <2 x i32> %{{[^ ]+}}, i32 [[ELT1]], i32 1 195; CHECK: store <2 x i32> 196define amdgpu_kernel void @merge_store_i32_ptr32(ptr addrspace(3) nocapture %a, i32 %val0, ptr addrspace(3) %ptr1) #0 { 197entry: 198 %a.1 = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %a, i32 1 199 200 store i32 %val0, ptr addrspace(3) %a 201 store ptr addrspace(3) %ptr1, ptr addrspace(3) %a.1 202 203 ret void 204} 205 206; CHECK-LABEL: @no_merge_store_ptr32_i64( 207; CHECK: store ptr addrspace(3) 208; CHECK: store i64 209define amdgpu_kernel void @no_merge_store_ptr32_i64(ptr addrspace(1) nocapture %a, ptr addrspace(3) %ptr0, i64 %val1) #0 { 210entry: 211 %a.1 = getelementptr inbounds i64, ptr addrspace(1) %a, i64 1 212 213 214 store ptr addrspace(3) %ptr0, ptr addrspace(1) %a 215 store i64 %val1, ptr addrspace(1) %a.1 216 217 ret void 218} 219 220; CHECK-LABEL: @no_merge_store_i64_ptr32( 221; CHECK: store i64 222; CHECK: store ptr addrspace(3) 223define amdgpu_kernel void @no_merge_store_i64_ptr32(ptr addrspace(1) nocapture %a, i64 %val0, ptr addrspace(3) %ptr1) #0 { 224entry: 225 %a.1 = getelementptr inbounds ptr addrspace(3), ptr addrspace(1) %a, i64 1 226 227 store i64 %val0, ptr addrspace(1) %a 228 store ptr addrspace(3) %ptr1, ptr addrspace(1) %a.1 229 230 ret void 231} 232 233; CHECK-LABEL: @no_merge_load_i64_ptr32( 234; CHECK: load i64, 235; CHECK: load ptr addrspace(3), 236define amdgpu_kernel void @no_merge_load_i64_ptr32(ptr addrspace(1) nocapture %a) #0 { 237entry: 238 %a.1 = getelementptr inbounds i64, ptr addrspace(1) %a, i64 1 239 240 %ld.0 = load i64, ptr addrspace(1) %a 241 %ld.1 = load ptr addrspace(3), ptr addrspace(1) %a.1 242 243 ret void 244} 245 246; CHECK-LABEL: @no_merge_load_ptr32_i64( 247; CHECK: load ptr addrspace(3), 248; CHECK: load i64, 249define amdgpu_kernel void @no_merge_load_ptr32_i64(ptr addrspace(1) nocapture %a) #0 { 250entry: 251 %a.1 = getelementptr inbounds i64, ptr addrspace(1) %a, i64 1 252 253 %ld.0 = load ptr addrspace(3), ptr addrspace(1) %a 254 %ld.1 = load i64, ptr addrspace(1) %a.1 255 256 ret void 257} 258 259; XXX - This isn't merged for some reason 260; CHECK-LABEL: @merge_v2p1i8_v2p1i8( 261; CHECK: load <2 x ptr addrspace(1)> 262; CHECK: load <2 x ptr addrspace(1)> 263; CHECK: store <2 x ptr addrspace(1)> 264; CHECK: store <2 x ptr addrspace(1)> 265define amdgpu_kernel void @merge_v2p1i8_v2p1i8(ptr addrspace(1) nocapture noalias %a, ptr addrspace(1) nocapture readonly noalias %b) #0 { 266entry: 267 %a.1 = getelementptr inbounds <2 x ptr addrspace(1)>, ptr addrspace(1) %a, i64 1 268 %b.1 = getelementptr inbounds <2 x ptr addrspace(1)>, ptr addrspace(1) %b, i64 1 269 270 %ld.c = load <2 x ptr addrspace(1)>, ptr addrspace(1) %b, align 4 271 %ld.c.idx.1 = load <2 x ptr addrspace(1)>, ptr addrspace(1) %b.1, align 4 272 273 store <2 x ptr addrspace(1)> zeroinitializer, ptr addrspace(1) %a, align 4 274 store <2 x ptr addrspace(1)> zeroinitializer, ptr addrspace(1) %a.1, align 4 275 ret void 276} 277 278; CHECK-LABEL: @merge_load_ptr64_f64( 279; CHECK: load <2 x i64> 280; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 0 281; CHECK: [[ELT0_INT:%[^ ]+]] = inttoptr i64 [[ELT0]] to ptr addrspace(1) 282; CHECK: [[ELT1_INT:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1 283; CHECK: bitcast i64 [[ELT1_INT]] to double 284define amdgpu_kernel void @merge_load_ptr64_f64(ptr addrspace(1) nocapture %a) #0 { 285entry: 286 %a.1 = getelementptr inbounds double, ptr addrspace(1) %a, i64 1 287 288 %ld.0 = load ptr addrspace(1), ptr addrspace(1) %a 289 %ld.1 = load double, ptr addrspace(1) %a.1 290 291 ret void 292} 293 294; CHECK-LABEL: @merge_load_f64_ptr64( 295; CHECK: load <2 x i64> 296; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 0 297; CHECK: bitcast i64 [[ELT0]] to double 298; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1 299; CHECK: inttoptr i64 [[ELT1]] to ptr addrspace(1) 300define amdgpu_kernel void @merge_load_f64_ptr64(ptr addrspace(1) nocapture %a) #0 { 301entry: 302 %a.1 = getelementptr inbounds double, ptr addrspace(1) %a, i64 1 303 304 %ld.0 = load double, ptr addrspace(1) %a 305 %ld.1 = load ptr addrspace(1), ptr addrspace(1) %a.1 306 307 ret void 308} 309 310; CHECK-LABEL: @merge_store_ptr64_f64( 311; CHECK: [[ELT0_INT:%[^ ]+]] = ptrtoint ptr addrspace(1) %ptr0 to i64 312; CHECK: insertelement <2 x i64> poison, i64 [[ELT0_INT]], i32 0 313; CHECK: [[ELT1_INT:%[^ ]+]] = bitcast double %val1 to i64 314; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1_INT]], i32 1 315; CHECK: store <2 x i64> 316define amdgpu_kernel void @merge_store_ptr64_f64(ptr addrspace(1) nocapture %a, ptr addrspace(1) %ptr0, double %val1) #0 { 317entry: 318 %a.1 = getelementptr inbounds double, ptr addrspace(1) %a, i64 1 319 320 store ptr addrspace(1) %ptr0, ptr addrspace(1) %a 321 store double %val1, ptr addrspace(1) %a.1 322 323 ret void 324} 325 326; CHECK-LABEL: @merge_store_f64_ptr64( 327; CHECK: [[ELT0_INT:%[^ ]+]] = bitcast double %val0 to i64 328; CHECK: insertelement <2 x i64> poison, i64 [[ELT0_INT]], i32 0 329; CHECK: [[ELT1_INT:%[^ ]+]] = ptrtoint ptr addrspace(1) %ptr1 to i64 330; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1_INT]], i32 1 331; CHECK: store <2 x i64> 332define amdgpu_kernel void @merge_store_f64_ptr64(ptr addrspace(1) nocapture %a, double %val0, ptr addrspace(1) %ptr1) #0 { 333entry: 334 %a.1 = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) %a, i64 1 335 336 store double %val0, ptr addrspace(1) %a 337 store ptr addrspace(1) %ptr1, ptr addrspace(1) %a.1 338 339 ret void 340} 341 342attributes #0 = { nounwind } 343attributes #1 = { nounwind readnone } 344