1; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck %s 2; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s 3 4target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" 5 6declare i32 @llvm.amdgcn.workitem.id.x() #1 7 8; CHECK-LABEL: @basic_merge_sext_index( 9; CHECK: sext i32 %id.x to i64 10; CHECK: load <2 x float> 11; CHECK: store <2 x float> zeroinitializer 12define amdgpu_kernel void @basic_merge_sext_index(ptr addrspace(1) nocapture %a, ptr addrspace(1) nocapture %b, ptr addrspace(1) nocapture readonly %c) #0 { 13entry: 14 %id.x = call i32 @llvm.amdgcn.workitem.id.x() 15 %sext.id.x = sext i32 %id.x to i64 16 %a.idx.x = getelementptr inbounds float, ptr addrspace(1) %a, i64 %sext.id.x 17 %c.idx.x = getelementptr inbounds float, ptr addrspace(1) %c, i64 %sext.id.x 18 %a.idx.x.1 = getelementptr inbounds float, ptr addrspace(1) %a.idx.x, i64 1 19 %c.idx.x.1 = getelementptr inbounds float, ptr addrspace(1) %c.idx.x, i64 1 20 21 %ld.c = load float, ptr addrspace(1) %c.idx.x, align 4 22 %ld.c.idx.1 = load float, ptr addrspace(1) %c.idx.x.1, align 4 23 24 store float 0.0, ptr addrspace(1) %a.idx.x, align 4 25 store float 0.0, ptr addrspace(1) %a.idx.x.1, align 4 26 27 %add = fadd float %ld.c, %ld.c.idx.1 28 store float %add, ptr addrspace(1) %b, align 4 29 ret void 30} 31 32; CHECK-LABEL: @basic_merge_zext_index( 33; CHECK: zext i32 %id.x to i64 34; CHECK: load <2 x float> 35; CHECK: store <2 x float> 36define amdgpu_kernel void @basic_merge_zext_index(ptr addrspace(1) nocapture %a, ptr addrspace(1) nocapture %b, ptr addrspace(1) nocapture readonly %c) #0 { 37entry: 38 %id.x = call i32 @llvm.amdgcn.workitem.id.x() 39 %zext.id.x = zext i32 %id.x to i64 40 %a.idx.x = getelementptr inbounds float, ptr addrspace(1) %a, i64 %zext.id.x 41 %c.idx.x = getelementptr inbounds float, ptr addrspace(1) %c, i64 %zext.id.x 42 %a.idx.x.1 = getelementptr inbounds float, ptr addrspace(1) %a.idx.x, i64 1 43 %c.idx.x.1 = getelementptr inbounds float, ptr addrspace(1) %c.idx.x, i64 1 44 45 %ld.c = load float, ptr addrspace(1) %c.idx.x, align 4 46 %ld.c.idx.1 = load float, ptr addrspace(1) %c.idx.x.1, align 4 47 store float 0.0, ptr addrspace(1) %a.idx.x, align 4 48 store float 0.0, ptr addrspace(1) %a.idx.x.1, align 4 49 50 %add = fadd float %ld.c, %ld.c.idx.1 51 store float %add, ptr addrspace(1) %b, align 4 52 ret void 53} 54 55; CHECK-LABEL: @merge_op_zext_index( 56; CHECK: load <2 x float> 57; CHECK: store <2 x float> 58define amdgpu_kernel void @merge_op_zext_index(ptr addrspace(1) nocapture noalias %a, ptr addrspace(1) nocapture noalias %b, ptr addrspace(1) nocapture readonly noalias %c) #0 { 59entry: 60 %id.x = call i32 @llvm.amdgcn.workitem.id.x() 61 %shl = shl i32 %id.x, 2 62 %zext.id.x = zext i32 %shl to i64 63 %a.0 = getelementptr inbounds float, ptr addrspace(1) %a, i64 %zext.id.x 64 %c.0 = getelementptr inbounds float, ptr addrspace(1) %c, i64 %zext.id.x 65 66 %id.x.1 = or disjoint i32 %shl, 1 67 %id.x.1.ext = zext i32 %id.x.1 to i64 68 69 %a.1 = getelementptr inbounds float, ptr addrspace(1) %a, i64 %id.x.1.ext 70 %c.1 = getelementptr inbounds float, ptr addrspace(1) %c, i64 %id.x.1.ext 71 72 %ld.c.0 = load float, ptr addrspace(1) %c.0, align 4 73 store float 0.0, ptr addrspace(1) %a.0, align 4 74 %ld.c.1 = load float, ptr addrspace(1) %c.1, align 4 75 store float 0.0, ptr addrspace(1) %a.1, align 4 76 77 %add = fadd float %ld.c.0, %ld.c.1 78 store float %add, ptr addrspace(1) %b, align 4 79 ret void 80} 81 82; CHECK-LABEL: @merge_op_sext_index( 83; CHECK: load <2 x float> 84; CHECK: store <2 x float> 85define amdgpu_kernel void @merge_op_sext_index(ptr addrspace(1) nocapture noalias %a, ptr addrspace(1) nocapture noalias %b, ptr addrspace(1) nocapture readonly noalias %c) #0 { 86entry: 87 %id.x = call i32 @llvm.amdgcn.workitem.id.x() 88 %shl = shl i32 %id.x, 2 89 %zext.id.x = sext i32 %shl to i64 90 %a.0 = getelementptr inbounds float, ptr addrspace(1) %a, i64 %zext.id.x 91 %c.0 = getelementptr inbounds float, ptr addrspace(1) %c, i64 %zext.id.x 92 93 %id.x.1 = or disjoint i32 %shl, 1 94 %id.x.1.ext = sext i32 %id.x.1 to i64 95 96 %a.1 = getelementptr inbounds float, ptr addrspace(1) %a, i64 %id.x.1.ext 97 %c.1 = getelementptr inbounds float, ptr addrspace(1) %c, i64 %id.x.1.ext 98 99 %ld.c.0 = load float, ptr addrspace(1) %c.0, align 4 100 store float 0.0, ptr addrspace(1) %a.0, align 4 101 %ld.c.1 = load float, ptr addrspace(1) %c.1, align 4 102 store float 0.0, ptr addrspace(1) %a.1, align 4 103 104 %add = fadd float %ld.c.0, %ld.c.1 105 store float %add, ptr addrspace(1) %b, align 4 106 ret void 107} 108 109; This case fails to vectorize if not using the extra extension 110; handling in isConsecutiveAccess. 111 112; CHECK-LABEL: @zext_trunc_phi_1( 113; CHECK: loop: 114; CHECK: load <2 x i32> 115; CHECK: store <2 x i32> 116define amdgpu_kernel void @zext_trunc_phi_1(ptr addrspace(1) nocapture noalias %a, ptr addrspace(1) nocapture noalias %b, ptr addrspace(1) nocapture readonly noalias %c, i32 %n, i64 %arst, i64 %aoeu) #0 { 117entry: 118 %cmp0 = icmp eq i32 %n, 0 119 br i1 %cmp0, label %exit, label %loop 120 121loop: 122 %indvars.iv = phi i64 [ %indvars.iv.next, %loop ], [ 0, %entry ] 123 %trunc.iv = trunc i64 %indvars.iv to i32 124 %idx = shl i32 %trunc.iv, 4 125 126 %idx.ext = zext i32 %idx to i64 127 %c.0 = getelementptr inbounds i32, ptr addrspace(1) %c, i64 %idx.ext 128 %a.0 = getelementptr inbounds i32, ptr addrspace(1) %a, i64 %idx.ext 129 130 %idx.1 = or disjoint i32 %idx, 1 131 %idx.1.ext = zext i32 %idx.1 to i64 132 %c.1 = getelementptr inbounds i32, ptr addrspace(1) %c, i64 %idx.1.ext 133 %a.1 = getelementptr inbounds i32, ptr addrspace(1) %a, i64 %idx.1.ext 134 135 %ld.c.0 = load i32, ptr addrspace(1) %c.0, align 4 136 store i32 %ld.c.0, ptr addrspace(1) %a.0, align 4 137 %ld.c.1 = load i32, ptr addrspace(1) %c.1, align 4 138 store i32 %ld.c.1, ptr addrspace(1) %a.1, align 4 139 140 %indvars.iv.next = add i64 %indvars.iv, 1 141 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 142 143 %exitcond = icmp eq i32 %lftr.wideiv, %n 144 br i1 %exitcond, label %exit, label %loop 145 146exit: 147 ret void 148} 149 150attributes #0 = { nounwind } 151attributes #1 = { nounwind readnone } 152