xref: /llvm-project/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll (revision eecb99c5f66c8491766628a2925587e20f3b1dbd)
1; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck %s
2; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
3
4target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
5
6declare i32 @llvm.amdgcn.workitem.id.x() #1
7
8; CHECK-LABEL: @basic_merge_sext_index(
9; CHECK: sext i32 %id.x to i64
10; CHECK: load <2 x float>
11; CHECK: store <2 x float> zeroinitializer
12define amdgpu_kernel void @basic_merge_sext_index(ptr addrspace(1) nocapture %a, ptr addrspace(1) nocapture %b, ptr addrspace(1) nocapture readonly %c) #0 {
13entry:
14  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
15  %sext.id.x = sext i32 %id.x to i64
16  %a.idx.x = getelementptr inbounds float, ptr addrspace(1) %a, i64 %sext.id.x
17  %c.idx.x = getelementptr inbounds float, ptr addrspace(1) %c, i64 %sext.id.x
18  %a.idx.x.1 = getelementptr inbounds float, ptr addrspace(1) %a.idx.x, i64 1
19  %c.idx.x.1 = getelementptr inbounds float, ptr addrspace(1) %c.idx.x, i64 1
20
21  %ld.c = load float, ptr addrspace(1) %c.idx.x, align 4
22  %ld.c.idx.1 = load float, ptr addrspace(1) %c.idx.x.1, align 4
23
24  store float 0.0, ptr addrspace(1) %a.idx.x, align 4
25  store float 0.0, ptr addrspace(1) %a.idx.x.1, align 4
26
27  %add = fadd float %ld.c, %ld.c.idx.1
28  store float %add, ptr addrspace(1) %b, align 4
29  ret void
30}
31
32; CHECK-LABEL: @basic_merge_zext_index(
33; CHECK: zext i32 %id.x to i64
34; CHECK: load <2 x float>
35; CHECK: store <2 x float>
36define amdgpu_kernel void @basic_merge_zext_index(ptr addrspace(1) nocapture %a, ptr addrspace(1) nocapture %b, ptr addrspace(1) nocapture readonly %c) #0 {
37entry:
38  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
39  %zext.id.x = zext i32 %id.x to i64
40  %a.idx.x = getelementptr inbounds float, ptr addrspace(1) %a, i64 %zext.id.x
41  %c.idx.x = getelementptr inbounds float, ptr addrspace(1) %c, i64 %zext.id.x
42  %a.idx.x.1 = getelementptr inbounds float, ptr addrspace(1) %a.idx.x, i64 1
43  %c.idx.x.1 = getelementptr inbounds float, ptr addrspace(1) %c.idx.x, i64 1
44
45  %ld.c = load float, ptr addrspace(1) %c.idx.x, align 4
46  %ld.c.idx.1 = load float, ptr addrspace(1) %c.idx.x.1, align 4
47  store float 0.0, ptr addrspace(1) %a.idx.x, align 4
48  store float 0.0, ptr addrspace(1) %a.idx.x.1, align 4
49
50  %add = fadd float %ld.c, %ld.c.idx.1
51  store float %add, ptr addrspace(1) %b, align 4
52  ret void
53}
54
55; CHECK-LABEL: @merge_op_zext_index(
56; CHECK: load <2 x float>
57; CHECK: store <2 x float>
58define amdgpu_kernel void @merge_op_zext_index(ptr addrspace(1) nocapture noalias %a, ptr addrspace(1) nocapture noalias %b, ptr addrspace(1) nocapture readonly noalias %c) #0 {
59entry:
60  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
61  %shl = shl i32 %id.x, 2
62  %zext.id.x = zext i32 %shl to i64
63  %a.0 = getelementptr inbounds float, ptr addrspace(1) %a, i64 %zext.id.x
64  %c.0 = getelementptr inbounds float, ptr addrspace(1) %c, i64 %zext.id.x
65
66  %id.x.1 = or disjoint i32 %shl, 1
67  %id.x.1.ext = zext i32 %id.x.1 to i64
68
69  %a.1 = getelementptr inbounds float, ptr addrspace(1) %a, i64 %id.x.1.ext
70  %c.1 = getelementptr inbounds float, ptr addrspace(1) %c, i64 %id.x.1.ext
71
72  %ld.c.0 = load float, ptr addrspace(1) %c.0, align 4
73  store float 0.0, ptr addrspace(1) %a.0, align 4
74  %ld.c.1 = load float, ptr addrspace(1) %c.1, align 4
75  store float 0.0, ptr addrspace(1) %a.1, align 4
76
77  %add = fadd float %ld.c.0, %ld.c.1
78  store float %add, ptr addrspace(1) %b, align 4
79  ret void
80}
81
82; CHECK-LABEL: @merge_op_sext_index(
83; CHECK: load <2 x float>
84; CHECK: store <2 x float>
85define amdgpu_kernel void @merge_op_sext_index(ptr addrspace(1) nocapture noalias %a, ptr addrspace(1) nocapture noalias %b, ptr addrspace(1) nocapture readonly noalias %c) #0 {
86entry:
87  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
88  %shl = shl i32 %id.x, 2
89  %zext.id.x = sext i32 %shl to i64
90  %a.0 = getelementptr inbounds float, ptr addrspace(1) %a, i64 %zext.id.x
91  %c.0 = getelementptr inbounds float, ptr addrspace(1) %c, i64 %zext.id.x
92
93  %id.x.1 = or disjoint i32 %shl, 1
94  %id.x.1.ext = sext i32 %id.x.1 to i64
95
96  %a.1 = getelementptr inbounds float, ptr addrspace(1) %a, i64 %id.x.1.ext
97  %c.1 = getelementptr inbounds float, ptr addrspace(1) %c, i64 %id.x.1.ext
98
99  %ld.c.0 = load float, ptr addrspace(1) %c.0, align 4
100  store float 0.0, ptr addrspace(1) %a.0, align 4
101  %ld.c.1 = load float, ptr addrspace(1) %c.1, align 4
102  store float 0.0, ptr addrspace(1) %a.1, align 4
103
104  %add = fadd float %ld.c.0, %ld.c.1
105  store float %add, ptr addrspace(1) %b, align 4
106  ret void
107}
108
109; This case fails to vectorize if not using the extra extension
110; handling in isConsecutiveAccess.
111
112; CHECK-LABEL: @zext_trunc_phi_1(
113; CHECK: loop:
114; CHECK: load <2 x i32>
115; CHECK: store <2 x i32>
116define amdgpu_kernel void @zext_trunc_phi_1(ptr addrspace(1) nocapture noalias %a, ptr addrspace(1) nocapture noalias %b, ptr addrspace(1) nocapture readonly noalias %c, i32 %n, i64 %arst, i64 %aoeu) #0 {
117entry:
118  %cmp0 = icmp eq i32 %n, 0
119  br i1 %cmp0, label %exit, label %loop
120
121loop:
122  %indvars.iv = phi i64 [ %indvars.iv.next, %loop ], [ 0, %entry ]
123  %trunc.iv = trunc i64 %indvars.iv to i32
124  %idx = shl i32 %trunc.iv, 4
125
126  %idx.ext = zext i32 %idx to i64
127  %c.0 = getelementptr inbounds i32, ptr addrspace(1) %c, i64 %idx.ext
128  %a.0 = getelementptr inbounds i32, ptr addrspace(1) %a, i64 %idx.ext
129
130  %idx.1 = or disjoint i32 %idx, 1
131  %idx.1.ext = zext i32 %idx.1 to i64
132  %c.1 = getelementptr inbounds i32, ptr addrspace(1) %c, i64 %idx.1.ext
133  %a.1 = getelementptr inbounds i32, ptr addrspace(1) %a, i64 %idx.1.ext
134
135  %ld.c.0 = load i32, ptr addrspace(1) %c.0, align 4
136  store i32 %ld.c.0, ptr addrspace(1) %a.0, align 4
137  %ld.c.1 = load i32, ptr addrspace(1) %c.1, align 4
138  store i32 %ld.c.1, ptr addrspace(1) %a.1, align 4
139
140  %indvars.iv.next = add i64 %indvars.iv, 1
141  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
142
143  %exitcond = icmp eq i32 %lftr.wideiv, %n
144  br i1 %exitcond, label %exit, label %loop
145
146exit:
147  ret void
148}
149
150attributes #0 = { nounwind }
151attributes #1 = { nounwind readnone }
152