xref: /llvm-project/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll (revision 1833d418a04123916c1dbeb0c41c8bc7d06b779b)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake-avx512 | FileCheck %s
3
4%struct.S = type { [8 x i32], [8 x i32], [16 x i32] }
5
6define dso_local void @_Z4testP1S(ptr %p) local_unnamed_addr {
7; CHECK-LABEL: @_Z4testP1S(
8; CHECK-NEXT:  entry:
9; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[P:%.*]], i64 0, i32 1, i64 0
10; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 4
11; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 12
12; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 14
13; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX27]], align 4
14; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4
15; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr [[ARRAYIDX]], align 4
16; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX20]], align 4
17; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
18; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP3]], <8 x i32> <i32 1, i32 7, i32 6, i32 4, i32 poison, i32 poison, i32 0, i32 5>
19; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP0]], i64 4)
20; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <8 x i32> [[TMP6]], [[TMP2]]
21; CHECK-NEXT:    store <8 x i32> [[TMP7]], ptr [[P]], align 4
22; CHECK-NEXT:    ret void
23;
24entry:
25  %arrayidx = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 1, i64 0
26  %i = load i32, ptr %arrayidx, align 4
27  %arrayidx1 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 2, i64 15
28  %i1 = load i32, ptr %arrayidx1, align 4
29  %add = add nsw i32 %i1, %i
30  store i32 %add, ptr %p, align 4
31  %arrayidx4 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 1, i64 1
32  %i2 = load i32, ptr %arrayidx4, align 4
33  %arrayidx6 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 2, i64 7
34  %i3 = load i32, ptr %arrayidx6, align 4
35  %add7 = add nsw i32 %i3, %i2
36  %arrayidx9 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 1
37  store i32 %add7, ptr %arrayidx9, align 4
38  %arrayidx11 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 1, i64 2
39  %i4 = load i32, ptr %arrayidx11, align 4
40  %arrayidx13 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 2, i64 6
41  %i5 = load i32, ptr %arrayidx13, align 4
42  %add14 = add nsw i32 %i5, %i4
43  %arrayidx16 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 2
44  store i32 %add14, ptr %arrayidx16, align 4
45  %arrayidx18 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 1, i64 3
46  %i6 = load i32, ptr %arrayidx18, align 4
47  %arrayidx20 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 2, i64 4
48  %i7 = load i32, ptr %arrayidx20, align 4
49  %add21 = add nsw i32 %i7, %i6
50  %arrayidx23 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 3
51  store i32 %add21, ptr %arrayidx23, align 4
52  %arrayidx25 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 1, i64 4
53  %i8 = load i32, ptr %arrayidx25, align 4
54  %arrayidx27 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 2, i64 12
55  %i9 = load i32, ptr %arrayidx27, align 4
56  %add28 = add nsw i32 %i9, %i8
57  %arrayidx30 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 4
58  store i32 %add28, ptr %arrayidx30, align 4
59  %arrayidx32 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 1, i64 5
60  %i10 = load i32, ptr %arrayidx32, align 4
61  %arrayidx34 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 2, i64 13
62  %i11 = load i32, ptr %arrayidx34, align 4
63  %add35 = add nsw i32 %i11, %i10
64  %arrayidx37 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 5
65  store i32 %add35, ptr %arrayidx37, align 4
66  %arrayidx39 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 1, i64 6
67  %i12 = load i32, ptr %arrayidx39, align 4
68  %arrayidx41 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 2, i64 14
69  %i13 = load i32, ptr %arrayidx41, align 4
70  %add42 = add nsw i32 %i13, %i12
71  %arrayidx44 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 6
72  store i32 %add42, ptr %arrayidx44, align 4
73  %arrayidx46 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 1, i64 7
74  %i14 = load i32, ptr %arrayidx46, align 4
75  %arrayidx48 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 2, i64 5
76  %i15 = load i32, ptr %arrayidx48, align 4
77  %add49 = add nsw i32 %i15, %i14
78  %arrayidx51 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 7
79  store i32 %add49, ptr %arrayidx51, align 4
80  ret void
81}
82
83; Test for 2 load groups 4 elements each against different base pointers.
84; Both loaded groups are not ordered thus here are few specific points:
85; (1) these groups are detected, (2) reordereing shuffles generated and
86; (3) these loads vectorized as a part of tree that is seeded by stores
87; and with VF=8.
88
89define dso_local void @test_unordered_splits(ptr nocapture %p) local_unnamed_addr {
90; CHECK-LABEL: @test_unordered_splits(
91; CHECK-NEXT:  entry:
92; CHECK-NEXT:    [[P1:%.*]] = alloca [16 x i32], align 16
93; CHECK-NEXT:    [[P2:%.*]] = alloca [16 x i32], align 16
94; CHECK-NEXT:    [[G10:%.*]] = getelementptr inbounds [16 x i32], ptr [[P1]], i32 0, i64 4
95; CHECK-NEXT:    [[G20:%.*]] = getelementptr inbounds [16 x i32], ptr [[P2]], i32 0, i64 12
96; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[G10]], align 4
97; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[G20]], align 4
98; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP0]], i64 0)
99; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP2]], <4 x i32> [[TMP1]], i64 4)
100; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 7, i32 5, i32 6, i32 4>
101; CHECK-NEXT:    store <8 x i32> [[TMP4]], ptr [[P:%.*]], align 4
102; CHECK-NEXT:    ret void
103;
104entry:
105  %p1 = alloca [16 x i32], align 16
106  %p2 = alloca [16 x i32], align 16
107  %g10 = getelementptr inbounds [16 x i32], ptr %p1, i32 0, i64 4
108  %g11 = getelementptr inbounds [16 x i32], ptr %p1, i32 0, i64 5
109  %g12 = getelementptr inbounds [16 x i32], ptr %p1, i32 0, i64 6
110  %g13 = getelementptr inbounds [16 x i32], ptr %p1, i32 0, i64 7
111  %g20 = getelementptr inbounds [16 x i32], ptr %p2, i32 0, i64 12
112  %g21 = getelementptr inbounds [16 x i32], ptr %p2, i32 0, i64 13
113  %g22 = getelementptr inbounds [16 x i32], ptr %p2, i32 0, i64 14
114  %g23 = getelementptr inbounds [16 x i32], ptr %p2, i32 0, i64 15
115  %i1 = load i32, ptr %g11, align 4
116  store i32 %i1, ptr %p, align 4
117  %i3 = load i32, ptr %g10, align 4
118  %arrayidx9 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 1
119  store i32 %i3, ptr %arrayidx9, align 4
120  %i5 = load i32, ptr %g12, align 4
121  %arrayidx16 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 2
122  store i32 %i5, ptr %arrayidx16, align 4
123  %i7 = load i32, ptr %g13, align 4
124  %arrayidx23 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 3
125  store i32 %i7, ptr %arrayidx23, align 4
126  %i9 = load i32, ptr %g23, align 4
127  %arrayidx30 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 4
128  store i32 %i9, ptr %arrayidx30, align 4
129  %i11 = load i32, ptr %g21, align 4
130  %arrayidx37 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 5
131  store i32 %i11, ptr %arrayidx37, align 4
132  %i13 = load i32, ptr %g22, align 4
133  %arrayidx44 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 6
134  store i32 %i13, ptr %arrayidx44, align 4
135  %i15 = load i32, ptr %g20, align 4
136  %arrayidx51 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 7
137  store i32 %i15, ptr %arrayidx51, align 4
138  ret void
139}
140
141define dso_local void @test_cost_splits(ptr nocapture %p) local_unnamed_addr {
142; CHECK-LABEL: @test_cost_splits(
143; CHECK-NEXT:  entry:
144; CHECK-NEXT:    [[P1:%.*]] = alloca [16 x i32], align 16
145; CHECK-NEXT:    [[P2:%.*]] = alloca [16 x i32], align 16
146; CHECK-NEXT:    [[P3:%.*]] = alloca [16 x i32], align 16
147; CHECK-NEXT:    [[P4:%.*]] = alloca [16 x i32], align 16
148; CHECK-NEXT:    [[G10:%.*]] = getelementptr inbounds [16 x i32], ptr [[P1]], i32 0, i64 4
149; CHECK-NEXT:    [[G12:%.*]] = getelementptr inbounds [16 x i32], ptr [[P2]], i32 0, i64 6
150; CHECK-NEXT:    [[G20:%.*]] = getelementptr inbounds [16 x i32], ptr [[P3]], i32 0, i64 12
151; CHECK-NEXT:    [[G22:%.*]] = getelementptr inbounds [16 x i32], ptr [[P4]], i32 0, i64 14
152; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[G10]], align 4
153; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[G12]], align 4
154; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[G20]], align 4
155; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr [[G22]], align 4
156; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> poison, <2 x i32> [[TMP0]], i64 0)
157; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP4]], <2 x i32> [[TMP1]], i64 2)
158; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP2]], i64 4)
159; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP3]], i64 6)
160; CHECK-NEXT:    store <8 x i32> [[TMP7]], ptr [[P:%.*]], align 4
161; CHECK-NEXT:    ret void
162;
163entry:
164  %p1 = alloca [16 x i32], align 16
165  %p2 = alloca [16 x i32], align 16
166  %p3 = alloca [16 x i32], align 16
167  %p4 = alloca [16 x i32], align 16
168  %g10 = getelementptr inbounds [16 x i32], ptr %p1, i32 0, i64 4
169  %g11 = getelementptr inbounds [16 x i32], ptr %p1, i32 0, i64 5
170  %g12 = getelementptr inbounds [16 x i32], ptr %p2, i32 0, i64 6
171  %g13 = getelementptr inbounds [16 x i32], ptr %p2, i32 0, i64 7
172  %g20 = getelementptr inbounds [16 x i32], ptr %p3, i32 0, i64 12
173  %g21 = getelementptr inbounds [16 x i32], ptr %p3, i32 0, i64 13
174  %g22 = getelementptr inbounds [16 x i32], ptr %p4, i32 0, i64 14
175  %g23 = getelementptr inbounds [16 x i32], ptr %p4, i32 0, i64 15
176  %i1 = load i32, ptr %g10, align 4
177  store i32 %i1, ptr %p, align 4
178  %i3 = load i32, ptr %g11, align 4
179  %arrayidx9 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 1
180  store i32 %i3, ptr %arrayidx9, align 4
181  %i5 = load i32, ptr %g12, align 4
182  %arrayidx16 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 2
183  store i32 %i5, ptr %arrayidx16, align 4
184  %i7 = load i32, ptr %g13, align 4
185  %arrayidx23 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 3
186  store i32 %i7, ptr %arrayidx23, align 4
187  %i9 = load i32, ptr %g20, align 4
188  %arrayidx30 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 4
189  store i32 %i9, ptr %arrayidx30, align 4
190  %i11 = load i32, ptr %g21, align 4
191  %arrayidx37 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 5
192  store i32 %i11, ptr %arrayidx37, align 4
193  %i13 = load i32, ptr %g22, align 4
194  %arrayidx44 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 6
195  store i32 %i13, ptr %arrayidx44, align 4
196  %i15 = load i32, ptr %g23, align 4
197  %arrayidx51 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 7
198  store i32 %i15, ptr %arrayidx51, align 4
199  ret void
200}
201