xref: /llvm-project/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll (revision 2b00a73f62605fcaeaedd358ba8b55fad06571aa)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s
3
4; Make sure that we rotate the graph to help avoid the shuffle to
5; the external vectorizable stores.
6;
7; SLP starts vectorizing from the operands of the `fcmp` in bb2, then crosses
8; into bb1, vectorizing all the way to the broadcast load at the top.
9; The stores in bb1 are external to this tree, but they are vectorizable and are
10; in reverse order.
11define void @rotate_with_external_users(ptr %A, ptr %ptr) {
12; CHECK-LABEL: @rotate_with_external_users(
13; CHECK-NEXT:  bb1:
14; CHECK-NEXT:    [[LD:%.*]] = load double, ptr undef, align 8
15; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD]], i32 0
16; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer
17; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[SHUFFLE]], <double 2.200000e+00, double 1.100000e+00>
18; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], <double 2.200000e+00, double 1.100000e+00>
19; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr [[A:%.*]], align 8
20; CHECK-NEXT:    br label [[BB2:%.*]]
21; CHECK:       bb2:
22; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP2]], <double 4.400000e+00, double 3.300000e+00>
23; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
24; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
25; CHECK-NEXT:    [[SEED:%.*]] = fcmp ogt double [[TMP6]], [[TMP5]]
26; CHECK-NEXT:    ret void
27;
28bb1:
29  %ld = load double, ptr undef
30
31  %add1 = fadd double %ld, 1.1
32  %add2 = fadd double %ld, 2.2
33
34  %mul1 = fmul double %add1, 1.1
35  %mul2 = fmul double %add2, 2.2
36
37  ; Thes are external vectorizable stores with operands in reverse order.
38  %ptrA2 = getelementptr inbounds double, ptr %A, i64 1
39  store double %mul2, ptr %A
40  store double %mul1, ptr %ptrA2
41  br label %bb2
42
43bb2:
44  %add3 = fadd double %mul1, 3.3
45  %add4 = fadd double %mul2, 4.4
46  %seed = fcmp ogt double %add3, %add4
47  ret void
48}
49
50; This checks that non-consecutive external users are skipped.
51define void @non_consecutive_external_users(ptr %A, ptr %ptr) {
52; CHECK-LABEL: @non_consecutive_external_users(
53; CHECK-NEXT:  bb1:
54; CHECK-NEXT:    [[LD:%.*]] = load double, ptr undef, align 8
55; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> poison, double [[LD]], i32 0
56; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <4 x i32> zeroinitializer
57; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[SHUFFLE]], <double 1.100000e+00, double 2.200000e+00, double 3.300000e+00, double 4.400000e+00>
58; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x double> [[TMP1]], <double 1.100000e+00, double 2.200000e+00, double 3.300000e+00, double 4.400000e+00>
59; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x double> [[TMP2]], <double 1.100000e+00, double 2.200000e+00, double 3.300000e+00, double 4.400000e+00>
60; CHECK-NEXT:    [[PTRA4:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 3
61; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP3]], i32 3
62; CHECK-NEXT:    store double [[TMP4]], ptr [[A]], align 8
63; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP3]], i32 2
64; CHECK-NEXT:    store double [[TMP5]], ptr [[A]], align 8
65; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP3]], i32 1
66; CHECK-NEXT:    store double [[TMP6]], ptr [[PTRA4]], align 8
67; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP3]], i32 0
68; CHECK-NEXT:    store double [[TMP7]], ptr [[PTRA4]], align 8
69; CHECK-NEXT:    br label [[SEED_LOOP:%.*]]
70; CHECK:       seed_loop:
71; CHECK-NEXT:    [[TMP8:%.*]] = phi <4 x double> [ [[TMP3]], [[BB1:%.*]] ], [ zeroinitializer, [[SEED_LOOP]] ]
72; CHECK-NEXT:    br label [[SEED_LOOP]]
73;
74bb1:
75  %ld = load double, ptr undef
76
77  %add5 = fadd double %ld, 1.1
78  %add6 = fadd double %ld, 2.2
79  %add7 = fadd double %ld, 3.3
80  %add8 = fadd double %ld, 4.4
81
82  %add1 = fadd double %add5, 1.1
83  %add2 = fadd double %add6, 2.2
84  %add3 = fadd double %add7, 3.3
85  %add4 = fadd double %add8, 4.4
86
87  %mul1 = fmul double %add1, 1.1
88  %mul2 = fmul double %add2, 2.2
89  %mul3 = fmul double %add3, 3.3
90  %mul4 = fmul double %add4, 4.4
91
92  ; External non-consecutive stores.
93  %ptrA4 = getelementptr inbounds double, ptr %A, i64 3
94  store double %mul4, ptr %A
95  store double %mul3, ptr %A
96  store double %mul2, ptr %ptrA4
97  store double %mul1, ptr %ptrA4
98  br label %seed_loop
99
100seed_loop:
101  %phi1 = phi double [ %mul1, %bb1 ], [ 0.0, %seed_loop ]
102  %phi2 = phi double [ %mul2, %bb1 ], [ 0.0, %seed_loop ]
103  %phi3 = phi double [ %mul3, %bb1 ], [ 0.0, %seed_loop ]
104  %phi4 = phi double [ %mul4, %bb1 ], [ 0.0, %seed_loop ]
105  br label %seed_loop
106}
107
108; We have to be careful when the tree contains add/sub patterns that could be
109; combined into a single addsub instruction. Reordering can block the pattern.
110define void @addsub_and_external_users(ptr %A, ptr %ptr) {
111; CHECK-LABEL: @addsub_and_external_users(
112; CHECK-NEXT:  bb1:
113; CHECK-NEXT:    [[LD:%.*]] = load double, ptr undef, align 8
114; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD]], i32 0
115; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer
116; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> [[TMP1]], <double 1.100000e+00, double 1.200000e+00>
117; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP1]], <double 1.100000e+00, double 1.200000e+00>
118; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP6]], <2 x i32> <i32 0, i32 3>
119; CHECK-NEXT:    [[TMP4:%.*]] = fdiv <2 x double> [[TMP3]], <double 2.100000e+00, double 2.200000e+00>
120; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], <double 3.100000e+00, double 3.200000e+00>
121; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
122; CHECK-NEXT:    store <2 x double> [[SHUFFLE1]], ptr [[A:%.*]], align 8
123; CHECK-NEXT:    br label [[BB2:%.*]]
124; CHECK:       bb2:
125; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP5]], <double 4.100000e+00, double 4.200000e+00>
126; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
127; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
128; CHECK-NEXT:    [[SEED:%.*]] = fcmp ogt double [[TMP8]], [[TMP9]]
129; CHECK-NEXT:    ret void
130;
131bb1:
132  %ld = load double, ptr undef
133
134  %sub1 = fsub double %ld, 1.1
135  %add2 = fadd double %ld, 1.2
136
137  %div1 = fdiv double %sub1, 2.1
138  %div2 = fdiv double %add2, 2.2
139
140  %mul1 = fmul double %div1, 3.1
141  %mul2 = fmul double %div2, 3.2
142
143  ; These are external vectorizable stores with operands in reverse order.
144  %ptrA1 = getelementptr inbounds double, ptr %A, i64 1
145  store double %mul2, ptr %A
146  store double %mul1, ptr %ptrA1
147  br label %bb2
148
149bb2:
150  %addS1 = fadd double %mul1, 4.1
151  %addS2 = fadd double %mul2, 4.2
152  %seed = fcmp ogt double %addS1, %addS2
153  ret void
154}
155
156; This contains a sub/add bundle, reordering it will make it better.
157define void @subadd_and_external_users(ptr %A, ptr %ptr) {
158; CHECK-LABEL: @subadd_and_external_users(
159; CHECK-NEXT:  bb1:
160; CHECK-NEXT:    [[LD:%.*]] = load double, ptr undef, align 8
161; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD]], i32 0
162; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer
163; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> [[TMP1]], <double 1.200000e+00, double 1.100000e+00>
164; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP1]], <double 1.200000e+00, double 1.100000e+00>
165; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP6]], <2 x i32> <i32 0, i32 3>
166; CHECK-NEXT:    [[TMP4:%.*]] = fdiv <2 x double> [[TMP3]], <double 2.200000e+00, double 2.100000e+00>
167; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], <double 3.200000e+00, double 3.100000e+00>
168; CHECK-NEXT:    store <2 x double> [[TMP5]], ptr [[A:%.*]], align 8
169; CHECK-NEXT:    br label [[BB2:%.*]]
170; CHECK:       bb2:
171; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP5]], <double 4.200000e+00, double 4.100000e+00>
172; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
173; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
174; CHECK-NEXT:    [[SEED:%.*]] = fcmp ogt double [[TMP9]], [[TMP8]]
175; CHECK-NEXT:    ret void
176;
177bb1:
178  %ld = load double, ptr undef
179
180  %add1 = fadd double %ld, 1.1
181  %sub2 = fsub double %ld, 1.2
182
183  %div1 = fdiv double %add1, 2.1
184  %div2 = fdiv double %sub2, 2.2
185
186  %mul1 = fmul double %div1, 3.1
187  %mul2 = fmul double %div2, 3.2
188
189  ; These are external vectorizable stores with operands in reverse order.
190  %ptrA1 = getelementptr inbounds double, ptr %A, i64 1
191  store double %mul2, ptr %A
192  store double %mul1, ptr %ptrA1
193  br label %bb2
194
195bb2:
196  %addS1 = fadd double %mul1, 4.1
197  %addS2 = fadd double %mul2, 4.2
198  %seed = fcmp ogt double %addS1, %addS2
199  ret void
200}
201
202define void @alt_but_not_addsub_and_external_users(ptr %A, ptr %ptr) {
203; CHECK-LABEL: @alt_but_not_addsub_and_external_users(
204; CHECK-NEXT:  bb1:
205; CHECK-NEXT:    [[LD:%.*]] = load double, ptr undef, align 8
206; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> poison, double [[LD]], i32 0
207; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <4 x i32> zeroinitializer
208; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> [[SHUFFLE]], <double 1.400000e+00, double 1.300000e+00, double 1.200000e+00, double 1.100000e+00>
209; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x double> [[SHUFFLE]], <double 1.400000e+00, double 1.300000e+00, double 1.200000e+00, double 1.100000e+00>
210; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
211; CHECK-NEXT:    [[TMP4:%.*]] = fdiv <4 x double> [[TMP3]], <double 2.400000e+00, double 2.300000e+00, double 2.200000e+00, double 2.100000e+00>
212; CHECK-NEXT:    [[TMP5:%.*]] = fmul <4 x double> [[TMP4]], <double 3.400000e+00, double 3.300000e+00, double 3.200000e+00, double 3.100000e+00>
213; CHECK-NEXT:    store <4 x double> [[TMP5]], ptr [[A:%.*]], align 8
214; CHECK-NEXT:    br label [[BB2:%.*]]
215; CHECK:       bb2:
216; CHECK-NEXT:    [[TMP7:%.*]] = phi <4 x double> [ [[TMP5]], [[BB1:%.*]] ], [ <double 4.400000e+00, double 4.300000e+00, double 4.200000e+00, double 4.100000e+00>, [[BB2]] ]
217; CHECK-NEXT:    br label [[BB2]]
218;
219bb1:
220  %ld = load double, ptr undef
221
222  %sub1 = fsub double %ld, 1.1
223  %add2 = fadd double %ld, 1.2
224  %add3 = fadd double %ld, 1.3
225  %sub4 = fsub double %ld, 1.4
226
227  %div1 = fdiv double %sub1, 2.1
228  %div2 = fdiv double %add2, 2.2
229  %div3 = fdiv double %add3, 2.3
230  %div4 = fdiv double %sub4, 2.4
231
232  %mul1 = fmul double %div1, 3.1
233  %mul2 = fmul double %div2, 3.2
234  %mul3 = fmul double %div3, 3.3
235  %mul4 = fmul double %div4, 3.4
236
237  ; These are external vectorizable stores with operands in reverse order.
238  %ptrA3 = getelementptr inbounds double, ptr %A, i64 3
239  %ptrA2 = getelementptr inbounds double, ptr %A, i64 2
240  %ptrA1 = getelementptr inbounds double, ptr %A, i64 1
241  store double %mul4, ptr %A
242  store double %mul3, ptr %ptrA1
243  store double %mul2, ptr %ptrA2
244  store double %mul1, ptr %ptrA3
245  br label %bb2
246
247bb2:
248  %phi1 = phi double [ %mul1, %bb1 ], [ 4.1, %bb2 ]
249  %phi2 = phi double [ %mul2, %bb1 ], [ 4.2, %bb2 ]
250  %phi3 = phi double [ %mul3, %bb1 ], [ 4.3, %bb2 ]
251  %phi4 = phi double [ %mul4, %bb1 ], [ 4.4, %bb2 ]
252  br label %bb2
253}
254