xref: /llvm-project/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll (revision 38fffa630ee80163dc65e759392ad29798905679)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -passes=slp-vectorizer,verify -S < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx | FileCheck %s -check-prefix=ENABLED
3;
4; Without supernode operand reordering, this does not get fully vectorized.
5; S[0] = (A[0] + B[0]) + C[0]
6; S[1] = (B[1] + C[1]) + A[1]
7define void @test_supernode_add(ptr %Aarray, ptr %Barray, ptr %Carray, ptr %Sarray) {
8; ENABLED-LABEL: @test_supernode_add(
9; ENABLED-NEXT:  entry:
10; ENABLED-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8
11; ENABLED-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8
12; ENABLED-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[CARRAY:%.*]], align 8
13; ENABLED-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3>
14; ENABLED-NEXT:    [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]]
15; ENABLED-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> <i32 0, i32 3>
16; ENABLED-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP5]]
17; ENABLED-NEXT:    store <2 x double> [[TMP6]], ptr [[SARRAY:%.*]], align 8
18; ENABLED-NEXT:    ret void
19;
20entry:
21  %idxA1 = getelementptr inbounds double, ptr %Aarray, i64 1
22  %idxB1 = getelementptr inbounds double, ptr %Barray, i64 1
23  %idxC1 = getelementptr inbounds double, ptr %Carray, i64 1
24  %idxS1 = getelementptr inbounds double, ptr %Sarray, i64 1
25
26  %A0 = load double, ptr %Aarray, align 8
27  %A1 = load double, ptr %idxA1, align 8
28
29  %B0 = load double, ptr %Barray, align 8
30  %B1 = load double, ptr %idxB1, align 8
31
32  %C0 = load double, ptr %Carray, align 8
33  %C1 = load double, ptr %idxC1, align 8
34
35  %addA0B0 = fadd fast double %A0, %B0
36  %addB1C1 = fadd fast double %B1, %C1
37  %add0 = fadd fast double %addA0B0, %C0
38  %add1 = fadd fast double %addB1C1, %A1
39  store double %add0, ptr %Sarray, align 8
40  store double %add1, ptr %idxS1, align 8
41  ret void
42}
43
44
45; Without supernode operand reordering, this does not get fully vectorized.
46; S[0] = (A[0] - B[0]) + C[0]
47; S[1] = (C[1] - B[1]) + A[1]
48define void @test_supernode_addsub(ptr %Aarray, ptr %Barray, ptr %Carray, ptr %Sarray) {
49; ENABLED-LABEL: @test_supernode_addsub(
50; ENABLED-NEXT:  entry:
51; ENABLED-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8
52; ENABLED-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8
53; ENABLED-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[CARRAY:%.*]], align 8
54; ENABLED-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3>
55; ENABLED-NEXT:    [[TMP4:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP1]]
56; ENABLED-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> <i32 0, i32 3>
57; ENABLED-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP5]]
58; ENABLED-NEXT:    store <2 x double> [[TMP6]], ptr [[SARRAY:%.*]], align 8
59; ENABLED-NEXT:    ret void
60;
61entry:
62  %idxA1 = getelementptr inbounds double, ptr %Aarray, i64 1
63  %idxB1 = getelementptr inbounds double, ptr %Barray, i64 1
64  %idxC1 = getelementptr inbounds double, ptr %Carray, i64 1
65  %idxS1 = getelementptr inbounds double, ptr %Sarray, i64 1
66
67  %A0 = load double, ptr %Aarray, align 8
68  %A1 = load double, ptr %idxA1, align 8
69
70  %B0 = load double, ptr %Barray, align 8
71  %B1 = load double, ptr %idxB1, align 8
72
73  %C0 = load double, ptr %Carray, align 8
74  %C1 = load double, ptr %idxC1, align 8
75
76  %subA0B0 = fsub fast double %A0, %B0
77  %subC1B1 = fsub fast double %C1, %B1
78  %add0 = fadd fast double %subA0B0, %C0
79  %add1 = fadd fast double %subC1B1, %A1
80  store double %add0, ptr %Sarray, align 8
81  store double %add1, ptr %idxS1, align 8
82  ret void
83}
84
85; Without supernode operand reordering, this does not get fully vectorized.
86; This checks that the super-node works with alternate sequences.
87;
88; S[0] = (A[0] - B[0]) - C[0]
89; S[1] = (B[1] + C[1]) + A[1]
90define void @test_supernode_addsub_alt(ptr %Aarray, ptr %Barray, ptr %Carray, ptr %Sarray) {
91; ENABLED-LABEL: @test_supernode_addsub_alt(
92; ENABLED-NEXT:  entry:
93; ENABLED-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8
94; ENABLED-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8
95; ENABLED-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[CARRAY:%.*]], align 8
96; ENABLED-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3>
97; ENABLED-NEXT:    [[TMP4:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP1]]
98; ENABLED-NEXT:    [[TMP5:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP1]]
99; ENABLED-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 3>
100; ENABLED-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> <i32 0, i32 3>
101; ENABLED-NEXT:    [[TMP8:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP7]]
102; ENABLED-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP7]]
103; ENABLED-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32> <i32 0, i32 3>
104; ENABLED-NEXT:    store <2 x double> [[TMP10]], ptr [[SARRAY:%.*]], align 8
105; ENABLED-NEXT:    ret void
106;
107entry:
108  %idxA1 = getelementptr inbounds double, ptr %Aarray, i64 1
109  %idxB1 = getelementptr inbounds double, ptr %Barray, i64 1
110  %idxC1 = getelementptr inbounds double, ptr %Carray, i64 1
111  %idxS1 = getelementptr inbounds double, ptr %Sarray, i64 1
112
113  %A0 = load double, ptr %Aarray, align 8
114  %A1 = load double, ptr %idxA1, align 8
115
116  %B0 = load double, ptr %Barray, align 8
117  %B1 = load double, ptr %idxB1, align 8
118
119  %C0 = load double, ptr %Carray, align 8
120  %C1 = load double, ptr %idxC1, align 8
121
122  %subA0B0 = fsub fast double %A0, %B0
123  %addB1C1 = fadd fast double %B1, %C1
124  %sub0 = fsub fast double %subA0B0, %C0
125  %add1 = fadd fast double %addB1C1, %A1
126  store double %sub0, ptr %Sarray, align 8
127  store double %add1, ptr %idxS1, align 8
128  ret void
129}
130
131; This checks that vectorizeTree() works correctly with the supernode
132; and does not generate uses before defs.
133; If all of the operands of the supernode are vectorizable, then the scheduler
134; will fix their position in the program. If not, then the scheduler may not
135; touch them, leading to uses before defs.
136;
137; A0 = ...
138; C = ...
139; t1 = A0 + C
140; B0 = ...
141; t2 = t1 + B0
142; A1 = ...
143; B1 = ...
144; t3 = A1 + B1
145; D = ...
146; t4 = t3 + D
147;
148;
149;  A0  C   A1  B1              A0  C    A1  D            A0:1  C,D
150;   \ /      \ /    Reorder      \ /      \ /    Bundles     \ /
151; t1 + B0  t3 + D   ------->   t1 + B0  t3 + B1  ------> t1:3 + B0:1
152;    |/       |/                  |/       |/                 |/
153; t2 +     t4 +                t2 +     t4 +             t2:4 +
154;
155; After reordering, 'D' conceptually becomes an operand of t3:
156; t3 = A1 + D
157; But D is defined *after* its use.
158;
159define void @supernode_scheduling(ptr %Aarray, ptr %Barray, ptr %Carray, ptr %Darray, ptr %Sarray) {
160; ENABLED-LABEL: @supernode_scheduling(
161; ENABLED-NEXT:  entry:
162; ENABLED-NEXT:    [[C:%.*]] = load double, ptr [[CARRAY:%.*]], align 8
163; ENABLED-NEXT:    [[D:%.*]] = load double, ptr [[DARRAY:%.*]], align 8
164; ENABLED-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8
165; ENABLED-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8
166; ENABLED-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[C]], i32 0
167; ENABLED-NEXT:    [[TMP3:%.*]] = fadd fast <2 x double> [[TMP0]], [[TMP2]]
168; ENABLED-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP1]], double [[D]], i32 1
169; ENABLED-NEXT:    [[TMP5:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP4]]
170; ENABLED-NEXT:    store <2 x double> [[TMP5]], ptr [[SARRAY:%.*]], align 8
171; ENABLED-NEXT:    ret void
172;
173entry:
174  %idxA1 = getelementptr inbounds double, ptr %Aarray, i64 1
175  %idxB1 = getelementptr inbounds double, ptr %Barray, i64 1
176  %idxS1 = getelementptr inbounds double, ptr %Sarray, i64 1
177
178
179  %A0 = load double, ptr %Aarray, align 8
180  %C = load double, ptr %Carray, align 8
181  %t1 = fadd fast double %A0, %C
182  %B0 = load double, ptr %Barray, align 8
183  %t2 = fadd fast double %t1, %B0
184  %A1 = load double, ptr %idxA1, align 8
185  %B1 = load double, ptr %idxB1, align 8
186  %t3 = fadd fast double %A1, %B1
187  %D = load double, ptr %Darray, align 8
188  %t4 = fadd fast double %t3, %D
189
190  store double %t2, ptr %Sarray, align 8
191  store double %t4, ptr %idxS1, align 8
192  ret void
193}
194
195
196; The SLP scheduler has trouble moving instructions across blocks.
197; Even though we can build a SuperNode for this example, we should not because the scheduler
198; cannot handle the cross-block instruction motion that is required once the operands of the
199; SuperNode are reordered.
200;
201; bb1:
202;  A0 = ...
203;  B1 = ...
204;  Tmp0 = A0 + 2.0
205;  Tmp1 = B1 + 2.0
206;
207; bb2:
208;  A1 = ...
209;  B0 = ...
210;  S[0] = Tmp0 + B0
211;  S[1] = Tmp1 + A1
212define void @supernode_scheduling_cross_block(ptr %Aarray, ptr %Barray, ptr %Sarray) {
213; ENABLED-LABEL: @supernode_scheduling_cross_block(
214; ENABLED-NEXT:  entry:
215; ENABLED-NEXT:    [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[AARRAY:%.*]], i64 1
216; ENABLED-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[BARRAY:%.*]], i64 1
217; ENABLED-NEXT:    [[A0:%.*]] = load double, ptr [[AARRAY]], align 8
218; ENABLED-NEXT:    [[B1:%.*]] = load double, ptr [[IDXB1]], align 8
219; ENABLED-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
220; ENABLED-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[B1]], i32 1
221; ENABLED-NEXT:    [[TMP2:%.*]] = fadd fast <2 x double> [[TMP1]], splat (double 2.000000e+00)
222; ENABLED-NEXT:    br label [[BB:%.*]]
223; ENABLED:       bb:
224; ENABLED-NEXT:    [[A1:%.*]] = load double, ptr [[IDXA1]], align 8
225; ENABLED-NEXT:    [[B0:%.*]] = load double, ptr [[BARRAY]], align 8
226; ENABLED-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0
227; ENABLED-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[A1]], i32 1
228; ENABLED-NEXT:    [[TMP5:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP4]]
229; ENABLED-NEXT:    store <2 x double> [[TMP5]], ptr [[SARRAY:%.*]], align 8
230; ENABLED-NEXT:    ret void
231;
232entry:
233  %idxA1 = getelementptr inbounds double, ptr %Aarray, i64 1
234  %idxB1 = getelementptr inbounds double, ptr %Barray, i64 1
235  %idxS1 = getelementptr inbounds double, ptr %Sarray, i64 1
236
237  %A0 = load double, ptr %Aarray, align 8
238  %B1 = load double, ptr %idxB1, align 8
239  %Tmp0 = fadd fast double %A0, 2.0
240  %Tmp1 = fadd fast double %B1, 2.0
241br label %bb
242
243bb:
244  %A1 = load double, ptr %idxA1, align 8
245  %B0 = load double, ptr %Barray, align 8
246
247  %Sum0 = fadd fast double %Tmp0, %B0
248  %Sum1 = fadd fast double %Tmp1, %A1
249
250  store double %Sum0, ptr %Sarray, align 8
251  store double %Sum1, ptr %idxS1, align 8
252  ret void
253}
254