xref: /llvm-project/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll (revision 1833d418a04123916c1dbeb0c41c8bc7d06b779b)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE
3; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx | FileCheck %s --check-prefixes=CHECK,AVX
4;
5; This file tests the look-ahead operand reordering heuristic.
6;
7;
8; This checks that operand reordering will reorder the operands of the adds
9; by taking into consideration the instructions beyond the immediate
10; predecessors.
11;
12; A[0] B[0] C[0] D[0]  C[1] D[1] A[1] B[1]
13;     \  /   \  /          \  /   \  /
14;       -     -              -     -
15;        \   /                \   /
16;          +                    +
17;          |                    |
18;         S[0]                 S[1]
19;
20define void @lookahead_basic(ptr %array) {
21; CHECK-LABEL: @lookahead_basic(
22; CHECK-NEXT:  entry:
23; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 2
24; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 4
25; CHECK-NEXT:    [[IDX6:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 6
26; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY]], align 8
27; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[IDX2]], align 8
28; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[IDX4]], align 8
29; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[IDX6]], align 8
30; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP1]]
31; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP3]]
32; CHECK-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP4]]
33; CHECK-NEXT:    store <2 x double> [[TMP6]], ptr [[ARRAY]], align 8
34; CHECK-NEXT:    ret void
35;
36entry:
37  %idx1 = getelementptr inbounds double, ptr %array, i64 1
38  %idx2 = getelementptr inbounds double, ptr %array, i64 2
39  %idx3 = getelementptr inbounds double, ptr %array, i64 3
40  %idx4 = getelementptr inbounds double, ptr %array, i64 4
41  %idx5 = getelementptr inbounds double, ptr %array, i64 5
42  %idx6 = getelementptr inbounds double, ptr %array, i64 6
43  %idx7 = getelementptr inbounds double, ptr %array, i64 7
44
45  %A_0 = load double, ptr %array, align 8
46  %A_1 = load double, ptr %idx1, align 8
47  %B_0 = load double, ptr %idx2, align 8
48  %B_1 = load double, ptr %idx3, align 8
49  %C_0 = load double, ptr %idx4, align 8
50  %C_1 = load double, ptr %idx5, align 8
51  %D_0 = load double, ptr %idx6, align 8
52  %D_1 = load double, ptr %idx7, align 8
53
54  %subAB_0 = fsub fast double %A_0, %B_0
55  %subCD_0 = fsub fast double %C_0, %D_0
56
57  %subAB_1 = fsub fast double %A_1, %B_1
58  %subCD_1 = fsub fast double %C_1, %D_1
59
60  %addABCD_0 = fadd fast double %subAB_0, %subCD_0
61  %addCDAB_1 = fadd fast double %subCD_1, %subAB_1
62
63  store double %addABCD_0, ptr %array, align 8
64  store double %addCDAB_1, ptr %idx1, align 8
65  ret void
66}
67
68
69; Check whether the look-ahead operand reordering heuristic will avoid
70; bundling the alt opcodes. The vectorized code should have no shuffles.
71;
72; A[0] B[0] A[0] B[0]  A[1] A[1] A[1] B[1]
73;     \  /   \  /          \  /   \  /
74;       +     -              -     +
75;        \   /                \   /
76;          +                    +
77;          |                    |
78;         S[0]                 S[1]
79;
80define void @lookahead_alt1(ptr %array) {
81; CHECK-LABEL: @lookahead_alt1(
82; CHECK-NEXT:  entry:
83; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 2
84; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 4
85; CHECK-NEXT:    [[IDX5:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 5
86; CHECK-NEXT:    [[IDX6:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 6
87; CHECK-NEXT:    [[IDX7:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 7
88; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY]], align 8
89; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[IDX2]], align 8
90; CHECK-NEXT:    [[TMP2:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP1]]
91; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <2 x double> [[TMP0]], [[TMP1]]
92; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP2]]
93; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[ARRAY]], align 8
94; CHECK-NEXT:    ret void
95;
96entry:
97  %idx1 = getelementptr inbounds double, ptr %array, i64 1
98  %idx2 = getelementptr inbounds double, ptr %array, i64 2
99  %idx3 = getelementptr inbounds double, ptr %array, i64 3
100  %idx4 = getelementptr inbounds double, ptr %array, i64 4
101  %idx5 = getelementptr inbounds double, ptr %array, i64 5
102  %idx6 = getelementptr inbounds double, ptr %array, i64 6
103  %idx7 = getelementptr inbounds double, ptr %array, i64 7
104
105  %A_0 = load double, ptr %array, align 8
106  %A_1 = load double, ptr %idx1, align 8
107  %B_0 = load double, ptr %idx2, align 8
108  %B_1 = load double, ptr %idx3, align 8
109
110  %addAB_0_L = fadd fast double %A_0, %B_0
111  %subAB_0_R = fsub fast double %A_0, %B_0
112
113  %subAB_1_L = fsub fast double %A_1, %B_1
114  %addAB_1_R = fadd fast double %A_1, %B_1
115
116  %addABCD_0 = fadd fast double %addAB_0_L, %subAB_0_R
117  %addCDAB_1 = fadd fast double %subAB_1_L, %addAB_1_R
118
119  store double %addABCD_0, ptr %array, align 8
120  store double %addCDAB_1, ptr %idx1, align 8
121  ret void
122}
123
124
125; This code should get vectorized all the way to the loads with shuffles for
126; the alt opcodes.
127;
128; A[0] B[0] C[0] D[0]  C[1] D[1] A[1] B[1]
129;     \  /   \  /          \  /   \  /
130;       +     -              +     -
131;        \   /                \   /
132;          +                    +
133;          |                    |
134;         S[0]                 S[1]
135;
136define void @lookahead_alt2(ptr %array) {
137; CHECK-LABEL: @lookahead_alt2(
138; CHECK-NEXT:  entry:
139; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 2
140; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 4
141; CHECK-NEXT:    [[IDX6:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 6
142; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY]], align 8
143; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[IDX2]], align 8
144; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[IDX4]], align 8
145; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[IDX6]], align 8
146; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP3]]
147; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP3]]
148; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 3>
149; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <2 x double> [[TMP0]], [[TMP1]]
150; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP1]]
151; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x i32> <i32 0, i32 3>
152; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP9]]
153; CHECK-NEXT:    store <2 x double> [[TMP10]], ptr [[ARRAY]], align 8
154; CHECK-NEXT:    ret void
155;
156entry:
157  %idx1 = getelementptr inbounds double, ptr %array, i64 1
158  %idx2 = getelementptr inbounds double, ptr %array, i64 2
159  %idx3 = getelementptr inbounds double, ptr %array, i64 3
160  %idx4 = getelementptr inbounds double, ptr %array, i64 4
161  %idx5 = getelementptr inbounds double, ptr %array, i64 5
162  %idx6 = getelementptr inbounds double, ptr %array, i64 6
163  %idx7 = getelementptr inbounds double, ptr %array, i64 7
164
165  %A_0 = load double, ptr %array, align 8
166  %A_1 = load double, ptr %idx1, align 8
167  %B_0 = load double, ptr %idx2, align 8
168  %B_1 = load double, ptr %idx3, align 8
169  %C_0 = load double, ptr %idx4, align 8
170  %C_1 = load double, ptr %idx5, align 8
171  %D_0 = load double, ptr %idx6, align 8
172  %D_1 = load double, ptr %idx7, align 8
173
174  %addAB_0 = fadd fast double %A_0, %B_0
175  %subCD_0 = fsub fast double %C_0, %D_0
176
177  %addCD_1 = fadd fast double %C_1, %D_1
178  %subAB_1 = fsub fast double %A_1, %B_1
179
180  %addABCD_0 = fadd fast double %addAB_0, %subCD_0
181  %addCDAB_1 = fadd fast double %addCD_1, %subAB_1
182
183  store double %addABCD_0, ptr %array, align 8
184  store double %addCDAB_1, ptr %idx1, align 8
185  ret void
186}
187
188
189;
190; A[0] B[0] C[0] D[0]  A[1] B[2] A[2] B[1]
191;     \  /   \  /       /  \  /   \  /
192;       -     -        U     -     -
193;        \   /                \   /
194;          +                    +
195;          |                    |
196;         S[0]                 S[1]
197;
198; SLP should reorder the operands of the RHS add taking into consideration the cost of external uses.
199; It is more profitable to reorder the operands of the RHS add, because A[1] has an external use.
200
201define void @lookahead_external_uses(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S, ptr %Ext1, ptr %Ext2) {
202; CHECK-LABEL: @lookahead_external_uses(
203; CHECK-NEXT:  entry:
204; CHECK-NEXT:    [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 1
205; CHECK-NEXT:    [[IDXB2:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 2
206; CHECK-NEXT:    [[IDXA2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 2
207; CHECK-NEXT:    [[C0:%.*]] = load double, ptr [[C:%.*]], align 8
208; CHECK-NEXT:    [[D0:%.*]] = load double, ptr [[D:%.*]], align 8
209; CHECK-NEXT:    [[B2:%.*]] = load double, ptr [[IDXB2]], align 8
210; CHECK-NEXT:    [[A2:%.*]] = load double, ptr [[IDXA2]], align 8
211; CHECK-NEXT:    [[A1:%.*]] = load double, ptr [[IDXA1]], align 8
212; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8
213; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B]], align 8
214; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1
215; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP2]]
216; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
217; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A2]], i32 1
218; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP1]], double [[D0]], i32 0
219; CHECK-NEXT:    [[TMP7:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP6]]
220; CHECK-NEXT:    [[TMP8:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP7]]
221; CHECK-NEXT:    store <2 x double> [[TMP8]], ptr [[S:%.*]], align 8
222; CHECK-NEXT:    store double [[A1]], ptr [[EXT1:%.*]], align 8
223; CHECK-NEXT:    ret void
224;
225entry:
226
227  %IdxA1 = getelementptr inbounds double, ptr %A, i64 1
228  %IdxB2 = getelementptr inbounds double, ptr %B, i64 2
229  %IdxA2 = getelementptr inbounds double, ptr %A, i64 2
230  %IdxB1 = getelementptr inbounds double, ptr %B, i64 1
231
232  %A0 = load double, ptr %A, align 8
233  %B0 = load double, ptr %B, align 8
234  %C0 = load double, ptr %C, align 8
235  %D0 = load double, ptr %D, align 8
236
237  %A1 = load double, ptr %IdxA1, align 8
238  %B2 = load double, ptr %IdxB2, align 8
239  %A2 = load double, ptr %IdxA2, align 8
240  %B1 = load double, ptr %IdxB1, align 8
241
242  %subA0B0 = fsub fast double %A0, %B0
243  %subC0D0 = fsub fast double %C0, %D0
244
245  %subA1B2 = fsub fast double %A1, %B2
246  %subA2B1 = fsub fast double %A2, %B1
247
248  %add0 = fadd fast double %subA0B0, %subC0D0
249  %add1 = fadd fast double %subA1B2, %subA2B1
250
251  %IdxS1 = getelementptr inbounds double, ptr %S, i64 1
252
253  store double %add0, ptr %S, align 8
254  store double %add1, ptr %IdxS1, align 8
255
256  ; External use
257  store double %A1, ptr %Ext1, align 8
258  ret void
259}
260
261; A[0] B[0] C[0] D[0]  A[1] B[2] A[2] B[1]
262;     \  /   \  /       /  \  /   \  / \
263;       -     -    U1,U2,U3  -     -  U4,U5
264;        \   /                \   /
265;          +                    +
266;          |                    |
267;         S[0]                 S[1]
268;
269;
270; If we limit the users budget for the look-ahead heuristic to 2, then the
271; look-ahead heuristic has no way of choosing B[1] (with 2 external users)
272; over A[1] (with 3 external users).
273; The result is that the operands are of the Add not reordered and the loads
274; from A get vectorized instead of the loads from B.
275;
276define void @lookahead_limit_users_budget(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S, ptr %Ext1, ptr %Ext2, ptr %Ext3, ptr %Ext4, ptr %Ext5) {
277; CHECK-LABEL: @lookahead_limit_users_budget(
278; CHECK-NEXT:  entry:
279; CHECK-NEXT:    [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 1
280; CHECK-NEXT:    [[IDXB2:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 2
281; CHECK-NEXT:    [[IDXA2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 2
282; CHECK-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 1
283; CHECK-NEXT:    [[C0:%.*]] = load double, ptr [[C:%.*]], align 8
284; CHECK-NEXT:    [[D0:%.*]] = load double, ptr [[D:%.*]], align 8
285; CHECK-NEXT:    [[B2:%.*]] = load double, ptr [[IDXB2]], align 8
286; CHECK-NEXT:    [[A2:%.*]] = load double, ptr [[IDXA2]], align 8
287; CHECK-NEXT:    [[A1:%.*]] = load double, ptr [[IDXA1]], align 8
288; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8
289; CHECK-NEXT:    [[B1:%.*]] = load double, ptr [[IDXB1]], align 8
290; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B]], align 8
291; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1
292; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP2]]
293; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
294; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A2]], i32 1
295; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP1]], double [[D0]], i32 0
296; CHECK-NEXT:    [[TMP7:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP6]]
297; CHECK-NEXT:    [[TMP8:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP7]]
298; CHECK-NEXT:    store <2 x double> [[TMP8]], ptr [[S:%.*]], align 8
299; CHECK-NEXT:    store double [[A1]], ptr [[EXT1:%.*]], align 8
300; CHECK-NEXT:    store double [[A1]], ptr [[EXT2:%.*]], align 8
301; CHECK-NEXT:    store double [[A1]], ptr [[EXT3:%.*]], align 8
302; CHECK-NEXT:    store double [[B1]], ptr [[EXT4:%.*]], align 8
303; CHECK-NEXT:    store double [[B1]], ptr [[EXT5:%.*]], align 8
304; CHECK-NEXT:    ret void
305;
306entry:
307
308  %IdxA1 = getelementptr inbounds double, ptr %A, i64 1
309  %IdxB2 = getelementptr inbounds double, ptr %B, i64 2
310  %IdxA2 = getelementptr inbounds double, ptr %A, i64 2
311  %IdxB1 = getelementptr inbounds double, ptr %B, i64 1
312
313  %A0 = load double, ptr %A, align 8
314  %B0 = load double, ptr %B, align 8
315  %C0 = load double, ptr %C, align 8
316  %D0 = load double, ptr %D, align 8
317
318  %A1 = load double, ptr %IdxA1, align 8
319  %B2 = load double, ptr %IdxB2, align 8
320  %A2 = load double, ptr %IdxA2, align 8
321  %B1 = load double, ptr %IdxB1, align 8
322
323  %subA0B0 = fsub fast double %A0, %B0
324  %subC0D0 = fsub fast double %C0, %D0
325
326  %subA1B2 = fsub fast double %A1, %B2
327  %subA2B1 = fsub fast double %A2, %B1
328
329  %add0 = fadd fast double %subA0B0, %subC0D0
330  %add1 = fadd fast double %subA1B2, %subA2B1
331
332  %IdxS1 = getelementptr inbounds double, ptr %S, i64 1
333
334  store double %add0, ptr %S, align 8
335  store double %add1, ptr %IdxS1, align 8
336
337  ; External uses of A1
338  store double %A1, ptr %Ext1, align 8
339  store double %A1, ptr %Ext2, align 8
340  store double %A1, ptr %Ext3, align 8
341
342  ; External uses of B1
343  store double %B1, ptr %Ext4, align 8
344  store double %B1, ptr %Ext5, align 8
345
346  ret void
347}
348
349; This checks that the lookahead code does not crash when instructions with the same opcodes have different numbers of operands (in this case the calls).
350
351%Class = type { i8 }
352declare double @_ZN1i2ayEv(ptr)
353declare double @_ZN1i2axEv()
354
355define void @lookahead_crash(ptr %A, ptr %S, ptr %Arg0) {
356; CHECK-LABEL: @lookahead_crash(
357; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
358; CHECK-NEXT:    [[C0:%.*]] = call double @_ZN1i2ayEv(ptr [[ARG0:%.*]])
359; CHECK-NEXT:    [[C1:%.*]] = call double @_ZN1i2axEv()
360; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
361; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[C1]], i32 1
362; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]]
363; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[S:%.*]], align 8
364; CHECK-NEXT:    ret void
365;
366  %IdxA1 = getelementptr inbounds double, ptr %A, i64 1
367
368  %A0 = load double, ptr %A, align 8
369  %A1 = load double, ptr %IdxA1, align 8
370
371  %C0 = call double @_ZN1i2ayEv(ptr %Arg0)
372  %C1 = call double @_ZN1i2axEv()
373
374  %add0 = fadd fast double %A0, %C0
375  %add1 = fadd fast double %A1, %C1
376
377  %IdxS1 = getelementptr inbounds double, ptr %S, i64 1
378  store double %add0, ptr %S, align 8
379  store double %add1, ptr %IdxS1, align 8
380  ret void
381}
382
383; This checks that we choose to group consecutive extracts from the same vectors.
384define void @ChecksExtractScores(ptr %storeArray, ptr %array, ptr %vecPtr1, ptr %vecPtr2) {
385; CHECK-LABEL: @ChecksExtractScores(
386; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 1
387; CHECK-NEXT:    [[LOADA0:%.*]] = load double, ptr [[ARRAY]], align 4
388; CHECK-NEXT:    [[LOADA1:%.*]] = load double, ptr [[IDX1]], align 4
389; CHECK-NEXT:    [[LOADVEC:%.*]] = load <2 x double>, ptr [[VECPTR1:%.*]], align 4
390; CHECK-NEXT:    [[LOADVEC2:%.*]] = load <2 x double>, ptr [[VECPTR2:%.*]], align 4
391; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LOADA0]], i32 0
392; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer
393; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[LOADVEC]], [[TMP2]]
394; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[LOADA1]], i32 0
395; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
396; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[LOADVEC2]], [[TMP5]]
397; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]]
398; CHECK-NEXT:    store <2 x double> [[TMP7]], ptr [[STOREARRAY:%.*]], align 8
399; CHECK-NEXT:    ret void
400;
401  %idx1 = getelementptr inbounds double, ptr %array, i64 1
402  %loadA0 = load double, ptr %array, align 4
403  %loadA1 = load double, ptr %idx1, align 4
404
405  %loadVec = load <2 x double>, ptr %vecPtr1, align 4
406  %extrA0 = extractelement <2 x double> %loadVec, i32 0
407  %extrA1 = extractelement <2 x double> %loadVec, i32 1
408  %loadVec2 = load <2 x double>, ptr %vecPtr2, align 4
409  %extrB0 = extractelement <2 x double> %loadVec2, i32 0
410  %extrB1 = extractelement <2 x double> %loadVec2, i32 1
411
412  %mul0 = fmul double %extrA0, %loadA0
413  %mul1 = fmul double %extrA1, %loadA0
414  %mul3 = fmul double %extrB0, %loadA1
415  %mul4 = fmul double %extrB1, %loadA1
416  %add0 = fadd double %mul0, %mul3
417  %add1 = fadd double %mul1, %mul4
418
419  %sidx1 = getelementptr inbounds double, ptr %storeArray, i64 1
420  store double %add0, ptr %storeArray, align 8
421  store double %add1, ptr %sidx1, align 8
422  ret void
423}
424
425
426define i1 @ExtractIdxNotConstantInt1(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) {
427; SSE-LABEL: @ExtractIdxNotConstantInt1(
428; SSE-NEXT:    [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 undef
429; SSE-NEXT:    [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
430; SSE-NEXT:    [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]]
431; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0
432; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1
433; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0
434; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1
435; SSE-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
436; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+01>, float [[B:%.*]], i32 0
437; SSE-NEXT:    [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]]
438; SSE-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], <float 1.000000e+01, float 2.000000e+00>
439; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
440; SSE-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
441; SSE-NEXT:    [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]]
442; SSE-NEXT:    [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
443; SSE-NEXT:    ret i1 [[CMP_I185]]
444;
445; AVX-LABEL: @ExtractIdxNotConstantInt1(
446; AVX-NEXT:    [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 undef
447; AVX-NEXT:    [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
448; AVX-NEXT:    [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]]
449; AVX-NEXT:    [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]]
450; AVX-NEXT:    [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]]
451; AVX-NEXT:    [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01
452; AVX-NEXT:    [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]]
453; AVX-NEXT:    [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01
454; AVX-NEXT:    [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]]
455; AVX-NEXT:    [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]]
456; AVX-NEXT:    [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
457; AVX-NEXT:    ret i1 [[CMP_I185]]
458;
459  %vecext.i291.i166 = extractelement <4 x float> %vec, i64 undef
460  %sub14.i167 = fsub float undef, %vecext.i291.i166
461  %fm = fmul float %a, %sub14.i167
462  %sub25.i168 = fsub float %fm, %b
463  %vecext.i276.i169 = extractelement <4 x float> %vec, i64 %idx2
464  %add36.i173 = fadd float %sub25.i168, 10.0
465  %mul72.i179 = fmul float %c, %vecext.i276.i169
466  %add78.i180 = fsub float %mul72.i179, 30.0
467  %add79.i181 = fadd float 2.0, %add78.i180
468  %mul123.i184 = fmul float %add36.i173, %add79.i181
469  %cmp.i185 = fcmp ogt float %mul123.i184, 0.000000e+00
470  ret i1 %cmp.i185
471}
472
473
474define i1 @ExtractIdxNotConstantInt2(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) {
475; SSE-LABEL: @ExtractIdxNotConstantInt2(
476; SSE-NEXT:    [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 1
477; SSE-NEXT:    [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
478; SSE-NEXT:    [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]]
479; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0
480; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1
481; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0
482; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1
483; SSE-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
484; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+01>, float [[B:%.*]], i32 0
485; SSE-NEXT:    [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]]
486; SSE-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], <float 1.000000e+01, float 2.000000e+00>
487; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
488; SSE-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
489; SSE-NEXT:    [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]]
490; SSE-NEXT:    [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
491; SSE-NEXT:    ret i1 [[CMP_I185]]
492;
493; AVX-LABEL: @ExtractIdxNotConstantInt2(
494; AVX-NEXT:    [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 1
495; AVX-NEXT:    [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
496; AVX-NEXT:    [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]]
497; AVX-NEXT:    [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]]
498; AVX-NEXT:    [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]]
499; AVX-NEXT:    [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01
500; AVX-NEXT:    [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]]
501; AVX-NEXT:    [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01
502; AVX-NEXT:    [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]]
503; AVX-NEXT:    [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]]
504; AVX-NEXT:    [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
505; AVX-NEXT:    ret i1 [[CMP_I185]]
506;
507  %vecext.i291.i166 = extractelement <4 x float> %vec, i64 1
508  %sub14.i167 = fsub float undef, %vecext.i291.i166
509  %fm = fmul float %a, %sub14.i167
510  %sub25.i168 = fsub float %fm, %b
511  %vecext.i276.i169 = extractelement <4 x float> %vec, i64 %idx2
512  %add36.i173 = fadd float %sub25.i168, 10.0
513  %mul72.i179 = fmul float %c, %vecext.i276.i169
514  %add78.i180 = fsub float %mul72.i179, 30.0
515  %add79.i181 = fadd float 2.0, %add78.i180
516  %mul123.i184 = fmul float %add36.i173, %add79.i181
517  %cmp.i185 = fcmp ogt float %mul123.i184, 0.000000e+00
518  ret i1 %cmp.i185
519}
520
521
522define i1 @foo(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) {
523; CHECK-LABEL: @foo(
524; CHECK-NEXT:    [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0
525; CHECK-NEXT:    [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
526; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0
527; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1
528; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[VEC]], <4 x float> poison, <2 x i32> <i32 poison, i32 1>
529; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[SUB14_I167]], i32 0
530; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
531; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+01>, float [[B:%.*]], i32 0
532; CHECK-NEXT:    [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]]
533; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], <float 1.000000e+01, float 2.000000e+00>
534; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
535; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
536; CHECK-NEXT:    [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]]
537; CHECK-NEXT:    [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
538; CHECK-NEXT:    ret i1 [[CMP_I185]]
539;
540  %vecext.i291.i166 = extractelement <4 x float> %vec, i64 0
541  %sub14.i167 = fsub float undef, %vecext.i291.i166
542  %fm = fmul float %a, %sub14.i167
543  %sub25.i168 = fsub float %fm, %b
544  %vecext.i276.i169 = extractelement <4 x float> %vec, i64 1
545  %add36.i173 = fadd float %sub25.i168, 10.0
546  %mul72.i179 = fmul float %c, %vecext.i276.i169
547  %add78.i180 = fsub float %mul72.i179, 30.0
548  %add79.i181 = fadd float 2.0, %add78.i180
549  %mul123.i184 = fmul float %add36.i173, %add79.i181
550  %cmp.i185 = fcmp ogt float %mul123.i184, 0.000000e+00
551  ret i1 %cmp.i185
552}
553
554; Same as @ChecksExtractScores, but the extratelement vector operands do not match.
555define void @ChecksExtractScores_different_vectors(ptr %storeArray, ptr %array, ptr %vecPtr1, ptr %vecPtr2, ptr %vecPtr3, ptr %vecPtr4) {
556;
557; SSE-LABEL: @ChecksExtractScores_different_vectors(
558; SSE-NEXT:    [[LOADVEC:%.*]] = load <2 x double>, ptr [[VECPTR1:%.*]], align 4
559; SSE-NEXT:    [[LOADVEC2:%.*]] = load <2 x double>, ptr [[VECPTR2:%.*]], align 4
560; SSE-NEXT:    [[LOADVEC3:%.*]] = load <2 x double>, ptr [[VECPTR3:%.*]], align 4
561; SSE-NEXT:    [[LOADVEC4:%.*]] = load <2 x double>, ptr [[VECPTR4:%.*]], align 4
562; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAY:%.*]], align 4
563; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[LOADVEC2]], <2 x double> [[LOADVEC3]], <2 x i32> <i32 1, i32 2>
564; SSE-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], [[TMP1]]
565; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
566; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[LOADVEC]], <2 x double> [[LOADVEC4]], <2 x i32> <i32 0, i32 3>
567; SSE-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], [[TMP1]]
568; SSE-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP4]], [[TMP6]]
569; SSE-NEXT:    store <2 x double> [[TMP7]], ptr [[STOREARRAY:%.*]], align 8
570; SSE-NEXT:    ret void
571;
572; AVX-LABEL: @ChecksExtractScores_different_vectors(
573; AVX-NEXT:    [[IDX1:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 1
574; AVX-NEXT:    [[LOADA0:%.*]] = load double, ptr [[ARRAY]], align 4
575; AVX-NEXT:    [[LOADA1:%.*]] = load double, ptr [[IDX1]], align 4
576; AVX-NEXT:    [[LOADVEC:%.*]] = load <2 x double>, ptr [[VECPTR1:%.*]], align 4
577; AVX-NEXT:    [[LOADVEC2:%.*]] = load <2 x double>, ptr [[VECPTR2:%.*]], align 4
578; AVX-NEXT:    [[LOADVEC3:%.*]] = load <2 x double>, ptr [[VECPTR3:%.*]], align 4
579; AVX-NEXT:    [[LOADVEC4:%.*]] = load <2 x double>, ptr [[VECPTR4:%.*]], align 4
580; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[LOADVEC]], <2 x double> [[LOADVEC2]], <2 x i32> <i32 0, i32 3>
581; AVX-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[LOADA0]], i32 0
582; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
583; AVX-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
584; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[LOADVEC3]], <2 x double> [[LOADVEC4]], <2 x i32> <i32 0, i32 3>
585; AVX-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[LOADA1]], i32 0
586; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <2 x i32> zeroinitializer
587; AVX-NEXT:    [[TMP8:%.*]] = fmul <2 x double> [[TMP5]], [[TMP7]]
588; AVX-NEXT:    [[TMP9:%.*]] = fadd <2 x double> [[TMP4]], [[TMP8]]
589; AVX-NEXT:    store <2 x double> [[TMP9]], ptr [[STOREARRAY:%.*]], align 8
590; AVX-NEXT:    ret void
591;
592  %idx1 = getelementptr inbounds double, ptr %array, i64 1
593  %loadA0 = load double, ptr %array, align 4
594  %loadA1 = load double, ptr %idx1, align 4
595
596  %loadVec = load <2 x double>, ptr %vecPtr1, align 4
597  %loadVec2 = load <2 x double>, ptr %vecPtr2, align 4
598  %extrA0 = extractelement <2 x double> %loadVec, i32 0
599  %extrA1 = extractelement <2 x double> %loadVec2, i32 1
600  %loadVec3= load <2 x double>, ptr %vecPtr3, align 4
601  %loadVec4 = load <2 x double>, ptr %vecPtr4, align 4
602  %extrB0 = extractelement <2 x double> %loadVec3, i32 0
603  %extrB1 = extractelement <2 x double> %loadVec4, i32 1
604
605  %mul0 = fmul double %extrA0, %loadA0
606  %mul1 = fmul double %extrA1, %loadA0
607  %mul3 = fmul double %extrB0, %loadA1
608  %mul4 = fmul double %extrB1, %loadA1
609  %add0 = fadd double %mul0, %mul3
610  %add1 = fadd double %mul1, %mul4
611
612  %sidx1 = getelementptr inbounds double, ptr %storeArray, i64 1
613  store double %add0, ptr %storeArray, align 8
614  store double %add1, ptr %sidx1, align 8
615  ret void
616}
617
618; This checks that we we prefer splats rather than reverse load vectors + shuffles.
619; 2-wide splat loads in x86 use a single instruction so they are quite cheap.
620define double @splat_loads(ptr %array1, ptr %array2, ptr %ptrA, ptr %ptrB) {
621; SSE-LABEL: @splat_loads(
622; SSE-NEXT:  entry:
623; SSE-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY1:%.*]], align 8
624; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAY2:%.*]], align 8
625; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
626; SSE-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]]
627; SSE-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]]
628; SSE-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], [[TMP4]]
629; SSE-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
630; SSE-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
631; SSE-NEXT:    [[ADD3:%.*]] = fadd double [[TMP6]], [[TMP7]]
632; SSE-NEXT:    ret double [[ADD3]]
633;
634; AVX-LABEL: @splat_loads(
635; AVX-NEXT:  entry:
636; AVX-NEXT:    [[GEP_2_1:%.*]] = getelementptr inbounds double, ptr [[ARRAY2:%.*]], i64 1
637; AVX-NEXT:    [[LD_2_0:%.*]] = load double, ptr [[ARRAY2]], align 8
638; AVX-NEXT:    [[LD_2_1:%.*]] = load double, ptr [[GEP_2_1]], align 8
639; AVX-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY1:%.*]], align 8
640; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LD_2_0]], i32 0
641; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer
642; AVX-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]]
643; AVX-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[LD_2_1]], i32 0
644; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
645; AVX-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP0]], [[TMP5]]
646; AVX-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]]
647; AVX-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
648; AVX-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
649; AVX-NEXT:    [[ADD3:%.*]] = fadd double [[TMP8]], [[TMP9]]
650; AVX-NEXT:    ret double [[ADD3]]
651;
652entry:
653  %gep_1_1 = getelementptr inbounds double, ptr %array1, i64 1
654  %ld_1_0 = load double, ptr %array1, align 8
655  %ld_1_1 = load double, ptr %gep_1_1, align 8
656
657  %gep_2_1 = getelementptr inbounds double, ptr %array2, i64 1
658  %ld_2_0 = load double, ptr %array2, align 8
659  %ld_2_1 = load double, ptr %gep_2_1, align 8
660
661  %mul1 = fmul double %ld_1_0, %ld_2_0
662  %mul2 = fmul double %ld_1_1, %ld_2_0
663
664  %mul3 = fmul double %ld_1_0, %ld_2_1
665  %mul4 = fmul double %ld_1_1, %ld_2_1
666
667  %add1 = fadd double %mul1, %mul3
668  %add2 = fadd double %mul2, %mul4
669
670  %add3 = fadd double %add1, %add2
671  ret double %add3
672}
673
674
675; Same as splat_loads() but the splat load has internal uses in the slp graph.
676define double @splat_loads_with_internal_uses(ptr %array1, ptr %array2, ptr %ptrA, ptr %ptrB) {
677; SSE-LABEL: @splat_loads_with_internal_uses(
678; SSE-NEXT:  entry:
679; SSE-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY1:%.*]], align 8
680; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAY2:%.*]], align 8
681; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
682; SSE-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]]
683; SSE-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]]
684; SSE-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], [[TMP4]]
685; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer
686; SSE-NEXT:    [[TMP7:%.*]] = fsub <2 x double> [[TMP5]], [[TMP6]]
687; SSE-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
688; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
689; SSE-NEXT:    [[RES:%.*]] = fadd double [[TMP8]], [[TMP9]]
690; SSE-NEXT:    ret double [[RES]]
691;
692; AVX-LABEL: @splat_loads_with_internal_uses(
693; AVX-NEXT:  entry:
694; AVX-NEXT:    [[GEP_2_1:%.*]] = getelementptr inbounds double, ptr [[ARRAY2:%.*]], i64 1
695; AVX-NEXT:    [[LD_2_0:%.*]] = load double, ptr [[ARRAY2]], align 8
696; AVX-NEXT:    [[LD_2_1:%.*]] = load double, ptr [[GEP_2_1]], align 8
697; AVX-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY1:%.*]], align 8
698; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LD_2_0]], i32 0
699; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer
700; AVX-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]]
701; AVX-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[LD_2_1]], i32 0
702; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
703; AVX-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP0]], [[TMP5]]
704; AVX-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]]
705; AVX-NEXT:    [[TMP8:%.*]] = fsub <2 x double> [[TMP7]], [[TMP2]]
706; AVX-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
707; AVX-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP8]], i32 1
708; AVX-NEXT:    [[RES:%.*]] = fadd double [[TMP9]], [[TMP10]]
709; AVX-NEXT:    ret double [[RES]]
710;
711entry:
712  %gep_1_1 = getelementptr inbounds double, ptr %array1, i64 1
713  %ld_1_0 = load double, ptr %array1, align 8
714  %ld_1_1 = load double, ptr %gep_1_1, align 8
715
716  %gep_2_1 = getelementptr inbounds double, ptr %array2, i64 1
717  %ld_2_0 = load double, ptr %array2, align 8
718  %ld_2_1 = load double, ptr %gep_2_1, align 8
719
720  %mul1 = fmul double %ld_1_0, %ld_2_0
721  %mul2 = fmul double %ld_1_1, %ld_2_0
722
723  %mul3 = fmul double %ld_1_0, %ld_2_1
724  %mul4 = fmul double %ld_1_1, %ld_2_1
725
726  %add1 = fadd double %mul1, %mul3
727  %add2 = fadd double %mul2, %mul4
728
729  ; One more user for the broadcast of %ld_2_0
730  %sub1 = fsub double %add1, %ld_2_0
731  %sub2 = fsub double %add2, %ld_2_0
732
733  %res = fadd double %sub1, %sub2
734
735  ret double %res
736}
737