xref: /llvm-project/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll (revision 178f47143a3b3c547df6d1f07e9707792f5d9fd4)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK,SSE
3; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX
4
5declare void @use_i8(i8)
6declare void @use_f32(float)
7
8; Eliminating extract is profitable.
9
10define i8 @ext0_ext0_add(<16 x i8> %x, <16 x i8> %y) {
11; CHECK-LABEL: @ext0_ext0_add(
12; CHECK-NEXT:    [[TMP1:%.*]] = add <16 x i8> [[X:%.*]], [[Y:%.*]]
13; CHECK-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0
14; CHECK-NEXT:    ret i8 [[R]]
15;
16  %e0 = extractelement <16 x i8> %x, i32 0
17  %e1 = extractelement <16 x i8> %y, i32 0
18  %r = add i8 %e0, %e1
19  ret i8 %r
20}
21
22; Eliminating extract is still profitable. Flags propagate.
23
24define i8 @ext1_ext1_add_flags(<16 x i8> %x, <16 x i8> %y) {
25; CHECK-LABEL: @ext1_ext1_add_flags(
26; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw <16 x i8> [[X:%.*]], [[Y:%.*]]
27; CHECK-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1
28; CHECK-NEXT:    ret i8 [[R]]
29;
30  %e0 = extractelement <16 x i8> %x, i32 1
31  %e1 = extractelement <16 x i8> %y, i32 1
32  %r = add nsw nuw i8 %e0, %e1
33  ret i8 %r
34}
35
36; Negative test - eliminating extract is profitable, but vector shift is expensive.
37
38define i8 @ext1_ext1_shl(<16 x i8> %x, <16 x i8> %y) {
39; CHECK-LABEL: @ext1_ext1_shl(
40; CHECK-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 1
41; CHECK-NEXT:    [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 1
42; CHECK-NEXT:    [[R:%.*]] = shl i8 [[E0]], [[E1]]
43; CHECK-NEXT:    ret i8 [[R]]
44;
45  %e0 = extractelement <16 x i8> %x, i32 1
46  %e1 = extractelement <16 x i8> %y, i32 1
47  %r = shl i8 %e0, %e1
48  ret i8 %r
49}
50
51; Negative test - eliminating extract is profitable, but vector multiply is expensive.
52
53define i8 @ext13_ext13_mul(<16 x i8> %x, <16 x i8> %y) {
54; CHECK-LABEL: @ext13_ext13_mul(
55; CHECK-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 13
56; CHECK-NEXT:    [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 13
57; CHECK-NEXT:    [[R:%.*]] = mul i8 [[E0]], [[E1]]
58; CHECK-NEXT:    ret i8 [[R]]
59;
60  %e0 = extractelement <16 x i8> %x, i32 13
61  %e1 = extractelement <16 x i8> %y, i32 13
62  %r = mul i8 %e0, %e1
63  ret i8 %r
64}
65
66; Negative test - cost is irrelevant because sdiv has potential UB.
67
68define i8 @ext0_ext0_sdiv(<16 x i8> %x, <16 x i8> %y) {
69; CHECK-LABEL: @ext0_ext0_sdiv(
70; CHECK-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
71; CHECK-NEXT:    [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 0
72; CHECK-NEXT:    [[R:%.*]] = sdiv i8 [[E0]], [[E1]]
73; CHECK-NEXT:    ret i8 [[R]]
74;
75  %e0 = extractelement <16 x i8> %x, i32 0
76  %e1 = extractelement <16 x i8> %y, i32 0
77  %r = sdiv i8 %e0, %e1
78  ret i8 %r
79}
80
81; Extracts are free and vector op has same cost as scalar, but we
82; speculatively transform to vector to create more optimization
83; opportunities..
84
85define double @ext0_ext0_fadd(<2 x double> %x, <2 x double> %y) {
86; CHECK-LABEL: @ext0_ext0_fadd(
87; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[X:%.*]], [[Y:%.*]]
88; CHECK-NEXT:    [[R:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
89; CHECK-NEXT:    ret double [[R]]
90;
91  %e0 = extractelement <2 x double> %x, i32 0
92  %e1 = extractelement <2 x double> %y, i32 0
93  %r = fadd double %e0, %e1
94  ret double %r
95}
96
97; Eliminating extract is profitable. Flags propagate.
98
99define double @ext1_ext1_fsub(<2 x double> %x, <2 x double> %y) {
100; CHECK-LABEL: @ext1_ext1_fsub(
101; CHECK-NEXT:    [[TMP1:%.*]] = fsub fast <2 x double> [[X:%.*]], [[Y:%.*]]
102; CHECK-NEXT:    [[R:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
103; CHECK-NEXT:    ret double [[R]]
104;
105  %e0 = extractelement <2 x double> %x, i32 1
106  %e1 = extractelement <2 x double> %y, i32 1
107  %r = fsub fast double %e0, %e1
108  ret double %r
109}
110
111; Negative test - type mismatch.
112
113define double @ext1_ext1_fadd_different_types(<2 x double> %x, <4 x double> %y) {
114; CHECK-LABEL: @ext1_ext1_fadd_different_types(
115; CHECK-NEXT:    [[E0:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
116; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x double> [[Y:%.*]], i32 1
117; CHECK-NEXT:    [[R:%.*]] = fadd fast double [[E0]], [[E1]]
118; CHECK-NEXT:    ret double [[R]]
119;
120  %e0 = extractelement <2 x double> %x, i32 1
121  %e1 = extractelement <4 x double> %y, i32 1
122  %r = fadd fast double %e0, %e1
123  ret double %r
124}
125
126; Disguised same vector operand; scalar code is not cheaper (with default
127; x86 target), so aggressively form vector binop.
128
129define i32 @ext1_ext1_add_same_vec(<4 x i32> %x) {
130; CHECK-LABEL: @ext1_ext1_add_same_vec(
131; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[X:%.*]], [[X]]
132; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
133; CHECK-NEXT:    ret i32 [[R]]
134;
135  %e0 = extractelement <4 x i32> %x, i32 1
136  %e1 = extractelement <4 x i32> %x, i32 1
137  %r = add i32 %e0, %e1
138  ret i32 %r
139}
140
141; Functionally equivalent to above test; should transform as above.
142
143define i32 @ext1_ext1_add_same_vec_cse(<4 x i32> %x) {
144; CHECK-LABEL: @ext1_ext1_add_same_vec_cse(
145; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[X:%.*]], [[X]]
146; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
147; CHECK-NEXT:    ret i32 [[R]]
148;
149  %e0 = extractelement <4 x i32> %x, i32 1
150  %r = add i32 %e0, %e0
151  ret i32 %r
152}
153
154; Don't assert if extract indices have different types.
155
156define i32 @ext1_ext1_add_same_vec_diff_idx_ty(<4 x i32> %x) {
157; CHECK-LABEL: @ext1_ext1_add_same_vec_diff_idx_ty(
158; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[X:%.*]], [[X]]
159; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
160; CHECK-NEXT:    ret i32 [[R]]
161;
162  %e0 = extractelement <4 x i32> %x, i32 1
163  %e1 = extractelement <4 x i32> %x, i64 1
164  %r = add i32 %e0, %e1
165  ret i32 %r
166}
167
168; Negative test - same vector operand; scalar code is cheaper than general case
169;                 and vector code would be more expensive still.
170
171define i8 @ext1_ext1_add_same_vec_extra_use0(<16 x i8> %x) {
172; CHECK-LABEL: @ext1_ext1_add_same_vec_extra_use0(
173; CHECK-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
174; CHECK-NEXT:    call void @use_i8(i8 [[E0]])
175; CHECK-NEXT:    [[E1:%.*]] = extractelement <16 x i8> [[X]], i32 0
176; CHECK-NEXT:    [[R:%.*]] = add i8 [[E0]], [[E1]]
177; CHECK-NEXT:    ret i8 [[R]]
178;
179  %e0 = extractelement <16 x i8> %x, i32 0
180  call void @use_i8(i8 %e0)
181  %e1 = extractelement <16 x i8> %x, i32 0
182  %r = add i8 %e0, %e1
183  ret i8 %r
184}
185
186; Negative test - same vector operand; scalar code is cheaper than general case
187;                 and vector code would be more expensive still.
188
189define i8 @ext1_ext1_add_same_vec_extra_use1(<16 x i8> %x) {
190; CHECK-LABEL: @ext1_ext1_add_same_vec_extra_use1(
191; CHECK-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
192; CHECK-NEXT:    [[E1:%.*]] = extractelement <16 x i8> [[X]], i32 0
193; CHECK-NEXT:    call void @use_i8(i8 [[E1]])
194; CHECK-NEXT:    [[R:%.*]] = add i8 [[E0]], [[E1]]
195; CHECK-NEXT:    ret i8 [[R]]
196;
197  %e0 = extractelement <16 x i8> %x, i32 0
198  %e1 = extractelement <16 x i8> %x, i32 0
199  call void @use_i8(i8 %e1)
200  %r = add i8 %e0, %e1
201  ret i8 %r
202}
203
204; Negative test - same vector operand; scalar code is cheaper than general case
205;                 and vector code would be more expensive still.
206
207define i8 @ext1_ext1_add_same_vec_cse_extra_use(<16 x i8> %x) {
208; CHECK-LABEL: @ext1_ext1_add_same_vec_cse_extra_use(
209; CHECK-NEXT:    [[E:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
210; CHECK-NEXT:    call void @use_i8(i8 [[E]])
211; CHECK-NEXT:    [[R:%.*]] = add i8 [[E]], [[E]]
212; CHECK-NEXT:    ret i8 [[R]]
213;
214  %e = extractelement <16 x i8> %x, i32 0
215  call void @use_i8(i8 %e)
216  %r = add i8 %e, %e
217  ret i8 %r
218}
219
220; Vector code costs the same as scalar, so aggressively form vector op.
221
222define i8 @ext1_ext1_add_uses1(<16 x i8> %x, <16 x i8> %y) {
223; CHECK-LABEL: @ext1_ext1_add_uses1(
224; CHECK-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
225; CHECK-NEXT:    call void @use_i8(i8 [[E0]])
226; CHECK-NEXT:    [[TMP1:%.*]] = add <16 x i8> [[X]], [[Y:%.*]]
227; CHECK-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0
228; CHECK-NEXT:    ret i8 [[R]]
229;
230  %e0 = extractelement <16 x i8> %x, i32 0
231  call void @use_i8(i8 %e0)
232  %e1 = extractelement <16 x i8> %y, i32 0
233  %r = add i8 %e0, %e1
234  ret i8 %r
235}
236
237; Vector code costs the same as scalar, so aggressively form vector op.
238
239define i8 @ext1_ext1_add_uses2(<16 x i8> %x, <16 x i8> %y) {
240; CHECK-LABEL: @ext1_ext1_add_uses2(
241; CHECK-NEXT:    [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 0
242; CHECK-NEXT:    call void @use_i8(i8 [[E1]])
243; CHECK-NEXT:    [[TMP1:%.*]] = add <16 x i8> [[X:%.*]], [[Y]]
244; CHECK-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0
245; CHECK-NEXT:    ret i8 [[R]]
246;
247  %e0 = extractelement <16 x i8> %x, i32 0
248  %e1 = extractelement <16 x i8> %y, i32 0
249  call void @use_i8(i8 %e1)
250  %r = add i8 %e0, %e1
251  ret i8 %r
252}
253
254define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) {
255; CHECK-LABEL: @ext0_ext1_add(
256; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
257; CHECK-NEXT:    [[TMP1:%.*]] = add nuw <16 x i8> [[X:%.*]], [[SHIFT]]
258; CHECK-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0
259; CHECK-NEXT:    ret i8 [[R]]
260;
261  %e0 = extractelement <16 x i8> %x, i32 0
262  %e1 = extractelement <16 x i8> %y, i32 1
263  %r = add nuw i8 %e0, %e1
264  ret i8 %r
265}
266
267define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) {
268; CHECK-LABEL: @ext5_ext0_add(
269; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i8> [[X:%.*]], <16 x i8> poison, <16 x i32> <i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
270; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw <16 x i8> [[SHIFT]], [[Y:%.*]]
271; CHECK-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i64 0
272; CHECK-NEXT:    ret i8 [[R]]
273;
274  %e0 = extractelement <16 x i8> %x, i32 5
275  %e1 = extractelement <16 x i8> %y, i32 0
276  %r = sub nsw i8 %e0, %e1
277  ret i8 %r
278}
279
280define i8 @ext1_ext6_add(<16 x i8> %x, <16 x i8> %y) {
281; CHECK-LABEL: @ext1_ext6_add(
282; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
283; CHECK-NEXT:    [[TMP1:%.*]] = and <16 x i8> [[X:%.*]], [[SHIFT]]
284; CHECK-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1
285; CHECK-NEXT:    ret i8 [[R]]
286;
287  %e0 = extractelement <16 x i8> %x, i32 1
288  %e1 = extractelement <16 x i8> %y, i32 6
289  %r = and i8 %e0, %e1
290  ret i8 %r
291}
292
293define float @ext1_ext0_fmul(<4 x float> %x) {
294; CHECK-LABEL: @ext1_ext0_fmul(
295; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
296; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[SHIFT]], [[X]]
297; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
298; CHECK-NEXT:    ret float [[R]]
299;
300  %e0 = extractelement <4 x float> %x, i32 1
301  %e1 = extractelement <4 x float> %x, i32 0
302  %r = fmul float %e0, %e1
303  ret float %r
304}
305
306define float @ext0_ext3_fmul_extra_use1(<4 x float> %x) {
307; CHECK-LABEL: @ext0_ext3_fmul_extra_use1(
308; CHECK-NEXT:    [[E0:%.*]] = extractelement <4 x float> [[X:%.*]], i32 0
309; CHECK-NEXT:    call void @use_f32(float [[E0]])
310; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 3, i32 poison, i32 poison, i32 poison>
311; CHECK-NEXT:    [[TMP1:%.*]] = fmul nnan <4 x float> [[X]], [[SHIFT]]
312; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
313; CHECK-NEXT:    ret float [[R]]
314;
315  %e0 = extractelement <4 x float> %x, i32 0
316  call void @use_f32(float %e0)
317  %e1 = extractelement <4 x float> %x, i32 3
318  %r = fmul nnan float %e0, %e1
319  ret float %r
320}
321
322define float @ext0_ext3_fmul_extra_use2(<4 x float> %x) {
323; CHECK-LABEL: @ext0_ext3_fmul_extra_use2(
324; CHECK-NEXT:    [[E0:%.*]] = extractelement <4 x float> [[X:%.*]], i32 0
325; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x float> [[X]], i32 3
326; CHECK-NEXT:    call void @use_f32(float [[E1]])
327; CHECK-NEXT:    [[R:%.*]] = fmul ninf nsz float [[E0]], [[E1]]
328; CHECK-NEXT:    ret float [[R]]
329;
330  %e0 = extractelement <4 x float> %x, i32 0
331  %e1 = extractelement <4 x float> %x, i32 3
332  call void @use_f32(float %e1)
333  %r = fmul ninf nsz float %e0, %e1
334  ret float %r
335}
336
337define float @ext0_ext4_fmul_v8f32(<8 x float> %x) {
338; SSE-LABEL: @ext0_ext4_fmul_v8f32(
339; SSE-NEXT:    [[E0:%.*]] = extractelement <8 x float> [[X:%.*]], i32 0
340; SSE-NEXT:    [[E1:%.*]] = extractelement <8 x float> [[X]], i32 4
341; SSE-NEXT:    [[R:%.*]] = fadd float [[E0]], [[E1]]
342; SSE-NEXT:    ret float [[R]]
343;
344; AVX-LABEL: @ext0_ext4_fmul_v8f32(
345; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[X:%.*]], <8 x float> poison, <8 x i32> <i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
346; AVX-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[X]], [[SHIFT]]
347; AVX-NEXT:    [[R:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
348; AVX-NEXT:    ret float [[R]]
349;
350  %e0 = extractelement <8 x float> %x, i32 0
351  %e1 = extractelement <8 x float> %x, i32 4
352  %r = fadd float %e0, %e1
353  ret float %r
354}
355
356define float @ext7_ext4_fmul_v8f32(<8 x float> %x) {
357; SSE-LABEL: @ext7_ext4_fmul_v8f32(
358; SSE-NEXT:    [[E0:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
359; SSE-NEXT:    [[E1:%.*]] = extractelement <8 x float> [[X]], i32 4
360; SSE-NEXT:    [[R:%.*]] = fadd float [[E0]], [[E1]]
361; SSE-NEXT:    ret float [[R]]
362;
363; AVX-LABEL: @ext7_ext4_fmul_v8f32(
364; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[X:%.*]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison, i32 poison, i32 poison>
365; AVX-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[SHIFT]], [[X]]
366; AVX-NEXT:    [[R:%.*]] = extractelement <8 x float> [[TMP1]], i64 4
367; AVX-NEXT:    ret float [[R]]
368;
369  %e0 = extractelement <8 x float> %x, i32 7
370  %e1 = extractelement <8 x float> %x, i32 4
371  %r = fadd float %e0, %e1
372  ret float %r
373}
374
375define float @ext0_ext8_fmul_v16f32(<16 x float> %x) {
376; CHECK-LABEL: @ext0_ext8_fmul_v16f32(
377; CHECK-NEXT:    [[E0:%.*]] = extractelement <16 x float> [[X:%.*]], i32 0
378; CHECK-NEXT:    [[E1:%.*]] = extractelement <16 x float> [[X]], i32 8
379; CHECK-NEXT:    [[R:%.*]] = fadd float [[E0]], [[E1]]
380; CHECK-NEXT:    ret float [[R]]
381;
382  %e0 = extractelement <16 x float> %x, i32 0
383  %e1 = extractelement <16 x float> %x, i32 8
384  %r = fadd float %e0, %e1
385  ret float %r
386}
387
388define float @ext14_ext15_fmul_v16f32(<16 x float> %x) {
389; SSE-LABEL: @ext14_ext15_fmul_v16f32(
390; SSE-NEXT:    [[E0:%.*]] = extractelement <16 x float> [[X:%.*]], i32 14
391; SSE-NEXT:    [[E1:%.*]] = extractelement <16 x float> [[X]], i32 15
392; SSE-NEXT:    [[R:%.*]] = fadd float [[E0]], [[E1]]
393; SSE-NEXT:    ret float [[R]]
394;
395; AVX-LABEL: @ext14_ext15_fmul_v16f32(
396; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x float> [[X:%.*]], <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 15, i32 poison>
397; AVX-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[X]], [[SHIFT]]
398; AVX-NEXT:    [[R:%.*]] = extractelement <16 x float> [[TMP1]], i32 14
399; AVX-NEXT:    ret float [[R]]
400;
401  %e0 = extractelement <16 x float> %x, i32 14
402  %e1 = extractelement <16 x float> %x, i32 15
403  %r = fadd float %e0, %e1
404  ret float %r
405}
406
407define <4 x float> @ins_bo_ext_ext(<4 x float> %a, <4 x float> %b) {
408; CHECK-LABEL: @ins_bo_ext_ext(
409; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
410; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[A]]
411; CHECK-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
412; CHECK-NEXT:    ret <4 x float> [[V3]]
413;
414  %a2 = extractelement <4 x float> %a, i32 2
415  %a3 = extractelement <4 x float> %a, i32 3
416  %a23 = fadd float %a2, %a3
417  %v3 = insertelement <4 x float> %b, float %a23, i32 3
418  ret <4 x float> %v3
419}
420
421; TODO: This is conservatively left to extract from the lower index value,
422;       but it is likely that extracting from index 3 is the better option.
423
424define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) {
425; SSE-LABEL: @ins_bo_ext_ext_uses(
426; SSE-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
427; SSE-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
428; SSE-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
429; SSE-NEXT:    call void @use_f32(float [[A23]])
430; SSE-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 6>
431; SSE-NEXT:    ret <4 x float> [[V3]]
432;
433; AVX-LABEL: @ins_bo_ext_ext_uses(
434; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
435; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
436; AVX-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
437; AVX-NEXT:    call void @use_f32(float [[A23]])
438; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[B:%.*]], float [[A23]], i32 3
439; AVX-NEXT:    ret <4 x float> [[V3]]
440;
441  %a2 = extractelement <4 x float> %a, i32 2
442  %a3 = extractelement <4 x float> %a, i32 3
443  %a23 = fadd float %a2, %a3
444  call void @use_f32(float %a23)
445  %v3 = insertelement <4 x float> %b, float %a23, i32 3
446  ret <4 x float> %v3
447}
448
449define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) {
450; CHECK-LABEL: @PR34724(
451; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
452; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
453; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
454; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
455; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
456; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
457; CHECK-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
458; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
459; CHECK-NEXT:    [[V1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 6, i32 7>
460; CHECK-NEXT:    [[V2:%.*]] = shufflevector <4 x float> [[V1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
461; CHECK-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
462; CHECK-NEXT:    ret <4 x float> [[V3]]
463;
464  %a0 = extractelement <4 x float> %a, i32 0
465  %a1 = extractelement <4 x float> %a, i32 1
466  %a2 = extractelement <4 x float> %a, i32 2
467  %a3 = extractelement <4 x float> %a, i32 3
468
469  %b0 = extractelement <4 x float> %b, i32 0
470  %b1 = extractelement <4 x float> %b, i32 1
471  %b2 = extractelement <4 x float> %b, i32 2
472  %b3 = extractelement <4 x float> %b, i32 3
473
474  %a23 = fadd float %a2, %a3
475  %b01 = fadd float %b0, %b1
476  %b23 = fadd float %b2, %b3
477
478  %v1 = insertelement <4 x float> undef, float %a23, i32 1
479  %v2 = insertelement <4 x float> %v1, float %b01, i32 2
480  %v3 = insertelement <4 x float> %v2, float %b23, i32 3
481  ret <4 x float> %v3
482}
483
484define i32 @ext_ext_or_reduction_v4i32(<4 x i32> %x, <4 x i32> %y) {
485; CHECK-LABEL: @ext_ext_or_reduction_v4i32(
486; CHECK-NEXT:    [[Z:%.*]] = and <4 x i32> [[X:%.*]], [[Y:%.*]]
487; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
488; CHECK-NEXT:    [[TMP1:%.*]] = or <4 x i32> [[Z]], [[SHIFT]]
489; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> poison, <4 x i32> <i32 2, i32 poison, i32 poison, i32 poison>
490; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i32> [[TMP1]], [[SHIFT1]]
491; CHECK-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> poison, <4 x i32> <i32 3, i32 poison, i32 poison, i32 poison>
492; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[SHIFT2]], [[TMP2]]
493; CHECK-NEXT:    [[Z0123:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
494; CHECK-NEXT:    ret i32 [[Z0123]]
495;
496  %z = and <4 x i32> %x, %y
497  %z0 = extractelement <4 x i32> %z, i32 0
498  %z1 = extractelement <4 x i32> %z, i32 1
499  %z01 = or i32 %z0, %z1
500  %z2 = extractelement <4 x i32> %z, i32 2
501  %z012 = or i32 %z01, %z2
502  %z3 = extractelement <4 x i32> %z, i32 3
503  %z0123 = or i32 %z3, %z012
504  ret i32 %z0123
505}
506
507define i32 @ext_ext_partial_add_reduction_v4i32(<4 x i32> %x) {
508; CHECK-LABEL: @ext_ext_partial_add_reduction_v4i32(
509; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
510; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[SHIFT]], [[X]]
511; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <4 x i32> <i32 2, i32 poison, i32 poison, i32 poison>
512; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[SHIFT1]], [[TMP1]]
513; CHECK-NEXT:    [[X210:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
514; CHECK-NEXT:    ret i32 [[X210]]
515;
516  %x0 = extractelement <4 x i32> %x, i32 0
517  %x1 = extractelement <4 x i32> %x, i32 1
518  %x10 = add i32 %x1, %x0
519  %x2 = extractelement <4 x i32> %x, i32 2
520  %x210 = add i32 %x2, %x10
521  ret i32 %x210
522}
523
524define i32 @ext_ext_partial_add_reduction_and_extra_add_v4i32(<4 x i32> %x, <4 x i32> %y) {
525; CHECK-LABEL: @ext_ext_partial_add_reduction_and_extra_add_v4i32(
526; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
527; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[SHIFT]], [[Y]]
528; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> poison, <4 x i32> <i32 2, i32 poison, i32 poison, i32 poison>
529; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[SHIFT1]], [[TMP1]]
530; CHECK-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 2, i32 poison, i32 poison, i32 poison>
531; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[SHIFT2]], [[TMP2]]
532; CHECK-NEXT:    [[X2Y210:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
533; CHECK-NEXT:    ret i32 [[X2Y210]]
534;
535  %y0 = extractelement <4 x i32> %y, i32 0
536  %y1 = extractelement <4 x i32> %y, i32 1
537  %y10 = add i32 %y1, %y0
538  %y2 = extractelement <4 x i32> %y, i32 2
539  %y210 = add i32 %y2, %y10
540  %x2 = extractelement <4 x i32> %x, i32 2
541  %x2y210 = add i32 %x2, %y210
542  ret i32 %x2y210
543}
544
545define i32 @constant_fold_crash(<4 x i32> %x) {
546; CHECK-LABEL: @constant_fold_crash(
547; CHECK-NEXT:    [[A:%.*]] = extractelement <4 x i32> <i32 16, i32 17, i32 18, i32 19>, i32 1
548; CHECK-NEXT:    [[B:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
549; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[B]]
550; CHECK-NEXT:    ret i32 [[C]]
551;
552  %a = extractelement <4 x i32> <i32 16, i32 17, i32 18, i32 19>, i32 1
553  %b = extractelement <4 x i32> %x, i32 0
554  %c = add i32 %a, %b
555  ret i32 %c
556}
557
558define float @constant_fold_crash_commute(<4 x float> %x) {
559; CHECK-LABEL: @constant_fold_crash_commute(
560; CHECK-NEXT:    [[A:%.*]] = extractelement <4 x float> <float 1.600000e+01, float 1.700000e+01, float 1.800000e+01, float 1.900000e+01>, i32 3
561; CHECK-NEXT:    [[B:%.*]] = extractelement <4 x float> [[X:%.*]], i32 1
562; CHECK-NEXT:    [[C:%.*]] = fadd float [[B]], [[A]]
563; CHECK-NEXT:    ret float [[C]]
564;
565  %a = extractelement <4 x float> <float 16.0, float 17.0, float 18.0, float 19.0>, i32 3
566  %b = extractelement <4 x float> %x, i32 1
567  %c = fadd float %b, %a
568  ret float %c
569}
570