xref: /llvm-project/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll (revision 38fffa630ee80163dc65e759392ad29798905679)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s
3; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s
4
5declare void @use(<4 x i32>)
6declare void @usef(<4 x float>)
7
8; Eliminating an insert is profitable.
9
10define <16 x i1> @ins0_ins0_i8(i8 %x, i8 %y) {
11; CHECK-LABEL: @ins0_ins0_i8(
12; CHECK-NEXT:    [[R_SCALAR:%.*]] = icmp eq i8 [[X:%.*]], [[Y:%.*]]
13; CHECK-NEXT:    [[R:%.*]] = insertelement <16 x i1> undef, i1 [[R_SCALAR]], i64 0
14; CHECK-NEXT:    ret <16 x i1> [[R]]
15;
16  %i0 = insertelement <16 x i8> undef, i8 %x, i32 0
17  %i1 = insertelement <16 x i8> undef, i8 %y, i32 0
18  %r = icmp eq <16 x i8> %i0, %i1
19  ret <16 x i1> %r
20}
21
22; Eliminating an insert is still profitable. Mismatch types on index is ok.
23
24define <8 x i1> @ins5_ins5_i16(i16 %x, i16 %y) {
25; CHECK-LABEL: @ins5_ins5_i16(
26; CHECK-NEXT:    [[R_SCALAR:%.*]] = icmp sgt i16 [[X:%.*]], [[Y:%.*]]
27; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i1> undef, i1 [[R_SCALAR]], i64 5
28; CHECK-NEXT:    ret <8 x i1> [[R]]
29;
30  %i0 = insertelement <8 x i16> undef, i16 %x, i8 5
31  %i1 = insertelement <8 x i16> undef, i16 %y, i32 5
32  %r = icmp sgt <8 x i16> %i0, %i1
33  ret <8 x i1> %r
34}
35
36; The new vector constant is calculated by constant folding.
37
38define <2 x i1> @ins1_ins1_i64(i64 %x, i64 %y) {
39; CHECK-LABEL: @ins1_ins1_i64(
40; CHECK-NEXT:    [[R_SCALAR:%.*]] = icmp sle i64 [[X:%.*]], [[Y:%.*]]
41; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i1> <i1 true, i1 false>, i1 [[R_SCALAR]], i64 1
42; CHECK-NEXT:    ret <2 x i1> [[R]]
43;
44  %i0 = insertelement <2 x i64> zeroinitializer, i64 %x, i64 1
45  %i1 = insertelement <2 x i64> <i64 1, i64 -1>, i64 %y, i32 1
46  %r = icmp sle <2 x i64> %i0, %i1
47  ret <2 x i1> %r
48}
49
50; The inserts are free, but it's still better to scalarize.
51
52define <2 x i1> @ins0_ins0_f64(double %x, double %y) {
53; CHECK-LABEL: @ins0_ins0_f64(
54; CHECK-NEXT:    [[R_SCALAR:%.*]] = fcmp nnan ninf uge double [[X:%.*]], [[Y:%.*]]
55; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i1> splat (i1 true), i1 [[R_SCALAR]], i64 0
56; CHECK-NEXT:    ret <2 x i1> [[R]]
57;
58  %i0 = insertelement <2 x double> undef, double %x, i32 0
59  %i1 = insertelement <2 x double> undef, double %y, i32 0
60  %r = fcmp nnan ninf uge <2 x double> %i0, %i1
61  ret <2 x i1> %r
62}
63
64; Negative test - mismatched indexes (but could fold this).
65
66define <16 x i1> @ins1_ins0_i8(i8 %x, i8 %y) {
67; CHECK-LABEL: @ins1_ins0_i8(
68; CHECK-NEXT:    [[I0:%.*]] = insertelement <16 x i8> undef, i8 [[X:%.*]], i32 1
69; CHECK-NEXT:    [[I1:%.*]] = insertelement <16 x i8> undef, i8 [[Y:%.*]], i32 0
70; CHECK-NEXT:    [[R:%.*]] = icmp sle <16 x i8> [[I0]], [[I1]]
71; CHECK-NEXT:    ret <16 x i1> [[R]]
72;
73  %i0 = insertelement <16 x i8> undef, i8 %x, i32 1
74  %i1 = insertelement <16 x i8> undef, i8 %y, i32 0
75  %r = icmp sle <16 x i8> %i0, %i1
76  ret <16 x i1> %r
77}
78
79; Base vector does not have to be undef.
80
81define <4 x i1> @ins0_ins0_i32(i32 %x, i32 %y) {
82; CHECK-LABEL: @ins0_ins0_i32(
83; CHECK-NEXT:    [[R_SCALAR:%.*]] = icmp ne i32 [[X:%.*]], [[Y:%.*]]
84; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i1> undef, i1 [[R_SCALAR]], i64 0
85; CHECK-NEXT:    ret <4 x i1> [[R]]
86;
87  %i0 = insertelement <4 x i32> zeroinitializer, i32 %x, i32 0
88  %i1 = insertelement <4 x i32> undef, i32 %y, i32 0
89  %r = icmp ne <4 x i32> %i0, %i1
90  ret <4 x i1> %r
91}
92
93; Extra use is accounted for in cost calculation.
94
95define <4 x i1> @ins0_ins0_i32_use(i32 %x, i32 %y) {
96; CHECK-LABEL: @ins0_ins0_i32_use(
97; CHECK-NEXT:    [[I0:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
98; CHECK-NEXT:    call void @use(<4 x i32> [[I0]])
99; CHECK-NEXT:    [[R_SCALAR:%.*]] = icmp ugt i32 [[X]], [[Y:%.*]]
100; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i1> undef, i1 [[R_SCALAR]], i64 0
101; CHECK-NEXT:    ret <4 x i1> [[R]]
102;
103  %i0 = insertelement <4 x i32> undef, i32 %x, i32 0
104  call void @use(<4 x i32> %i0)
105  %i1 = insertelement <4 x i32> undef, i32 %y, i32 0
106  %r = icmp ugt <4 x i32> %i0, %i1
107  ret <4 x i1> %r
108}
109
110; Extra use is accounted for in cost calculation.
111
112define <4 x i1> @ins1_ins1_f32_use(float %x, float %y) {
113; CHECK-LABEL: @ins1_ins1_f32_use(
114; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x float> undef, float [[Y:%.*]], i32 1
115; CHECK-NEXT:    call void @usef(<4 x float> [[I1]])
116; CHECK-NEXT:    [[R_SCALAR:%.*]] = fcmp ogt float [[X:%.*]], [[Y]]
117; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i1> zeroinitializer, i1 [[R_SCALAR]], i64 1
118; CHECK-NEXT:    ret <4 x i1> [[R]]
119;
120  %i0 = insertelement <4 x float> undef, float %x, i32 1
121  %i1 = insertelement <4 x float> undef, float %y, i32 1
122  call void @usef(<4 x float> %i1)
123  %r = fcmp ogt <4 x float> %i0, %i1
124  ret <4 x i1> %r
125}
126
127; If the scalar cmp is not cheaper than the vector cmp, extra uses can prevent the transform.
128
129define <4 x i1> @ins2_ins2_f32_uses(float %x, float %y) {
130; CHECK-LABEL: @ins2_ins2_f32_uses(
131; CHECK-NEXT:    [[I0:%.*]] = insertelement <4 x float> undef, float [[X:%.*]], i32 2
132; CHECK-NEXT:    call void @usef(<4 x float> [[I0]])
133; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x float> undef, float [[Y:%.*]], i32 2
134; CHECK-NEXT:    call void @usef(<4 x float> [[I1]])
135; CHECK-NEXT:    [[R:%.*]] = fcmp oeq <4 x float> [[I0]], [[I1]]
136; CHECK-NEXT:    ret <4 x i1> [[R]]
137;
138  %i0 = insertelement <4 x float> undef, float %x, i32 2
139  call void @usef(<4 x float> %i0)
140  %i1 = insertelement <4 x float> undef, float %y, i32 2
141  call void @usef(<4 x float> %i1)
142  %r = fcmp oeq <4 x float> %i0, %i1
143  ret <4 x i1> %r
144}
145
146define <2 x i1> @constant_op1_i64(i64 %x) {
147; CHECK-LABEL: @constant_op1_i64(
148; CHECK-NEXT:    [[R_SCALAR:%.*]] = icmp ne i64 [[X:%.*]], 42
149; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i1> undef, i1 [[R_SCALAR]], i64 0
150; CHECK-NEXT:    ret <2 x i1> [[R]]
151;
152  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
153  %r = icmp ne <2 x i64> %ins, <i64 42, i64 undef>
154  ret <2 x i1> %r
155}
156
157define <2 x i1> @constant_op1_i64_not_undef_lane(i64 %x) {
158; CHECK-LABEL: @constant_op1_i64_not_undef_lane(
159; CHECK-NEXT:    [[R_SCALAR:%.*]] = icmp sge i64 [[X:%.*]], 42
160; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i1> splat (i1 true), i1 [[R_SCALAR]], i64 0
161; CHECK-NEXT:    ret <2 x i1> [[R]]
162;
163  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
164  %r = icmp sge <2 x i64> %ins, <i64 42, i64 -42>
165  ret <2 x i1> %r
166}
167
168; negative test - load prevents the transform
169
170define <2 x i1> @constant_op1_i64_load(ptr %p) {
171; CHECK-LABEL: @constant_op1_i64_load(
172; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
173; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[LD]], i32 0
174; CHECK-NEXT:    [[R:%.*]] = icmp eq <2 x i64> [[INS]], <i64 42, i64 -42>
175; CHECK-NEXT:    ret <2 x i1> [[R]]
176;
177  %ld = load i64, ptr %p
178  %ins = insertelement <2 x i64> undef, i64 %ld, i32 0
179  %r = icmp eq <2 x i64> %ins, <i64 42, i64 -42>
180  ret <2 x i1> %r
181}
182
183define <4 x i1> @constant_op0_i32(i32 %x) {
184; CHECK-LABEL: @constant_op0_i32(
185; CHECK-NEXT:    [[R_SCALAR:%.*]] = icmp ult i32 -42, [[X:%.*]]
186; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i1> zeroinitializer, i1 [[R_SCALAR]], i64 1
187; CHECK-NEXT:    ret <4 x i1> [[R]]
188;
189  %ins = insertelement <4 x i32> undef, i32 %x, i32 1
190  %r = icmp ult <4 x i32> <i32 undef, i32 -42, i32 undef, i32 undef>, %ins
191  ret <4 x i1> %r
192}
193
194define <4 x i1> @constant_op0_i32_not_undef_lane(i32 %x) {
195; CHECK-LABEL: @constant_op0_i32_not_undef_lane(
196; CHECK-NEXT:    [[R_SCALAR:%.*]] = icmp ule i32 42, [[X:%.*]]
197; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i1> splat (i1 true), i1 [[R_SCALAR]], i64 1
198; CHECK-NEXT:    ret <4 x i1> [[R]]
199;
200  %ins = insertelement <4 x i32> undef, i32 %x, i32 1
201  %r = icmp ule <4 x i32> <i32 1, i32 42, i32 42, i32 -42>, %ins
202  ret <4 x i1> %r
203}
204
205define <2 x i1> @constant_op0_f64(double %x) {
206; CHECK-LABEL: @constant_op0_f64(
207; CHECK-NEXT:    [[R_SCALAR:%.*]] = fcmp fast olt double 4.200000e+01, [[X:%.*]]
208; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i1> zeroinitializer, i1 [[R_SCALAR]], i64 0
209; CHECK-NEXT:    ret <2 x i1> [[R]]
210;
211  %ins = insertelement <2 x double> undef, double %x, i32 0
212  %r = fcmp fast olt <2 x double> <double 42.0, double undef>, %ins
213  ret <2 x i1> %r
214}
215
216define <2 x i1> @constant_op0_f64_not_undef_lane(double %x) {
217; CHECK-LABEL: @constant_op0_f64_not_undef_lane(
218; CHECK-NEXT:    [[R_SCALAR:%.*]] = fcmp nnan ueq double -4.200000e+01, [[X:%.*]]
219; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i1> splat (i1 true), i1 [[R_SCALAR]], i64 1
220; CHECK-NEXT:    ret <2 x i1> [[R]]
221;
222  %ins = insertelement <2 x double> undef, double %x, i32 1
223  %r = fcmp nnan ueq <2 x double> <double 42.0, double -42.0>, %ins
224  ret <2 x i1> %r
225}
226
227define <2 x i1> @constant_op1_f64(double %x) {
228; CHECK-LABEL: @constant_op1_f64(
229; CHECK-NEXT:    [[R_SCALAR:%.*]] = fcmp one double [[X:%.*]], 4.200000e+01
230; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i1> zeroinitializer, i1 [[R_SCALAR]], i64 1
231; CHECK-NEXT:    ret <2 x i1> [[R]]
232;
233  %ins = insertelement <2 x double> undef, double %x, i32 1
234  %r = fcmp one <2 x double> %ins, <double undef, double 42.0>
235  ret <2 x i1> %r
236}
237
238define <4 x i1> @constant_op1_f32_not_undef_lane(float %x) {
239; CHECK-LABEL: @constant_op1_f32_not_undef_lane(
240; CHECK-NEXT:    [[R_SCALAR:%.*]] = fcmp uge float [[X:%.*]], 4.200000e+01
241; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i1> splat (i1 true), i1 [[R_SCALAR]], i64 0
242; CHECK-NEXT:    ret <4 x i1> [[R]]
243;
244  %ins = insertelement <4 x float> undef, float %x, i32 0
245  %r = fcmp uge <4 x float> %ins, <float 42.0, float -42.0, float 0.0, float 1.0>
246  ret <4 x i1> %r
247}
248
249; negative test - select prevents the transform
250
251define <4 x float> @vec_select_use1(<4 x float> %x, <4 x float> %y, i32 %a, i32 %b) {
252; CHECK-LABEL: @vec_select_use1(
253; CHECK-NEXT:    [[VECA:%.*]] = insertelement <4 x i32> undef, i32 [[A:%.*]], i8 0
254; CHECK-NEXT:    [[VECB:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i8 0
255; CHECK-NEXT:    [[COND:%.*]] = icmp eq <4 x i32> [[VECA]], [[VECB]]
256; CHECK-NEXT:    [[R:%.*]] = select <4 x i1> [[COND]], <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]]
257; CHECK-NEXT:    ret <4 x float> [[R]]
258;
259  %veca = insertelement <4 x i32> undef, i32 %a, i8 0
260  %vecb = insertelement <4 x i32> undef, i32 %b, i8 0
261  %cond = icmp eq <4 x i32> %veca, %vecb
262  %r = select <4 x i1> %cond, <4 x float> %x, <4 x float> %y
263  ret <4 x float> %r
264}
265
266; negative test - select prevents the transform
267
268define <4 x float> @vec_select_use2(<4 x float> %x, <4 x float> %y, float %a) {
269; CHECK-LABEL: @vec_select_use2(
270; CHECK-NEXT:    [[VECA:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i8 0
271; CHECK-NEXT:    [[COND:%.*]] = fcmp oeq <4 x float> [[VECA]], zeroinitializer
272; CHECK-NEXT:    [[R:%.*]] = select <4 x i1> [[COND]], <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]]
273; CHECK-NEXT:    ret <4 x float> [[R]]
274;
275  %veca = insertelement <4 x float> undef, float %a, i8 0
276  %cond = fcmp oeq <4 x float> %veca, zeroinitializer
277  %r = select <4 x i1> %cond, <4 x float> %x, <4 x float> %y
278  ret <4 x float> %r
279}
280
281define <4 x i1> @vector_of_pointers(ptr %t1) {
282; CHECK-LABEL: @vector_of_pointers(
283; CHECK-NEXT:    [[T6_SCALAR:%.*]] = icmp ne ptr [[T1:%.*]], null
284; CHECK-NEXT:    [[T6:%.*]] = insertelement <4 x i1> undef, i1 [[T6_SCALAR]], i64 0
285; CHECK-NEXT:    ret <4 x i1> [[T6]]
286;
287  %t5 = insertelement <4 x ptr> undef, ptr %t1, i32 0
288  %t6 = icmp ne <4 x ptr> %t5, zeroinitializer
289  ret <4 x i1> %t6
290}
291