xref: /llvm-project/llvm/test/Transforms/VectorCombine/X86/insert-binop.ll (revision 5e4dbd7a2fb095a6c25ec4758b806571c033e88e)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK,SSE
3; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX
4
5declare void @use(<4 x i32>)
6declare void @usef(<4 x float>)
7
8; Eliminating an insert is profitable.
9
10define <16 x i8> @ins0_ins0_add(i8 %x, i8 %y) {
11; CHECK-LABEL: @ins0_ins0_add(
12; CHECK-NEXT:    [[R_SCALAR:%.*]] = add i8 [[X:%.*]], [[Y:%.*]]
13; CHECK-NEXT:    [[R:%.*]] = insertelement <16 x i8> undef, i8 [[R_SCALAR]], i64 0
14; CHECK-NEXT:    ret <16 x i8> [[R]]
15;
16  %i0 = insertelement <16 x i8> undef, i8 %x, i32 0
17  %i1 = insertelement <16 x i8> undef, i8 %y, i32 0
18  %r = add <16 x i8> %i0, %i1
19  ret <16 x i8> %r
20}
21
22; Eliminating an insert is still profitable. Flags propagate. Mismatch types on index is ok.
23
24define <8 x i16> @ins0_ins0_sub_flags(i16 %x, i16 %y) {
25; CHECK-LABEL: @ins0_ins0_sub_flags(
26; CHECK-NEXT:    [[R_SCALAR:%.*]] = sub nuw nsw i16 [[X:%.*]], [[Y:%.*]]
27; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[R_SCALAR]], i64 5
28; CHECK-NEXT:    ret <8 x i16> [[R]]
29;
30  %i0 = insertelement <8 x i16> undef, i16 %x, i8 5
31  %i1 = insertelement <8 x i16> undef, i16 %y, i32 5
32  %r = sub nsw nuw <8 x i16> %i0, %i1
33  ret <8 x i16> %r
34}
35
36; The new vector constant is calculated by constant folding.
37; This is conservatively created as zero rather than undef for 'undef ^ undef'.
38
39define <2 x i64> @ins1_ins1_xor(i64 %x, i64 %y) {
40; CHECK-LABEL: @ins1_ins1_xor(
41; CHECK-NEXT:    [[R_SCALAR:%.*]] = xor i64 [[X:%.*]], [[Y:%.*]]
42; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[R_SCALAR]], i64 1
43; CHECK-NEXT:    ret <2 x i64> [[R]]
44;
45  %i0 = insertelement <2 x i64> undef, i64 %x, i64 1
46  %i1 = insertelement <2 x i64> undef, i64 %y, i32 1
47  %r = xor <2 x i64> %i0, %i1
48  ret <2 x i64> %r
49}
50
51define <2 x i64> @ins1_ins1_iterate(i64 %w, i64 %x, i64 %y, i64 %z) {
52; CHECK-LABEL: @ins1_ins1_iterate(
53; CHECK-NEXT:    [[S0_SCALAR:%.*]] = sub i64 [[W:%.*]], [[X:%.*]]
54; CHECK-NEXT:    [[S1_SCALAR:%.*]] = or i64 [[S0_SCALAR]], [[Y:%.*]]
55; CHECK-NEXT:    [[S2_SCALAR:%.*]] = shl i64 [[Z:%.*]], [[S1_SCALAR]]
56; CHECK-NEXT:    [[S2:%.*]] = insertelement <2 x i64> poison, i64 [[S2_SCALAR]], i64 1
57; CHECK-NEXT:    ret <2 x i64> [[S2]]
58;
59  %i0 = insertelement <2 x i64> undef, i64 %w, i64 1
60  %i1 = insertelement <2 x i64> undef, i64 %x, i32 1
61  %s0 = sub <2 x i64> %i0, %i1
62  %i2 = insertelement <2 x i64> undef, i64 %y, i32 1
63  %s1 = or <2 x i64> %s0, %i2
64  %i3 = insertelement <2 x i64> undef, i64 %z, i32 1
65  %s2 = shl <2 x i64> %i3, %s1
66  ret <2 x i64> %s2
67}
68
69; The inserts are free, but it's still better to scalarize.
70
71define <2 x double> @ins0_ins0_fadd(double %x, double %y) {
72; CHECK-LABEL: @ins0_ins0_fadd(
73; CHECK-NEXT:    [[R_SCALAR:%.*]] = fadd reassoc nsz double [[X:%.*]], [[Y:%.*]]
74; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x double> undef, double [[R_SCALAR]], i64 0
75; CHECK-NEXT:    ret <2 x double> [[R]]
76;
77  %i0 = insertelement <2 x double> undef, double %x, i32 0
78  %i1 = insertelement <2 x double> undef, double %y, i32 0
79  %r = fadd reassoc nsz <2 x double> %i0, %i1
80  ret <2 x double> %r
81}
82
83; Negative test - mismatched indexes (but could fold this).
84
85define <16 x i8> @ins1_ins0_add(i8 %x, i8 %y) {
86; CHECK-LABEL: @ins1_ins0_add(
87; CHECK-NEXT:    [[I0:%.*]] = insertelement <16 x i8> undef, i8 [[X:%.*]], i32 1
88; CHECK-NEXT:    [[I1:%.*]] = insertelement <16 x i8> undef, i8 [[Y:%.*]], i32 0
89; CHECK-NEXT:    [[R:%.*]] = add <16 x i8> [[I0]], [[I1]]
90; CHECK-NEXT:    ret <16 x i8> [[R]]
91;
92  %i0 = insertelement <16 x i8> undef, i8 %x, i32 1
93  %i1 = insertelement <16 x i8> undef, i8 %y, i32 0
94  %r = add <16 x i8> %i0, %i1
95  ret <16 x i8> %r
96}
97
98; Base vector does not have to be undef.
99
100define <4 x i32> @ins0_ins0_mul(i32 %x, i32 %y) {
101; CHECK-LABEL: @ins0_ins0_mul(
102; CHECK-NEXT:    [[R_SCALAR:%.*]] = mul i32 [[X:%.*]], [[Y:%.*]]
103; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[R_SCALAR]], i64 0
104; CHECK-NEXT:    ret <4 x i32> [[R]]
105;
106  %i0 = insertelement <4 x i32> zeroinitializer, i32 %x, i32 0
107  %i1 = insertelement <4 x i32> undef, i32 %y, i32 0
108  %r = mul <4 x i32> %i0, %i1
109  ret <4 x i32> %r
110}
111
112; It is safe to scalarize any binop (no extra UB/poison danger).
113
114define <2 x i64> @ins1_ins1_sdiv(i64 %x, i64 %y) {
115; CHECK-LABEL: @ins1_ins1_sdiv(
116; CHECK-NEXT:    [[R_SCALAR:%.*]] = sdiv i64 [[X:%.*]], [[Y:%.*]]
117; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i64> <i64 -6, i64 0>, i64 [[R_SCALAR]], i64 1
118; CHECK-NEXT:    ret <2 x i64> [[R]]
119;
120  %i0 = insertelement <2 x i64> <i64 42, i64 -42>, i64 %x, i64 1
121  %i1 = insertelement <2 x i64> <i64 -7, i64 128>, i64 %y, i32 1
122  %r = sdiv <2 x i64> %i0, %i1
123  ret <2 x i64> %r
124}
125
126; Constant folding deals with undef per element - the entire value does not become undef.
127
128define <2 x i64> @ins1_ins1_udiv(i64 %x, i64 %y) {
129; CHECK-LABEL: @ins1_ins1_udiv(
130; CHECK-NEXT:    [[R_SCALAR:%.*]] = udiv i64 [[X:%.*]], [[Y:%.*]]
131; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i64> <i64 6, i64 poison>, i64 [[R_SCALAR]], i64 1
132; CHECK-NEXT:    ret <2 x i64> [[R]]
133;
134  %i0 = insertelement <2 x i64> <i64 42, i64 undef>, i64 %x, i32 1
135  %i1 = insertelement <2 x i64> <i64 7, i64 undef>, i64 %y, i32 1
136  %r = udiv <2 x i64> %i0, %i1
137  ret <2 x i64> %r
138}
139
140; This could be simplified -- creates immediate UB without the transform because
141; divisor has an undef element -- but that is hidden after the transform.
142
143define <2 x i64> @ins1_ins1_urem(i64 %x, i64 %y) {
144; CHECK-LABEL: @ins1_ins1_urem(
145; CHECK-NEXT:    [[R_SCALAR:%.*]] = urem i64 [[X:%.*]], [[Y:%.*]]
146; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[R_SCALAR]], i64 1
147; CHECK-NEXT:    ret <2 x i64> [[R]]
148;
149  %i0 = insertelement <2 x i64> <i64 42, i64 undef>, i64 %x, i64 1
150  %i1 = insertelement <2 x i64> <i64 undef, i64 128>, i64 %y, i32 1
151  %r = urem <2 x i64> %i0, %i1
152  ret <2 x i64> %r
153}
154
155; Extra use is accounted for in cost calculation.
156
157define <4 x i32> @ins0_ins0_xor(i32 %x, i32 %y) {
158; CHECK-LABEL: @ins0_ins0_xor(
159; CHECK-NEXT:    [[I0:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
160; CHECK-NEXT:    call void @use(<4 x i32> [[I0]])
161; CHECK-NEXT:    [[R_SCALAR:%.*]] = xor i32 [[X]], [[Y:%.*]]
162; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[R_SCALAR]], i64 0
163; CHECK-NEXT:    ret <4 x i32> [[R]]
164;
165  %i0 = insertelement <4 x i32> undef, i32 %x, i32 0
166  call void @use(<4 x i32> %i0)
167  %i1 = insertelement <4 x i32> undef, i32 %y, i32 0
168  %r = xor <4 x i32> %i0, %i1
169  ret <4 x i32> %r
170}
171
172; Extra use is accounted for in cost calculation.
173
174define <4 x float> @ins1_ins1_fmul(float %x, float %y) {
175; CHECK-LABEL: @ins1_ins1_fmul(
176; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x float> undef, float [[Y:%.*]], i32 1
177; CHECK-NEXT:    call void @usef(<4 x float> [[I1]])
178; CHECK-NEXT:    [[R_SCALAR:%.*]] = fmul float [[X:%.*]], [[Y]]
179; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[R_SCALAR]], i64 1
180; CHECK-NEXT:    ret <4 x float> [[R]]
181;
182  %i0 = insertelement <4 x float> undef, float %x, i32 1
183  %i1 = insertelement <4 x float> undef, float %y, i32 1
184  call void @usef(<4 x float> %i1)
185  %r = fmul <4 x float> %i0, %i1
186  ret <4 x float> %r
187}
188
189; If the scalar binop is not cheaper than the vector binop, extra uses can prevent the transform.
190
191define <4 x float> @ins2_ins2_fsub(float %x, float %y) {
192; CHECK-LABEL: @ins2_ins2_fsub(
193; CHECK-NEXT:    [[I0:%.*]] = insertelement <4 x float> undef, float [[X:%.*]], i32 2
194; CHECK-NEXT:    call void @usef(<4 x float> [[I0]])
195; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x float> undef, float [[Y:%.*]], i32 2
196; CHECK-NEXT:    call void @usef(<4 x float> [[I1]])
197; CHECK-NEXT:    [[R:%.*]] = fsub <4 x float> [[I0]], [[I1]]
198; CHECK-NEXT:    ret <4 x float> [[R]]
199;
200  %i0 = insertelement <4 x float> undef, float %x, i32 2
201  call void @usef(<4 x float> %i0)
202  %i1 = insertelement <4 x float> undef, float %y, i32 2
203  call void @usef(<4 x float> %i1)
204  %r = fsub <4 x float> %i0, %i1
205  ret <4 x float> %r
206}
207
208; It may be worth scalarizing an expensive binop even if both inserts have extra uses.
209
210define <4 x float> @ins3_ins3_fdiv(float %x, float %y) {
211; SSE-LABEL: @ins3_ins3_fdiv(
212; SSE-NEXT:    [[I0:%.*]] = insertelement <4 x float> undef, float [[X:%.*]], i32 3
213; SSE-NEXT:    call void @usef(<4 x float> [[I0]])
214; SSE-NEXT:    [[I1:%.*]] = insertelement <4 x float> undef, float [[Y:%.*]], i32 3
215; SSE-NEXT:    call void @usef(<4 x float> [[I1]])
216; SSE-NEXT:    [[R_SCALAR:%.*]] = fdiv float [[X]], [[Y]]
217; SSE-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[R_SCALAR]], i64 3
218; SSE-NEXT:    ret <4 x float> [[R]]
219;
220; AVX-LABEL: @ins3_ins3_fdiv(
221; AVX-NEXT:    [[I0:%.*]] = insertelement <4 x float> undef, float [[X:%.*]], i32 3
222; AVX-NEXT:    call void @usef(<4 x float> [[I0]])
223; AVX-NEXT:    [[I1:%.*]] = insertelement <4 x float> undef, float [[Y:%.*]], i32 3
224; AVX-NEXT:    call void @usef(<4 x float> [[I1]])
225; AVX-NEXT:    [[R:%.*]] = fdiv <4 x float> [[I0]], [[I1]]
226; AVX-NEXT:    ret <4 x float> [[R]]
227;
228  %i0 = insertelement <4 x float> undef, float %x, i32 3
229  call void @usef(<4 x float> %i0)
230  %i1 = insertelement <4 x float> undef, float %y, i32 3
231  call void @usef(<4 x float> %i1)
232  %r = fdiv <4 x float> %i0, %i1
233  ret <4 x float> %r
234}
235