xref: /llvm-project/llvm/test/CodeGen/X86/combine-mul.ll (revision 213e308633e533f74f04269766989bb89fde0921)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
4
5; fold (mul x, 0) -> 0
6define <4 x i32> @combine_vec_mul_zero(<4 x i32> %x) {
7; SSE-LABEL: combine_vec_mul_zero:
8; SSE:       # %bb.0:
9; SSE-NEXT:    xorps %xmm0, %xmm0
10; SSE-NEXT:    retq
11;
12; AVX-LABEL: combine_vec_mul_zero:
13; AVX:       # %bb.0:
14; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
15; AVX-NEXT:    retq
16  %1 = mul <4 x i32> %x, zeroinitializer
17  ret <4 x i32> %1
18}
19
20; fold (mul x, 1) -> x
21define <4 x i32> @combine_vec_mul_one(<4 x i32> %x) {
22; SSE-LABEL: combine_vec_mul_one:
23; SSE:       # %bb.0:
24; SSE-NEXT:    retq
25;
26; AVX-LABEL: combine_vec_mul_one:
27; AVX:       # %bb.0:
28; AVX-NEXT:    retq
29  %1 = mul <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
30  ret <4 x i32> %1
31}
32
33; fold (mul x, -1) -> 0-x
34define <4 x i32> @combine_vec_mul_negone(<4 x i32> %x) {
35; SSE-LABEL: combine_vec_mul_negone:
36; SSE:       # %bb.0:
37; SSE-NEXT:    pxor %xmm1, %xmm1
38; SSE-NEXT:    psubd %xmm0, %xmm1
39; SSE-NEXT:    movdqa %xmm1, %xmm0
40; SSE-NEXT:    retq
41;
42; AVX-LABEL: combine_vec_mul_negone:
43; AVX:       # %bb.0:
44; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
45; AVX-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
46; AVX-NEXT:    retq
47  %1 = mul <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
48  ret <4 x i32> %1
49}
50
51; fold (mul x, (1 << c)) -> x << c
52define <4 x i32> @combine_vec_mul_pow2a(<4 x i32> %x) {
53; SSE-LABEL: combine_vec_mul_pow2a:
54; SSE:       # %bb.0:
55; SSE-NEXT:    paddd %xmm0, %xmm0
56; SSE-NEXT:    retq
57;
58; AVX-LABEL: combine_vec_mul_pow2a:
59; AVX:       # %bb.0:
60; AVX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
61; AVX-NEXT:    retq
62  %1 = mul <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
63  ret <4 x i32> %1
64}
65
66define <4 x i32> @combine_vec_mul_pow2b(<4 x i32> %x) {
67; SSE-LABEL: combine_vec_mul_pow2b:
68; SSE:       # %bb.0:
69; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
70; SSE-NEXT:    retq
71;
72; AVX-LABEL: combine_vec_mul_pow2b:
73; AVX:       # %bb.0:
74; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
75; AVX-NEXT:    retq
76  %1 = mul <4 x i32> %x, <i32 1, i32 2, i32 4, i32 16>
77  ret <4 x i32> %1
78}
79
80define <4 x i64> @combine_vec_mul_pow2c(<4 x i64> %x) {
81; SSE-LABEL: combine_vec_mul_pow2c:
82; SSE:       # %bb.0:
83; SSE-NEXT:    movdqa %xmm0, %xmm2
84; SSE-NEXT:    paddq %xmm0, %xmm2
85; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
86; SSE-NEXT:    movdqa %xmm1, %xmm2
87; SSE-NEXT:    psllq $4, %xmm2
88; SSE-NEXT:    psllq $2, %xmm1
89; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
90; SSE-NEXT:    retq
91;
92; AVX-LABEL: combine_vec_mul_pow2c:
93; AVX:       # %bb.0:
94; AVX-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
95; AVX-NEXT:    retq
96  %1 = mul <4 x i64> %x, <i64 1, i64 2, i64 4, i64 16>
97  ret <4 x i64> %1
98}
99
100; fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c
101define <4 x i32> @combine_vec_mul_negpow2a(<4 x i32> %x) {
102; SSE-LABEL: combine_vec_mul_negpow2a:
103; SSE:       # %bb.0:
104; SSE-NEXT:    paddd %xmm0, %xmm0
105; SSE-NEXT:    pxor %xmm1, %xmm1
106; SSE-NEXT:    psubd %xmm0, %xmm1
107; SSE-NEXT:    movdqa %xmm1, %xmm0
108; SSE-NEXT:    retq
109;
110; AVX-LABEL: combine_vec_mul_negpow2a:
111; AVX:       # %bb.0:
112; AVX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
113; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
114; AVX-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
115; AVX-NEXT:    retq
116  %1 = mul <4 x i32> %x, <i32 -2, i32 -2, i32 -2, i32 -2>
117  ret <4 x i32> %1
118}
119
120define <4 x i32> @combine_vec_mul_negpow2b(<4 x i32> %x) {
121; SSE-LABEL: combine_vec_mul_negpow2b:
122; SSE:       # %bb.0:
123; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
124; SSE-NEXT:    retq
125;
126; AVX-LABEL: combine_vec_mul_negpow2b:
127; AVX:       # %bb.0:
128; AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
129; AVX-NEXT:    retq
130  %1 = mul <4 x i32> %x, <i32 -1, i32 -2, i32 -4, i32 -16>
131  ret <4 x i32> %1
132}
133
134define <4 x i64> @combine_vec_mul_negpow2c(<4 x i64> %x) {
135; SSE-LABEL: combine_vec_mul_negpow2c:
136; SSE:       # %bb.0:
137; SSE-NEXT:    pmovsxbd {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
138; SSE-NEXT:    movdqa %xmm0, %xmm3
139; SSE-NEXT:    pmuludq %xmm2, %xmm3
140; SSE-NEXT:    movdqa %xmm0, %xmm4
141; SSE-NEXT:    psrlq $32, %xmm4
142; SSE-NEXT:    pmovsxbq {{.*#+}} xmm5 = [18446744073709551615,18446744073709551614]
143; SSE-NEXT:    pmuludq %xmm5, %xmm4
144; SSE-NEXT:    paddq %xmm3, %xmm4
145; SSE-NEXT:    psllq $32, %xmm4
146; SSE-NEXT:    pmuludq %xmm5, %xmm0
147; SSE-NEXT:    paddq %xmm4, %xmm0
148; SSE-NEXT:    pmuludq %xmm1, %xmm2
149; SSE-NEXT:    movdqa %xmm1, %xmm3
150; SSE-NEXT:    psrlq $32, %xmm3
151; SSE-NEXT:    pmovsxbq {{.*#+}} xmm4 = [18446744073709551612,18446744073709551600]
152; SSE-NEXT:    pmuludq %xmm4, %xmm3
153; SSE-NEXT:    paddq %xmm2, %xmm3
154; SSE-NEXT:    psllq $32, %xmm3
155; SSE-NEXT:    pmuludq %xmm4, %xmm1
156; SSE-NEXT:    paddq %xmm3, %xmm1
157; SSE-NEXT:    retq
158;
159; AVX-LABEL: combine_vec_mul_negpow2c:
160; AVX:       # %bb.0:
161; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
162; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm1
163; AVX-NEXT:    vpsrlq $32, %ymm0, %ymm2
164; AVX-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [18446744073709551615,18446744073709551614,18446744073709551612,18446744073709551600]
165; AVX-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
166; AVX-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
167; AVX-NEXT:    vpsllq $32, %ymm1, %ymm1
168; AVX-NEXT:    vpmuludq %ymm3, %ymm0, %ymm0
169; AVX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
170; AVX-NEXT:    retq
171  %1 = mul <4 x i64> %x, <i64 -1, i64 -2, i64 -4, i64 -16>
172  ret <4 x i64> %1
173}
174
175; (mul (shl X, c1), c2) -> (mul X, c2 << c1)
176define <4 x i32> @combine_vec_mul_shl_const(<4 x i32> %x) {
177; SSE-LABEL: combine_vec_mul_shl_const:
178; SSE:       # %bb.0:
179; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
180; SSE-NEXT:    retq
181;
182; AVX-LABEL: combine_vec_mul_shl_const:
183; AVX:       # %bb.0:
184; AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
185; AVX-NEXT:    retq
186  %1 = shl <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
187  %2 = mul <4 x i32> %1, <i32 1, i32 3, i32 5, i32 7>
188  ret <4 x i32> %2
189}
190
191; (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one use.
192define <4 x i32> @combine_vec_mul_shl_oneuse0(<4 x i32> %x, <4 x i32> %y) {
193; SSE-LABEL: combine_vec_mul_shl_oneuse0:
194; SSE:       # %bb.0:
195; SSE-NEXT:    pmulld %xmm1, %xmm0
196; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
197; SSE-NEXT:    retq
198;
199; AVX-LABEL: combine_vec_mul_shl_oneuse0:
200; AVX:       # %bb.0:
201; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
202; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
203; AVX-NEXT:    retq
204  %1 = shl <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
205  %2 = mul <4 x i32> %1, %y
206  ret <4 x i32> %2
207}
208
209define <4 x i32> @combine_vec_mul_shl_oneuse1(<4 x i32> %x, <4 x i32> %y) {
210; SSE-LABEL: combine_vec_mul_shl_oneuse1:
211; SSE:       # %bb.0:
212; SSE-NEXT:    pmulld %xmm1, %xmm0
213; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
214; SSE-NEXT:    retq
215;
216; AVX-LABEL: combine_vec_mul_shl_oneuse1:
217; AVX:       # %bb.0:
218; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
219; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
220; AVX-NEXT:    retq
221  %1 = shl <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
222  %2 = mul <4 x i32> %y, %1
223  ret <4 x i32> %2
224}
225
226define <4 x i32> @combine_vec_mul_shl_multiuse0(<4 x i32> %x, <4 x i32> %y) {
227; SSE-LABEL: combine_vec_mul_shl_multiuse0:
228; SSE:       # %bb.0:
229; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
230; SSE-NEXT:    pmulld %xmm0, %xmm1
231; SSE-NEXT:    paddd %xmm1, %xmm0
232; SSE-NEXT:    retq
233;
234; AVX-LABEL: combine_vec_mul_shl_multiuse0:
235; AVX:       # %bb.0:
236; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
237; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm1
238; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
239; AVX-NEXT:    retq
240  %1 = shl <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
241  %2 = mul <4 x i32> %1, %y
242  %3 = add <4 x i32> %1, %2
243  ret <4 x i32> %3
244}
245
246define <4 x i32> @combine_vec_mul_shl_multiuse1(<4 x i32> %x, <4 x i32> %y) {
247; SSE-LABEL: combine_vec_mul_shl_multiuse1:
248; SSE:       # %bb.0:
249; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
250; SSE-NEXT:    pmulld %xmm0, %xmm1
251; SSE-NEXT:    paddd %xmm1, %xmm0
252; SSE-NEXT:    retq
253;
254; AVX-LABEL: combine_vec_mul_shl_multiuse1:
255; AVX:       # %bb.0:
256; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
257; AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm1
258; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
259; AVX-NEXT:    retq
260  %1 = shl <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
261  %2 = mul <4 x i32> %y, %1
262  %3 = add <4 x i32> %1, %2
263  ret <4 x i32> %3
264}
265
266; fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2)
267
268define <4 x i32> @combine_vec_mul_add(<4 x i32> %x) {
269; SSE-LABEL: combine_vec_mul_add:
270; SSE:       # %bb.0:
271; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
272; SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
273; SSE-NEXT:    retq
274;
275; AVX-LABEL: combine_vec_mul_add:
276; AVX:       # %bb.0:
277; AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
278; AVX-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
279; AVX-NEXT:    retq
280  %1 = add <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
281  %2 = mul <4 x i32> %1, <i32 4, i32 6, i32 2, i32 0>
282  ret <4 x i32> %2
283}
284
285; fold Y = sra (X, size(X)-1); mul (or (Y, 1), X) -> (abs X)
286
287define <16 x i8> @combine_mul_to_abs_v16i8(<16 x i8> %x) {
288; SSE-LABEL: combine_mul_to_abs_v16i8:
289; SSE:       # %bb.0:
290; SSE-NEXT:    pabsb %xmm0, %xmm0
291; SSE-NEXT:    retq
292;
293; AVX-LABEL: combine_mul_to_abs_v16i8:
294; AVX:       # %bb.0:
295; AVX-NEXT:    vpabsb %xmm0, %xmm0
296; AVX-NEXT:    retq
297  %s = ashr <16 x i8> %x, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
298  %o = or <16 x i8> %s, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
299  %m = mul <16 x i8> %o, %x
300  ret <16 x i8> %m
301}
302
303define <2 x i64> @combine_mul_to_abs_v2i64(<2 x i64> %x) {
304; SSE-LABEL: combine_mul_to_abs_v2i64:
305; SSE:       # %bb.0:
306; SSE-NEXT:    pxor %xmm1, %xmm1
307; SSE-NEXT:    psubq %xmm0, %xmm1
308; SSE-NEXT:    blendvpd %xmm0, %xmm1, %xmm0
309; SSE-NEXT:    retq
310;
311; AVX-LABEL: combine_mul_to_abs_v2i64:
312; AVX:       # %bb.0:
313; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
314; AVX-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
315; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
316; AVX-NEXT:    retq
317  %s = ashr <2 x i64> %x, <i64 63, i64 63>
318  %o = or <2 x i64> %s, <i64 1, i64 1>
319  %m = mul <2 x i64> %x, %o
320  ret <2 x i64> %m
321}
322
323; 'Quadratic Reciprocity' - and(mul(x,x),2) -> 0
324
325define i64 @combine_mul_self_knownbits(i64 %x) {
326; SSE-LABEL: combine_mul_self_knownbits:
327; SSE:       # %bb.0:
328; SSE-NEXT:    xorl %eax, %eax
329; SSE-NEXT:    retq
330;
331; AVX-LABEL: combine_mul_self_knownbits:
332; AVX:       # %bb.0:
333; AVX-NEXT:    xorl %eax, %eax
334; AVX-NEXT:    retq
335  %1 = mul i64 %x, %x
336  %2 = and i64 %1, 2
337  ret i64 %2
338}
339
340define <4 x i32> @combine_mul_self_knownbits_vector(<4 x i32> %x) {
341; SSE-LABEL: combine_mul_self_knownbits_vector:
342; SSE:       # %bb.0:
343; SSE-NEXT:    xorps %xmm0, %xmm0
344; SSE-NEXT:    retq
345;
346; AVX-LABEL: combine_mul_self_knownbits_vector:
347; AVX:       # %bb.0:
348; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
349; AVX-NEXT:    retq
350  %1 = mul <4 x i32> %x, %x
351  %2 = and <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
352  ret <4 x i32> %2
353}
354
355; mul(x,x) - bit[1] is 0, but if demanding the other bits the source must not be undef
356
357define i64 @combine_mul_self_demandedbits(i64 %x) {
358; SSE-LABEL: combine_mul_self_demandedbits:
359; SSE:       # %bb.0:
360; SSE-NEXT:    movq %rdi, %rax
361; SSE-NEXT:    imulq %rdi, %rax
362; SSE-NEXT:    retq
363;
364; AVX-LABEL: combine_mul_self_demandedbits:
365; AVX:       # %bb.0:
366; AVX-NEXT:    movq %rdi, %rax
367; AVX-NEXT:    imulq %rdi, %rax
368; AVX-NEXT:    retq
369  %1 = mul i64 %x, %x
370  %2 = and i64 %1, -3
371  ret i64 %2
372}
373
374define <4 x i32> @combine_mul_self_demandedbits_vector(<4 x i32> %x) {
375; SSE-LABEL: combine_mul_self_demandedbits_vector:
376; SSE:       # %bb.0:
377; SSE-NEXT:    pmulld %xmm0, %xmm0
378; SSE-NEXT:    retq
379;
380; AVX-LABEL: combine_mul_self_demandedbits_vector:
381; AVX:       # %bb.0:
382; AVX-NEXT:    vpmulld %xmm0, %xmm0, %xmm0
383; AVX-NEXT:    retq
384  %1 = freeze <4 x i32> %x
385  %2 = mul <4 x i32> %1, %1
386  %3 = and <4 x i32> %2, <i32 -3, i32 -3, i32 -3, i32 -3>
387  ret <4 x i32> %3
388}
389
390; PR59217 - Reuse umul_lohi/smul_lohi node
391
392define i64 @combine_mul_umul_lohi_i64(i64 %a, i64 %b) {
393; SSE-LABEL: combine_mul_umul_lohi_i64:
394; SSE:       # %bb.0:
395; SSE-NEXT:    movq %rdi, %rax
396; SSE-NEXT:    mulq %rsi
397; SSE-NEXT:    xorq %rdx, %rax
398; SSE-NEXT:    retq
399;
400; AVX-LABEL: combine_mul_umul_lohi_i64:
401; AVX:       # %bb.0:
402; AVX-NEXT:    movq %rdi, %rax
403; AVX-NEXT:    mulq %rsi
404; AVX-NEXT:    xorq %rdx, %rax
405; AVX-NEXT:    retq
406  %a128 = zext i64 %a to i128
407  %b128 = zext i64 %b to i128
408  %m128 = mul nuw i128 %a128, %b128
409  %hi128 = lshr i128 %m128, 64
410  %hi = trunc i128 %hi128 to i64
411  %lo = mul i64 %a, %b
412  %r = xor i64 %lo, %hi
413  ret i64 %r
414}
415
416define i64 @combine_mul_smul_lohi_commute_i64(i64 %a, i64 %b) {
417; SSE-LABEL: combine_mul_smul_lohi_commute_i64:
418; SSE:       # %bb.0:
419; SSE-NEXT:    movq %rdi, %rax
420; SSE-NEXT:    imulq %rsi
421; SSE-NEXT:    xorq %rdx, %rax
422; SSE-NEXT:    retq
423;
424; AVX-LABEL: combine_mul_smul_lohi_commute_i64:
425; AVX:       # %bb.0:
426; AVX-NEXT:    movq %rdi, %rax
427; AVX-NEXT:    imulq %rsi
428; AVX-NEXT:    xorq %rdx, %rax
429; AVX-NEXT:    retq
430  %a128 = sext i64 %a to i128
431  %b128 = sext i64 %b to i128
432  %m128 = mul nsw i128 %a128, %b128
433  %hi128 = lshr i128 %m128, 64
434  %hi = trunc i128 %hi128 to i64
435  %lo = mul i64 %b, %a
436  %r = xor i64 %lo, %hi
437  ret i64 %r
438}
439
440define i64 @combine_mul_umul_lohi_const_i64(i64 %h) {
441; SSE-LABEL: combine_mul_umul_lohi_const_i64:
442; SSE:       # %bb.0:
443; SSE-NEXT:    movq %rdi, %rax
444; SSE-NEXT:    movabsq $-4265267296055464877, %rcx # imm = 0xC4CEB9FE1A85EC53
445; SSE-NEXT:    mulq %rcx
446; SSE-NEXT:    xorq %rdx, %rax
447; SSE-NEXT:    retq
448;
449; AVX-LABEL: combine_mul_umul_lohi_const_i64:
450; AVX:       # %bb.0:
451; AVX-NEXT:    movq %rdi, %rax
452; AVX-NEXT:    movabsq $-4265267296055464877, %rcx # imm = 0xC4CEB9FE1A85EC53
453; AVX-NEXT:    mulq %rcx
454; AVX-NEXT:    xorq %rdx, %rax
455; AVX-NEXT:    retq
456  %h128 = zext i64 %h to i128
457  %m128 = mul nuw i128 %h128, 14181476777654086739
458  %hi128 = lshr i128 %m128, 64
459  %hi = trunc i128 %hi128 to i64
460  %lo = mul i64 %h, 14181476777654086739
461  %r = xor i64 %lo, %hi
462  ret i64 %r
463}
464
465define i64 @combine_mul_smul_lohi_const_i64(i64 %h) {
466; SSE-LABEL: combine_mul_smul_lohi_const_i64:
467; SSE:       # %bb.0:
468; SSE-NEXT:    movq %rdi, %rax
469; SSE-NEXT:    movq %rdi, %rcx
470; SSE-NEXT:    sarq $63, %rcx
471; SSE-NEXT:    movabsq $-4265267296055464877, %rsi # imm = 0xC4CEB9FE1A85EC53
472; SSE-NEXT:    mulq %rsi
473; SSE-NEXT:    imulq %rsi, %rcx
474; SSE-NEXT:    addq %rdx, %rcx
475; SSE-NEXT:    xorq %rcx, %rax
476; SSE-NEXT:    retq
477;
478; AVX-LABEL: combine_mul_smul_lohi_const_i64:
479; AVX:       # %bb.0:
480; AVX-NEXT:    movq %rdi, %rax
481; AVX-NEXT:    movq %rdi, %rcx
482; AVX-NEXT:    sarq $63, %rcx
483; AVX-NEXT:    movabsq $-4265267296055464877, %rsi # imm = 0xC4CEB9FE1A85EC53
484; AVX-NEXT:    mulq %rsi
485; AVX-NEXT:    imulq %rsi, %rcx
486; AVX-NEXT:    addq %rdx, %rcx
487; AVX-NEXT:    xorq %rcx, %rax
488; AVX-NEXT:    retq
489  %h128 = sext i64 %h to i128
490  %m128 = mul nsw i128 %h128, 14181476777654086739
491  %hi128 = lshr i128 %m128, 64
492  %hi = trunc i128 %hi128 to i64
493  %lo = mul i64 %h, 14181476777654086739
494  %r = xor i64 %lo, %hi
495  ret i64 %r
496}
497
498; This would infinite loop because DAGCombiner wants to turn this into a shift,
499; but x86 lowering wants to avoid non-uniform vector shift amounts.
500
501define <16 x i8> @PR35579(<16 x i8> %x) {
502; SSE-LABEL: PR35579:
503; SSE:       # %bb.0:
504; SSE-NEXT:    movdqa %xmm0, %xmm1
505; SSE-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
506; SSE-NEXT:    psllw $8, %xmm1
507; SSE-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,2,0,4,0,2,0,8,0,2,0,4,0,2,0]
508; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
509; SSE-NEXT:    por %xmm1, %xmm0
510; SSE-NEXT:    retq
511;
512; AVX-LABEL: PR35579:
513; AVX:       # %bb.0:
514; AVX-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
515; AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,1,2,1,4,1,2,1,8,1,2,1,4,1,2,1]
516; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
517; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
518; AVX-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
519; AVX-NEXT:    vzeroupper
520; AVX-NEXT:    retq
521  %r = mul <16 x i8> %x, <i8 0, i8 1, i8 2, i8 1, i8 4, i8 1, i8 2, i8 1, i8 8, i8 1, i8 2, i8 1, i8 4, i8 1, i8 2, i8 1>
522  ret <16 x i8> %r
523}
524
525; OSS Fuzz: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=15429
526define <4 x i64> @fuzz15429(<4 x i64> %InVec) {
527; SSE-LABEL: fuzz15429:
528; SSE:       # %bb.0:
529; SSE-NEXT:    movdqa %xmm1, %xmm2
530; SSE-NEXT:    psllq $3, %xmm2
531; SSE-NEXT:    psllq $2, %xmm1
532; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
533; SSE-NEXT:    paddq %xmm0, %xmm0
534; SSE-NEXT:    movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
535; SSE-NEXT:    pinsrq $0, %rax, %xmm0
536; SSE-NEXT:    retq
537;
538; AVX-LABEL: fuzz15429:
539; AVX:       # %bb.0:
540; AVX-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
541; AVX-NEXT:    movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
542; AVX-NEXT:    vmovq %rax, %xmm1
543; AVX-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
544; AVX-NEXT:    retq
545  %mul = mul <4 x i64> %InVec, <i64 1, i64 2, i64 4, i64 8>
546  %I = insertelement <4 x i64> %mul, i64 9223372036854775807, i64 0
547  ret <4 x i64> %I
548}
549