xref: /llvm-project/llvm/test/CodeGen/X86/mmx-fold-zero.ll (revision b7e4fba6e5dcae5ff51f8eced21470a1b3ccd895)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X86
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 -code-model=small | FileCheck %s --check-prefix=X64
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 -code-model=medium | FileCheck %s --check-prefix=X64
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 -code-model=large | FileCheck %s --check-prefix=X64-LARGE
6
7define double @mmx_zero(double, double, double, double) nounwind {
8; X86-LABEL: mmx_zero:
9; X86:       # %bb.0:
10; X86-NEXT:    pushl %ebp
11; X86-NEXT:    movl %esp, %ebp
12; X86-NEXT:    andl $-8, %esp
13; X86-NEXT:    subl $16, %esp
14; X86-NEXT:    movq 8(%ebp), %mm0
15; X86-NEXT:    movq 16(%ebp), %mm5
16; X86-NEXT:    movq %mm5, (%esp) # 8-byte Spill
17; X86-NEXT:    movq %mm0, %mm3
18; X86-NEXT:    paddd %mm5, %mm3
19; X86-NEXT:    pxor %mm1, %mm1
20; X86-NEXT:    movq %mm3, %mm6
21; X86-NEXT:    pmuludq %mm1, %mm6
22; X86-NEXT:    movq 24(%ebp), %mm4
23; X86-NEXT:    movq %mm6, %mm2
24; X86-NEXT:    paddd %mm4, %mm2
25; X86-NEXT:    paddw %mm2, %mm0
26; X86-NEXT:    movq %mm5, %mm1
27; X86-NEXT:    paddw %mm0, %mm1
28; X86-NEXT:    movq 32(%ebp), %mm5
29; X86-NEXT:    movq %mm1, %mm7
30; X86-NEXT:    pmuludq %mm5, %mm7
31; X86-NEXT:    paddw %mm4, %mm7
32; X86-NEXT:    paddw %mm7, %mm5
33; X86-NEXT:    paddw %mm5, %mm2
34; X86-NEXT:    paddw %mm2, %mm0
35; X86-NEXT:    paddw %mm6, %mm0
36; X86-NEXT:    pmuludq %mm3, %mm0
37; X86-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}, %mm0
38; X86-NEXT:    paddw %mm1, %mm0
39; X86-NEXT:    pmuludq %mm7, %mm0
40; X86-NEXT:    pmuludq (%esp), %mm0 # 8-byte Folded Reload
41; X86-NEXT:    paddw %mm5, %mm0
42; X86-NEXT:    paddw %mm2, %mm0
43; X86-NEXT:    movq2dq %mm0, %xmm0
44; X86-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
45; X86-NEXT:    fldl {{[0-9]+}}(%esp)
46; X86-NEXT:    movl %ebp, %esp
47; X86-NEXT:    popl %ebp
48; X86-NEXT:    retl
49;
50; X64-LABEL: mmx_zero:
51; X64:       # %bb.0:
52; X64-NEXT:    movdq2q %xmm0, %mm0
53; X64-NEXT:    movdq2q %xmm1, %mm5
54; X64-NEXT:    movq %mm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
55; X64-NEXT:    movq %mm0, %mm3
56; X64-NEXT:    paddd %mm5, %mm3
57; X64-NEXT:    pxor %mm1, %mm1
58; X64-NEXT:    movq %mm3, %mm6
59; X64-NEXT:    pmuludq %mm1, %mm6
60; X64-NEXT:    movdq2q %xmm2, %mm4
61; X64-NEXT:    movq %mm6, %mm2
62; X64-NEXT:    paddd %mm4, %mm2
63; X64-NEXT:    paddw %mm2, %mm0
64; X64-NEXT:    movq %mm5, %mm1
65; X64-NEXT:    paddw %mm0, %mm1
66; X64-NEXT:    movdq2q %xmm3, %mm5
67; X64-NEXT:    movq %mm1, %mm7
68; X64-NEXT:    pmuludq %mm5, %mm7
69; X64-NEXT:    paddw %mm4, %mm7
70; X64-NEXT:    paddw %mm7, %mm5
71; X64-NEXT:    paddw %mm5, %mm2
72; X64-NEXT:    paddw %mm2, %mm0
73; X64-NEXT:    paddw %mm6, %mm0
74; X64-NEXT:    pmuludq %mm3, %mm0
75; X64-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %mm0
76; X64-NEXT:    paddw %mm1, %mm0
77; X64-NEXT:    pmuludq %mm7, %mm0
78; X64-NEXT:    pmuludq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
79; X64-NEXT:    paddw %mm5, %mm0
80; X64-NEXT:    paddw %mm2, %mm0
81; X64-NEXT:    movq2dq %mm0, %xmm0
82; X64-NEXT:    retq
83;
84; X64-LARGE-LABEL: mmx_zero:
85; X64-LARGE:       # %bb.0:
86; X64-LARGE-NEXT:    movdq2q %xmm0, %mm0
87; X64-LARGE-NEXT:    movdq2q %xmm1, %mm5
88; X64-LARGE-NEXT:    movq %mm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
89; X64-LARGE-NEXT:    movq %mm0, %mm3
90; X64-LARGE-NEXT:    paddd %mm5, %mm3
91; X64-LARGE-NEXT:    pxor %mm1, %mm1
92; X64-LARGE-NEXT:    movq %mm3, %mm6
93; X64-LARGE-NEXT:    pmuludq %mm1, %mm6
94; X64-LARGE-NEXT:    movdq2q %xmm2, %mm4
95; X64-LARGE-NEXT:    movq %mm6, %mm2
96; X64-LARGE-NEXT:    paddd %mm4, %mm2
97; X64-LARGE-NEXT:    paddw %mm2, %mm0
98; X64-LARGE-NEXT:    movq %mm5, %mm1
99; X64-LARGE-NEXT:    paddw %mm0, %mm1
100; X64-LARGE-NEXT:    movdq2q %xmm3, %mm5
101; X64-LARGE-NEXT:    movq %mm1, %mm7
102; X64-LARGE-NEXT:    pmuludq %mm5, %mm7
103; X64-LARGE-NEXT:    paddw %mm4, %mm7
104; X64-LARGE-NEXT:    paddw %mm7, %mm5
105; X64-LARGE-NEXT:    paddw %mm5, %mm2
106; X64-LARGE-NEXT:    paddw %mm2, %mm0
107; X64-LARGE-NEXT:    paddw %mm6, %mm0
108; X64-LARGE-NEXT:    pmuludq %mm3, %mm0
109; X64-LARGE-NEXT:    pxor %mm3, %mm3
110; X64-LARGE-NEXT:    paddw %mm3, %mm0
111; X64-LARGE-NEXT:    paddw %mm1, %mm0
112; X64-LARGE-NEXT:    pmuludq %mm7, %mm0
113; X64-LARGE-NEXT:    pmuludq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
114; X64-LARGE-NEXT:    paddw %mm5, %mm0
115; X64-LARGE-NEXT:    paddw %mm2, %mm0
116; X64-LARGE-NEXT:    movq2dq %mm0, %xmm0
117; X64-LARGE-NEXT:    retq
118  %5 = bitcast double %0 to <1 x i64>
119  %6 = bitcast double %1 to <1 x i64>
120  %7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %6)
121  %8 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %7, <1 x i64> bitcast (double 0.000000e+00 to <1 x i64>))
122  %9 = bitcast double %2 to <1 x i64>
123  %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %8, <1 x i64> %9)
124  %11 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %5, <1 x i64> %10)
125  %12 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %6, <1 x i64> %11)
126  %13 = bitcast double %3 to <1 x i64>
127  %14 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %12, <1 x i64> %13)
128  %15 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %14, <1 x i64> %9)
129  %16 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %15, <1 x i64> %13)
130  %17 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %16, <1 x i64> %10)
131  %18 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %17, <1 x i64> %11)
132  %19 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %18, <1 x i64> %8)
133  %20 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %19, <1 x i64> %7)
134  %21 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %20, <1 x i64> bitcast (double 0.000000e+00 to <1 x i64>))
135  %22 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %21, <1 x i64> %12)
136  %23 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %22, <1 x i64> %15)
137  %24 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %23, <1 x i64> %6)
138  %25 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %24, <1 x i64> %16)
139  %26 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %25, <1 x i64> %17)
140  %27 = bitcast <1 x i64> %26 to double
141  ret double %27
142}
143
144declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
145declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>)
146declare <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64>, <1 x i64>)
147