xref: /llvm-project/llvm/test/CodeGen/X86/arithmetic_fence2.ll (revision 02dfbbff1937b3e2c1ee1cd4a5ad0a9f03ee23ea)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
4
5define double @f1(double %a) {
6; X86-LABEL: f1:
7; X86:       # %bb.0:
8; X86-NEXT:    pushl %ebp
9; X86-NEXT:    .cfi_def_cfa_offset 8
10; X86-NEXT:    .cfi_offset %ebp, -8
11; X86-NEXT:    movl %esp, %ebp
12; X86-NEXT:    .cfi_def_cfa_register %ebp
13; X86-NEXT:    andl $-8, %esp
14; X86-NEXT:    subl $8, %esp
15; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
16; X86-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
17; X86-NEXT:    movsd %xmm0, (%esp)
18; X86-NEXT:    fldl (%esp)
19; X86-NEXT:    movl %ebp, %esp
20; X86-NEXT:    popl %ebp
21; X86-NEXT:    .cfi_def_cfa %esp, 4
22; X86-NEXT:    retl
23;
24; X64-LABEL: f1:
25; X64:       # %bb.0:
26; X64-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
27; X64-NEXT:    retq
28  %1 = fadd fast double %a, %a
29  %2 = fadd fast double %a, %a
30  %3 = fadd fast double %1, %2
31  ret double %3
32}
33
34define double @f2(double %a) {
35; X86-LABEL: f2:
36; X86:       # %bb.0:
37; X86-NEXT:    pushl %ebp
38; X86-NEXT:    .cfi_def_cfa_offset 8
39; X86-NEXT:    .cfi_offset %ebp, -8
40; X86-NEXT:    movl %esp, %ebp
41; X86-NEXT:    .cfi_def_cfa_register %ebp
42; X86-NEXT:    andl $-8, %esp
43; X86-NEXT:    subl $8, %esp
44; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
45; X86-NEXT:    addsd %xmm0, %xmm0
46; X86-NEXT:    movapd %xmm0, %xmm1
47; X86-NEXT:    #ARITH_FENCE
48; X86-NEXT:    addsd %xmm0, %xmm1
49; X86-NEXT:    movsd %xmm1, (%esp)
50; X86-NEXT:    fldl (%esp)
51; X86-NEXT:    movl %ebp, %esp
52; X86-NEXT:    popl %ebp
53; X86-NEXT:    .cfi_def_cfa %esp, 4
54; X86-NEXT:    retl
55;
56; X64-LABEL: f2:
57; X64:       # %bb.0:
58; X64-NEXT:    addsd %xmm0, %xmm0
59; X64-NEXT:    movapd %xmm0, %xmm1
60; X64-NEXT:    #ARITH_FENCE
61; X64-NEXT:    addsd %xmm1, %xmm0
62; X64-NEXT:    retq
63  %1 = fadd fast double %a, %a
64  %t = call double @llvm.arithmetic.fence.f64(double %1)
65  %2 = fadd fast double %a, %a
66  %3 = fadd fast double %t, %2
67  ret double %3
68}
69
70define <2 x float> @f3(<2 x float> %a) {
71; X86-LABEL: f3:
72; X86:       # %bb.0:
73; X86-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
74; X86-NEXT:    retl
75;
76; X64-LABEL: f3:
77; X64:       # %bb.0:
78; X64-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
79; X64-NEXT:    retq
80  %1 = fadd fast <2 x float> %a, %a
81  %2 = fadd fast <2 x float> %a, %a
82  %3 = fadd fast <2 x float> %1, %2
83  ret <2 x float> %3
84}
85
86define <2 x float> @f4(<2 x float> %a) {
87; X86-LABEL: f4:
88; X86:       # %bb.0:
89; X86-NEXT:    addps %xmm0, %xmm0
90; X86-NEXT:    movaps %xmm0, %xmm1
91; X86-NEXT:    #ARITH_FENCE
92; X86-NEXT:    addps %xmm1, %xmm0
93; X86-NEXT:    retl
94;
95; X64-LABEL: f4:
96; X64:       # %bb.0:
97; X64-NEXT:    addps %xmm0, %xmm0
98; X64-NEXT:    movaps %xmm0, %xmm1
99; X64-NEXT:    #ARITH_FENCE
100; X64-NEXT:    addps %xmm1, %xmm0
101; X64-NEXT:    retq
102  %1 = fadd fast <2 x float> %a, %a
103  %t = call <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float> %1)
104  %2 = fadd fast <2 x float> %a, %a
105  %3 = fadd fast <2 x float> %t, %2
106  ret <2 x float> %3
107}
108
109define <8 x float> @f5(<8 x float> %a) {
110; X86-LABEL: f5:
111; X86:       # %bb.0:
112; X86-NEXT:    movaps {{.*#+}} xmm2 = [4.0E+0,4.0E+0,4.0E+0,4.0E+0]
113; X86-NEXT:    mulps %xmm2, %xmm0
114; X86-NEXT:    mulps %xmm2, %xmm1
115; X86-NEXT:    retl
116;
117; X64-LABEL: f5:
118; X64:       # %bb.0:
119; X64-NEXT:    movaps {{.*#+}} xmm2 = [4.0E+0,4.0E+0,4.0E+0,4.0E+0]
120; X64-NEXT:    mulps %xmm2, %xmm0
121; X64-NEXT:    mulps %xmm2, %xmm1
122; X64-NEXT:    retq
123  %1 = fadd fast <8 x float> %a, %a
124  %2 = fadd fast <8 x float> %a, %a
125  %3 = fadd fast <8 x float> %1, %2
126  ret <8 x float> %3
127}
128
129define <8 x float> @f6(<8 x float> %a) {
130; X86-LABEL: f6:
131; X86:       # %bb.0:
132; X86-NEXT:    addps %xmm0, %xmm0
133; X86-NEXT:    addps %xmm1, %xmm1
134; X86-NEXT:    movaps %xmm1, %xmm2
135; X86-NEXT:    #ARITH_FENCE
136; X86-NEXT:    movaps %xmm0, %xmm3
137; X86-NEXT:    #ARITH_FENCE
138; X86-NEXT:    addps %xmm3, %xmm0
139; X86-NEXT:    addps %xmm2, %xmm1
140; X86-NEXT:    retl
141;
142; X64-LABEL: f6:
143; X64:       # %bb.0:
144; X64-NEXT:    addps %xmm0, %xmm0
145; X64-NEXT:    addps %xmm1, %xmm1
146; X64-NEXT:    movaps %xmm1, %xmm2
147; X64-NEXT:    #ARITH_FENCE
148; X64-NEXT:    movaps %xmm0, %xmm3
149; X64-NEXT:    #ARITH_FENCE
150; X64-NEXT:    addps %xmm3, %xmm0
151; X64-NEXT:    addps %xmm2, %xmm1
152; X64-NEXT:    retq
153  %1 = fadd fast <8 x float> %a, %a
154  %t = call <8 x float> @llvm.arithmetic.fence.v8f32(<8 x float> %1)
155  %2 = fadd fast <8 x float> %a, %a
156  %3 = fadd fast <8 x float> %t, %2
157  ret <8 x float> %3
158}
159
160define half @f7(half %a) nounwind {
161; X86-LABEL: f7:
162; X86:       # %bb.0:
163; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
164; X86-NEXT:    #ARITH_FENCE
165; X86-NEXT:    retl
166;
167; X64-LABEL: f7:
168; X64:       # %bb.0:
169; X64-NEXT:    #ARITH_FENCE
170; X64-NEXT:    retq
171  %b = call half @llvm.arithmetic.fence.f16(half %a)
172  ret half %b
173}
174
175define bfloat @f8(bfloat %a) nounwind {
176; X86-LABEL: f8:
177; X86:       # %bb.0:
178; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
179; X86-NEXT:    #ARITH_FENCE
180; X86-NEXT:    pinsrw $0, %eax, %xmm0
181; X86-NEXT:    retl
182;
183; X64-LABEL: f8:
184; X64:       # %bb.0:
185; X64-NEXT:    pextrw $0, %xmm0, %eax
186; X64-NEXT:    #ARITH_FENCE
187; X64-NEXT:    pinsrw $0, %eax, %xmm0
188; X64-NEXT:    retq
189  %b = call bfloat @llvm.arithmetic.fence.bf16(bfloat %a)
190  ret bfloat %b
191}
192
193define <2 x half> @f9(<2 x half> %a) nounwind {
194; X86-LABEL: f9:
195; X86:       # %bb.0:
196; X86-NEXT:    movdqa %xmm0, %xmm1
197; X86-NEXT:    psrld $16, %xmm1
198; X86-NEXT:    #ARITH_FENCE
199; X86-NEXT:    #ARITH_FENCE
200; X86-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
201; X86-NEXT:    retl
202;
203; X64-LABEL: f9:
204; X64:       # %bb.0:
205; X64-NEXT:    movdqa %xmm0, %xmm1
206; X64-NEXT:    psrld $16, %xmm1
207; X64-NEXT:    #ARITH_FENCE
208; X64-NEXT:    #ARITH_FENCE
209; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
210; X64-NEXT:    retq
211  %b = call <2 x half> @llvm.arithmetic.fence.v2f16(<2 x half> %a)
212  ret <2 x half> %b
213}
214
215define <3 x bfloat> @f10(<3 x bfloat> %a) nounwind {
216; X86-LABEL: f10:
217; X86:       # %bb.0:
218; X86-NEXT:    pextrw $0, %xmm0, %eax
219; X86-NEXT:    movdqa %xmm0, %xmm1
220; X86-NEXT:    psrld $16, %xmm1
221; X86-NEXT:    pextrw $0, %xmm1, %ecx
222; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
223; X86-NEXT:    pextrw $0, %xmm0, %edx
224; X86-NEXT:    #ARITH_FENCE
225; X86-NEXT:    #ARITH_FENCE
226; X86-NEXT:    #ARITH_FENCE
227; X86-NEXT:    pinsrw $0, %eax, %xmm0
228; X86-NEXT:    pinsrw $0, %ecx, %xmm1
229; X86-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
230; X86-NEXT:    pinsrw $0, %edx, %xmm1
231; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
232; X86-NEXT:    retl
233;
234; X64-LABEL: f10:
235; X64:       # %bb.0:
236; X64-NEXT:    pextrw $0, %xmm0, %eax
237; X64-NEXT:    movdqa %xmm0, %xmm1
238; X64-NEXT:    psrld $16, %xmm1
239; X64-NEXT:    pextrw $0, %xmm1, %ecx
240; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
241; X64-NEXT:    pextrw $0, %xmm0, %edx
242; X64-NEXT:    #ARITH_FENCE
243; X64-NEXT:    #ARITH_FENCE
244; X64-NEXT:    #ARITH_FENCE
245; X64-NEXT:    pinsrw $0, %eax, %xmm0
246; X64-NEXT:    pinsrw $0, %ecx, %xmm1
247; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
248; X64-NEXT:    pinsrw $0, %edx, %xmm1
249; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
250; X64-NEXT:    retq
251  %b = call <3 x bfloat> @llvm.arithmetic.fence.v3bf16(<3 x bfloat> %a)
252  ret <3 x bfloat> %b
253}
254
255define <4 x bfloat> @f11(<4 x bfloat> %a) nounwind {
256; X86-LABEL: f11:
257; X86:       # %bb.0:
258; X86-NEXT:    pushl %esi
259; X86-NEXT:    movdqa %xmm0, %xmm1
260; X86-NEXT:    psrlq $48, %xmm1
261; X86-NEXT:    pextrw $0, %xmm1, %eax
262; X86-NEXT:    movdqa %xmm0, %xmm1
263; X86-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
264; X86-NEXT:    pextrw $0, %xmm1, %edx
265; X86-NEXT:    pextrw $0, %xmm0, %ecx
266; X86-NEXT:    psrld $16, %xmm0
267; X86-NEXT:    pextrw $0, %xmm0, %esi
268; X86-NEXT:    #ARITH_FENCE
269; X86-NEXT:    #ARITH_FENCE
270; X86-NEXT:    #ARITH_FENCE
271; X86-NEXT:    #ARITH_FENCE
272; X86-NEXT:    pinsrw $0, %eax, %xmm0
273; X86-NEXT:    pinsrw $0, %edx, %xmm1
274; X86-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
275; X86-NEXT:    pinsrw $0, %ecx, %xmm0
276; X86-NEXT:    pinsrw $0, %esi, %xmm2
277; X86-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
278; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
279; X86-NEXT:    popl %esi
280; X86-NEXT:    retl
281;
282; X64-LABEL: f11:
283; X64:       # %bb.0:
284; X64-NEXT:    movdqa %xmm0, %xmm1
285; X64-NEXT:    psrlq $48, %xmm1
286; X64-NEXT:    pextrw $0, %xmm1, %eax
287; X64-NEXT:    movdqa %xmm0, %xmm1
288; X64-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
289; X64-NEXT:    pextrw $0, %xmm1, %ecx
290; X64-NEXT:    pextrw $0, %xmm0, %edx
291; X64-NEXT:    psrld $16, %xmm0
292; X64-NEXT:    pextrw $0, %xmm0, %esi
293; X64-NEXT:    #ARITH_FENCE
294; X64-NEXT:    #ARITH_FENCE
295; X64-NEXT:    #ARITH_FENCE
296; X64-NEXT:    #ARITH_FENCE
297; X64-NEXT:    pinsrw $0, %eax, %xmm0
298; X64-NEXT:    pinsrw $0, %ecx, %xmm1
299; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
300; X64-NEXT:    pinsrw $0, %edx, %xmm0
301; X64-NEXT:    pinsrw $0, %esi, %xmm2
302; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
303; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
304; X64-NEXT:    retq
305  %b = call <4 x bfloat> @llvm.arithmetic.fence.v4bf16(<4 x bfloat> %a)
306  ret <4 x bfloat> %b
307}
308
309declare half @llvm.arithmetic.fence.f16(half)
310declare bfloat @llvm.arithmetic.fence.bf16(bfloat)
311declare <2 x half> @llvm.arithmetic.fence.v2f16(<2 x half>)
312declare <3 x bfloat> @llvm.arithmetic.fence.v3bf16(<3 x bfloat>)
313declare <4 x bfloat> @llvm.arithmetic.fence.v4bf16(<4 x bfloat>)
314declare float @llvm.arithmetic.fence.f32(float)
315declare double @llvm.arithmetic.fence.f64(double)
316declare <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float>)
317declare <8 x float> @llvm.arithmetic.fence.v8f32(<8 x float>)
318