xref: /llvm-project/llvm/test/CodeGen/X86/fp-round.ll (revision 834cc88c5d08ca55664b7742590463de813d768f)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-freebsd -mattr=+avx | FileCheck %s --check-prefixes=AVX1
5; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512F
6; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512fp16,+avx512vl | FileCheck %s --check-prefixes=AVX512FP16
7
8define half @round_f16(half %h) {
9; SSE2-LABEL: round_f16:
10; SSE2:       # %bb.0: # %entry
11; SSE2-NEXT:    pushq %rax
12; SSE2-NEXT:    .cfi_def_cfa_offset 16
13; SSE2-NEXT:    callq __extendhfsf2@PLT
14; SSE2-NEXT:    callq roundf@PLT
15; SSE2-NEXT:    callq __truncsfhf2@PLT
16; SSE2-NEXT:    popq %rax
17; SSE2-NEXT:    .cfi_def_cfa_offset 8
18; SSE2-NEXT:    retq
19;
20; SSE41-LABEL: round_f16:
21; SSE41:       # %bb.0: # %entry
22; SSE41-NEXT:    pushq %rax
23; SSE41-NEXT:    .cfi_def_cfa_offset 16
24; SSE41-NEXT:    callq __extendhfsf2@PLT
25; SSE41-NEXT:    movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
26; SSE41-NEXT:    andps %xmm0, %xmm1
27; SSE41-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
28; SSE41-NEXT:    addss %xmm0, %xmm1
29; SSE41-NEXT:    xorps %xmm0, %xmm0
30; SSE41-NEXT:    roundss $11, %xmm1, %xmm0
31; SSE41-NEXT:    callq __truncsfhf2@PLT
32; SSE41-NEXT:    popq %rax
33; SSE41-NEXT:    .cfi_def_cfa_offset 8
34; SSE41-NEXT:    retq
35;
36; AVX1-LABEL: round_f16:
37; AVX1:       # %bb.0: # %entry
38; AVX1-NEXT:    pushq %rax
39; AVX1-NEXT:    .cfi_def_cfa_offset 16
40; AVX1-NEXT:    callq __extendhfsf2@PLT
41; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
42; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
43; AVX1-NEXT:    vorps %xmm2, %xmm1, %xmm1
44; AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm0
45; AVX1-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
46; AVX1-NEXT:    callq __truncsfhf2@PLT
47; AVX1-NEXT:    popq %rax
48; AVX1-NEXT:    .cfi_def_cfa_offset 8
49; AVX1-NEXT:    retq
50;
51; AVX512F-LABEL: round_f16:
52; AVX512F:       # %bb.0: # %entry
53; AVX512F-NEXT:    vpextrw $0, %xmm0, %eax
54; AVX512F-NEXT:    movzwl %ax, %eax
55; AVX512F-NEXT:    vmovd %eax, %xmm0
56; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
57; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
58; AVX512F-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1
59; AVX512F-NEXT:    vaddss %xmm1, %xmm0, %xmm0
60; AVX512F-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
61; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
62; AVX512F-NEXT:    vmovd %xmm0, %eax
63; AVX512F-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
64; AVX512F-NEXT:    retq
65;
66; AVX512FP16-LABEL: round_f16:
67; AVX512FP16:       ## %bb.0: ## %entry
68; AVX512FP16-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
69; AVX512FP16-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1]
70; AVX512FP16-NEXT:    vpternlogq $248, %xmm1, %xmm0, %xmm2
71; AVX512FP16-NEXT:    vaddsh %xmm2, %xmm0, %xmm0
72; AVX512FP16-NEXT:    vrndscalesh $11, %xmm0, %xmm0, %xmm0
73; AVX512FP16-NEXT:    retq
74entry:
75  %a = call half @llvm.round.f16(half %h)
76  ret half %a
77}
78
79define float @round_f32(float %x) {
80; SSE2-LABEL: round_f32:
81; SSE2:       # %bb.0:
82; SSE2-NEXT:    jmp roundf@PLT # TAILCALL
83;
84; SSE41-LABEL: round_f32:
85; SSE41:       # %bb.0:
86; SSE41-NEXT:    movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
87; SSE41-NEXT:    andps %xmm0, %xmm1
88; SSE41-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
89; SSE41-NEXT:    addss %xmm0, %xmm1
90; SSE41-NEXT:    xorps %xmm0, %xmm0
91; SSE41-NEXT:    roundss $11, %xmm1, %xmm0
92; SSE41-NEXT:    retq
93;
94; AVX1-LABEL: round_f32:
95; AVX1:       # %bb.0:
96; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
97; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
98; AVX1-NEXT:    vorps %xmm2, %xmm1, %xmm1
99; AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm0
100; AVX1-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
101; AVX1-NEXT:    retq
102;
103; AVX512F-LABEL: round_f32:
104; AVX512F:       # %bb.0:
105; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
106; AVX512F-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1
107; AVX512F-NEXT:    vaddss %xmm1, %xmm0, %xmm0
108; AVX512F-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
109; AVX512F-NEXT:    retq
110;
111; AVX512FP16-LABEL: round_f32:
112; AVX512FP16:       ## %bb.0:
113; AVX512FP16-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
114; AVX512FP16-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1
115; AVX512FP16-NEXT:    vaddss %xmm1, %xmm0, %xmm0
116; AVX512FP16-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
117; AVX512FP16-NEXT:    retq
118  %a = call float @llvm.round.f32(float %x)
119  ret float %a
120}
121
122define double @round_f64(double %x) {
123; SSE2-LABEL: round_f64:
124; SSE2:       # %bb.0:
125; SSE2-NEXT:    jmp round@PLT # TAILCALL
126;
127; SSE41-LABEL: round_f64:
128; SSE41:       # %bb.0:
129; SSE41-NEXT:    movapd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0]
130; SSE41-NEXT:    andpd %xmm0, %xmm1
131; SSE41-NEXT:    orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
132; SSE41-NEXT:    addsd %xmm0, %xmm1
133; SSE41-NEXT:    xorps %xmm0, %xmm0
134; SSE41-NEXT:    roundsd $11, %xmm1, %xmm0
135; SSE41-NEXT:    retq
136;
137; AVX1-LABEL: round_f64:
138; AVX1:       # %bb.0:
139; AVX1-NEXT:    vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
140; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1]
141; AVX1-NEXT:    # xmm2 = mem[0,0]
142; AVX1-NEXT:    vorpd %xmm2, %xmm1, %xmm1
143; AVX1-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
144; AVX1-NEXT:    vroundsd $11, %xmm0, %xmm0, %xmm0
145; AVX1-NEXT:    retq
146;
147; AVX512F-LABEL: round_f64:
148; AVX512F:       # %bb.0:
149; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1]
150; AVX512F-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm1
151; AVX512F-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
152; AVX512F-NEXT:    vroundsd $11, %xmm0, %xmm0, %xmm0
153; AVX512F-NEXT:    retq
154;
155; AVX512FP16-LABEL: round_f64:
156; AVX512FP16:       ## %bb.0:
157; AVX512FP16-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1]
158; AVX512FP16-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm1
159; AVX512FP16-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
160; AVX512FP16-NEXT:    vroundsd $11, %xmm0, %xmm0, %xmm0
161; AVX512FP16-NEXT:    retq
162  %a = call double @llvm.round.f64(double %x)
163  ret double %a
164}
165
166define <4 x float> @round_v4f32(<4 x float> %x) {
167; SSE2-LABEL: round_v4f32:
168; SSE2:       # %bb.0:
169; SSE2-NEXT:    subq $56, %rsp
170; SSE2-NEXT:    .cfi_def_cfa_offset 64
171; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
172; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
173; SSE2-NEXT:    callq roundf@PLT
174; SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
175; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
176; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
177; SSE2-NEXT:    callq roundf@PLT
178; SSE2-NEXT:    unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
179; SSE2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
180; SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
181; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
182; SSE2-NEXT:    callq roundf@PLT
183; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
184; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
185; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
186; SSE2-NEXT:    callq roundf@PLT
187; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
188; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
189; SSE2-NEXT:    unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload
190; SSE2-NEXT:    # xmm1 = xmm1[0],mem[0]
191; SSE2-NEXT:    movaps %xmm1, %xmm0
192; SSE2-NEXT:    addq $56, %rsp
193; SSE2-NEXT:    .cfi_def_cfa_offset 8
194; SSE2-NEXT:    retq
195;
196; SSE41-LABEL: round_v4f32:
197; SSE41:       # %bb.0:
198; SSE41-NEXT:    movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
199; SSE41-NEXT:    andps %xmm0, %xmm1
200; SSE41-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
201; SSE41-NEXT:    addps %xmm0, %xmm1
202; SSE41-NEXT:    roundps $11, %xmm1, %xmm0
203; SSE41-NEXT:    retq
204;
205; AVX1-LABEL: round_v4f32:
206; AVX1:       # %bb.0:
207; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
208; AVX1-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
209; AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0
210; AVX1-NEXT:    vroundps $11, %xmm0, %xmm0
211; AVX1-NEXT:    retq
212;
213; AVX512F-LABEL: round_v4f32:
214; AVX512F:       # %bb.0:
215; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
216; AVX512F-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1
217; AVX512F-NEXT:    vaddps %xmm1, %xmm0, %xmm0
218; AVX512F-NEXT:    vroundps $11, %xmm0, %xmm0
219; AVX512F-NEXT:    retq
220;
221; AVX512FP16-LABEL: round_v4f32:
222; AVX512FP16:       ## %bb.0:
223; AVX512FP16-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
224; AVX512FP16-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1
225; AVX512FP16-NEXT:    vaddps %xmm1, %xmm0, %xmm0
226; AVX512FP16-NEXT:    vroundps $11, %xmm0, %xmm0
227; AVX512FP16-NEXT:    retq
228  %a = call <4 x float> @llvm.round.v4f32(<4 x float> %x)
229  ret <4 x float> %a
230}
231
232define <2 x double> @round_v2f64(<2 x double> %x) {
233; SSE2-LABEL: round_v2f64:
234; SSE2:       # %bb.0:
235; SSE2-NEXT:    subq $40, %rsp
236; SSE2-NEXT:    .cfi_def_cfa_offset 48
237; SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
238; SSE2-NEXT:    callq round@PLT
239; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
240; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
241; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
242; SSE2-NEXT:    callq round@PLT
243; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
244; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
245; SSE2-NEXT:    movaps %xmm1, %xmm0
246; SSE2-NEXT:    addq $40, %rsp
247; SSE2-NEXT:    .cfi_def_cfa_offset 8
248; SSE2-NEXT:    retq
249;
250; SSE41-LABEL: round_v2f64:
251; SSE41:       # %bb.0:
252; SSE41-NEXT:    movapd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0]
253; SSE41-NEXT:    andpd %xmm0, %xmm1
254; SSE41-NEXT:    orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
255; SSE41-NEXT:    addpd %xmm0, %xmm1
256; SSE41-NEXT:    roundpd $11, %xmm1, %xmm0
257; SSE41-NEXT:    retq
258;
259; AVX1-LABEL: round_v2f64:
260; AVX1:       # %bb.0:
261; AVX1-NEXT:    vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
262; AVX1-NEXT:    vorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
263; AVX1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
264; AVX1-NEXT:    vroundpd $11, %xmm0, %xmm0
265; AVX1-NEXT:    retq
266;
267; AVX512F-LABEL: round_v2f64:
268; AVX512F:       # %bb.0:
269; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1]
270; AVX512F-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm1
271; AVX512F-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
272; AVX512F-NEXT:    vroundpd $11, %xmm0, %xmm0
273; AVX512F-NEXT:    retq
274;
275; AVX512FP16-LABEL: round_v2f64:
276; AVX512FP16:       ## %bb.0:
277; AVX512FP16-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1]
278; AVX512FP16-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm1
279; AVX512FP16-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
280; AVX512FP16-NEXT:    vroundpd $11, %xmm0, %xmm0
281; AVX512FP16-NEXT:    retq
282  %a = call <2 x double> @llvm.round.v2f64(<2 x double> %x)
283  ret <2 x double> %a
284}
285
286define <8 x float> @round_v8f32(<8 x float> %x) {
287; SSE2-LABEL: round_v8f32:
288; SSE2:       # %bb.0:
289; SSE2-NEXT:    subq $72, %rsp
290; SSE2-NEXT:    .cfi_def_cfa_offset 80
291; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
292; SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
293; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
294; SSE2-NEXT:    callq roundf@PLT
295; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
296; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
297; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
298; SSE2-NEXT:    callq roundf@PLT
299; SSE2-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
300; SSE2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
301; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
302; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
303; SSE2-NEXT:    callq roundf@PLT
304; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
305; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
306; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
307; SSE2-NEXT:    callq roundf@PLT
308; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
309; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
310; SSE2-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
311; SSE2-NEXT:    # xmm1 = xmm1[0],mem[0]
312; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
313; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
314; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
315; SSE2-NEXT:    callq roundf@PLT
316; SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
317; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
318; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
319; SSE2-NEXT:    callq roundf@PLT
320; SSE2-NEXT:    unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
321; SSE2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
322; SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
323; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
324; SSE2-NEXT:    callq roundf@PLT
325; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
326; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
327; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
328; SSE2-NEXT:    callq roundf@PLT
329; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
330; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
331; SSE2-NEXT:    unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload
332; SSE2-NEXT:    # xmm1 = xmm1[0],mem[0]
333; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
334; SSE2-NEXT:    addq $72, %rsp
335; SSE2-NEXT:    .cfi_def_cfa_offset 8
336; SSE2-NEXT:    retq
337;
338; SSE41-LABEL: round_v8f32:
339; SSE41:       # %bb.0:
340; SSE41-NEXT:    movaps {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
341; SSE41-NEXT:    movaps %xmm0, %xmm3
342; SSE41-NEXT:    andps %xmm2, %xmm3
343; SSE41-NEXT:    movaps {{.*#+}} xmm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
344; SSE41-NEXT:    orps %xmm4, %xmm3
345; SSE41-NEXT:    addps %xmm0, %xmm3
346; SSE41-NEXT:    roundps $11, %xmm3, %xmm0
347; SSE41-NEXT:    andps %xmm1, %xmm2
348; SSE41-NEXT:    orps %xmm4, %xmm2
349; SSE41-NEXT:    addps %xmm1, %xmm2
350; SSE41-NEXT:    roundps $11, %xmm2, %xmm1
351; SSE41-NEXT:    retq
352;
353; AVX1-LABEL: round_v8f32:
354; AVX1:       # %bb.0:
355; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
356; AVX1-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
357; AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
358; AVX1-NEXT:    vroundps $11, %ymm0, %ymm0
359; AVX1-NEXT:    retq
360;
361; AVX512F-LABEL: round_v8f32:
362; AVX512F:       # %bb.0:
363; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
364; AVX512F-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1
365; AVX512F-NEXT:    vaddps %ymm1, %ymm0, %ymm0
366; AVX512F-NEXT:    vroundps $11, %ymm0, %ymm0
367; AVX512F-NEXT:    retq
368;
369; AVX512FP16-LABEL: round_v8f32:
370; AVX512FP16:       ## %bb.0:
371; AVX512FP16-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
372; AVX512FP16-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1
373; AVX512FP16-NEXT:    vaddps %ymm1, %ymm0, %ymm0
374; AVX512FP16-NEXT:    vroundps $11, %ymm0, %ymm0
375; AVX512FP16-NEXT:    retq
376  %a = call <8 x float> @llvm.round.v8f32(<8 x float> %x)
377  ret <8 x float> %a
378}
379
380define <4 x double> @round_v4f64(<4 x double> %x) {
381; SSE2-LABEL: round_v4f64:
382; SSE2:       # %bb.0:
383; SSE2-NEXT:    subq $56, %rsp
384; SSE2-NEXT:    .cfi_def_cfa_offset 64
385; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
386; SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
387; SSE2-NEXT:    callq round@PLT
388; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
389; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
390; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
391; SSE2-NEXT:    callq round@PLT
392; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
393; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
394; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
395; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
396; SSE2-NEXT:    callq round@PLT
397; SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
398; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
399; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
400; SSE2-NEXT:    callq round@PLT
401; SSE2-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
402; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
403; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
404; SSE2-NEXT:    addq $56, %rsp
405; SSE2-NEXT:    .cfi_def_cfa_offset 8
406; SSE2-NEXT:    retq
407;
408; SSE41-LABEL: round_v4f64:
409; SSE41:       # %bb.0:
410; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0]
411; SSE41-NEXT:    movapd %xmm0, %xmm3
412; SSE41-NEXT:    andpd %xmm2, %xmm3
413; SSE41-NEXT:    movapd {{.*#+}} xmm4 = [4.9999999999999994E-1,4.9999999999999994E-1]
414; SSE41-NEXT:    orpd %xmm4, %xmm3
415; SSE41-NEXT:    addpd %xmm0, %xmm3
416; SSE41-NEXT:    roundpd $11, %xmm3, %xmm0
417; SSE41-NEXT:    andpd %xmm1, %xmm2
418; SSE41-NEXT:    orpd %xmm4, %xmm2
419; SSE41-NEXT:    addpd %xmm1, %xmm2
420; SSE41-NEXT:    roundpd $11, %xmm2, %xmm1
421; SSE41-NEXT:    retq
422;
423; AVX1-LABEL: round_v4f64:
424; AVX1:       # %bb.0:
425; AVX1-NEXT:    vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
426; AVX1-NEXT:    vorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
427; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
428; AVX1-NEXT:    vroundpd $11, %ymm0, %ymm0
429; AVX1-NEXT:    retq
430;
431; AVX512F-LABEL: round_v4f64:
432; AVX512F:       # %bb.0:
433; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
434; AVX512F-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm1
435; AVX512F-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
436; AVX512F-NEXT:    vroundpd $11, %ymm0, %ymm0
437; AVX512F-NEXT:    retq
438;
439; AVX512FP16-LABEL: round_v4f64:
440; AVX512FP16:       ## %bb.0:
441; AVX512FP16-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
442; AVX512FP16-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm1
443; AVX512FP16-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
444; AVX512FP16-NEXT:    vroundpd $11, %ymm0, %ymm0
445; AVX512FP16-NEXT:    retq
446  %a = call <4 x double> @llvm.round.v4f64(<4 x double> %x)
447  ret <4 x double> %a
448}
449
450define <16 x float> @round_v16f32(<16 x float> %x) {
451; SSE2-LABEL: round_v16f32:
452; SSE2:       # %bb.0:
453; SSE2-NEXT:    subq $104, %rsp
454; SSE2-NEXT:    .cfi_def_cfa_offset 112
455; SSE2-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
456; SSE2-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
457; SSE2-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
458; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
459; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
460; SSE2-NEXT:    callq roundf@PLT
461; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
462; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
463; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
464; SSE2-NEXT:    callq roundf@PLT
465; SSE2-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
466; SSE2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
467; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
468; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
469; SSE2-NEXT:    callq roundf@PLT
470; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
471; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
472; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
473; SSE2-NEXT:    callq roundf@PLT
474; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
475; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
476; SSE2-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
477; SSE2-NEXT:    # xmm1 = xmm1[0],mem[0]
478; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
479; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
480; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
481; SSE2-NEXT:    callq roundf@PLT
482; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
483; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
484; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
485; SSE2-NEXT:    callq roundf@PLT
486; SSE2-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
487; SSE2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
488; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
489; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
490; SSE2-NEXT:    callq roundf@PLT
491; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
492; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
493; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
494; SSE2-NEXT:    callq roundf@PLT
495; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
496; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
497; SSE2-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
498; SSE2-NEXT:    # xmm1 = xmm1[0],mem[0]
499; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
500; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
501; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
502; SSE2-NEXT:    callq roundf@PLT
503; SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
504; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
505; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
506; SSE2-NEXT:    callq roundf@PLT
507; SSE2-NEXT:    unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
508; SSE2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
509; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
510; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
511; SSE2-NEXT:    callq roundf@PLT
512; SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
513; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
514; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
515; SSE2-NEXT:    callq roundf@PLT
516; SSE2-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
517; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
518; SSE2-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
519; SSE2-NEXT:    # xmm1 = xmm1[0],mem[0]
520; SSE2-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
521; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
522; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
523; SSE2-NEXT:    callq roundf@PLT
524; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
525; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
526; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
527; SSE2-NEXT:    callq roundf@PLT
528; SSE2-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
529; SSE2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
530; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
531; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
532; SSE2-NEXT:    callq roundf@PLT
533; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
534; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
535; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
536; SSE2-NEXT:    callq roundf@PLT
537; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
538; SSE2-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
539; SSE2-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
540; SSE2-NEXT:    # xmm3 = xmm3[0],mem[0]
541; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
542; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
543; SSE2-NEXT:    movaps (%rsp), %xmm2 # 16-byte Reload
544; SSE2-NEXT:    addq $104, %rsp
545; SSE2-NEXT:    .cfi_def_cfa_offset 8
546; SSE2-NEXT:    retq
547;
548; SSE41-LABEL: round_v16f32:
549; SSE41:       # %bb.0:
550; SSE41-NEXT:    movaps {{.*#+}} xmm4 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
551; SSE41-NEXT:    movaps %xmm0, %xmm5
552; SSE41-NEXT:    andps %xmm4, %xmm5
553; SSE41-NEXT:    movaps {{.*#+}} xmm6 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
554; SSE41-NEXT:    orps %xmm6, %xmm5
555; SSE41-NEXT:    addps %xmm0, %xmm5
556; SSE41-NEXT:    roundps $11, %xmm5, %xmm0
557; SSE41-NEXT:    movaps %xmm1, %xmm5
558; SSE41-NEXT:    andps %xmm4, %xmm5
559; SSE41-NEXT:    orps %xmm6, %xmm5
560; SSE41-NEXT:    addps %xmm1, %xmm5
561; SSE41-NEXT:    roundps $11, %xmm5, %xmm1
562; SSE41-NEXT:    movaps %xmm2, %xmm5
563; SSE41-NEXT:    andps %xmm4, %xmm5
564; SSE41-NEXT:    orps %xmm6, %xmm5
565; SSE41-NEXT:    addps %xmm2, %xmm5
566; SSE41-NEXT:    roundps $11, %xmm5, %xmm2
567; SSE41-NEXT:    andps %xmm3, %xmm4
568; SSE41-NEXT:    orps %xmm6, %xmm4
569; SSE41-NEXT:    addps %xmm3, %xmm4
570; SSE41-NEXT:    roundps $11, %xmm4, %xmm3
571; SSE41-NEXT:    retq
572;
573; AVX1-LABEL: round_v16f32:
574; AVX1:       # %bb.0:
575; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
576; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm3
577; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
578; AVX1-NEXT:    vorps %ymm4, %ymm3, %ymm3
579; AVX1-NEXT:    vaddps %ymm3, %ymm0, %ymm0
580; AVX1-NEXT:    vroundps $11, %ymm0, %ymm0
581; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm2
582; AVX1-NEXT:    vorps %ymm4, %ymm2, %ymm2
583; AVX1-NEXT:    vaddps %ymm2, %ymm1, %ymm1
584; AVX1-NEXT:    vroundps $11, %ymm1, %ymm1
585; AVX1-NEXT:    retq
586;
587; AVX512F-LABEL: round_v16f32:
588; AVX512F:       # %bb.0:
589; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
590; AVX512F-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1
591; AVX512F-NEXT:    vaddps %zmm1, %zmm0, %zmm0
592; AVX512F-NEXT:    vrndscaleps $11, %zmm0, %zmm0
593; AVX512F-NEXT:    retq
594;
595; AVX512FP16-LABEL: round_v16f32:
596; AVX512FP16:       ## %bb.0:
597; AVX512FP16-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
598; AVX512FP16-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1
599; AVX512FP16-NEXT:    vaddps %zmm1, %zmm0, %zmm0
600; AVX512FP16-NEXT:    vrndscaleps $11, %zmm0, %zmm0
601; AVX512FP16-NEXT:    retq
602  %a = call <16 x float> @llvm.round.v16f32(<16 x float> %x)
603  ret <16 x float> %a
604}
605
606define <8 x double> @round_v8f64(<8 x double> %x) {
607; SSE2-LABEL: round_v8f64:
608; SSE2:       # %bb.0:
609; SSE2-NEXT:    subq $88, %rsp
610; SSE2-NEXT:    .cfi_def_cfa_offset 96
611; SSE2-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
612; SSE2-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
613; SSE2-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
614; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
615; SSE2-NEXT:    callq round@PLT
616; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
617; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
618; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
619; SSE2-NEXT:    callq round@PLT
620; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
621; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
622; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
623; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
624; SSE2-NEXT:    callq round@PLT
625; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
626; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
627; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
628; SSE2-NEXT:    callq round@PLT
629; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
630; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
631; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
632; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
633; SSE2-NEXT:    callq round@PLT
634; SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
635; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
636; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
637; SSE2-NEXT:    callq round@PLT
638; SSE2-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
639; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
640; SSE2-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
641; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
642; SSE2-NEXT:    callq round@PLT
643; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
644; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
645; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
646; SSE2-NEXT:    callq round@PLT
647; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
648; SSE2-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
649; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
650; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
651; SSE2-NEXT:    movaps (%rsp), %xmm2 # 16-byte Reload
652; SSE2-NEXT:    addq $88, %rsp
653; SSE2-NEXT:    .cfi_def_cfa_offset 8
654; SSE2-NEXT:    retq
655;
656; SSE41-LABEL: round_v8f64:
657; SSE41:       # %bb.0:
658; SSE41-NEXT:    movapd {{.*#+}} xmm4 = [-0.0E+0,-0.0E+0]
659; SSE41-NEXT:    movapd %xmm0, %xmm5
660; SSE41-NEXT:    andpd %xmm4, %xmm5
661; SSE41-NEXT:    movapd {{.*#+}} xmm6 = [4.9999999999999994E-1,4.9999999999999994E-1]
662; SSE41-NEXT:    orpd %xmm6, %xmm5
663; SSE41-NEXT:    addpd %xmm0, %xmm5
664; SSE41-NEXT:    roundpd $11, %xmm5, %xmm0
665; SSE41-NEXT:    movapd %xmm1, %xmm5
666; SSE41-NEXT:    andpd %xmm4, %xmm5
667; SSE41-NEXT:    orpd %xmm6, %xmm5
668; SSE41-NEXT:    addpd %xmm1, %xmm5
669; SSE41-NEXT:    roundpd $11, %xmm5, %xmm1
670; SSE41-NEXT:    movapd %xmm2, %xmm5
671; SSE41-NEXT:    andpd %xmm4, %xmm5
672; SSE41-NEXT:    orpd %xmm6, %xmm5
673; SSE41-NEXT:    addpd %xmm2, %xmm5
674; SSE41-NEXT:    roundpd $11, %xmm5, %xmm2
675; SSE41-NEXT:    andpd %xmm3, %xmm4
676; SSE41-NEXT:    orpd %xmm6, %xmm4
677; SSE41-NEXT:    addpd %xmm3, %xmm4
678; SSE41-NEXT:    roundpd $11, %xmm4, %xmm3
679; SSE41-NEXT:    retq
680;
681; AVX1-LABEL: round_v8f64:
682; AVX1:       # %bb.0:
683; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
684; AVX1-NEXT:    vandpd %ymm2, %ymm0, %ymm3
685; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm4 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
686; AVX1-NEXT:    vorpd %ymm4, %ymm3, %ymm3
687; AVX1-NEXT:    vaddpd %ymm3, %ymm0, %ymm0
688; AVX1-NEXT:    vroundpd $11, %ymm0, %ymm0
689; AVX1-NEXT:    vandpd %ymm2, %ymm1, %ymm2
690; AVX1-NEXT:    vorpd %ymm4, %ymm2, %ymm2
691; AVX1-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
692; AVX1-NEXT:    vroundpd $11, %ymm1, %ymm1
693; AVX1-NEXT:    retq
694;
695; AVX512F-LABEL: round_v8f64:
696; AVX512F:       # %bb.0:
697; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
698; AVX512F-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1
699; AVX512F-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
700; AVX512F-NEXT:    vrndscalepd $11, %zmm0, %zmm0
701; AVX512F-NEXT:    retq
702;
703; AVX512FP16-LABEL: round_v8f64:
704; AVX512FP16:       ## %bb.0:
705; AVX512FP16-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
706; AVX512FP16-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1
707; AVX512FP16-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
708; AVX512FP16-NEXT:    vrndscalepd $11, %zmm0, %zmm0
709; AVX512FP16-NEXT:    retq
710  %a = call <8 x double> @llvm.round.v8f64(<8 x double> %x)
711  ret <8 x double> %a
712}
713
714declare half @llvm.round.f16(half)
715declare float @llvm.round.f32(float)
716declare double @llvm.round.f64(double)
717declare <4 x float> @llvm.round.v4f32(<4 x float>)
718declare <2 x double> @llvm.round.v2f64(<2 x double>)
719declare <8 x float> @llvm.round.v8f32(<8 x float>)
720declare <4 x double> @llvm.round.v4f64(<4 x double>)
721declare <16 x float> @llvm.round.v16f32(<16 x float>)
722declare <8 x double> @llvm.round.v8f64(<8 x double>)
723