1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41 4; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F 6; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512fp16,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512FP16 7 8define half @roundeven_f16(half %h) { 9; SSE2-LABEL: roundeven_f16: 10; SSE2: ## %bb.0: ## %entry 11; SSE2-NEXT: pushq %rax 12; SSE2-NEXT: .cfi_def_cfa_offset 16 13; SSE2-NEXT: pextrw $0, %xmm0, %eax 14; SSE2-NEXT: movzwl %ax, %edi 15; SSE2-NEXT: callq ___extendhfsf2 16; SSE2-NEXT: callq _roundevenf 17; SSE2-NEXT: callq ___truncsfhf2 18; SSE2-NEXT: ## kill: def $ax killed $ax def $eax 19; SSE2-NEXT: pinsrw $0, %eax, %xmm0 20; SSE2-NEXT: popq %rax 21; SSE2-NEXT: retq 22; 23; SSE41-LABEL: roundeven_f16: 24; SSE41: ## %bb.0: ## %entry 25; SSE41-NEXT: pushq %rax 26; SSE41-NEXT: .cfi_def_cfa_offset 16 27; SSE41-NEXT: pextrw $0, %xmm0, %eax 28; SSE41-NEXT: movzwl %ax, %edi 29; SSE41-NEXT: callq ___extendhfsf2 30; SSE41-NEXT: roundss $8, %xmm0, %xmm0 31; SSE41-NEXT: callq ___truncsfhf2 32; SSE41-NEXT: ## kill: def $ax killed $ax def $eax 33; SSE41-NEXT: pinsrw $0, %eax, %xmm0 34; SSE41-NEXT: popq %rax 35; SSE41-NEXT: retq 36; 37; AVX1-LABEL: roundeven_f16: 38; AVX1: ## %bb.0: ## %entry 39; AVX1-NEXT: pushq %rax 40; AVX1-NEXT: .cfi_def_cfa_offset 16 41; AVX1-NEXT: vpextrw $0, %xmm0, %eax 42; AVX1-NEXT: movzwl %ax, %edi 43; AVX1-NEXT: callq ___extendhfsf2 44; AVX1-NEXT: vroundss $8, %xmm0, %xmm0, %xmm0 45; AVX1-NEXT: callq ___truncsfhf2 46; AVX1-NEXT: ## kill: def $ax killed $ax def $eax 47; AVX1-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 48; AVX1-NEXT: popq %rax 49; AVX1-NEXT: retq 50; 51; AVX512F-LABEL: roundeven_f16: 52; AVX512F: ## %bb.0: ## %entry 53; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 54; AVX512F-NEXT: vroundss $8, %xmm0, %xmm0, %xmm0 55; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 56; AVX512F-NEXT: vmovd %xmm0, %eax 57; AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 58; AVX512F-NEXT: retq 59; 60; AVX512FP16-LABEL: roundeven_f16: 61; AVX512FP16: ## %bb.0: ## %entry 62; AVX512FP16-NEXT: vrndscalesh $8, %xmm0, %xmm0, %xmm0 63; AVX512FP16-NEXT: retq 64entry: 65 %a = call half @llvm.roundeven.f16(half %h) 66 ret half %a 67} 68 69define float @roundeven_f32(float %x) { 70; SSE2-LABEL: roundeven_f32: 71; SSE2: ## %bb.0: 72; SSE2-NEXT: jmp _roundevenf ## TAILCALL 73; 74; SSE41-LABEL: roundeven_f32: 75; SSE41: ## %bb.0: 76; SSE41-NEXT: roundss $8, %xmm0, %xmm0 77; SSE41-NEXT: retq 78; 79; AVX-LABEL: roundeven_f32: 80; AVX: ## %bb.0: 81; AVX-NEXT: vroundss $8, %xmm0, %xmm0, %xmm0 82; AVX-NEXT: retq 83 %a = call float @llvm.roundeven.f32(float %x) 84 ret float %a 85} 86 87define double @roundeven_f64(double %x) { 88; SSE2-LABEL: roundeven_f64: 89; SSE2: ## %bb.0: 90; SSE2-NEXT: jmp _roundeven ## TAILCALL 91; 92; SSE41-LABEL: roundeven_f64: 93; SSE41: ## %bb.0: 94; SSE41-NEXT: roundsd $8, %xmm0, %xmm0 95; SSE41-NEXT: retq 96; 97; AVX-LABEL: roundeven_f64: 98; AVX: ## %bb.0: 99; AVX-NEXT: vroundsd $8, %xmm0, %xmm0, %xmm0 100; AVX-NEXT: retq 101 %a = call double @llvm.roundeven.f64(double %x) 102 ret double %a 103} 104 105define <4 x float> @roundeven_v4f32(<4 x float> %x) { 106; SSE2-LABEL: roundeven_v4f32: 107; SSE2: ## %bb.0: 108; SSE2-NEXT: subq $56, %rsp 109; SSE2-NEXT: .cfi_def_cfa_offset 64 110; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 111; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 112; SSE2-NEXT: callq _roundevenf 113; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 114; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 115; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 116; SSE2-NEXT: callq _roundevenf 117; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload 118; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 119; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 120; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 121; SSE2-NEXT: callq _roundevenf 122; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 123; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 124; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 125; SSE2-NEXT: callq _roundevenf 126; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 127; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 128; SSE2-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload 129; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] 130; SSE2-NEXT: movaps %xmm1, %xmm0 131; SSE2-NEXT: addq $56, %rsp 132; SSE2-NEXT: retq 133; 134; SSE41-LABEL: roundeven_v4f32: 135; SSE41: ## %bb.0: 136; SSE41-NEXT: roundps $8, %xmm0, %xmm0 137; SSE41-NEXT: retq 138; 139; AVX-LABEL: roundeven_v4f32: 140; AVX: ## %bb.0: 141; AVX-NEXT: vroundps $8, %xmm0, %xmm0 142; AVX-NEXT: retq 143 %a = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %x) 144 ret <4 x float> %a 145} 146 147define <2 x double> @roundeven_v2f64(<2 x double> %x) { 148; SSE2-LABEL: roundeven_v2f64: 149; SSE2: ## %bb.0: 150; SSE2-NEXT: subq $40, %rsp 151; SSE2-NEXT: .cfi_def_cfa_offset 48 152; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 153; SSE2-NEXT: callq _roundeven 154; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 155; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 156; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 157; SSE2-NEXT: callq _roundeven 158; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 159; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 160; SSE2-NEXT: movaps %xmm1, %xmm0 161; SSE2-NEXT: addq $40, %rsp 162; SSE2-NEXT: retq 163; 164; SSE41-LABEL: roundeven_v2f64: 165; SSE41: ## %bb.0: 166; SSE41-NEXT: roundpd $8, %xmm0, %xmm0 167; SSE41-NEXT: retq 168; 169; AVX-LABEL: roundeven_v2f64: 170; AVX: ## %bb.0: 171; AVX-NEXT: vroundpd $8, %xmm0, %xmm0 172; AVX-NEXT: retq 173 %a = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %x) 174 ret <2 x double> %a 175} 176 177define <8 x float> @roundeven_v8f32(<8 x float> %x) { 178; SSE2-LABEL: roundeven_v8f32: 179; SSE2: ## %bb.0: 180; SSE2-NEXT: subq $72, %rsp 181; SSE2-NEXT: .cfi_def_cfa_offset 80 182; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 183; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 184; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 185; SSE2-NEXT: callq _roundevenf 186; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 187; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 188; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 189; SSE2-NEXT: callq _roundevenf 190; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload 191; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 192; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 193; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 194; SSE2-NEXT: callq _roundevenf 195; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 196; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 197; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 198; SSE2-NEXT: callq _roundevenf 199; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 200; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 201; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload 202; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] 203; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 204; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 205; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 206; SSE2-NEXT: callq _roundevenf 207; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 208; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 209; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 210; SSE2-NEXT: callq _roundevenf 211; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload 212; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 213; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 214; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 215; SSE2-NEXT: callq _roundevenf 216; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 217; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 218; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 219; SSE2-NEXT: callq _roundevenf 220; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 221; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 222; SSE2-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload 223; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] 224; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 225; SSE2-NEXT: addq $72, %rsp 226; SSE2-NEXT: retq 227; 228; SSE41-LABEL: roundeven_v8f32: 229; SSE41: ## %bb.0: 230; SSE41-NEXT: roundps $8, %xmm0, %xmm0 231; SSE41-NEXT: roundps $8, %xmm1, %xmm1 232; SSE41-NEXT: retq 233; 234; AVX-LABEL: roundeven_v8f32: 235; AVX: ## %bb.0: 236; AVX-NEXT: vroundps $8, %ymm0, %ymm0 237; AVX-NEXT: retq 238 %a = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %x) 239 ret <8 x float> %a 240} 241 242define <4 x double> @roundeven_v4f64(<4 x double> %x) { 243; SSE2-LABEL: roundeven_v4f64: 244; SSE2: ## %bb.0: 245; SSE2-NEXT: subq $56, %rsp 246; SSE2-NEXT: .cfi_def_cfa_offset 64 247; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 248; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 249; SSE2-NEXT: callq _roundeven 250; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 251; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 252; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 253; SSE2-NEXT: callq _roundeven 254; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 255; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 256; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 257; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 258; SSE2-NEXT: callq _roundeven 259; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 260; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 261; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 262; SSE2-NEXT: callq _roundeven 263; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload 264; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 265; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 266; SSE2-NEXT: addq $56, %rsp 267; SSE2-NEXT: retq 268; 269; SSE41-LABEL: roundeven_v4f64: 270; SSE41: ## %bb.0: 271; SSE41-NEXT: roundpd $8, %xmm0, %xmm0 272; SSE41-NEXT: roundpd $8, %xmm1, %xmm1 273; SSE41-NEXT: retq 274; 275; AVX-LABEL: roundeven_v4f64: 276; AVX: ## %bb.0: 277; AVX-NEXT: vroundpd $8, %ymm0, %ymm0 278; AVX-NEXT: retq 279 %a = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %x) 280 ret <4 x double> %a 281} 282 283define <16 x float> @roundeven_v16f32(<16 x float> %x) { 284; SSE2-LABEL: roundeven_v16f32: 285; SSE2: ## %bb.0: 286; SSE2-NEXT: subq $104, %rsp 287; SSE2-NEXT: .cfi_def_cfa_offset 112 288; SSE2-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 289; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 290; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill 291; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 292; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 293; SSE2-NEXT: callq _roundevenf 294; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 295; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 296; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 297; SSE2-NEXT: callq _roundevenf 298; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload 299; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 300; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 301; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 302; SSE2-NEXT: callq _roundevenf 303; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 304; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 305; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 306; SSE2-NEXT: callq _roundevenf 307; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 308; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 309; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload 310; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] 311; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 312; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 313; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 314; SSE2-NEXT: callq _roundevenf 315; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 316; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 317; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 318; SSE2-NEXT: callq _roundevenf 319; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload 320; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 321; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 322; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 323; SSE2-NEXT: callq _roundevenf 324; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 325; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 326; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 327; SSE2-NEXT: callq _roundevenf 328; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 329; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 330; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload 331; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] 332; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 333; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 334; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 335; SSE2-NEXT: callq _roundevenf 336; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 337; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 338; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 339; SSE2-NEXT: callq _roundevenf 340; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload 341; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 342; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 343; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 344; SSE2-NEXT: callq _roundevenf 345; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 346; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 347; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 348; SSE2-NEXT: callq _roundevenf 349; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload 350; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 351; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload 352; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] 353; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill 354; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 355; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 356; SSE2-NEXT: callq _roundevenf 357; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 358; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 359; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 360; SSE2-NEXT: callq _roundevenf 361; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload 362; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 363; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 364; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 365; SSE2-NEXT: callq _roundevenf 366; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 367; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 368; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 369; SSE2-NEXT: callq _roundevenf 370; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload 371; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 372; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Folded Reload 373; SSE2-NEXT: ## xmm3 = xmm3[0],mem[0] 374; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 375; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 376; SSE2-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload 377; SSE2-NEXT: addq $104, %rsp 378; SSE2-NEXT: retq 379; 380; SSE41-LABEL: roundeven_v16f32: 381; SSE41: ## %bb.0: 382; SSE41-NEXT: roundps $8, %xmm0, %xmm0 383; SSE41-NEXT: roundps $8, %xmm1, %xmm1 384; SSE41-NEXT: roundps $8, %xmm2, %xmm2 385; SSE41-NEXT: roundps $8, %xmm3, %xmm3 386; SSE41-NEXT: retq 387; 388; AVX1-LABEL: roundeven_v16f32: 389; AVX1: ## %bb.0: 390; AVX1-NEXT: vroundps $8, %ymm0, %ymm0 391; AVX1-NEXT: vroundps $8, %ymm1, %ymm1 392; AVX1-NEXT: retq 393; 394; AVX512-LABEL: roundeven_v16f32: 395; AVX512: ## %bb.0: 396; AVX512-NEXT: vrndscaleps $8, %zmm0, %zmm0 397; AVX512-NEXT: retq 398 %a = call <16 x float> @llvm.roundeven.v16f32(<16 x float> %x) 399 ret <16 x float> %a 400} 401 402define <8 x double> @roundeven_v8f64(<8 x double> %x) { 403; SSE2-LABEL: roundeven_v8f64: 404; SSE2: ## %bb.0: 405; SSE2-NEXT: subq $88, %rsp 406; SSE2-NEXT: .cfi_def_cfa_offset 96 407; SSE2-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 408; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 409; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill 410; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 411; SSE2-NEXT: callq _roundeven 412; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 413; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 414; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 415; SSE2-NEXT: callq _roundeven 416; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 417; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 418; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 419; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 420; SSE2-NEXT: callq _roundeven 421; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 422; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 423; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 424; SSE2-NEXT: callq _roundeven 425; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 426; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 427; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 428; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 429; SSE2-NEXT: callq _roundeven 430; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 431; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 432; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 433; SSE2-NEXT: callq _roundeven 434; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload 435; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 436; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill 437; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 438; SSE2-NEXT: callq _roundeven 439; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 440; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 441; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 442; SSE2-NEXT: callq _roundeven 443; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload 444; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] 445; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 446; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 447; SSE2-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload 448; SSE2-NEXT: addq $88, %rsp 449; SSE2-NEXT: retq 450; 451; SSE41-LABEL: roundeven_v8f64: 452; SSE41: ## %bb.0: 453; SSE41-NEXT: roundpd $8, %xmm0, %xmm0 454; SSE41-NEXT: roundpd $8, %xmm1, %xmm1 455; SSE41-NEXT: roundpd $8, %xmm2, %xmm2 456; SSE41-NEXT: roundpd $8, %xmm3, %xmm3 457; SSE41-NEXT: retq 458; 459; AVX1-LABEL: roundeven_v8f64: 460; AVX1: ## %bb.0: 461; AVX1-NEXT: vroundpd $8, %ymm0, %ymm0 462; AVX1-NEXT: vroundpd $8, %ymm1, %ymm1 463; AVX1-NEXT: retq 464; 465; AVX512-LABEL: roundeven_v8f64: 466; AVX512: ## %bb.0: 467; AVX512-NEXT: vrndscalepd $8, %zmm0, %zmm0 468; AVX512-NEXT: retq 469 %a = call <8 x double> @llvm.roundeven.v8f64(<8 x double> %x) 470 ret <8 x double> %a 471} 472 473declare half @llvm.roundeven.f16(half) 474declare float @llvm.roundeven.f32(float) 475declare double @llvm.roundeven.f64(double) 476declare <4 x float> @llvm.roundeven.v4f32(<4 x float>) 477declare <2 x double> @llvm.roundeven.v2f64(<2 x double>) 478declare <8 x float> @llvm.roundeven.v8f32(<8 x float>) 479declare <4 x double> @llvm.roundeven.v4f64(<4 x double>) 480declare <16 x float> @llvm.roundeven.v16f32(<16 x float>) 481declare <8 x double> @llvm.roundeven.v8f64(<8 x double>) 482