1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,X64-AVX1 5; RUN: llc < %s -mtriple=i686-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,X86-AVX1 6 7declare i32 @llvm.fptoui.sat.i32.f32(float) 8declare i64 @llvm.fptosi.sat.i64.f64(double) 9 10define float @trunc_unsigned_f32(float %x) #0 { 11; SSE2-LABEL: trunc_unsigned_f32: 12; SSE2: # %bb.0: 13; SSE2-NEXT: cvttss2si %xmm0, %rax 14; SSE2-NEXT: movl %eax, %eax 15; SSE2-NEXT: xorps %xmm0, %xmm0 16; SSE2-NEXT: cvtsi2ss %rax, %xmm0 17; SSE2-NEXT: retq 18; 19; SSE41-LABEL: trunc_unsigned_f32: 20; SSE41: # %bb.0: 21; SSE41-NEXT: roundss $11, %xmm0, %xmm0 22; SSE41-NEXT: retq 23; 24; X64-AVX1-LABEL: trunc_unsigned_f32: 25; X64-AVX1: # %bb.0: 26; X64-AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 27; X64-AVX1-NEXT: retq 28; 29; X86-AVX1-LABEL: trunc_unsigned_f32: 30; X86-AVX1: # %bb.0: 31; X86-AVX1-NEXT: pushl %eax 32; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 33; X86-AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 34; X86-AVX1-NEXT: vmovss %xmm0, (%esp) 35; X86-AVX1-NEXT: flds (%esp) 36; X86-AVX1-NEXT: popl %eax 37; X86-AVX1-NEXT: retl 38 %i = fptoui float %x to i32 39 %r = uitofp i32 %i to float 40 ret float %r 41} 42 43define double @trunc_unsigned_f64(double %x) #0 { 44; SSE2-LABEL: trunc_unsigned_f64: 45; SSE2: # %bb.0: 46; SSE2-NEXT: cvttsd2si %xmm0, %rax 47; SSE2-NEXT: movq %rax, %rcx 48; SSE2-NEXT: sarq $63, %rcx 49; SSE2-NEXT: subsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 50; SSE2-NEXT: cvttsd2si %xmm0, %rdx 51; SSE2-NEXT: andq %rcx, %rdx 52; SSE2-NEXT: orq %rax, %rdx 53; SSE2-NEXT: movq %rdx, %xmm1 54; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 55; SSE2-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 56; SSE2-NEXT: movapd %xmm1, %xmm0 57; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 58; SSE2-NEXT: addsd %xmm1, %xmm0 59; SSE2-NEXT: retq 60; 61; SSE41-LABEL: trunc_unsigned_f64: 62; SSE41: # %bb.0: 63; SSE41-NEXT: roundsd $11, %xmm0, %xmm0 64; SSE41-NEXT: retq 65; 66; X64-AVX1-LABEL: trunc_unsigned_f64: 67; X64-AVX1: # %bb.0: 68; X64-AVX1-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0 69; X64-AVX1-NEXT: retq 70; 71; X86-AVX1-LABEL: trunc_unsigned_f64: 72; X86-AVX1: # %bb.0: 73; X86-AVX1-NEXT: pushl %ebp 74; X86-AVX1-NEXT: movl %esp, %ebp 75; X86-AVX1-NEXT: andl $-8, %esp 76; X86-AVX1-NEXT: subl $8, %esp 77; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 78; X86-AVX1-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0 79; X86-AVX1-NEXT: vmovsd %xmm0, (%esp) 80; X86-AVX1-NEXT: fldl (%esp) 81; X86-AVX1-NEXT: movl %ebp, %esp 82; X86-AVX1-NEXT: popl %ebp 83; X86-AVX1-NEXT: retl 84 %i = fptoui double %x to i64 85 %r = uitofp i64 %i to double 86 ret double %r 87} 88 89define <4 x float> @trunc_unsigned_v4f32(<4 x float> %x) #0 { 90; SSE2-LABEL: trunc_unsigned_v4f32: 91; SSE2: # %bb.0: 92; SSE2-NEXT: cvttps2dq %xmm0, %xmm1 93; SSE2-NEXT: movdqa %xmm1, %xmm2 94; SSE2-NEXT: psrad $31, %xmm2 95; SSE2-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 96; SSE2-NEXT: cvttps2dq %xmm0, %xmm0 97; SSE2-NEXT: pand %xmm2, %xmm0 98; SSE2-NEXT: por %xmm1, %xmm0 99; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] 100; SSE2-NEXT: pand %xmm0, %xmm1 101; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 102; SSE2-NEXT: psrld $16, %xmm0 103; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 104; SSE2-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 105; SSE2-NEXT: addps %xmm1, %xmm0 106; SSE2-NEXT: retq 107; 108; SSE41-LABEL: trunc_unsigned_v4f32: 109; SSE41: # %bb.0: 110; SSE41-NEXT: roundps $11, %xmm0, %xmm0 111; SSE41-NEXT: retq 112; 113; AVX-LABEL: trunc_unsigned_v4f32: 114; AVX: # %bb.0: 115; AVX-NEXT: vroundps $11, %xmm0, %xmm0 116; AVX-NEXT: ret{{[l|q]}} 117 %i = fptoui <4 x float> %x to <4 x i32> 118 %r = uitofp <4 x i32> %i to <4 x float> 119 ret <4 x float> %r 120} 121 122define <2 x double> @trunc_unsigned_v2f64(<2 x double> %x) #0 { 123; SSE2-LABEL: trunc_unsigned_v2f64: 124; SSE2: # %bb.0: 125; SSE2-NEXT: movsd {{.*#+}} xmm2 = [9.2233720368547758E+18,0.0E+0] 126; SSE2-NEXT: movapd %xmm0, %xmm1 127; SSE2-NEXT: subsd %xmm2, %xmm1 128; SSE2-NEXT: cvttsd2si %xmm1, %rax 129; SSE2-NEXT: cvttsd2si %xmm0, %rcx 130; SSE2-NEXT: movq %rcx, %rdx 131; SSE2-NEXT: sarq $63, %rdx 132; SSE2-NEXT: andq %rax, %rdx 133; SSE2-NEXT: orq %rcx, %rdx 134; SSE2-NEXT: movq %rdx, %xmm1 135; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 136; SSE2-NEXT: cvttsd2si %xmm0, %rax 137; SSE2-NEXT: subsd %xmm2, %xmm0 138; SSE2-NEXT: cvttsd2si %xmm0, %rcx 139; SSE2-NEXT: movq %rax, %rdx 140; SSE2-NEXT: sarq $63, %rdx 141; SSE2-NEXT: andq %rcx, %rdx 142; SSE2-NEXT: orq %rax, %rdx 143; SSE2-NEXT: movq %rdx, %xmm0 144; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 145; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [4294967295,4294967295] 146; SSE2-NEXT: pand %xmm1, %xmm0 147; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 148; SSE2-NEXT: psrlq $32, %xmm1 149; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 150; SSE2-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 151; SSE2-NEXT: addpd %xmm0, %xmm1 152; SSE2-NEXT: movapd %xmm1, %xmm0 153; SSE2-NEXT: retq 154; 155; SSE41-LABEL: trunc_unsigned_v2f64: 156; SSE41: # %bb.0: 157; SSE41-NEXT: roundpd $11, %xmm0, %xmm0 158; SSE41-NEXT: retq 159; 160; AVX-LABEL: trunc_unsigned_v2f64: 161; AVX: # %bb.0: 162; AVX-NEXT: vroundpd $11, %xmm0, %xmm0 163; AVX-NEXT: ret{{[l|q]}} 164 %i = fptoui <2 x double> %x to <2 x i64> 165 %r = uitofp <2 x i64> %i to <2 x double> 166 ret <2 x double> %r 167} 168 169define <4 x double> @trunc_unsigned_v4f64(<4 x double> %x) #0 { 170; SSE2-LABEL: trunc_unsigned_v4f64: 171; SSE2: # %bb.0: 172; SSE2-NEXT: movapd %xmm1, %xmm2 173; SSE2-NEXT: movsd {{.*#+}} xmm3 = [9.2233720368547758E+18,0.0E+0] 174; SSE2-NEXT: subsd %xmm3, %xmm1 175; SSE2-NEXT: cvttsd2si %xmm1, %rax 176; SSE2-NEXT: cvttsd2si %xmm2, %rcx 177; SSE2-NEXT: movq %rcx, %rdx 178; SSE2-NEXT: sarq $63, %rdx 179; SSE2-NEXT: andq %rax, %rdx 180; SSE2-NEXT: orq %rcx, %rdx 181; SSE2-NEXT: movq %rdx, %xmm1 182; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 183; SSE2-NEXT: cvttsd2si %xmm2, %rax 184; SSE2-NEXT: subsd %xmm3, %xmm2 185; SSE2-NEXT: cvttsd2si %xmm2, %rcx 186; SSE2-NEXT: movq %rax, %rdx 187; SSE2-NEXT: sarq $63, %rdx 188; SSE2-NEXT: andq %rcx, %rdx 189; SSE2-NEXT: orq %rax, %rdx 190; SSE2-NEXT: movq %rdx, %xmm2 191; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 192; SSE2-NEXT: movapd %xmm0, %xmm2 193; SSE2-NEXT: subsd %xmm3, %xmm2 194; SSE2-NEXT: cvttsd2si %xmm2, %rax 195; SSE2-NEXT: cvttsd2si %xmm0, %rcx 196; SSE2-NEXT: movq %rcx, %rdx 197; SSE2-NEXT: sarq $63, %rdx 198; SSE2-NEXT: andq %rax, %rdx 199; SSE2-NEXT: orq %rcx, %rdx 200; SSE2-NEXT: movq %rdx, %xmm2 201; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 202; SSE2-NEXT: cvttsd2si %xmm0, %rax 203; SSE2-NEXT: subsd %xmm3, %xmm0 204; SSE2-NEXT: cvttsd2si %xmm0, %rcx 205; SSE2-NEXT: movq %rax, %rdx 206; SSE2-NEXT: sarq $63, %rdx 207; SSE2-NEXT: andq %rcx, %rdx 208; SSE2-NEXT: orq %rax, %rdx 209; SSE2-NEXT: movq %rdx, %xmm0 210; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] 211; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [4294967295,4294967295] 212; SSE2-NEXT: movdqa %xmm2, %xmm3 213; SSE2-NEXT: pand %xmm0, %xmm3 214; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] 215; SSE2-NEXT: por %xmm4, %xmm3 216; SSE2-NEXT: psrlq $32, %xmm2 217; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] 218; SSE2-NEXT: por %xmm5, %xmm2 219; SSE2-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] 220; SSE2-NEXT: subpd %xmm6, %xmm2 221; SSE2-NEXT: addpd %xmm3, %xmm2 222; SSE2-NEXT: pand %xmm1, %xmm0 223; SSE2-NEXT: por %xmm4, %xmm0 224; SSE2-NEXT: psrlq $32, %xmm1 225; SSE2-NEXT: por %xmm5, %xmm1 226; SSE2-NEXT: subpd %xmm6, %xmm1 227; SSE2-NEXT: addpd %xmm0, %xmm1 228; SSE2-NEXT: movapd %xmm2, %xmm0 229; SSE2-NEXT: retq 230; 231; SSE41-LABEL: trunc_unsigned_v4f64: 232; SSE41: # %bb.0: 233; SSE41-NEXT: roundpd $11, %xmm0, %xmm0 234; SSE41-NEXT: roundpd $11, %xmm1, %xmm1 235; SSE41-NEXT: retq 236; 237; AVX-LABEL: trunc_unsigned_v4f64: 238; AVX: # %bb.0: 239; AVX-NEXT: vroundpd $11, %ymm0, %ymm0 240; AVX-NEXT: ret{{[l|q]}} 241 %i = fptoui <4 x double> %x to <4 x i64> 242 %r = uitofp <4 x i64> %i to <4 x double> 243 ret <4 x double> %r 244} 245 246define float @trunc_signed_f32_no_fast_math(float %x) { 247; SSE-LABEL: trunc_signed_f32_no_fast_math: 248; SSE: # %bb.0: 249; SSE-NEXT: cvttps2dq %xmm0, %xmm0 250; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 251; SSE-NEXT: retq 252; 253; X64-AVX1-LABEL: trunc_signed_f32_no_fast_math: 254; X64-AVX1: # %bb.0: 255; X64-AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 256; X64-AVX1-NEXT: vcvtdq2ps %xmm0, %xmm0 257; X64-AVX1-NEXT: retq 258; 259; X86-AVX1-LABEL: trunc_signed_f32_no_fast_math: 260; X86-AVX1: # %bb.0: 261; X86-AVX1-NEXT: pushl %eax 262; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 263; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 264; X86-AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 265; X86-AVX1-NEXT: vcvtdq2ps %xmm0, %xmm0 266; X86-AVX1-NEXT: vmovss %xmm0, (%esp) 267; X86-AVX1-NEXT: flds (%esp) 268; X86-AVX1-NEXT: popl %eax 269; X86-AVX1-NEXT: .cfi_def_cfa_offset 4 270; X86-AVX1-NEXT: retl 271 %i = fptosi float %x to i32 272 %r = sitofp i32 %i to float 273 ret float %r 274} 275 276; Without -0.0, it is ok to use roundss if it is available. 277 278define float @trunc_signed_f32_nsz(float %x) #0 { 279; SSE2-LABEL: trunc_signed_f32_nsz: 280; SSE2: # %bb.0: 281; SSE2-NEXT: cvttps2dq %xmm0, %xmm0 282; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 283; SSE2-NEXT: retq 284; 285; SSE41-LABEL: trunc_signed_f32_nsz: 286; SSE41: # %bb.0: 287; SSE41-NEXT: roundss $11, %xmm0, %xmm0 288; SSE41-NEXT: retq 289; 290; X64-AVX1-LABEL: trunc_signed_f32_nsz: 291; X64-AVX1: # %bb.0: 292; X64-AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 293; X64-AVX1-NEXT: retq 294; 295; X86-AVX1-LABEL: trunc_signed_f32_nsz: 296; X86-AVX1: # %bb.0: 297; X86-AVX1-NEXT: pushl %eax 298; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 299; X86-AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 300; X86-AVX1-NEXT: vmovss %xmm0, (%esp) 301; X86-AVX1-NEXT: flds (%esp) 302; X86-AVX1-NEXT: popl %eax 303; X86-AVX1-NEXT: retl 304 %i = fptosi float %x to i32 305 %r = sitofp i32 %i to float 306 ret float %r 307} 308 309define double @trunc_signed32_f64_no_fast_math(double %x) { 310; SSE-LABEL: trunc_signed32_f64_no_fast_math: 311; SSE: # %bb.0: 312; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 313; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 314; SSE-NEXT: retq 315; 316; X64-AVX1-LABEL: trunc_signed32_f64_no_fast_math: 317; X64-AVX1: # %bb.0: 318; X64-AVX1-NEXT: vcvttpd2dq %xmm0, %xmm0 319; X64-AVX1-NEXT: vcvtdq2pd %xmm0, %xmm0 320; X64-AVX1-NEXT: retq 321; 322; X86-AVX1-LABEL: trunc_signed32_f64_no_fast_math: 323; X86-AVX1: # %bb.0: 324; X86-AVX1-NEXT: pushl %ebp 325; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 326; X86-AVX1-NEXT: .cfi_offset %ebp, -8 327; X86-AVX1-NEXT: movl %esp, %ebp 328; X86-AVX1-NEXT: .cfi_def_cfa_register %ebp 329; X86-AVX1-NEXT: andl $-8, %esp 330; X86-AVX1-NEXT: subl $8, %esp 331; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 332; X86-AVX1-NEXT: vcvttpd2dq %xmm0, %xmm0 333; X86-AVX1-NEXT: vcvtdq2pd %xmm0, %xmm0 334; X86-AVX1-NEXT: vmovlps %xmm0, (%esp) 335; X86-AVX1-NEXT: fldl (%esp) 336; X86-AVX1-NEXT: movl %ebp, %esp 337; X86-AVX1-NEXT: popl %ebp 338; X86-AVX1-NEXT: .cfi_def_cfa %esp, 4 339; X86-AVX1-NEXT: retl 340 %i = fptosi double %x to i32 341 %r = sitofp i32 %i to double 342 ret double %r 343} 344 345define double @trunc_signed32_f64_nsz(double %x) #0 { 346; SSE2-LABEL: trunc_signed32_f64_nsz: 347; SSE2: # %bb.0: 348; SSE2-NEXT: cvttpd2dq %xmm0, %xmm0 349; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 350; SSE2-NEXT: retq 351; 352; SSE41-LABEL: trunc_signed32_f64_nsz: 353; SSE41: # %bb.0: 354; SSE41-NEXT: roundsd $11, %xmm0, %xmm0 355; SSE41-NEXT: retq 356; 357; X64-AVX1-LABEL: trunc_signed32_f64_nsz: 358; X64-AVX1: # %bb.0: 359; X64-AVX1-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0 360; X64-AVX1-NEXT: retq 361; 362; X86-AVX1-LABEL: trunc_signed32_f64_nsz: 363; X86-AVX1: # %bb.0: 364; X86-AVX1-NEXT: pushl %ebp 365; X86-AVX1-NEXT: movl %esp, %ebp 366; X86-AVX1-NEXT: andl $-8, %esp 367; X86-AVX1-NEXT: subl $8, %esp 368; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 369; X86-AVX1-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0 370; X86-AVX1-NEXT: vmovsd %xmm0, (%esp) 371; X86-AVX1-NEXT: fldl (%esp) 372; X86-AVX1-NEXT: movl %ebp, %esp 373; X86-AVX1-NEXT: popl %ebp 374; X86-AVX1-NEXT: retl 375 %i = fptosi double %x to i32 376 %r = sitofp i32 %i to double 377 ret double %r 378} 379 380define double @trunc_f32_signed32_f64_no_fast_math(float %x) { 381; SSE-LABEL: trunc_f32_signed32_f64_no_fast_math: 382; SSE: # %bb.0: 383; SSE-NEXT: cvttps2dq %xmm0, %xmm0 384; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 385; SSE-NEXT: retq 386; 387; X64-AVX1-LABEL: trunc_f32_signed32_f64_no_fast_math: 388; X64-AVX1: # %bb.0: 389; X64-AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 390; X64-AVX1-NEXT: vcvtdq2pd %xmm0, %xmm0 391; X64-AVX1-NEXT: retq 392; 393; X86-AVX1-LABEL: trunc_f32_signed32_f64_no_fast_math: 394; X86-AVX1: # %bb.0: 395; X86-AVX1-NEXT: pushl %ebp 396; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 397; X86-AVX1-NEXT: .cfi_offset %ebp, -8 398; X86-AVX1-NEXT: movl %esp, %ebp 399; X86-AVX1-NEXT: .cfi_def_cfa_register %ebp 400; X86-AVX1-NEXT: andl $-8, %esp 401; X86-AVX1-NEXT: subl $8, %esp 402; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 403; X86-AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 404; X86-AVX1-NEXT: vcvtdq2pd %xmm0, %xmm0 405; X86-AVX1-NEXT: vmovlps %xmm0, (%esp) 406; X86-AVX1-NEXT: fldl (%esp) 407; X86-AVX1-NEXT: movl %ebp, %esp 408; X86-AVX1-NEXT: popl %ebp 409; X86-AVX1-NEXT: .cfi_def_cfa %esp, 4 410; X86-AVX1-NEXT: retl 411 %i = fptosi float %x to i32 412 %r = sitofp i32 %i to double 413 ret double %r 414} 415 416define double @trunc_f32_signed32_f64_nsz(float %x) #0 { 417; SSE-LABEL: trunc_f32_signed32_f64_nsz: 418; SSE: # %bb.0: 419; SSE-NEXT: cvttps2dq %xmm0, %xmm0 420; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 421; SSE-NEXT: retq 422; 423; X64-AVX1-LABEL: trunc_f32_signed32_f64_nsz: 424; X64-AVX1: # %bb.0: 425; X64-AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 426; X64-AVX1-NEXT: vcvtdq2pd %xmm0, %xmm0 427; X64-AVX1-NEXT: retq 428; 429; X86-AVX1-LABEL: trunc_f32_signed32_f64_nsz: 430; X86-AVX1: # %bb.0: 431; X86-AVX1-NEXT: pushl %ebp 432; X86-AVX1-NEXT: movl %esp, %ebp 433; X86-AVX1-NEXT: andl $-8, %esp 434; X86-AVX1-NEXT: subl $8, %esp 435; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 436; X86-AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 437; X86-AVX1-NEXT: vcvtdq2pd %xmm0, %xmm0 438; X86-AVX1-NEXT: vmovlps %xmm0, (%esp) 439; X86-AVX1-NEXT: fldl (%esp) 440; X86-AVX1-NEXT: movl %ebp, %esp 441; X86-AVX1-NEXT: popl %ebp 442; X86-AVX1-NEXT: retl 443 %i = fptosi float %x to i32 444 %r = sitofp i32 %i to double 445 ret double %r 446} 447 448define float @trunc_f64_signed32_f32_no_fast_math(double %x) { 449; SSE-LABEL: trunc_f64_signed32_f32_no_fast_math: 450; SSE: # %bb.0: 451; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 452; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 453; SSE-NEXT: retq 454; 455; X64-AVX1-LABEL: trunc_f64_signed32_f32_no_fast_math: 456; X64-AVX1: # %bb.0: 457; X64-AVX1-NEXT: vcvttpd2dq %xmm0, %xmm0 458; X64-AVX1-NEXT: vcvtdq2ps %xmm0, %xmm0 459; X64-AVX1-NEXT: retq 460; 461; X86-AVX1-LABEL: trunc_f64_signed32_f32_no_fast_math: 462; X86-AVX1: # %bb.0: 463; X86-AVX1-NEXT: pushl %eax 464; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 465; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 466; X86-AVX1-NEXT: vcvttpd2dq %xmm0, %xmm0 467; X86-AVX1-NEXT: vcvtdq2ps %xmm0, %xmm0 468; X86-AVX1-NEXT: vmovss %xmm0, (%esp) 469; X86-AVX1-NEXT: flds (%esp) 470; X86-AVX1-NEXT: popl %eax 471; X86-AVX1-NEXT: .cfi_def_cfa_offset 4 472; X86-AVX1-NEXT: retl 473 %i = fptosi double %x to i32 474 %r = sitofp i32 %i to float 475 ret float %r 476} 477 478define float @trunc_f64_signed32_f32_nsz(double %x) #0 { 479; SSE-LABEL: trunc_f64_signed32_f32_nsz: 480; SSE: # %bb.0: 481; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 482; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 483; SSE-NEXT: retq 484; 485; X64-AVX1-LABEL: trunc_f64_signed32_f32_nsz: 486; X64-AVX1: # %bb.0: 487; X64-AVX1-NEXT: vcvttpd2dq %xmm0, %xmm0 488; X64-AVX1-NEXT: vcvtdq2ps %xmm0, %xmm0 489; X64-AVX1-NEXT: retq 490; 491; X86-AVX1-LABEL: trunc_f64_signed32_f32_nsz: 492; X86-AVX1: # %bb.0: 493; X86-AVX1-NEXT: pushl %eax 494; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 495; X86-AVX1-NEXT: vcvttpd2dq %xmm0, %xmm0 496; X86-AVX1-NEXT: vcvtdq2ps %xmm0, %xmm0 497; X86-AVX1-NEXT: vmovss %xmm0, (%esp) 498; X86-AVX1-NEXT: flds (%esp) 499; X86-AVX1-NEXT: popl %eax 500; X86-AVX1-NEXT: retl 501 %i = fptosi double %x to i32 502 %r = sitofp i32 %i to float 503 ret float %r 504} 505 506define double @trunc_signed_f64_no_fast_math(double %x) { 507; SSE-LABEL: trunc_signed_f64_no_fast_math: 508; SSE: # %bb.0: 509; SSE-NEXT: cvttsd2si %xmm0, %rax 510; SSE-NEXT: xorps %xmm0, %xmm0 511; SSE-NEXT: cvtsi2sd %rax, %xmm0 512; SSE-NEXT: retq 513; 514; X64-AVX1-LABEL: trunc_signed_f64_no_fast_math: 515; X64-AVX1: # %bb.0: 516; X64-AVX1-NEXT: vcvttsd2si %xmm0, %rax 517; X64-AVX1-NEXT: vcvtsi2sd %rax, %xmm1, %xmm0 518; X64-AVX1-NEXT: retq 519; 520; X86-AVX1-LABEL: trunc_signed_f64_no_fast_math: 521; X86-AVX1: # %bb.0: 522; X86-AVX1-NEXT: pushl %ebp 523; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 524; X86-AVX1-NEXT: .cfi_offset %ebp, -8 525; X86-AVX1-NEXT: movl %esp, %ebp 526; X86-AVX1-NEXT: .cfi_def_cfa_register %ebp 527; X86-AVX1-NEXT: andl $-8, %esp 528; X86-AVX1-NEXT: subl $24, %esp 529; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 530; X86-AVX1-NEXT: vmovsd %xmm0, (%esp) 531; X86-AVX1-NEXT: fldl (%esp) 532; X86-AVX1-NEXT: fisttpll (%esp) 533; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 534; X86-AVX1-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) 535; X86-AVX1-NEXT: fildll {{[0-9]+}}(%esp) 536; X86-AVX1-NEXT: fstpl {{[0-9]+}}(%esp) 537; X86-AVX1-NEXT: fldl {{[0-9]+}}(%esp) 538; X86-AVX1-NEXT: movl %ebp, %esp 539; X86-AVX1-NEXT: popl %ebp 540; X86-AVX1-NEXT: .cfi_def_cfa %esp, 4 541; X86-AVX1-NEXT: retl 542 %i = fptosi double %x to i64 543 %r = sitofp i64 %i to double 544 ret double %r 545} 546 547define double @trunc_signed_f64_nsz(double %x) #0 { 548; SSE2-LABEL: trunc_signed_f64_nsz: 549; SSE2: # %bb.0: 550; SSE2-NEXT: cvttsd2si %xmm0, %rax 551; SSE2-NEXT: xorps %xmm0, %xmm0 552; SSE2-NEXT: cvtsi2sd %rax, %xmm0 553; SSE2-NEXT: retq 554; 555; SSE41-LABEL: trunc_signed_f64_nsz: 556; SSE41: # %bb.0: 557; SSE41-NEXT: roundsd $11, %xmm0, %xmm0 558; SSE41-NEXT: retq 559; 560; X64-AVX1-LABEL: trunc_signed_f64_nsz: 561; X64-AVX1: # %bb.0: 562; X64-AVX1-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0 563; X64-AVX1-NEXT: retq 564; 565; X86-AVX1-LABEL: trunc_signed_f64_nsz: 566; X86-AVX1: # %bb.0: 567; X86-AVX1-NEXT: pushl %ebp 568; X86-AVX1-NEXT: movl %esp, %ebp 569; X86-AVX1-NEXT: andl $-8, %esp 570; X86-AVX1-NEXT: subl $8, %esp 571; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 572; X86-AVX1-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0 573; X86-AVX1-NEXT: vmovsd %xmm0, (%esp) 574; X86-AVX1-NEXT: fldl (%esp) 575; X86-AVX1-NEXT: movl %ebp, %esp 576; X86-AVX1-NEXT: popl %ebp 577; X86-AVX1-NEXT: retl 578 %i = fptosi double %x to i64 579 %r = sitofp i64 %i to double 580 ret double %r 581} 582 583define <4 x float> @trunc_signed_v4f32_nsz(<4 x float> %x) #0 { 584; SSE2-LABEL: trunc_signed_v4f32_nsz: 585; SSE2: # %bb.0: 586; SSE2-NEXT: cvttps2dq %xmm0, %xmm0 587; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 588; SSE2-NEXT: retq 589; 590; SSE41-LABEL: trunc_signed_v4f32_nsz: 591; SSE41: # %bb.0: 592; SSE41-NEXT: roundps $11, %xmm0, %xmm0 593; SSE41-NEXT: retq 594; 595; AVX-LABEL: trunc_signed_v4f32_nsz: 596; AVX: # %bb.0: 597; AVX-NEXT: vroundps $11, %xmm0, %xmm0 598; AVX-NEXT: ret{{[l|q]}} 599 %i = fptosi <4 x float> %x to <4 x i32> 600 %r = sitofp <4 x i32> %i to <4 x float> 601 ret <4 x float> %r 602} 603 604define <2 x double> @trunc_signed_v2f64_nsz(<2 x double> %x) #0 { 605; SSE2-LABEL: trunc_signed_v2f64_nsz: 606; SSE2: # %bb.0: 607; SSE2-NEXT: cvttsd2si %xmm0, %rax 608; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 609; SSE2-NEXT: cvttsd2si %xmm0, %rcx 610; SSE2-NEXT: xorps %xmm0, %xmm0 611; SSE2-NEXT: cvtsi2sd %rax, %xmm0 612; SSE2-NEXT: cvtsi2sd %rcx, %xmm1 613; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 614; SSE2-NEXT: retq 615; 616; SSE41-LABEL: trunc_signed_v2f64_nsz: 617; SSE41: # %bb.0: 618; SSE41-NEXT: roundpd $11, %xmm0, %xmm0 619; SSE41-NEXT: retq 620; 621; AVX-LABEL: trunc_signed_v2f64_nsz: 622; AVX: # %bb.0: 623; AVX-NEXT: vroundpd $11, %xmm0, %xmm0 624; AVX-NEXT: ret{{[l|q]}} 625 %i = fptosi <2 x double> %x to <2 x i64> 626 %r = sitofp <2 x i64> %i to <2 x double> 627 ret <2 x double> %r 628} 629 630define <4 x double> @trunc_signed_v4f64_nsz(<4 x double> %x) #0 { 631; SSE2-LABEL: trunc_signed_v4f64_nsz: 632; SSE2: # %bb.0: 633; SSE2-NEXT: cvttsd2si %xmm1, %rax 634; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 635; SSE2-NEXT: cvttsd2si %xmm1, %rcx 636; SSE2-NEXT: cvttsd2si %xmm0, %rdx 637; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 638; SSE2-NEXT: cvttsd2si %xmm0, %rsi 639; SSE2-NEXT: xorps %xmm0, %xmm0 640; SSE2-NEXT: cvtsi2sd %rdx, %xmm0 641; SSE2-NEXT: xorps %xmm1, %xmm1 642; SSE2-NEXT: cvtsi2sd %rsi, %xmm1 643; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 644; SSE2-NEXT: xorps %xmm1, %xmm1 645; SSE2-NEXT: cvtsi2sd %rax, %xmm1 646; SSE2-NEXT: cvtsi2sd %rcx, %xmm2 647; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] 648; SSE2-NEXT: retq 649; 650; SSE41-LABEL: trunc_signed_v4f64_nsz: 651; SSE41: # %bb.0: 652; SSE41-NEXT: roundpd $11, %xmm0, %xmm0 653; SSE41-NEXT: roundpd $11, %xmm1, %xmm1 654; SSE41-NEXT: retq 655; 656; AVX-LABEL: trunc_signed_v4f64_nsz: 657; AVX: # %bb.0: 658; AVX-NEXT: vroundpd $11, %ymm0, %ymm0 659; AVX-NEXT: ret{{[l|q]}} 660 %i = fptosi <4 x double> %x to <4 x i64> 661 %r = sitofp <4 x i64> %i to <4 x double> 662 ret <4 x double> %r 663} 664 665; The FTRUNC ("round**" x86 asm) fold relies on UB in the case of overflow. 666; This used to be guarded with an attribute check. That allowed existing 667; code to continue working based on its assumptions that float->int 668; overflow had saturating behavior. 669; 670; Now, we expect a front-end to use IR intrinsics if it wants to avoid this 671; transform. 672 673define float @trunc_unsigned_f32_disable_via_intrinsic(float %x) #0 { 674; SSE-LABEL: trunc_unsigned_f32_disable_via_intrinsic: 675; SSE: # %bb.0: 676; SSE-NEXT: cvttss2si %xmm0, %rax 677; SSE-NEXT: xorl %ecx, %ecx 678; SSE-NEXT: xorps %xmm1, %xmm1 679; SSE-NEXT: ucomiss %xmm1, %xmm0 680; SSE-NEXT: cmovael %eax, %ecx 681; SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 682; SSE-NEXT: movl $-1, %eax 683; SSE-NEXT: cmovbel %ecx, %eax 684; SSE-NEXT: xorps %xmm0, %xmm0 685; SSE-NEXT: cvtsi2ss %rax, %xmm0 686; SSE-NEXT: retq 687; 688; X64-AVX1-LABEL: trunc_unsigned_f32_disable_via_intrinsic: 689; X64-AVX1: # %bb.0: 690; X64-AVX1-NEXT: vcvttss2si %xmm0, %rax 691; X64-AVX1-NEXT: xorl %ecx, %ecx 692; X64-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 693; X64-AVX1-NEXT: vucomiss %xmm1, %xmm0 694; X64-AVX1-NEXT: cmovael %eax, %ecx 695; X64-AVX1-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 696; X64-AVX1-NEXT: movl $-1, %eax 697; X64-AVX1-NEXT: cmovbel %ecx, %eax 698; X64-AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 699; X64-AVX1-NEXT: retq 700; 701; X86-AVX1-LABEL: trunc_unsigned_f32_disable_via_intrinsic: 702; X86-AVX1: # %bb.0: 703; X86-AVX1-NEXT: pushl %eax 704; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 705; X86-AVX1-NEXT: vcvttss2si %xmm0, %eax 706; X86-AVX1-NEXT: movl %eax, %ecx 707; X86-AVX1-NEXT: sarl $31, %ecx 708; X86-AVX1-NEXT: vsubss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 709; X86-AVX1-NEXT: vcvttss2si %xmm1, %edx 710; X86-AVX1-NEXT: andl %ecx, %edx 711; X86-AVX1-NEXT: orl %eax, %edx 712; X86-AVX1-NEXT: xorl %eax, %eax 713; X86-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 714; X86-AVX1-NEXT: vucomiss %xmm1, %xmm0 715; X86-AVX1-NEXT: cmovael %edx, %eax 716; X86-AVX1-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 717; X86-AVX1-NEXT: movl $-1, %ecx 718; X86-AVX1-NEXT: cmovbel %eax, %ecx 719; X86-AVX1-NEXT: vmovd %ecx, %xmm0 720; X86-AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 721; X86-AVX1-NEXT: vsubsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 722; X86-AVX1-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 723; X86-AVX1-NEXT: vmovss %xmm0, (%esp) 724; X86-AVX1-NEXT: flds (%esp) 725; X86-AVX1-NEXT: popl %eax 726; X86-AVX1-NEXT: retl 727 %i = call i32 @llvm.fptoui.sat.i32.f32(float %x) 728 %r = uitofp i32 %i to float 729 ret float %r 730} 731 732define double @trunc_signed_f64_disable_via_intrinsic(double %x) #0 { 733; SSE-LABEL: trunc_signed_f64_disable_via_intrinsic: 734; SSE: # %bb.0: 735; SSE-NEXT: cvttsd2si %xmm0, %rax 736; SSE-NEXT: ucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 737; SSE-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF 738; SSE-NEXT: cmovbeq %rax, %rcx 739; SSE-NEXT: xorl %eax, %eax 740; SSE-NEXT: ucomisd %xmm0, %xmm0 741; SSE-NEXT: cmovnpq %rcx, %rax 742; SSE-NEXT: xorps %xmm0, %xmm0 743; SSE-NEXT: cvtsi2sd %rax, %xmm0 744; SSE-NEXT: retq 745; 746; X64-AVX1-LABEL: trunc_signed_f64_disable_via_intrinsic: 747; X64-AVX1: # %bb.0: 748; X64-AVX1-NEXT: vcvttsd2si %xmm0, %rax 749; X64-AVX1-NEXT: vucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 750; X64-AVX1-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF 751; X64-AVX1-NEXT: cmovbeq %rax, %rcx 752; X64-AVX1-NEXT: xorl %eax, %eax 753; X64-AVX1-NEXT: vucomisd %xmm0, %xmm0 754; X64-AVX1-NEXT: cmovnpq %rcx, %rax 755; X64-AVX1-NEXT: vcvtsi2sd %rax, %xmm1, %xmm0 756; X64-AVX1-NEXT: retq 757; 758; X86-AVX1-LABEL: trunc_signed_f64_disable_via_intrinsic: 759; X86-AVX1: # %bb.0: 760; X86-AVX1-NEXT: pushl %ebp 761; X86-AVX1-NEXT: movl %esp, %ebp 762; X86-AVX1-NEXT: pushl %esi 763; X86-AVX1-NEXT: andl $-8, %esp 764; X86-AVX1-NEXT: subl $32, %esp 765; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 766; X86-AVX1-NEXT: vmovsd %xmm0, (%esp) 767; X86-AVX1-NEXT: fldl (%esp) 768; X86-AVX1-NEXT: fisttpll (%esp) 769; X86-AVX1-NEXT: xorl %eax, %eax 770; X86-AVX1-NEXT: vucomisd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 771; X86-AVX1-NEXT: movl $-2147483648, %ecx # imm = 0x80000000 772; X86-AVX1-NEXT: movl $0, %edx 773; X86-AVX1-NEXT: jb .LBB19_2 774; X86-AVX1-NEXT: # %bb.1: 775; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx 776; X86-AVX1-NEXT: movl (%esp), %edx 777; X86-AVX1-NEXT: .LBB19_2: 778; X86-AVX1-NEXT: vucomisd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 779; X86-AVX1-NEXT: movl $-1, %esi 780; X86-AVX1-NEXT: cmovbel %edx, %esi 781; X86-AVX1-NEXT: movl $2147483647, %edx # imm = 0x7FFFFFFF 782; X86-AVX1-NEXT: cmovbel %ecx, %edx 783; X86-AVX1-NEXT: vucomisd %xmm0, %xmm0 784; X86-AVX1-NEXT: cmovpl %eax, %edx 785; X86-AVX1-NEXT: cmovpl %eax, %esi 786; X86-AVX1-NEXT: vmovd %esi, %xmm0 787; X86-AVX1-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 788; X86-AVX1-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) 789; X86-AVX1-NEXT: fildll {{[0-9]+}}(%esp) 790; X86-AVX1-NEXT: fstpl {{[0-9]+}}(%esp) 791; X86-AVX1-NEXT: fldl {{[0-9]+}}(%esp) 792; X86-AVX1-NEXT: leal -4(%ebp), %esp 793; X86-AVX1-NEXT: popl %esi 794; X86-AVX1-NEXT: popl %ebp 795; X86-AVX1-NEXT: retl 796 %i = call i64 @llvm.fptosi.sat.i64.f64(double %x) 797 %r = sitofp i64 %i to double 798 ret double %r 799} 800 801attributes #0 = { nounwind "no-signed-zeros-fp-math"="true" } 802