1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64--linux-gnu -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64--linux-gnu -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 4; RUN: llc < %s -mtriple=x86_64--linux-gnu -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 5 6declare double @__sqrt_finite(double) 7declare float @__sqrtf_finite(float) 8declare x86_fp80 @__sqrtl_finite(x86_fp80) 9declare float @llvm.sqrt.f32(float) 10declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) 11declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) 12declare <16 x float> @llvm.sqrt.v16f32(<16 x float>) 13declare double @llvm.sqrt.f64(double) 14declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) 15 16declare float @llvm.fabs.f32(float) 17declare <4 x float> @llvm.fabs.v4f32(<4 x float>) 18declare double @llvm.fabs.f64(double) 19 20define double @finite_f64_no_estimate(double %d) #0 { 21; SSE-LABEL: finite_f64_no_estimate: 22; SSE: # %bb.0: 23; SSE-NEXT: sqrtsd %xmm0, %xmm0 24; SSE-NEXT: retq 25; 26; AVX-LABEL: finite_f64_no_estimate: 27; AVX: # %bb.0: 28; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 29; AVX-NEXT: retq 30 %call = tail call double @__sqrt_finite(double %d) #2 31 ret double %call 32} 33 34; No estimates for doubles. 35 36define double @finite_f64_estimate(double %d) #1 { 37; SSE-LABEL: finite_f64_estimate: 38; SSE: # %bb.0: 39; SSE-NEXT: sqrtsd %xmm0, %xmm0 40; SSE-NEXT: retq 41; 42; AVX-LABEL: finite_f64_estimate: 43; AVX: # %bb.0: 44; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 45; AVX-NEXT: retq 46 %call = tail call double @__sqrt_finite(double %d) #2 47 ret double %call 48} 49 50define float @finite_f32_no_estimate(float %f) #0 { 51; SSE-LABEL: finite_f32_no_estimate: 52; SSE: # %bb.0: 53; SSE-NEXT: sqrtss %xmm0, %xmm0 54; SSE-NEXT: retq 55; 56; AVX-LABEL: finite_f32_no_estimate: 57; AVX: # %bb.0: 58; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 59; AVX-NEXT: retq 60 %call = tail call float @__sqrtf_finite(float %f) #2 61 ret float %call 62} 63 64define float @finite_f32_estimate_ieee(float %f) #1 { 65; SSE-LABEL: finite_f32_estimate_ieee: 66; SSE: # %bb.0: 67; SSE-NEXT: sqrtss %xmm0, %xmm0 68; SSE-NEXT: retq 69; 70; AVX-LABEL: finite_f32_estimate_ieee: 71; AVX: # %bb.0: 72; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 73; AVX-NEXT: retq 74 %call = tail call float @__sqrtf_finite(float %f) #2 75 ret float %call 76} 77 78define float @finite_f32_estimate_ieee_ninf(float %f) #1 { 79; SSE-LABEL: finite_f32_estimate_ieee_ninf: 80; SSE: # %bb.0: 81; SSE-NEXT: sqrtss %xmm0, %xmm0 82; SSE-NEXT: retq 83; 84; AVX-LABEL: finite_f32_estimate_ieee_ninf: 85; AVX: # %bb.0: 86; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 87; AVX-NEXT: retq 88 %call = tail call ninf afn float @__sqrtf_finite(float %f) #2 89 ret float %call 90} 91 92define float @finite_f32_estimate_daz(float %f) #4 { 93; SSE-LABEL: finite_f32_estimate_daz: 94; SSE: # %bb.0: 95; SSE-NEXT: sqrtss %xmm0, %xmm0 96; SSE-NEXT: retq 97; 98; AVX-LABEL: finite_f32_estimate_daz: 99; AVX: # %bb.0: 100; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 101; AVX-NEXT: retq 102 %call = tail call float @__sqrtf_finite(float %f) #2 103 ret float %call 104} 105 106define float @finite_f32_estimate_daz_ninf(float %f) #4 { 107; SSE-LABEL: finite_f32_estimate_daz_ninf: 108; SSE: # %bb.0: 109; SSE-NEXT: sqrtss %xmm0, %xmm0 110; SSE-NEXT: retq 111; 112; AVX-LABEL: finite_f32_estimate_daz_ninf: 113; AVX: # %bb.0: 114; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 115; AVX-NEXT: retq 116 %call = tail call ninf afn float @__sqrtf_finite(float %f) #2 117 ret float %call 118} 119 120define x86_fp80 @finite_f80_no_estimate(x86_fp80 %ld) #0 { 121; CHECK-LABEL: finite_f80_no_estimate: 122; CHECK: # %bb.0: 123; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) 124; CHECK-NEXT: fsqrt 125; CHECK-NEXT: retq 126 %call = tail call x86_fp80 @__sqrtl_finite(x86_fp80 %ld) #2 127 ret x86_fp80 %call 128} 129 130; Don't die on the impossible. 131 132define x86_fp80 @finite_f80_estimate_but_no(x86_fp80 %ld) #1 { 133; CHECK-LABEL: finite_f80_estimate_but_no: 134; CHECK: # %bb.0: 135; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) 136; CHECK-NEXT: fsqrt 137; CHECK-NEXT: retq 138 %call = tail call x86_fp80 @__sqrtl_finite(x86_fp80 %ld) #2 139 ret x86_fp80 %call 140} 141 142; PR34994 - https://bugs.llvm.org/show_bug.cgi?id=34994 143 144define float @sqrtf_check_denorms(float %x) #3 { 145; SSE-LABEL: sqrtf_check_denorms: 146; SSE: # %bb.0: 147; SSE-NEXT: sqrtss %xmm0, %xmm0 148; SSE-NEXT: retq 149; 150; AVX-LABEL: sqrtf_check_denorms: 151; AVX: # %bb.0: 152; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 153; AVX-NEXT: retq 154 %call = tail call float @__sqrtf_finite(float %x) #2 155 ret float %call 156} 157 158define float @sqrtf_check_denorms_ninf(float %x) #3 { 159; SSE-LABEL: sqrtf_check_denorms_ninf: 160; SSE: # %bb.0: 161; SSE-NEXT: sqrtss %xmm0, %xmm0 162; SSE-NEXT: retq 163; 164; AVX-LABEL: sqrtf_check_denorms_ninf: 165; AVX: # %bb.0: 166; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 167; AVX-NEXT: retq 168 %call = tail call ninf afn float @__sqrtf_finite(float %x) #2 169 ret float %call 170} 171 172define <4 x float> @sqrt_v4f32_check_denorms(<4 x float> %x) #3 { 173; SSE-LABEL: sqrt_v4f32_check_denorms: 174; SSE: # %bb.0: 175; SSE-NEXT: sqrtps %xmm0, %xmm0 176; SSE-NEXT: retq 177; 178; AVX-LABEL: sqrt_v4f32_check_denorms: 179; AVX: # %bb.0: 180; AVX-NEXT: vsqrtps %xmm0, %xmm0 181; AVX-NEXT: retq 182 %call = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2 183 ret <4 x float> %call 184} 185 186define <4 x float> @sqrt_v4f32_check_denorms_ieee_ninf(<4 x float> %x) #3 { 187; SSE-LABEL: sqrt_v4f32_check_denorms_ieee_ninf: 188; SSE: # %bb.0: 189; SSE-NEXT: rsqrtps %xmm0, %xmm1 190; SSE-NEXT: movaps %xmm0, %xmm2 191; SSE-NEXT: mulps %xmm1, %xmm2 192; SSE-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 193; SSE-NEXT: mulps %xmm2, %xmm3 194; SSE-NEXT: mulps %xmm1, %xmm2 195; SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 196; SSE-NEXT: mulps %xmm3, %xmm2 197; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 198; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] 199; SSE-NEXT: cmpleps %xmm0, %xmm1 200; SSE-NEXT: andps %xmm2, %xmm1 201; SSE-NEXT: movaps %xmm1, %xmm0 202; SSE-NEXT: retq 203; 204; AVX1-LABEL: sqrt_v4f32_check_denorms_ieee_ninf: 205; AVX1: # %bb.0: 206; AVX1-NEXT: vrsqrtps %xmm0, %xmm1 207; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm2 208; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 209; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1 210; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 211; AVX1-NEXT: vmulps %xmm1, %xmm3, %xmm1 212; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 213; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] 214; AVX1-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 215; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 216; AVX1-NEXT: retq 217; 218; AVX512-LABEL: sqrt_v4f32_check_denorms_ieee_ninf: 219; AVX512: # %bb.0: 220; AVX512-NEXT: vrsqrtps %xmm0, %xmm1 221; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm2 222; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 223; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3 224; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 225; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1 226; AVX512-NEXT: vmulps %xmm3, %xmm1, %xmm1 227; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] 228; AVX512-NEXT: vandps %xmm2, %xmm0, %xmm0 229; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] 230; AVX512-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 231; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0 232; AVX512-NEXT: retq 233 %call = tail call ninf afn <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2 234 ret <4 x float> %call 235} 236 237define <4 x float> @sqrt_v4f32_check_denorms_dynamic_ninf(<4 x float> %x) #6 { 238; SSE-LABEL: sqrt_v4f32_check_denorms_dynamic_ninf: 239; SSE: # %bb.0: 240; SSE-NEXT: rsqrtps %xmm0, %xmm1 241; SSE-NEXT: movaps %xmm0, %xmm2 242; SSE-NEXT: mulps %xmm1, %xmm2 243; SSE-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 244; SSE-NEXT: mulps %xmm2, %xmm3 245; SSE-NEXT: mulps %xmm1, %xmm2 246; SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 247; SSE-NEXT: mulps %xmm3, %xmm2 248; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 249; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] 250; SSE-NEXT: cmpleps %xmm0, %xmm1 251; SSE-NEXT: andps %xmm2, %xmm1 252; SSE-NEXT: movaps %xmm1, %xmm0 253; SSE-NEXT: retq 254; 255; AVX1-LABEL: sqrt_v4f32_check_denorms_dynamic_ninf: 256; AVX1: # %bb.0: 257; AVX1-NEXT: vrsqrtps %xmm0, %xmm1 258; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm2 259; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 260; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1 261; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 262; AVX1-NEXT: vmulps %xmm1, %xmm3, %xmm1 263; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 264; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] 265; AVX1-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 266; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 267; AVX1-NEXT: retq 268; 269; AVX512-LABEL: sqrt_v4f32_check_denorms_dynamic_ninf: 270; AVX512: # %bb.0: 271; AVX512-NEXT: vrsqrtps %xmm0, %xmm1 272; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm2 273; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 274; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3 275; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 276; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1 277; AVX512-NEXT: vmulps %xmm3, %xmm1, %xmm1 278; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] 279; AVX512-NEXT: vandps %xmm2, %xmm0, %xmm0 280; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] 281; AVX512-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 282; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0 283; AVX512-NEXT: retq 284 %call = tail call ninf afn <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2 285 ret <4 x float> %call 286} 287 288define float @f32_no_estimate(float %x) #0 { 289; SSE-LABEL: f32_no_estimate: 290; SSE: # %bb.0: 291; SSE-NEXT: sqrtss %xmm0, %xmm1 292; SSE-NEXT: movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 293; SSE-NEXT: divss %xmm1, %xmm0 294; SSE-NEXT: retq 295; 296; AVX-LABEL: f32_no_estimate: 297; AVX: # %bb.0: 298; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 299; AVX-NEXT: vmovss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 300; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 301; AVX-NEXT: retq 302 %sqrt = tail call float @llvm.sqrt.f32(float %x) 303 %div = fdiv fast float 1.0, %sqrt 304 ret float %div 305} 306 307define float @f32_estimate(float %x) #1 { 308; SSE-LABEL: f32_estimate: 309; SSE: # %bb.0: 310; SSE-NEXT: rsqrtss %xmm0, %xmm1 311; SSE-NEXT: mulss %xmm1, %xmm0 312; SSE-NEXT: mulss %xmm1, %xmm0 313; SSE-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 314; SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 315; SSE-NEXT: mulss %xmm1, %xmm0 316; SSE-NEXT: retq 317; 318; AVX1-LABEL: f32_estimate: 319; AVX1: # %bb.0: 320; AVX1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1 321; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm0 322; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm0 323; AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 324; AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 325; AVX1-NEXT: vmulss %xmm0, %xmm1, %xmm0 326; AVX1-NEXT: retq 327; 328; AVX512-LABEL: f32_estimate: 329; AVX512: # %bb.0: 330; AVX512-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1 331; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 332; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem 333; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 334; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 335; AVX512-NEXT: retq 336 %sqrt = tail call float @llvm.sqrt.f32(float %x) 337 %div = fdiv fast float 1.0, %sqrt 338 ret float %div 339} 340 341define float @f32_estimate2(float %x) #5 { 342; SSE-LABEL: f32_estimate2: 343; SSE: # %bb.0: 344; SSE-NEXT: sqrtss %xmm0, %xmm0 345; SSE-NEXT: retq 346; 347; AVX-LABEL: f32_estimate2: 348; AVX: # %bb.0: 349; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 350; AVX-NEXT: retq 351 %sqrt = tail call fast float @llvm.sqrt.f32(float %x) 352 ret float %sqrt 353} 354 355define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 { 356; SSE-LABEL: v4f32_no_estimate: 357; SSE: # %bb.0: 358; SSE-NEXT: sqrtps %xmm0, %xmm1 359; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 360; SSE-NEXT: divps %xmm1, %xmm0 361; SSE-NEXT: retq 362; 363; AVX-LABEL: v4f32_no_estimate: 364; AVX: # %bb.0: 365; AVX-NEXT: vsqrtps %xmm0, %xmm0 366; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 367; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0 368; AVX-NEXT: retq 369 %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) 370 %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt 371 ret <4 x float> %div 372} 373 374define <4 x float> @v4f32_estimate(<4 x float> %x) #1 { 375; SSE-LABEL: v4f32_estimate: 376; SSE: # %bb.0: 377; SSE-NEXT: rsqrtps %xmm0, %xmm1 378; SSE-NEXT: mulps %xmm1, %xmm0 379; SSE-NEXT: mulps %xmm1, %xmm0 380; SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 381; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 382; SSE-NEXT: mulps %xmm1, %xmm0 383; SSE-NEXT: retq 384; 385; AVX1-LABEL: v4f32_estimate: 386; AVX1: # %bb.0: 387; AVX1-NEXT: vrsqrtps %xmm0, %xmm1 388; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0 389; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0 390; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 391; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 392; AVX1-NEXT: vmulps %xmm0, %xmm1, %xmm0 393; AVX1-NEXT: retq 394; 395; AVX512-LABEL: v4f32_estimate: 396; AVX512: # %bb.0: 397; AVX512-NEXT: vrsqrtps %xmm0, %xmm1 398; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 399; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 400; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2 401; AVX512-NEXT: vbroadcastss {{.*#+}} xmm0 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 402; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 403; AVX512-NEXT: vmulps %xmm2, %xmm0, %xmm0 404; AVX512-NEXT: retq 405 %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) 406 %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt 407 ret <4 x float> %div 408} 409 410define <4 x float> @v4f32_estimate2(<4 x float> %x) #5 { 411; SSE-LABEL: v4f32_estimate2: 412; SSE: # %bb.0: 413; SSE-NEXT: rsqrtps %xmm0, %xmm2 414; SSE-NEXT: mulps %xmm0, %xmm2 415; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 416; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] 417; SSE-NEXT: cmpleps %xmm0, %xmm1 418; SSE-NEXT: andps %xmm2, %xmm1 419; SSE-NEXT: movaps %xmm1, %xmm0 420; SSE-NEXT: retq 421; 422; AVX1-LABEL: v4f32_estimate2: 423; AVX1: # %bb.0: 424; AVX1-NEXT: vrsqrtps %xmm0, %xmm1 425; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm1 426; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 427; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] 428; AVX1-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 429; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 430; AVX1-NEXT: retq 431; 432; AVX512-LABEL: v4f32_estimate2: 433; AVX512: # %bb.0: 434; AVX512-NEXT: vrsqrtps %xmm0, %xmm1 435; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm1 436; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] 437; AVX512-NEXT: vandps %xmm2, %xmm0, %xmm0 438; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] 439; AVX512-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 440; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0 441; AVX512-NEXT: retq 442 %sqrt = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) 443 ret <4 x float> %sqrt 444} 445 446define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 { 447; SSE-LABEL: v8f32_no_estimate: 448; SSE: # %bb.0: 449; SSE-NEXT: sqrtps %xmm1, %xmm2 450; SSE-NEXT: sqrtps %xmm0, %xmm3 451; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 452; SSE-NEXT: movaps %xmm1, %xmm0 453; SSE-NEXT: divps %xmm3, %xmm0 454; SSE-NEXT: divps %xmm2, %xmm1 455; SSE-NEXT: retq 456; 457; AVX-LABEL: v8f32_no_estimate: 458; AVX: # %bb.0: 459; AVX-NEXT: vsqrtps %ymm0, %ymm0 460; AVX-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 461; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0 462; AVX-NEXT: retq 463 %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x) 464 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt 465 ret <8 x float> %div 466} 467 468define <8 x float> @v8f32_estimate(<8 x float> %x) #1 { 469; SSE-LABEL: v8f32_estimate: 470; SSE: # %bb.0: 471; SSE-NEXT: rsqrtps %xmm0, %xmm2 472; SSE-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 473; SSE-NEXT: mulps %xmm2, %xmm0 474; SSE-NEXT: mulps %xmm2, %xmm0 475; SSE-NEXT: mulps %xmm3, %xmm2 476; SSE-NEXT: movaps {{.*#+}} xmm4 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 477; SSE-NEXT: addps %xmm4, %xmm0 478; SSE-NEXT: mulps %xmm2, %xmm0 479; SSE-NEXT: rsqrtps %xmm1, %xmm2 480; SSE-NEXT: mulps %xmm2, %xmm3 481; SSE-NEXT: mulps %xmm2, %xmm1 482; SSE-NEXT: mulps %xmm2, %xmm1 483; SSE-NEXT: addps %xmm4, %xmm1 484; SSE-NEXT: mulps %xmm3, %xmm1 485; SSE-NEXT: retq 486; 487; AVX1-LABEL: v8f32_estimate: 488; AVX1: # %bb.0: 489; AVX1-NEXT: vrsqrtps %ymm0, %ymm1 490; AVX1-NEXT: vmulps %ymm1, %ymm0, %ymm0 491; AVX1-NEXT: vmulps %ymm1, %ymm0, %ymm0 492; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 493; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 494; AVX1-NEXT: vmulps %ymm0, %ymm1, %ymm0 495; AVX1-NEXT: retq 496; 497; AVX512-LABEL: v8f32_estimate: 498; AVX512: # %bb.0: 499; AVX512-NEXT: vrsqrtps %ymm0, %ymm1 500; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0 501; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 502; AVX512-NEXT: vbroadcastss {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 503; AVX512-NEXT: vfmadd231ps {{.*#+}} ymm2 = (ymm1 * ymm0) + ymm2 504; AVX512-NEXT: vmulps %ymm3, %ymm1, %ymm0 505; AVX512-NEXT: vmulps %ymm2, %ymm0, %ymm0 506; AVX512-NEXT: retq 507 %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x) 508 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt 509 ret <8 x float> %div 510} 511 512define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 { 513; SSE-LABEL: v16f32_no_estimate: 514; SSE: # %bb.0: 515; SSE-NEXT: sqrtps %xmm3, %xmm4 516; SSE-NEXT: sqrtps %xmm2, %xmm5 517; SSE-NEXT: sqrtps %xmm1, %xmm2 518; SSE-NEXT: sqrtps %xmm0, %xmm1 519; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 520; SSE-NEXT: movaps %xmm3, %xmm0 521; SSE-NEXT: divps %xmm1, %xmm0 522; SSE-NEXT: movaps %xmm3, %xmm1 523; SSE-NEXT: divps %xmm2, %xmm1 524; SSE-NEXT: movaps %xmm3, %xmm2 525; SSE-NEXT: divps %xmm5, %xmm2 526; SSE-NEXT: divps %xmm4, %xmm3 527; SSE-NEXT: retq 528; 529; AVX1-LABEL: v16f32_no_estimate: 530; AVX1: # %bb.0: 531; AVX1-NEXT: vsqrtps %ymm1, %ymm1 532; AVX1-NEXT: vsqrtps %ymm0, %ymm0 533; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 534; AVX1-NEXT: vdivps %ymm0, %ymm2, %ymm0 535; AVX1-NEXT: vdivps %ymm1, %ymm2, %ymm1 536; AVX1-NEXT: retq 537; 538; AVX512-LABEL: v16f32_no_estimate: 539; AVX512: # %bb.0: 540; AVX512-NEXT: vsqrtps %zmm0, %zmm0 541; AVX512-NEXT: vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 542; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0 543; AVX512-NEXT: retq 544 %sqrt = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %x) 545 %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt 546 ret <16 x float> %div 547} 548 549define <16 x float> @v16f32_estimate(<16 x float> %x) #1 { 550; SSE-LABEL: v16f32_estimate: 551; SSE: # %bb.0: 552; SSE-NEXT: rsqrtps %xmm0, %xmm6 553; SSE-NEXT: movaps {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 554; SSE-NEXT: mulps %xmm6, %xmm0 555; SSE-NEXT: mulps %xmm6, %xmm0 556; SSE-NEXT: mulps %xmm4, %xmm6 557; SSE-NEXT: movaps {{.*#+}} xmm5 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 558; SSE-NEXT: addps %xmm5, %xmm0 559; SSE-NEXT: mulps %xmm6, %xmm0 560; SSE-NEXT: rsqrtps %xmm1, %xmm6 561; SSE-NEXT: mulps %xmm6, %xmm1 562; SSE-NEXT: mulps %xmm6, %xmm1 563; SSE-NEXT: mulps %xmm4, %xmm6 564; SSE-NEXT: addps %xmm5, %xmm1 565; SSE-NEXT: mulps %xmm6, %xmm1 566; SSE-NEXT: rsqrtps %xmm2, %xmm6 567; SSE-NEXT: mulps %xmm6, %xmm2 568; SSE-NEXT: mulps %xmm6, %xmm2 569; SSE-NEXT: mulps %xmm4, %xmm6 570; SSE-NEXT: addps %xmm5, %xmm2 571; SSE-NEXT: mulps %xmm6, %xmm2 572; SSE-NEXT: rsqrtps %xmm3, %xmm6 573; SSE-NEXT: mulps %xmm6, %xmm4 574; SSE-NEXT: mulps %xmm6, %xmm3 575; SSE-NEXT: mulps %xmm6, %xmm3 576; SSE-NEXT: addps %xmm5, %xmm3 577; SSE-NEXT: mulps %xmm4, %xmm3 578; SSE-NEXT: retq 579; 580; AVX1-LABEL: v16f32_estimate: 581; AVX1: # %bb.0: 582; AVX1-NEXT: vrsqrtps %ymm0, %ymm2 583; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 584; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm4 585; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0 586; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0 587; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 588; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0 589; AVX1-NEXT: vrsqrtps %ymm1, %ymm5 590; AVX1-NEXT: vmulps %ymm0, %ymm4, %ymm0 591; AVX1-NEXT: vmulps %ymm3, %ymm5, %ymm3 592; AVX1-NEXT: vmulps %ymm5, %ymm1, %ymm1 593; AVX1-NEXT: vmulps %ymm5, %ymm1, %ymm1 594; AVX1-NEXT: vaddps %ymm2, %ymm1, %ymm1 595; AVX1-NEXT: vmulps %ymm1, %ymm3, %ymm1 596; AVX1-NEXT: retq 597; 598; AVX512-LABEL: v16f32_estimate: 599; AVX512: # %bb.0: 600; AVX512-NEXT: vrsqrt14ps %zmm0, %zmm1 601; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 602; AVX512-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + mem 603; AVX512-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 604; AVX512-NEXT: vmulps %zmm0, %zmm1, %zmm0 605; AVX512-NEXT: retq 606 %sqrt = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %x) 607 %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt 608 ret <16 x float> %div 609} 610 611; x / (fabs(y) * sqrt(z)) --> x * rsqrt(y*y*z) 612 613define float @div_sqrt_fabs_f32(float %x, float %y, float %z) { 614; SSE-LABEL: div_sqrt_fabs_f32: 615; SSE: # %bb.0: 616; SSE-NEXT: mulss %xmm1, %xmm1 617; SSE-NEXT: mulss %xmm2, %xmm1 618; SSE-NEXT: xorps %xmm2, %xmm2 619; SSE-NEXT: rsqrtss %xmm1, %xmm2 620; SSE-NEXT: mulss %xmm2, %xmm1 621; SSE-NEXT: mulss %xmm2, %xmm1 622; SSE-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 623; SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 624; SSE-NEXT: mulss %xmm2, %xmm0 625; SSE-NEXT: mulss %xmm1, %xmm0 626; SSE-NEXT: retq 627; 628; AVX1-LABEL: div_sqrt_fabs_f32: 629; AVX1: # %bb.0: 630; AVX1-NEXT: vmulss %xmm1, %xmm1, %xmm1 631; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm1 632; AVX1-NEXT: vrsqrtss %xmm1, %xmm1, %xmm2 633; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm1 634; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm1 635; AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 636; AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 637; AVX1-NEXT: vmulss %xmm2, %xmm0, %xmm0 638; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm0 639; AVX1-NEXT: retq 640; 641; AVX512-LABEL: div_sqrt_fabs_f32: 642; AVX512: # %bb.0: 643; AVX512-NEXT: vmulss %xmm1, %xmm1, %xmm1 644; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 645; AVX512-NEXT: vrsqrtss %xmm1, %xmm1, %xmm2 646; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 647; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem 648; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 649; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 650; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 651; AVX512-NEXT: retq 652 %s = call fast float @llvm.sqrt.f32(float %z) 653 %a = call fast float @llvm.fabs.f32(float %y) 654 %m = fmul fast float %s, %a 655 %d = fdiv fast float %x, %m 656 ret float %d 657} 658 659; x / (fabs(y) * sqrt(z)) --> x * rsqrt(y*y*z) 660 661define <4 x float> @div_sqrt_fabs_v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z) { 662; SSE-LABEL: div_sqrt_fabs_v4f32: 663; SSE: # %bb.0: 664; SSE-NEXT: mulps %xmm1, %xmm1 665; SSE-NEXT: mulps %xmm2, %xmm1 666; SSE-NEXT: rsqrtps %xmm1, %xmm2 667; SSE-NEXT: mulps %xmm2, %xmm1 668; SSE-NEXT: mulps %xmm2, %xmm1 669; SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 670; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 671; SSE-NEXT: mulps %xmm1, %xmm2 672; SSE-NEXT: mulps %xmm2, %xmm0 673; SSE-NEXT: retq 674; 675; AVX1-LABEL: div_sqrt_fabs_v4f32: 676; AVX1: # %bb.0: 677; AVX1-NEXT: vmulps %xmm1, %xmm1, %xmm1 678; AVX1-NEXT: vmulps %xmm2, %xmm1, %xmm1 679; AVX1-NEXT: vrsqrtps %xmm1, %xmm2 680; AVX1-NEXT: vmulps %xmm2, %xmm1, %xmm1 681; AVX1-NEXT: vmulps %xmm2, %xmm1, %xmm1 682; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 683; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 684; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1 685; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0 686; AVX1-NEXT: retq 687; 688; AVX512-LABEL: div_sqrt_fabs_v4f32: 689; AVX512: # %bb.0: 690; AVX512-NEXT: vmulps %xmm1, %xmm1, %xmm1 691; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 692; AVX512-NEXT: vrsqrtps %xmm1, %xmm2 693; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 694; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 695; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3 696; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 697; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1 698; AVX512-NEXT: vmulps %xmm3, %xmm1, %xmm1 699; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 700; AVX512-NEXT: retq 701 %s = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %z) 702 %a = call <4 x float> @llvm.fabs.v4f32(<4 x float> %y) 703 %m = fmul contract reassoc <4 x float> %a, %s 704 %d = fdiv contract reassoc arcp <4 x float> %x, %m 705 ret <4 x float> %d 706} 707 708; This has 'arcp' but does not have 'reassoc' FMF. 709; We allow converting the sqrt to an estimate, but 710; do not pull the divisor into the estimate. 711; x / (fabs(y) * sqrt(z)) --> x * rsqrt(z) / fabs(y) 712 713define <4 x float> @div_sqrt_fabs_v4f32_fmf(<4 x float> %x, <4 x float> %y, <4 x float> %z) { 714; SSE-LABEL: div_sqrt_fabs_v4f32_fmf: 715; SSE: # %bb.0: 716; SSE-NEXT: rsqrtps %xmm2, %xmm3 717; SSE-NEXT: mulps %xmm3, %xmm2 718; SSE-NEXT: mulps %xmm3, %xmm2 719; SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 720; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 721; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 722; SSE-NEXT: mulps %xmm2, %xmm3 723; SSE-NEXT: divps %xmm1, %xmm3 724; SSE-NEXT: mulps %xmm3, %xmm0 725; SSE-NEXT: retq 726; 727; AVX1-LABEL: div_sqrt_fabs_v4f32_fmf: 728; AVX1: # %bb.0: 729; AVX1-NEXT: vrsqrtps %xmm2, %xmm3 730; AVX1-NEXT: vmulps %xmm3, %xmm2, %xmm2 731; AVX1-NEXT: vmulps %xmm3, %xmm2, %xmm2 732; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 733; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 734; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 735; AVX1-NEXT: vmulps %xmm2, %xmm3, %xmm2 736; AVX1-NEXT: vdivps %xmm1, %xmm2, %xmm1 737; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0 738; AVX1-NEXT: retq 739; 740; AVX512-LABEL: div_sqrt_fabs_v4f32_fmf: 741; AVX512: # %bb.0: 742; AVX512-NEXT: vrsqrtps %xmm2, %xmm3 743; AVX512-NEXT: vbroadcastss {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 744; AVX512-NEXT: vmulps %xmm4, %xmm3, %xmm4 745; AVX512-NEXT: vmulps %xmm3, %xmm2, %xmm2 746; AVX512-NEXT: vmulps %xmm3, %xmm2, %xmm2 747; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 748; AVX512-NEXT: vaddps %xmm3, %xmm2, %xmm2 749; AVX512-NEXT: vmulps %xmm2, %xmm4, %xmm2 750; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN] 751; AVX512-NEXT: vandps %xmm3, %xmm1, %xmm1 752; AVX512-NEXT: vdivps %xmm1, %xmm2, %xmm1 753; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 754; AVX512-NEXT: retq 755 %s = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %z) 756 %a = call <4 x float> @llvm.fabs.v4f32(<4 x float> %y) 757 %m = fmul <4 x float> %a, %s 758 %d = fdiv arcp <4 x float> %x, %m 759 ret <4 x float> %d 760} 761 762; No estimates for f64, so do not convert fabs into an fmul. 763 764define double @div_sqrt_fabs_f64(double %x, double %y, double %z) { 765; SSE-LABEL: div_sqrt_fabs_f64: 766; SSE: # %bb.0: 767; SSE-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 768; SSE-NEXT: sqrtsd %xmm2, %xmm2 769; SSE-NEXT: mulsd %xmm2, %xmm1 770; SSE-NEXT: divsd %xmm1, %xmm0 771; SSE-NEXT: retq 772; 773; AVX-LABEL: div_sqrt_fabs_f64: 774; AVX: # %bb.0: 775; AVX-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 776; AVX-NEXT: vsqrtsd %xmm2, %xmm2, %xmm2 777; AVX-NEXT: vmulsd %xmm1, %xmm2, %xmm1 778; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 779; AVX-NEXT: retq 780 %s = call fast double @llvm.sqrt.f64(double %z) 781 %a = call fast double @llvm.fabs.f64(double %y) 782 %m = fmul fast double %s, %a 783 %d = fdiv fast double %x, %m 784 ret double %d 785} 786 787; This is a special case for the general pattern above - 788; if the sqrt operand is the same as the other mul op, 789; then fabs may be omitted. 790; x / (y * sqrt(y)) --> x * rsqrt(y*y*y) 791 792define float @div_sqrt_f32(float %x, float %y) { 793; SSE-LABEL: div_sqrt_f32: 794; SSE: # %bb.0: 795; SSE-NEXT: movaps %xmm1, %xmm2 796; SSE-NEXT: mulss %xmm1, %xmm2 797; SSE-NEXT: mulss %xmm1, %xmm2 798; SSE-NEXT: xorps %xmm1, %xmm1 799; SSE-NEXT: rsqrtss %xmm2, %xmm1 800; SSE-NEXT: mulss %xmm1, %xmm2 801; SSE-NEXT: mulss %xmm1, %xmm2 802; SSE-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 803; SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 804; SSE-NEXT: mulss %xmm1, %xmm0 805; SSE-NEXT: mulss %xmm2, %xmm0 806; SSE-NEXT: retq 807; 808; AVX1-LABEL: div_sqrt_f32: 809; AVX1: # %bb.0: 810; AVX1-NEXT: vmulss %xmm1, %xmm1, %xmm2 811; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1 812; AVX1-NEXT: vrsqrtss %xmm1, %xmm1, %xmm2 813; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm1 814; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm1 815; AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 816; AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 817; AVX1-NEXT: vmulss %xmm2, %xmm0, %xmm0 818; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm0 819; AVX1-NEXT: retq 820; 821; AVX512-LABEL: div_sqrt_f32: 822; AVX512: # %bb.0: 823; AVX512-NEXT: vmulss %xmm1, %xmm1, %xmm2 824; AVX512-NEXT: vmulss %xmm1, %xmm2, %xmm1 825; AVX512-NEXT: vrsqrtss %xmm1, %xmm1, %xmm2 826; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 827; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem 828; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 829; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 830; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 831; AVX512-NEXT: retq 832 %s = call fast float @llvm.sqrt.f32(float %y) 833 %m = fmul fast float %s, %y 834 %d = fdiv fast float %x, %m 835 ret float %d 836} 837 838; This is a special case for the general pattern above - 839; if the sqrt operand is the same as the other mul op, 840; then fabs may be omitted. 841; x / (y * sqrt(y)) --> x * rsqrt(y*y*y) 842 843define <4 x float> @div_sqrt_v4f32(<4 x float> %x, <4 x float> %y) { 844; SSE-LABEL: div_sqrt_v4f32: 845; SSE: # %bb.0: 846; SSE-NEXT: movaps %xmm1, %xmm2 847; SSE-NEXT: mulps %xmm1, %xmm2 848; SSE-NEXT: mulps %xmm1, %xmm2 849; SSE-NEXT: rsqrtps %xmm2, %xmm1 850; SSE-NEXT: mulps %xmm1, %xmm2 851; SSE-NEXT: mulps %xmm1, %xmm2 852; SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 853; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 854; SSE-NEXT: mulps %xmm2, %xmm1 855; SSE-NEXT: mulps %xmm1, %xmm0 856; SSE-NEXT: retq 857; 858; AVX1-LABEL: div_sqrt_v4f32: 859; AVX1: # %bb.0: 860; AVX1-NEXT: vmulps %xmm1, %xmm1, %xmm2 861; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1 862; AVX1-NEXT: vrsqrtps %xmm1, %xmm2 863; AVX1-NEXT: vmulps %xmm2, %xmm1, %xmm1 864; AVX1-NEXT: vmulps %xmm2, %xmm1, %xmm1 865; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 866; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 867; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1 868; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0 869; AVX1-NEXT: retq 870; 871; AVX512-LABEL: div_sqrt_v4f32: 872; AVX512: # %bb.0: 873; AVX512-NEXT: vmulps %xmm1, %xmm1, %xmm2 874; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1 875; AVX512-NEXT: vrsqrtps %xmm1, %xmm2 876; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 877; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 878; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3 879; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 880; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1 881; AVX512-NEXT: vmulps %xmm3, %xmm1, %xmm1 882; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 883; AVX512-NEXT: retq 884 %s = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %y) 885 %m = fmul contract reassoc <4 x float> %y, %s 886 %d = fdiv contract reassoc arcp <4 x float> %x, %m 887 ret <4 x float> %d 888} 889 890define double @sqrt_fdiv_common_operand(double %x) nounwind { 891; SSE-LABEL: sqrt_fdiv_common_operand: 892; SSE: # %bb.0: 893; SSE-NEXT: sqrtsd %xmm0, %xmm0 894; SSE-NEXT: retq 895; 896; AVX-LABEL: sqrt_fdiv_common_operand: 897; AVX: # %bb.0: 898; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 899; AVX-NEXT: retq 900 %sqrt = call fast double @llvm.sqrt.f64(double %x) 901 %r = fdiv fast double %x, %sqrt 902 ret double %r 903} 904 905define <2 x double> @sqrt_fdiv_common_operand_vec(<2 x double> %x) nounwind { 906; SSE-LABEL: sqrt_fdiv_common_operand_vec: 907; SSE: # %bb.0: 908; SSE-NEXT: sqrtpd %xmm0, %xmm0 909; SSE-NEXT: retq 910; 911; AVX-LABEL: sqrt_fdiv_common_operand_vec: 912; AVX: # %bb.0: 913; AVX-NEXT: vsqrtpd %xmm0, %xmm0 914; AVX-NEXT: retq 915 %sqrt = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %x) 916 %r = fdiv arcp nsz reassoc <2 x double> %x, %sqrt 917 ret <2 x double> %r 918} 919 920define double @sqrt_fdiv_common_operand_extra_use(double %x, ptr %p) nounwind { 921; SSE-LABEL: sqrt_fdiv_common_operand_extra_use: 922; SSE: # %bb.0: 923; SSE-NEXT: sqrtsd %xmm0, %xmm0 924; SSE-NEXT: movsd %xmm0, (%rdi) 925; SSE-NEXT: retq 926; 927; AVX-LABEL: sqrt_fdiv_common_operand_extra_use: 928; AVX: # %bb.0: 929; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 930; AVX-NEXT: vmovsd %xmm0, (%rdi) 931; AVX-NEXT: retq 932 %sqrt = call fast double @llvm.sqrt.f64(double %x) 933 store double %sqrt, ptr %p 934 %r = fdiv fast double %x, %sqrt 935 ret double %r 936} 937 938define double @sqrt_simplify_before_recip(double %x, ptr %p) nounwind { 939; SSE-LABEL: sqrt_simplify_before_recip: 940; SSE: # %bb.0: 941; SSE-NEXT: sqrtsd %xmm0, %xmm0 942; SSE-NEXT: movsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0] 943; SSE-NEXT: divsd %xmm0, %xmm1 944; SSE-NEXT: movsd %xmm1, (%rdi) 945; SSE-NEXT: retq 946; 947; AVX-LABEL: sqrt_simplify_before_recip: 948; AVX: # %bb.0: 949; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 950; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0] 951; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm1 952; AVX-NEXT: vmovsd %xmm1, (%rdi) 953; AVX-NEXT: retq 954 %sqrt = tail call fast double @llvm.sqrt.f64(double %x) 955 %rsqrt = fdiv fast double 1.0, %sqrt 956 %sqrt_fast = fdiv fast double %x, %sqrt 957 store double %rsqrt, ptr %p, align 8 958 ret double %sqrt_fast 959} 960 961define <2 x double> @sqrt_simplify_before_recip_vec(<2 x double> %x, ptr %p) nounwind { 962; SSE-LABEL: sqrt_simplify_before_recip_vec: 963; SSE: # %bb.0: 964; SSE-NEXT: sqrtpd %xmm0, %xmm0 965; SSE-NEXT: movapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0] 966; SSE-NEXT: divpd %xmm0, %xmm1 967; SSE-NEXT: movupd %xmm1, (%rdi) 968; SSE-NEXT: retq 969; 970; AVX-LABEL: sqrt_simplify_before_recip_vec: 971; AVX: # %bb.0: 972; AVX-NEXT: vsqrtpd %xmm0, %xmm0 973; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0] 974; AVX-NEXT: # xmm1 = mem[0,0] 975; AVX-NEXT: vdivpd %xmm0, %xmm1, %xmm1 976; AVX-NEXT: vmovupd %xmm1, (%rdi) 977; AVX-NEXT: retq 978 %sqrt = tail call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %x) 979 %rsqrt = fdiv fast <2 x double> <double 1.0, double 1.0>, %sqrt 980 %sqrt_fast = fdiv fast <2 x double> %x, %sqrt 981 store <2 x double> %rsqrt, ptr %p, align 8 982 ret <2 x double> %sqrt_fast 983} 984 985define double @sqrt_simplify_before_recip_order(double %x, ptr %p) nounwind { 986; SSE-LABEL: sqrt_simplify_before_recip_order: 987; SSE: # %bb.0: 988; SSE-NEXT: sqrtsd %xmm0, %xmm0 989; SSE-NEXT: movsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0] 990; SSE-NEXT: divsd %xmm0, %xmm1 991; SSE-NEXT: movsd %xmm1, (%rdi) 992; SSE-NEXT: retq 993; 994; AVX-LABEL: sqrt_simplify_before_recip_order: 995; AVX: # %bb.0: 996; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 997; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0] 998; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm1 999; AVX-NEXT: vmovsd %xmm1, (%rdi) 1000; AVX-NEXT: retq 1001 %sqrt = tail call fast double @llvm.sqrt.f64(double %x) 1002 %sqrt_fast = fdiv fast double %x, %sqrt 1003 %rsqrt = fdiv fast double 42.0, %sqrt 1004 store double %rsqrt, ptr %p, align 8 1005 ret double %sqrt_fast 1006} 1007 1008attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!sqrtf,!vec-sqrtf,!divf,!vec-divf" } 1009attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" } 1010attributes #2 = { nounwind readnone } 1011attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,ieee" } 1012attributes #4 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="ieee,preserve-sign" } 1013attributes #5 = { "unsafe-fp-math"="true" "reciprocal-estimates"="all:0" } 1014attributes #6 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,dynamic" } 1015