1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-RECIP 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefixes=AVX,FMA-RECIP 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 | FileCheck %s --check-prefixes=AVX,BDVER2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,BTVER2 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,SANDY 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell | FileCheck %s --check-prefixes=AVX,HASWELL 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -mattr=-fma | FileCheck %s --check-prefixes=AVX,HASWELL-NO-FMA 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefixes=AVX,AVX512,KNL 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefixes=AVX,AVX512,SKX 12 13; If the target's divss/divps instructions are substantially 14; slower than rcpss/rcpps with a Newton-Raphson refinement, 15; we should generate the estimate sequence. 16 17; See PR21385 ( http://llvm.org/bugs/show_bug.cgi?id=21385 ) 18; for details about the accuracy, speed, and implementation 19; differences of x86 reciprocal estimates. 20 21define float @f32_no_estimate(float %x) #0 { 22; SSE-LABEL: f32_no_estimate: 23; SSE: # %bb.0: 24; SSE-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 25; SSE-NEXT: divss %xmm0, %xmm1 26; SSE-NEXT: movaps %xmm1, %xmm0 27; SSE-NEXT: retq 28; 29; AVX-LABEL: f32_no_estimate: 30; AVX: # %bb.0: 31; AVX-NEXT: vmovss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 32; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 33; AVX-NEXT: retq 34 %div = fdiv fast float 1.0, %x 35 ret float %div 36} 37 38define float @f32_one_step(float %x) #1 { 39; SSE-LABEL: f32_one_step: 40; SSE: # %bb.0: 41; SSE-NEXT: rcpss %xmm0, %xmm2 42; SSE-NEXT: mulss %xmm2, %xmm0 43; SSE-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 44; SSE-NEXT: subss %xmm0, %xmm1 45; SSE-NEXT: mulss %xmm2, %xmm1 46; SSE-NEXT: addss %xmm2, %xmm1 47; SSE-NEXT: movaps %xmm1, %xmm0 48; SSE-NEXT: retq 49; 50; AVX-RECIP-LABEL: f32_one_step: 51; AVX-RECIP: # %bb.0: 52; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 53; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 54; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 55; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0 56; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 57; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 58; AVX-RECIP-NEXT: retq 59; 60; FMA-RECIP-LABEL: f32_one_step: 61; FMA-RECIP: # %bb.0: 62; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 63; FMA-RECIP-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - mem 64; FMA-RECIP-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 65; FMA-RECIP-NEXT: retq 66; 67; BDVER2-LABEL: f32_one_step: 68; BDVER2: # %bb.0: 69; BDVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 70; BDVER2-NEXT: vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm1) - mem 71; BDVER2-NEXT: vfnmaddss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1 72; BDVER2-NEXT: retq 73; 74; BTVER2-LABEL: f32_one_step: 75; BTVER2: # %bb.0: 76; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 77; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 78; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 79; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 80; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 81; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 82; BTVER2-NEXT: retq 83; 84; SANDY-LABEL: f32_one_step: 85; SANDY: # %bb.0: 86; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 87; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 88; SANDY-NEXT: vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 89; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 90; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 91; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 92; SANDY-NEXT: retq 93; 94; HASWELL-LABEL: f32_one_step: 95; HASWELL: # %bb.0: 96; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 97; HASWELL-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - mem 98; HASWELL-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 99; HASWELL-NEXT: retq 100; 101; HASWELL-NO-FMA-LABEL: f32_one_step: 102; HASWELL-NO-FMA: # %bb.0: 103; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 104; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 105; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 106; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 107; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 108; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 109; HASWELL-NO-FMA-NEXT: retq 110; 111; AVX512-LABEL: f32_one_step: 112; AVX512: # %bb.0: 113; AVX512-NEXT: vrcpss %xmm0, %xmm0, %xmm1 114; AVX512-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - mem 115; AVX512-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 116; AVX512-NEXT: retq 117 %div = fdiv fast float 1.0, %x 118 ret float %div 119} 120 121define float @f32_one_step_variables(float %x, float %y) #1 { 122; SSE-LABEL: f32_one_step_variables: 123; SSE: # %bb.0: 124; SSE-NEXT: rcpss %xmm1, %xmm2 125; SSE-NEXT: movaps %xmm0, %xmm3 126; SSE-NEXT: mulss %xmm2, %xmm3 127; SSE-NEXT: mulss %xmm3, %xmm1 128; SSE-NEXT: subss %xmm1, %xmm0 129; SSE-NEXT: mulss %xmm2, %xmm0 130; SSE-NEXT: addss %xmm3, %xmm0 131; SSE-NEXT: retq 132; 133; AVX-RECIP-LABEL: f32_one_step_variables: 134; AVX-RECIP: # %bb.0: 135; AVX-RECIP-NEXT: vrcpss %xmm1, %xmm1, %xmm2 136; AVX-RECIP-NEXT: vmulss %xmm2, %xmm0, %xmm3 137; AVX-RECIP-NEXT: vmulss %xmm3, %xmm1, %xmm1 138; AVX-RECIP-NEXT: vsubss %xmm1, %xmm0, %xmm0 139; AVX-RECIP-NEXT: vmulss %xmm0, %xmm2, %xmm0 140; AVX-RECIP-NEXT: vaddss %xmm0, %xmm3, %xmm0 141; AVX-RECIP-NEXT: retq 142; 143; FMA-RECIP-LABEL: f32_one_step_variables: 144; FMA-RECIP: # %bb.0: 145; FMA-RECIP-NEXT: vrcpss %xmm1, %xmm1, %xmm2 146; FMA-RECIP-NEXT: vmulss %xmm2, %xmm0, %xmm3 147; FMA-RECIP-NEXT: vfmsub231ss {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0 148; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 149; FMA-RECIP-NEXT: retq 150; 151; BDVER2-LABEL: f32_one_step_variables: 152; BDVER2: # %bb.0: 153; BDVER2-NEXT: vrcpss %xmm1, %xmm1, %xmm2 154; BDVER2-NEXT: vmulss %xmm2, %xmm0, %xmm3 155; BDVER2-NEXT: vfmsubss {{.*#+}} xmm0 = (xmm1 * xmm3) - xmm0 156; BDVER2-NEXT: vfnmaddss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 157; BDVER2-NEXT: retq 158; 159; BTVER2-LABEL: f32_one_step_variables: 160; BTVER2: # %bb.0: 161; BTVER2-NEXT: vrcpss %xmm1, %xmm1, %xmm2 162; BTVER2-NEXT: vmulss %xmm2, %xmm0, %xmm3 163; BTVER2-NEXT: vmulss %xmm3, %xmm1, %xmm1 164; BTVER2-NEXT: vsubss %xmm1, %xmm0, %xmm0 165; BTVER2-NEXT: vmulss %xmm0, %xmm2, %xmm0 166; BTVER2-NEXT: vaddss %xmm0, %xmm3, %xmm0 167; BTVER2-NEXT: retq 168; 169; SANDY-LABEL: f32_one_step_variables: 170; SANDY: # %bb.0: 171; SANDY-NEXT: vrcpss %xmm1, %xmm1, %xmm2 172; SANDY-NEXT: vmulss %xmm2, %xmm0, %xmm3 173; SANDY-NEXT: vmulss %xmm3, %xmm1, %xmm1 174; SANDY-NEXT: vsubss %xmm1, %xmm0, %xmm0 175; SANDY-NEXT: vmulss %xmm0, %xmm2, %xmm0 176; SANDY-NEXT: vaddss %xmm0, %xmm3, %xmm0 177; SANDY-NEXT: retq 178; 179; HASWELL-LABEL: f32_one_step_variables: 180; HASWELL: # %bb.0: 181; HASWELL-NEXT: vrcpss %xmm1, %xmm1, %xmm2 182; HASWELL-NEXT: vmulss %xmm2, %xmm0, %xmm3 183; HASWELL-NEXT: vfmsub231ss {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0 184; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 185; HASWELL-NEXT: retq 186; 187; HASWELL-NO-FMA-LABEL: f32_one_step_variables: 188; HASWELL-NO-FMA: # %bb.0: 189; HASWELL-NO-FMA-NEXT: vrcpss %xmm1, %xmm1, %xmm2 190; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm0, %xmm3 191; HASWELL-NO-FMA-NEXT: vmulss %xmm3, %xmm1, %xmm1 192; HASWELL-NO-FMA-NEXT: vsubss %xmm1, %xmm0, %xmm0 193; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm2, %xmm0 194; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm3, %xmm0 195; HASWELL-NO-FMA-NEXT: retq 196; 197; AVX512-LABEL: f32_one_step_variables: 198; AVX512: # %bb.0: 199; AVX512-NEXT: vrcpss %xmm1, %xmm1, %xmm2 200; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm3 201; AVX512-NEXT: vfmsub231ss {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0 202; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 203; AVX512-NEXT: retq 204 %div = fdiv fast float %x, %y 205 ret float %div 206} 207 208define float @f32_two_step(float %x) #2 { 209; SSE-LABEL: f32_two_step: 210; SSE: # %bb.0: 211; SSE-NEXT: rcpss %xmm0, %xmm2 212; SSE-NEXT: movaps %xmm0, %xmm3 213; SSE-NEXT: mulss %xmm2, %xmm3 214; SSE-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 215; SSE-NEXT: movaps %xmm1, %xmm4 216; SSE-NEXT: subss %xmm3, %xmm4 217; SSE-NEXT: mulss %xmm2, %xmm4 218; SSE-NEXT: addss %xmm2, %xmm4 219; SSE-NEXT: mulss %xmm4, %xmm0 220; SSE-NEXT: subss %xmm0, %xmm1 221; SSE-NEXT: mulss %xmm4, %xmm1 222; SSE-NEXT: addss %xmm4, %xmm1 223; SSE-NEXT: movaps %xmm1, %xmm0 224; SSE-NEXT: retq 225; 226; AVX-RECIP-LABEL: f32_two_step: 227; AVX-RECIP: # %bb.0: 228; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 229; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm2 230; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 231; AVX-RECIP-NEXT: vsubss %xmm2, %xmm3, %xmm2 232; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm2 233; AVX-RECIP-NEXT: vaddss %xmm2, %xmm1, %xmm1 234; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 235; AVX-RECIP-NEXT: vsubss %xmm0, %xmm3, %xmm0 236; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 237; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 238; AVX-RECIP-NEXT: retq 239; 240; FMA-RECIP-LABEL: f32_two_step: 241; FMA-RECIP: # %bb.0: 242; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 243; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 244; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 245; FMA-RECIP-NEXT: vfmsub213ss {{.*#+}} xmm3 = (xmm0 * xmm3) - xmm2 246; FMA-RECIP-NEXT: vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1 247; FMA-RECIP-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2 248; FMA-RECIP-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm3) + xmm3 249; FMA-RECIP-NEXT: retq 250; 251; BDVER2-LABEL: f32_two_step: 252; BDVER2: # %bb.0: 253; BDVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 254; BDVER2-NEXT: vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 255; BDVER2-NEXT: vfmsubss {{.*#+}} xmm3 = (xmm0 * xmm1) - xmm2 256; BDVER2-NEXT: vfnmaddss {{.*#+}} xmm1 = -(xmm1 * xmm3) + xmm1 257; BDVER2-NEXT: vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2 258; BDVER2-NEXT: vfnmaddss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1 259; BDVER2-NEXT: retq 260; 261; BTVER2-LABEL: f32_two_step: 262; BTVER2: # %bb.0: 263; BTVER2-NEXT: vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 264; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 265; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm2 266; BTVER2-NEXT: vsubss %xmm2, %xmm3, %xmm2 267; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm2 268; BTVER2-NEXT: vaddss %xmm2, %xmm1, %xmm1 269; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 270; BTVER2-NEXT: vsubss %xmm0, %xmm3, %xmm0 271; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 272; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 273; BTVER2-NEXT: retq 274; 275; SANDY-LABEL: f32_two_step: 276; SANDY: # %bb.0: 277; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 278; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 279; SANDY-NEXT: vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 280; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 281; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 282; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 283; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 284; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 285; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 286; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 287; SANDY-NEXT: retq 288; 289; HASWELL-LABEL: f32_two_step: 290; HASWELL: # %bb.0: 291; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 292; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 293; HASWELL-NEXT: vmovaps %xmm1, %xmm3 294; HASWELL-NEXT: vfmsub213ss {{.*#+}} xmm3 = (xmm0 * xmm3) - xmm2 295; HASWELL-NEXT: vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1 296; HASWELL-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2 297; HASWELL-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm3) + xmm3 298; HASWELL-NEXT: retq 299; 300; HASWELL-NO-FMA-LABEL: f32_two_step: 301; HASWELL-NO-FMA: # %bb.0: 302; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 303; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm2 304; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 305; HASWELL-NO-FMA-NEXT: vsubss %xmm2, %xmm3, %xmm2 306; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2 307; HASWELL-NO-FMA-NEXT: vaddss %xmm2, %xmm1, %xmm1 308; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 309; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm3, %xmm0 310; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 311; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 312; HASWELL-NO-FMA-NEXT: retq 313; 314; AVX512-LABEL: f32_two_step: 315; AVX512: # %bb.0: 316; AVX512-NEXT: vrcpss %xmm0, %xmm0, %xmm1 317; AVX512-NEXT: vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 318; AVX512-NEXT: vmovaps %xmm1, %xmm3 319; AVX512-NEXT: vfmsub213ss {{.*#+}} xmm3 = (xmm0 * xmm3) - xmm2 320; AVX512-NEXT: vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1 321; AVX512-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2 322; AVX512-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm3) + xmm3 323; AVX512-NEXT: retq 324 %div = fdiv fast float 1.0, %x 325 ret float %div 326} 327 328define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 { 329; SSE-LABEL: v4f32_no_estimate: 330; SSE: # %bb.0: 331; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 332; SSE-NEXT: divps %xmm0, %xmm1 333; SSE-NEXT: movaps %xmm1, %xmm0 334; SSE-NEXT: retq 335; 336; AVX-LABEL: v4f32_no_estimate: 337; AVX: # %bb.0: 338; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 339; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0 340; AVX-NEXT: retq 341 %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x 342 ret <4 x float> %div 343} 344 345define <4 x float> @v4f32_one_step(<4 x float> %x) #1 { 346; SSE-LABEL: v4f32_one_step: 347; SSE: # %bb.0: 348; SSE-NEXT: rcpps %xmm0, %xmm2 349; SSE-NEXT: mulps %xmm2, %xmm0 350; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 351; SSE-NEXT: subps %xmm0, %xmm1 352; SSE-NEXT: mulps %xmm2, %xmm1 353; SSE-NEXT: addps %xmm2, %xmm1 354; SSE-NEXT: movaps %xmm1, %xmm0 355; SSE-NEXT: retq 356; 357; AVX-RECIP-LABEL: v4f32_one_step: 358; AVX-RECIP: # %bb.0: 359; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 360; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 361; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 362; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 363; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 364; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 365; AVX-RECIP-NEXT: retq 366; 367; FMA-RECIP-LABEL: v4f32_one_step: 368; FMA-RECIP: # %bb.0: 369; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 370; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - mem 371; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 372; FMA-RECIP-NEXT: retq 373; 374; BDVER2-LABEL: v4f32_one_step: 375; BDVER2: # %bb.0: 376; BDVER2-NEXT: vrcpps %xmm0, %xmm1 377; BDVER2-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - mem 378; BDVER2-NEXT: vfnmaddps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1 379; BDVER2-NEXT: retq 380; 381; BTVER2-LABEL: v4f32_one_step: 382; BTVER2: # %bb.0: 383; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 384; BTVER2-NEXT: vrcpps %xmm0, %xmm1 385; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 386; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 387; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 388; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 389; BTVER2-NEXT: retq 390; 391; SANDY-LABEL: v4f32_one_step: 392; SANDY: # %bb.0: 393; SANDY-NEXT: vrcpps %xmm0, %xmm1 394; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 395; SANDY-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 396; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 397; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 398; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 399; SANDY-NEXT: retq 400; 401; HASWELL-LABEL: v4f32_one_step: 402; HASWELL: # %bb.0: 403; HASWELL-NEXT: vrcpps %xmm0, %xmm1 404; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 405; HASWELL-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 406; HASWELL-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 407; HASWELL-NEXT: retq 408; 409; HASWELL-NO-FMA-LABEL: v4f32_one_step: 410; HASWELL-NO-FMA: # %bb.0: 411; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 412; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 413; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 414; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 415; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 416; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 417; HASWELL-NO-FMA-NEXT: retq 418; 419; KNL-LABEL: v4f32_one_step: 420; KNL: # %bb.0: 421; KNL-NEXT: vrcpps %xmm0, %xmm1 422; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 423; KNL-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 424; KNL-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 425; KNL-NEXT: retq 426; 427; SKX-LABEL: v4f32_one_step: 428; SKX: # %bb.0: 429; SKX-NEXT: vrcpps %xmm0, %xmm1 430; SKX-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - mem 431; SKX-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 432; SKX-NEXT: retq 433 %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x 434 ret <4 x float> %div 435} 436 437define <4 x float> @v4f32_one_step_variables(<4 x float> %x, <4 x float> %y) #1 { 438; SSE-LABEL: v4f32_one_step_variables: 439; SSE: # %bb.0: 440; SSE-NEXT: rcpps %xmm1, %xmm2 441; SSE-NEXT: movaps %xmm0, %xmm3 442; SSE-NEXT: mulps %xmm2, %xmm3 443; SSE-NEXT: mulps %xmm3, %xmm1 444; SSE-NEXT: subps %xmm1, %xmm0 445; SSE-NEXT: mulps %xmm2, %xmm0 446; SSE-NEXT: addps %xmm3, %xmm0 447; SSE-NEXT: retq 448; 449; AVX-RECIP-LABEL: v4f32_one_step_variables: 450; AVX-RECIP: # %bb.0: 451; AVX-RECIP-NEXT: vrcpps %xmm1, %xmm2 452; AVX-RECIP-NEXT: vmulps %xmm2, %xmm0, %xmm3 453; AVX-RECIP-NEXT: vmulps %xmm3, %xmm1, %xmm1 454; AVX-RECIP-NEXT: vsubps %xmm1, %xmm0, %xmm0 455; AVX-RECIP-NEXT: vmulps %xmm0, %xmm2, %xmm0 456; AVX-RECIP-NEXT: vaddps %xmm0, %xmm3, %xmm0 457; AVX-RECIP-NEXT: retq 458; 459; FMA-RECIP-LABEL: v4f32_one_step_variables: 460; FMA-RECIP: # %bb.0: 461; FMA-RECIP-NEXT: vrcpps %xmm1, %xmm2 462; FMA-RECIP-NEXT: vmulps %xmm2, %xmm0, %xmm3 463; FMA-RECIP-NEXT: vfmsub231ps {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0 464; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 465; FMA-RECIP-NEXT: retq 466; 467; BDVER2-LABEL: v4f32_one_step_variables: 468; BDVER2: # %bb.0: 469; BDVER2-NEXT: vrcpps %xmm1, %xmm2 470; BDVER2-NEXT: vmulps %xmm2, %xmm0, %xmm3 471; BDVER2-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm1 * xmm3) - xmm0 472; BDVER2-NEXT: vfnmaddps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 473; BDVER2-NEXT: retq 474; 475; BTVER2-LABEL: v4f32_one_step_variables: 476; BTVER2: # %bb.0: 477; BTVER2-NEXT: vrcpps %xmm1, %xmm2 478; BTVER2-NEXT: vmulps %xmm2, %xmm0, %xmm3 479; BTVER2-NEXT: vmulps %xmm3, %xmm1, %xmm1 480; BTVER2-NEXT: vsubps %xmm1, %xmm0, %xmm0 481; BTVER2-NEXT: vmulps %xmm0, %xmm2, %xmm0 482; BTVER2-NEXT: vaddps %xmm0, %xmm3, %xmm0 483; BTVER2-NEXT: retq 484; 485; SANDY-LABEL: v4f32_one_step_variables: 486; SANDY: # %bb.0: 487; SANDY-NEXT: vrcpps %xmm1, %xmm2 488; SANDY-NEXT: vmulps %xmm2, %xmm0, %xmm3 489; SANDY-NEXT: vmulps %xmm3, %xmm1, %xmm1 490; SANDY-NEXT: vsubps %xmm1, %xmm0, %xmm0 491; SANDY-NEXT: vmulps %xmm0, %xmm2, %xmm0 492; SANDY-NEXT: vaddps %xmm0, %xmm3, %xmm0 493; SANDY-NEXT: retq 494; 495; HASWELL-LABEL: v4f32_one_step_variables: 496; HASWELL: # %bb.0: 497; HASWELL-NEXT: vrcpps %xmm1, %xmm2 498; HASWELL-NEXT: vmulps %xmm2, %xmm0, %xmm3 499; HASWELL-NEXT: vfmsub231ps {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0 500; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 501; HASWELL-NEXT: retq 502; 503; HASWELL-NO-FMA-LABEL: v4f32_one_step_variables: 504; HASWELL-NO-FMA: # %bb.0: 505; HASWELL-NO-FMA-NEXT: vrcpps %xmm1, %xmm2 506; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm0, %xmm3 507; HASWELL-NO-FMA-NEXT: vmulps %xmm3, %xmm1, %xmm1 508; HASWELL-NO-FMA-NEXT: vsubps %xmm1, %xmm0, %xmm0 509; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm2, %xmm0 510; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm3, %xmm0 511; HASWELL-NO-FMA-NEXT: retq 512; 513; AVX512-LABEL: v4f32_one_step_variables: 514; AVX512: # %bb.0: 515; AVX512-NEXT: vrcpps %xmm1, %xmm2 516; AVX512-NEXT: vmulps %xmm2, %xmm0, %xmm3 517; AVX512-NEXT: vfmsub231ps {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0 518; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 519; AVX512-NEXT: retq 520 %div = fdiv fast <4 x float> %x, %y 521 ret <4 x float> %div 522} 523 524define <4 x float> @v4f32_two_step(<4 x float> %x) #2 { 525; SSE-LABEL: v4f32_two_step: 526; SSE: # %bb.0: 527; SSE-NEXT: rcpps %xmm0, %xmm2 528; SSE-NEXT: movaps %xmm0, %xmm3 529; SSE-NEXT: mulps %xmm2, %xmm3 530; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 531; SSE-NEXT: movaps %xmm1, %xmm4 532; SSE-NEXT: subps %xmm3, %xmm4 533; SSE-NEXT: mulps %xmm2, %xmm4 534; SSE-NEXT: addps %xmm2, %xmm4 535; SSE-NEXT: mulps %xmm4, %xmm0 536; SSE-NEXT: subps %xmm0, %xmm1 537; SSE-NEXT: mulps %xmm4, %xmm1 538; SSE-NEXT: addps %xmm4, %xmm1 539; SSE-NEXT: movaps %xmm1, %xmm0 540; SSE-NEXT: retq 541; 542; AVX-RECIP-LABEL: v4f32_two_step: 543; AVX-RECIP: # %bb.0: 544; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 545; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm2 546; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 547; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2 548; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2 549; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1 550; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 551; AVX-RECIP-NEXT: vsubps %xmm0, %xmm3, %xmm0 552; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 553; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 554; AVX-RECIP-NEXT: retq 555; 556; FMA-RECIP-LABEL: v4f32_two_step: 557; FMA-RECIP: # %bb.0: 558; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 559; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 560; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 561; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} xmm3 = (xmm0 * xmm3) - xmm2 562; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1 563; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2 564; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm3) + xmm3 565; FMA-RECIP-NEXT: retq 566; 567; BDVER2-LABEL: v4f32_two_step: 568; BDVER2: # %bb.0: 569; BDVER2-NEXT: vrcpps %xmm0, %xmm1 570; BDVER2-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 571; BDVER2-NEXT: vfmsubps {{.*#+}} xmm3 = (xmm0 * xmm1) - xmm2 572; BDVER2-NEXT: vfnmaddps {{.*#+}} xmm1 = -(xmm1 * xmm3) + xmm1 573; BDVER2-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2 574; BDVER2-NEXT: vfnmaddps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1 575; BDVER2-NEXT: retq 576; 577; BTVER2-LABEL: v4f32_two_step: 578; BTVER2: # %bb.0: 579; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 580; BTVER2-NEXT: vrcpps %xmm0, %xmm1 581; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 582; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2 583; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm2 584; BTVER2-NEXT: vaddps %xmm2, %xmm1, %xmm1 585; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 586; BTVER2-NEXT: vsubps %xmm0, %xmm3, %xmm0 587; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 588; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 589; BTVER2-NEXT: retq 590; 591; SANDY-LABEL: v4f32_two_step: 592; SANDY: # %bb.0: 593; SANDY-NEXT: vrcpps %xmm0, %xmm1 594; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 595; SANDY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 596; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 597; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 598; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 599; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 600; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 601; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 602; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 603; SANDY-NEXT: retq 604; 605; HASWELL-LABEL: v4f32_two_step: 606; HASWELL: # %bb.0: 607; HASWELL-NEXT: vrcpps %xmm0, %xmm1 608; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 609; HASWELL-NEXT: vmovaps %xmm1, %xmm3 610; HASWELL-NEXT: vfmsub213ps {{.*#+}} xmm3 = (xmm0 * xmm3) - xmm2 611; HASWELL-NEXT: vfnmadd132ps {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1 612; HASWELL-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2 613; HASWELL-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm3) + xmm3 614; HASWELL-NEXT: retq 615; 616; HASWELL-NO-FMA-LABEL: v4f32_two_step: 617; HASWELL-NO-FMA: # %bb.0: 618; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 619; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2 620; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 621; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2 622; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2 623; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1 624; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 625; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm3, %xmm0 626; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 627; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 628; HASWELL-NO-FMA-NEXT: retq 629; 630; AVX512-LABEL: v4f32_two_step: 631; AVX512: # %bb.0: 632; AVX512-NEXT: vrcpps %xmm0, %xmm1 633; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 634; AVX512-NEXT: vmovaps %xmm1, %xmm3 635; AVX512-NEXT: vfmsub213ps {{.*#+}} xmm3 = (xmm0 * xmm3) - xmm2 636; AVX512-NEXT: vfnmadd132ps {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1 637; AVX512-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2 638; AVX512-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm3) + xmm3 639; AVX512-NEXT: retq 640 %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x 641 ret <4 x float> %div 642} 643 644define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 { 645; SSE-LABEL: v8f32_no_estimate: 646; SSE: # %bb.0: 647; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 648; SSE-NEXT: movaps %xmm2, %xmm3 649; SSE-NEXT: divps %xmm0, %xmm3 650; SSE-NEXT: divps %xmm1, %xmm2 651; SSE-NEXT: movaps %xmm3, %xmm0 652; SSE-NEXT: movaps %xmm2, %xmm1 653; SSE-NEXT: retq 654; 655; AVX-LABEL: v8f32_no_estimate: 656; AVX: # %bb.0: 657; AVX-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 658; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0 659; AVX-NEXT: retq 660 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x 661 ret <8 x float> %div 662} 663 664define <8 x float> @v8f32_one_step(<8 x float> %x) #1 { 665; SSE-LABEL: v8f32_one_step: 666; SSE: # %bb.0: 667; SSE-NEXT: rcpps %xmm0, %xmm4 668; SSE-NEXT: mulps %xmm4, %xmm0 669; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 670; SSE-NEXT: movaps %xmm2, %xmm3 671; SSE-NEXT: subps %xmm0, %xmm3 672; SSE-NEXT: mulps %xmm4, %xmm3 673; SSE-NEXT: addps %xmm4, %xmm3 674; SSE-NEXT: rcpps %xmm1, %xmm0 675; SSE-NEXT: mulps %xmm0, %xmm1 676; SSE-NEXT: subps %xmm1, %xmm2 677; SSE-NEXT: mulps %xmm0, %xmm2 678; SSE-NEXT: addps %xmm0, %xmm2 679; SSE-NEXT: movaps %xmm3, %xmm0 680; SSE-NEXT: movaps %xmm2, %xmm1 681; SSE-NEXT: retq 682; 683; AVX-RECIP-LABEL: v8f32_one_step: 684; AVX-RECIP: # %bb.0: 685; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 686; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 687; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 688; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 689; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 690; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 691; AVX-RECIP-NEXT: retq 692; 693; FMA-RECIP-LABEL: v8f32_one_step: 694; FMA-RECIP: # %bb.0: 695; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 696; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - mem 697; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1 698; FMA-RECIP-NEXT: retq 699; 700; BDVER2-LABEL: v8f32_one_step: 701; BDVER2: # %bb.0: 702; BDVER2-NEXT: vrcpps %ymm0, %ymm1 703; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm1) - mem 704; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm1 705; BDVER2-NEXT: retq 706; 707; BTVER2-LABEL: v8f32_one_step: 708; BTVER2: # %bb.0: 709; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 710; BTVER2-NEXT: vrcpps %ymm0, %ymm1 711; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 712; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 713; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 714; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 715; BTVER2-NEXT: retq 716; 717; SANDY-LABEL: v8f32_one_step: 718; SANDY: # %bb.0: 719; SANDY-NEXT: vrcpps %ymm0, %ymm1 720; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 721; SANDY-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 722; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 723; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 724; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 725; SANDY-NEXT: retq 726; 727; HASWELL-LABEL: v8f32_one_step: 728; HASWELL: # %bb.0: 729; HASWELL-NEXT: vrcpps %ymm0, %ymm1 730; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 731; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2 732; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1 733; HASWELL-NEXT: retq 734; 735; HASWELL-NO-FMA-LABEL: v8f32_one_step: 736; HASWELL-NO-FMA: # %bb.0: 737; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 738; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 739; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 740; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 741; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 742; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 743; HASWELL-NO-FMA-NEXT: retq 744; 745; KNL-LABEL: v8f32_one_step: 746; KNL: # %bb.0: 747; KNL-NEXT: vrcpps %ymm0, %ymm1 748; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 749; KNL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2 750; KNL-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1 751; KNL-NEXT: retq 752; 753; SKX-LABEL: v8f32_one_step: 754; SKX: # %bb.0: 755; SKX-NEXT: vrcpps %ymm0, %ymm1 756; SKX-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - mem 757; SKX-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1 758; SKX-NEXT: retq 759 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x 760 ret <8 x float> %div 761} 762 763define <8 x float> @v8f32_two_step(<8 x float> %x) #2 { 764; SSE-LABEL: v8f32_two_step: 765; SSE: # %bb.0: 766; SSE-NEXT: movaps %xmm1, %xmm2 767; SSE-NEXT: rcpps %xmm0, %xmm3 768; SSE-NEXT: movaps %xmm0, %xmm4 769; SSE-NEXT: mulps %xmm3, %xmm4 770; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 771; SSE-NEXT: movaps %xmm1, %xmm5 772; SSE-NEXT: subps %xmm4, %xmm5 773; SSE-NEXT: mulps %xmm3, %xmm5 774; SSE-NEXT: addps %xmm3, %xmm5 775; SSE-NEXT: mulps %xmm5, %xmm0 776; SSE-NEXT: movaps %xmm1, %xmm3 777; SSE-NEXT: subps %xmm0, %xmm3 778; SSE-NEXT: mulps %xmm5, %xmm3 779; SSE-NEXT: addps %xmm5, %xmm3 780; SSE-NEXT: rcpps %xmm2, %xmm0 781; SSE-NEXT: movaps %xmm2, %xmm4 782; SSE-NEXT: mulps %xmm0, %xmm4 783; SSE-NEXT: movaps %xmm1, %xmm5 784; SSE-NEXT: subps %xmm4, %xmm5 785; SSE-NEXT: mulps %xmm0, %xmm5 786; SSE-NEXT: addps %xmm0, %xmm5 787; SSE-NEXT: mulps %xmm5, %xmm2 788; SSE-NEXT: subps %xmm2, %xmm1 789; SSE-NEXT: mulps %xmm5, %xmm1 790; SSE-NEXT: addps %xmm5, %xmm1 791; SSE-NEXT: movaps %xmm3, %xmm0 792; SSE-NEXT: retq 793; 794; AVX-RECIP-LABEL: v8f32_two_step: 795; AVX-RECIP: # %bb.0: 796; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 797; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2 798; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 799; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2 800; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2 801; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1 802; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 803; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 804; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 805; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 806; AVX-RECIP-NEXT: retq 807; 808; FMA-RECIP-LABEL: v8f32_two_step: 809; FMA-RECIP: # %bb.0: 810; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 811; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 812; FMA-RECIP-NEXT: vmovaps %ymm1, %ymm3 813; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm3 = (ymm0 * ymm3) - ymm2 814; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm3 = -(ymm3 * ymm1) + ymm1 815; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm2 816; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm3) + ymm3 817; FMA-RECIP-NEXT: retq 818; 819; BDVER2-LABEL: v8f32_two_step: 820; BDVER2: # %bb.0: 821; BDVER2-NEXT: vrcpps %ymm0, %ymm1 822; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 823; BDVER2-NEXT: vfmsubps {{.*#+}} ymm3 = (ymm0 * ymm1) - ymm2 824; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm1 * ymm3) + ymm1 825; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2 826; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm1 827; BDVER2-NEXT: retq 828; 829; BTVER2-LABEL: v8f32_two_step: 830; BTVER2: # %bb.0: 831; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 832; BTVER2-NEXT: vrcpps %ymm0, %ymm1 833; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 834; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 835; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 836; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 837; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 838; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 839; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 840; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 841; BTVER2-NEXT: retq 842; 843; SANDY-LABEL: v8f32_two_step: 844; SANDY: # %bb.0: 845; SANDY-NEXT: vrcpps %ymm0, %ymm1 846; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 847; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 848; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 849; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 850; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 851; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 852; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 853; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 854; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 855; SANDY-NEXT: retq 856; 857; HASWELL-LABEL: v8f32_two_step: 858; HASWELL: # %bb.0: 859; HASWELL-NEXT: vrcpps %ymm0, %ymm1 860; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 861; HASWELL-NEXT: vmovaps %ymm1, %ymm3 862; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm3 = (ymm0 * ymm3) - ymm2 863; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm3 = -(ymm3 * ymm1) + ymm1 864; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm2 865; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm3) + ymm3 866; HASWELL-NEXT: retq 867; 868; HASWELL-NO-FMA-LABEL: v8f32_two_step: 869; HASWELL-NO-FMA: # %bb.0: 870; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 871; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2 872; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 873; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2 874; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 875; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1 876; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 877; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 878; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 879; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 880; HASWELL-NO-FMA-NEXT: retq 881; 882; AVX512-LABEL: v8f32_two_step: 883; AVX512: # %bb.0: 884; AVX512-NEXT: vrcpps %ymm0, %ymm1 885; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 886; AVX512-NEXT: vmovaps %ymm1, %ymm3 887; AVX512-NEXT: vfmsub213ps {{.*#+}} ymm3 = (ymm0 * ymm3) - ymm2 888; AVX512-NEXT: vfnmadd132ps {{.*#+}} ymm3 = -(ymm3 * ymm1) + ymm1 889; AVX512-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm2 890; AVX512-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm3) + ymm3 891; AVX512-NEXT: retq 892 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x 893 ret <8 x float> %div 894} 895 896define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 { 897; SSE-LABEL: v16f32_no_estimate: 898; SSE: # %bb.0: 899; SSE-NEXT: movaps {{.*#+}} xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 900; SSE-NEXT: movaps %xmm4, %xmm5 901; SSE-NEXT: divps %xmm0, %xmm5 902; SSE-NEXT: movaps %xmm4, %xmm6 903; SSE-NEXT: divps %xmm1, %xmm6 904; SSE-NEXT: movaps %xmm4, %xmm7 905; SSE-NEXT: divps %xmm2, %xmm7 906; SSE-NEXT: divps %xmm3, %xmm4 907; SSE-NEXT: movaps %xmm5, %xmm0 908; SSE-NEXT: movaps %xmm6, %xmm1 909; SSE-NEXT: movaps %xmm7, %xmm2 910; SSE-NEXT: movaps %xmm4, %xmm3 911; SSE-NEXT: retq 912; 913; AVX-RECIP-LABEL: v16f32_no_estimate: 914; AVX-RECIP: # %bb.0: 915; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 916; AVX-RECIP-NEXT: vdivps %ymm0, %ymm2, %ymm0 917; AVX-RECIP-NEXT: vdivps %ymm1, %ymm2, %ymm1 918; AVX-RECIP-NEXT: retq 919; 920; FMA-RECIP-LABEL: v16f32_no_estimate: 921; FMA-RECIP: # %bb.0: 922; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 923; FMA-RECIP-NEXT: vdivps %ymm0, %ymm2, %ymm0 924; FMA-RECIP-NEXT: vdivps %ymm1, %ymm2, %ymm1 925; FMA-RECIP-NEXT: retq 926; 927; BDVER2-LABEL: v16f32_no_estimate: 928; BDVER2: # %bb.0: 929; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 930; BDVER2-NEXT: vdivps %ymm0, %ymm2, %ymm0 931; BDVER2-NEXT: vdivps %ymm1, %ymm2, %ymm1 932; BDVER2-NEXT: retq 933; 934; BTVER2-LABEL: v16f32_no_estimate: 935; BTVER2: # %bb.0: 936; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 937; BTVER2-NEXT: vdivps %ymm0, %ymm2, %ymm0 938; BTVER2-NEXT: vdivps %ymm1, %ymm2, %ymm1 939; BTVER2-NEXT: retq 940; 941; SANDY-LABEL: v16f32_no_estimate: 942; SANDY: # %bb.0: 943; SANDY-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 944; SANDY-NEXT: vdivps %ymm0, %ymm2, %ymm0 945; SANDY-NEXT: vdivps %ymm1, %ymm2, %ymm1 946; SANDY-NEXT: retq 947; 948; HASWELL-LABEL: v16f32_no_estimate: 949; HASWELL: # %bb.0: 950; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 951; HASWELL-NEXT: vdivps %ymm0, %ymm2, %ymm0 952; HASWELL-NEXT: vdivps %ymm1, %ymm2, %ymm1 953; HASWELL-NEXT: retq 954; 955; HASWELL-NO-FMA-LABEL: v16f32_no_estimate: 956; HASWELL-NO-FMA: # %bb.0: 957; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 958; HASWELL-NO-FMA-NEXT: vdivps %ymm0, %ymm2, %ymm0 959; HASWELL-NO-FMA-NEXT: vdivps %ymm1, %ymm2, %ymm1 960; HASWELL-NO-FMA-NEXT: retq 961; 962; AVX512-LABEL: v16f32_no_estimate: 963; AVX512: # %bb.0: 964; AVX512-NEXT: vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 965; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0 966; AVX512-NEXT: retq 967 %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x 968 ret <16 x float> %div 969} 970 971define <16 x float> @v16f32_one_step(<16 x float> %x) #1 { 972; SSE-LABEL: v16f32_one_step: 973; SSE: # %bb.0: 974; SSE-NEXT: movaps %xmm3, %xmm4 975; SSE-NEXT: movaps %xmm0, %xmm5 976; SSE-NEXT: rcpps %xmm0, %xmm6 977; SSE-NEXT: mulps %xmm6, %xmm5 978; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 979; SSE-NEXT: movaps %xmm3, %xmm0 980; SSE-NEXT: subps %xmm5, %xmm0 981; SSE-NEXT: mulps %xmm6, %xmm0 982; SSE-NEXT: addps %xmm6, %xmm0 983; SSE-NEXT: rcpps %xmm1, %xmm6 984; SSE-NEXT: mulps %xmm6, %xmm1 985; SSE-NEXT: movaps %xmm3, %xmm5 986; SSE-NEXT: subps %xmm1, %xmm5 987; SSE-NEXT: mulps %xmm6, %xmm5 988; SSE-NEXT: addps %xmm6, %xmm5 989; SSE-NEXT: rcpps %xmm2, %xmm1 990; SSE-NEXT: mulps %xmm1, %xmm2 991; SSE-NEXT: movaps %xmm3, %xmm6 992; SSE-NEXT: subps %xmm2, %xmm6 993; SSE-NEXT: mulps %xmm1, %xmm6 994; SSE-NEXT: addps %xmm1, %xmm6 995; SSE-NEXT: rcpps %xmm4, %xmm1 996; SSE-NEXT: mulps %xmm1, %xmm4 997; SSE-NEXT: subps %xmm4, %xmm3 998; SSE-NEXT: mulps %xmm1, %xmm3 999; SSE-NEXT: addps %xmm1, %xmm3 1000; SSE-NEXT: movaps %xmm5, %xmm1 1001; SSE-NEXT: movaps %xmm6, %xmm2 1002; SSE-NEXT: retq 1003; 1004; AVX-RECIP-LABEL: v16f32_one_step: 1005; AVX-RECIP: # %bb.0: 1006; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 1007; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 1008; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1009; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 1010; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 1011; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0 1012; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2 1013; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm1 1014; AVX-RECIP-NEXT: vsubps %ymm1, %ymm3, %ymm1 1015; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 1016; AVX-RECIP-NEXT: vaddps %ymm1, %ymm2, %ymm1 1017; AVX-RECIP-NEXT: retq 1018; 1019; FMA-RECIP-LABEL: v16f32_one_step: 1020; FMA-RECIP: # %bb.0: 1021; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 1022; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1023; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm3 1024; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2 1025; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 1026; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm3 1027; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm2 1028; FMA-RECIP-NEXT: retq 1029; 1030; BDVER2-LABEL: v16f32_one_step: 1031; BDVER2: # %bb.0: 1032; BDVER2-NEXT: vrcpps %ymm0, %ymm2 1033; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1034; BDVER2-NEXT: vrcpps %ymm1, %ymm4 1035; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3 1036; BDVER2-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm4) - ymm3 1037; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm2 1038; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm4 1039; BDVER2-NEXT: retq 1040; 1041; BTVER2-LABEL: v16f32_one_step: 1042; BTVER2: # %bb.0: 1043; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1044; BTVER2-NEXT: vrcpps %ymm0, %ymm2 1045; BTVER2-NEXT: vrcpps %ymm1, %ymm4 1046; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 1047; BTVER2-NEXT: vmulps %ymm4, %ymm1, %ymm1 1048; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 1049; BTVER2-NEXT: vsubps %ymm1, %ymm3, %ymm1 1050; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0 1051; BTVER2-NEXT: vmulps %ymm1, %ymm4, %ymm1 1052; BTVER2-NEXT: vaddps %ymm0, %ymm2, %ymm0 1053; BTVER2-NEXT: vaddps %ymm1, %ymm4, %ymm1 1054; BTVER2-NEXT: retq 1055; 1056; SANDY-LABEL: v16f32_one_step: 1057; SANDY: # %bb.0: 1058; SANDY-NEXT: vrcpps %ymm0, %ymm2 1059; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 1060; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1061; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 1062; SANDY-NEXT: vrcpps %ymm1, %ymm4 1063; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 1064; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0 1065; SANDY-NEXT: vmulps %ymm4, %ymm1, %ymm1 1066; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1 1067; SANDY-NEXT: vmulps %ymm1, %ymm4, %ymm1 1068; SANDY-NEXT: vaddps %ymm1, %ymm4, %ymm1 1069; SANDY-NEXT: retq 1070; 1071; HASWELL-LABEL: v16f32_one_step: 1072; HASWELL: # %bb.0: 1073; HASWELL-NEXT: vrcpps %ymm0, %ymm2 1074; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1075; HASWELL-NEXT: vrcpps %ymm1, %ymm4 1076; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm3 1077; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2 1078; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm4 * ymm1) - ymm3 1079; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm4) + ymm4 1080; HASWELL-NEXT: retq 1081; 1082; HASWELL-NO-FMA-LABEL: v16f32_one_step: 1083; HASWELL-NO-FMA: # %bb.0: 1084; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2 1085; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 1086; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1087; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 1088; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0 1089; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm2, %ymm0 1090; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm2 1091; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm1 1092; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm3, %ymm1 1093; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1 1094; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm2, %ymm1 1095; HASWELL-NO-FMA-NEXT: retq 1096; 1097; AVX512-LABEL: v16f32_one_step: 1098; AVX512: # %bb.0: 1099; AVX512-NEXT: vrcp14ps %zmm0, %zmm1 1100; AVX512-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - mem 1101; AVX512-NEXT: vfnmadd132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm1 1102; AVX512-NEXT: retq 1103 %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x 1104 ret <16 x float> %div 1105} 1106 1107define <16 x float> @v16f32_two_step(<16 x float> %x) #2 { 1108; SSE-LABEL: v16f32_two_step: 1109; SSE: # %bb.0: 1110; SSE-NEXT: movaps %xmm3, %xmm4 1111; SSE-NEXT: movaps %xmm1, %xmm5 1112; SSE-NEXT: movaps %xmm0, %xmm1 1113; SSE-NEXT: rcpps %xmm0, %xmm0 1114; SSE-NEXT: movaps %xmm1, %xmm6 1115; SSE-NEXT: mulps %xmm0, %xmm6 1116; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1117; SSE-NEXT: movaps %xmm3, %xmm7 1118; SSE-NEXT: subps %xmm6, %xmm7 1119; SSE-NEXT: mulps %xmm0, %xmm7 1120; SSE-NEXT: addps %xmm0, %xmm7 1121; SSE-NEXT: mulps %xmm7, %xmm1 1122; SSE-NEXT: movaps %xmm3, %xmm0 1123; SSE-NEXT: subps %xmm1, %xmm0 1124; SSE-NEXT: mulps %xmm7, %xmm0 1125; SSE-NEXT: addps %xmm7, %xmm0 1126; SSE-NEXT: rcpps %xmm5, %xmm1 1127; SSE-NEXT: movaps %xmm5, %xmm6 1128; SSE-NEXT: mulps %xmm1, %xmm6 1129; SSE-NEXT: movaps %xmm3, %xmm7 1130; SSE-NEXT: subps %xmm6, %xmm7 1131; SSE-NEXT: mulps %xmm1, %xmm7 1132; SSE-NEXT: addps %xmm1, %xmm7 1133; SSE-NEXT: mulps %xmm7, %xmm5 1134; SSE-NEXT: movaps %xmm3, %xmm1 1135; SSE-NEXT: subps %xmm5, %xmm1 1136; SSE-NEXT: mulps %xmm7, %xmm1 1137; SSE-NEXT: addps %xmm7, %xmm1 1138; SSE-NEXT: rcpps %xmm2, %xmm5 1139; SSE-NEXT: movaps %xmm2, %xmm6 1140; SSE-NEXT: mulps %xmm5, %xmm6 1141; SSE-NEXT: movaps %xmm3, %xmm7 1142; SSE-NEXT: subps %xmm6, %xmm7 1143; SSE-NEXT: mulps %xmm5, %xmm7 1144; SSE-NEXT: addps %xmm5, %xmm7 1145; SSE-NEXT: mulps %xmm7, %xmm2 1146; SSE-NEXT: movaps %xmm3, %xmm5 1147; SSE-NEXT: subps %xmm2, %xmm5 1148; SSE-NEXT: mulps %xmm7, %xmm5 1149; SSE-NEXT: addps %xmm7, %xmm5 1150; SSE-NEXT: rcpps %xmm4, %xmm2 1151; SSE-NEXT: movaps %xmm4, %xmm6 1152; SSE-NEXT: mulps %xmm2, %xmm6 1153; SSE-NEXT: movaps %xmm3, %xmm7 1154; SSE-NEXT: subps %xmm6, %xmm7 1155; SSE-NEXT: mulps %xmm2, %xmm7 1156; SSE-NEXT: addps %xmm2, %xmm7 1157; SSE-NEXT: mulps %xmm7, %xmm4 1158; SSE-NEXT: subps %xmm4, %xmm3 1159; SSE-NEXT: mulps %xmm7, %xmm3 1160; SSE-NEXT: addps %xmm7, %xmm3 1161; SSE-NEXT: movaps %xmm5, %xmm2 1162; SSE-NEXT: retq 1163; 1164; AVX-RECIP-LABEL: v16f32_two_step: 1165; AVX-RECIP: # %bb.0: 1166; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 1167; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm3 1168; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1169; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3 1170; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3 1171; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2 1172; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 1173; AVX-RECIP-NEXT: vsubps %ymm0, %ymm4, %ymm0 1174; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 1175; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0 1176; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2 1177; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3 1178; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3 1179; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3 1180; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2 1181; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm1 1182; AVX-RECIP-NEXT: vsubps %ymm1, %ymm4, %ymm1 1183; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 1184; AVX-RECIP-NEXT: vaddps %ymm1, %ymm2, %ymm1 1185; AVX-RECIP-NEXT: retq 1186; 1187; FMA-RECIP-LABEL: v16f32_two_step: 1188; FMA-RECIP: # %bb.0: 1189; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 1190; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1191; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4 1192; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm4 = (ymm0 * ymm4) - ymm3 1193; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2 1194; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm4 * ymm0) - ymm3 1195; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm4) + ymm4 1196; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 1197; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4 1198; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm4 = (ymm1 * ymm4) - ymm3 1199; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2 1200; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm4 * ymm1) - ymm3 1201; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm4) + ymm4 1202; FMA-RECIP-NEXT: retq 1203; 1204; BDVER2-LABEL: v16f32_two_step: 1205; BDVER2: # %bb.0: 1206; BDVER2-NEXT: vrcpps %ymm0, %ymm2 1207; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1208; BDVER2-NEXT: vfmsubps {{.*#+}} ymm4 = (ymm0 * ymm2) - ymm3 1209; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm4) + ymm2 1210; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3 1211; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm2 1212; BDVER2-NEXT: vrcpps %ymm1, %ymm2 1213; BDVER2-NEXT: vfmsubps {{.*#+}} ymm4 = (ymm1 * ymm2) - ymm3 1214; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm4) + ymm2 1215; BDVER2-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm2) - ymm3 1216; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm2 1217; BDVER2-NEXT: retq 1218; 1219; BTVER2-LABEL: v16f32_two_step: 1220; BTVER2: # %bb.0: 1221; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1222; BTVER2-NEXT: vrcpps %ymm0, %ymm2 1223; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3 1224; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 1225; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3 1226; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2 1227; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 1228; BTVER2-NEXT: vsubps %ymm0, %ymm4, %ymm0 1229; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0 1230; BTVER2-NEXT: vaddps %ymm0, %ymm2, %ymm0 1231; BTVER2-NEXT: vrcpps %ymm1, %ymm2 1232; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3 1233; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 1234; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3 1235; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2 1236; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm1 1237; BTVER2-NEXT: vsubps %ymm1, %ymm4, %ymm1 1238; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 1239; BTVER2-NEXT: vaddps %ymm1, %ymm2, %ymm1 1240; BTVER2-NEXT: retq 1241; 1242; SANDY-LABEL: v16f32_two_step: 1243; SANDY: # %bb.0: 1244; SANDY-NEXT: vrcpps %ymm0, %ymm2 1245; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3 1246; SANDY-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1247; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3 1248; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3 1249; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2 1250; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 1251; SANDY-NEXT: vsubps %ymm0, %ymm4, %ymm0 1252; SANDY-NEXT: vrcpps %ymm1, %ymm3 1253; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 1254; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0 1255; SANDY-NEXT: vmulps %ymm3, %ymm1, %ymm2 1256; SANDY-NEXT: vsubps %ymm2, %ymm4, %ymm2 1257; SANDY-NEXT: vmulps %ymm2, %ymm3, %ymm2 1258; SANDY-NEXT: vaddps %ymm2, %ymm3, %ymm2 1259; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm1 1260; SANDY-NEXT: vsubps %ymm1, %ymm4, %ymm1 1261; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 1262; SANDY-NEXT: vaddps %ymm1, %ymm2, %ymm1 1263; SANDY-NEXT: retq 1264; 1265; HASWELL-LABEL: v16f32_two_step: 1266; HASWELL: # %bb.0: 1267; HASWELL-NEXT: vrcpps %ymm0, %ymm2 1268; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1269; HASWELL-NEXT: vmovaps %ymm2, %ymm4 1270; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm4 = (ymm0 * ymm4) - ymm3 1271; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2 1272; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm4 * ymm0) - ymm3 1273; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm4) + ymm4 1274; HASWELL-NEXT: vrcpps %ymm1, %ymm2 1275; HASWELL-NEXT: vmovaps %ymm2, %ymm4 1276; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm4 = (ymm1 * ymm4) - ymm3 1277; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2 1278; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm4 * ymm1) - ymm3 1279; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm4) + ymm4 1280; HASWELL-NEXT: retq 1281; 1282; HASWELL-NO-FMA-LABEL: v16f32_two_step: 1283; HASWELL-NO-FMA: # %bb.0: 1284; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2 1285; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm3 1286; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1287; HASWELL-NO-FMA-NEXT: vsubps %ymm3, %ymm4, %ymm3 1288; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm3 1289; HASWELL-NO-FMA-NEXT: vaddps %ymm3, %ymm2, %ymm2 1290; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 1291; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm4, %ymm0 1292; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0 1293; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm2, %ymm0 1294; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm2 1295; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm3 1296; HASWELL-NO-FMA-NEXT: vsubps %ymm3, %ymm4, %ymm3 1297; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm3 1298; HASWELL-NO-FMA-NEXT: vaddps %ymm3, %ymm2, %ymm2 1299; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm1 1300; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm4, %ymm1 1301; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1 1302; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm2, %ymm1 1303; HASWELL-NO-FMA-NEXT: retq 1304; 1305; AVX512-LABEL: v16f32_two_step: 1306; AVX512: # %bb.0: 1307; AVX512-NEXT: vrcp14ps %zmm0, %zmm1 1308; AVX512-NEXT: vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1309; AVX512-NEXT: vmovaps %zmm1, %zmm3 1310; AVX512-NEXT: vfmsub213ps {{.*#+}} zmm3 = (zmm0 * zmm3) - zmm2 1311; AVX512-NEXT: vfnmadd132ps {{.*#+}} zmm3 = -(zmm3 * zmm1) + zmm1 1312; AVX512-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm3 * zmm0) - zmm2 1313; AVX512-NEXT: vfnmadd132ps {{.*#+}} zmm0 = -(zmm0 * zmm3) + zmm3 1314; AVX512-NEXT: retq 1315 %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x 1316 ret <16 x float> %div 1317} 1318 1319attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" } 1320attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" } 1321attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" } 1322 1323