1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-RECIP 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefixes=AVX,FMA-RECIP 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 | FileCheck %s --check-prefixes=AVX,BDVER2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,BTVER2 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,SANDY 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell | FileCheck %s --check-prefixes=AVX,HASWELL 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -mattr=-fma | FileCheck %s --check-prefixes=AVX,HASWELL-NO-FMA 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefixes=AVX,AVX512,KNL 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefixes=AVX,AVX512,SKX 12 13; It's the extra tests coverage for recip as discussed on D26855. 14 15define float @f32_no_step_2(float %x) #3 { 16; SSE-LABEL: f32_no_step_2: 17; SSE: # %bb.0: 18; SSE-NEXT: rcpss %xmm0, %xmm0 19; SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 20; SSE-NEXT: retq 21; 22; AVX-LABEL: f32_no_step_2: 23; AVX: # %bb.0: 24; AVX-NEXT: vrcpss %xmm0, %xmm0, %xmm0 25; AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 26; AVX-NEXT: retq 27 %div = fdiv fast float 1234.0, %x 28 ret float %div 29} 30 31define float @f32_one_step_2(float %x) #1 { 32; SSE-LABEL: f32_one_step_2: 33; SSE: # %bb.0: 34; SSE-NEXT: rcpss %xmm0, %xmm2 35; SSE-NEXT: movss {{.*#+}} xmm1 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0] 36; SSE-NEXT: movaps %xmm2, %xmm3 37; SSE-NEXT: mulss %xmm1, %xmm3 38; SSE-NEXT: mulss %xmm3, %xmm0 39; SSE-NEXT: subss %xmm0, %xmm1 40; SSE-NEXT: mulss %xmm2, %xmm1 41; SSE-NEXT: addss %xmm3, %xmm1 42; SSE-NEXT: movaps %xmm1, %xmm0 43; SSE-NEXT: retq 44; 45; AVX-RECIP-LABEL: f32_one_step_2: 46; AVX-RECIP: # %bb.0: 47; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 48; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0] 49; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm3 50; AVX-RECIP-NEXT: vmulss %xmm3, %xmm0, %xmm0 51; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0 52; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 53; AVX-RECIP-NEXT: vaddss %xmm0, %xmm3, %xmm0 54; AVX-RECIP-NEXT: retq 55; 56; FMA-RECIP-LABEL: f32_one_step_2: 57; FMA-RECIP: # %bb.0: 58; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 59; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0] 60; FMA-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm3 61; FMA-RECIP-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2 62; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3 63; FMA-RECIP-NEXT: retq 64; 65; BDVER2-LABEL: f32_one_step_2: 66; BDVER2: # %bb.0: 67; BDVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 68; BDVER2-NEXT: vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0] 69; BDVER2-NEXT: vmulss %xmm2, %xmm1, %xmm3 70; BDVER2-NEXT: vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm3) - xmm2 71; BDVER2-NEXT: vfnmaddss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3 72; BDVER2-NEXT: retq 73; 74; BTVER2-LABEL: f32_one_step_2: 75; BTVER2: # %bb.0: 76; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0] 77; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 78; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm3 79; BTVER2-NEXT: vmulss %xmm3, %xmm0, %xmm0 80; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 81; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 82; BTVER2-NEXT: vaddss %xmm0, %xmm3, %xmm0 83; BTVER2-NEXT: retq 84; 85; SANDY-LABEL: f32_one_step_2: 86; SANDY: # %bb.0: 87; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 88; SANDY-NEXT: vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0] 89; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm3 90; SANDY-NEXT: vmulss %xmm3, %xmm0, %xmm0 91; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 92; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 93; SANDY-NEXT: vaddss %xmm0, %xmm3, %xmm0 94; SANDY-NEXT: retq 95; 96; HASWELL-LABEL: f32_one_step_2: 97; HASWELL: # %bb.0: 98; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 99; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0] 100; HASWELL-NEXT: vmulss %xmm2, %xmm1, %xmm3 101; HASWELL-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2 102; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3 103; HASWELL-NEXT: retq 104; 105; HASWELL-NO-FMA-LABEL: f32_one_step_2: 106; HASWELL-NO-FMA: # %bb.0: 107; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 108; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0] 109; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm3 110; HASWELL-NO-FMA-NEXT: vmulss %xmm3, %xmm0, %xmm0 111; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 112; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 113; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm3, %xmm0 114; HASWELL-NO-FMA-NEXT: retq 115; 116; AVX512-LABEL: f32_one_step_2: 117; AVX512: # %bb.0: 118; AVX512-NEXT: vrcpss %xmm0, %xmm0, %xmm1 119; AVX512-NEXT: vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0] 120; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm3 121; AVX512-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2 122; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3 123; AVX512-NEXT: retq 124 %div = fdiv fast float 3456.0, %x 125 ret float %div 126} 127 128define float @f32_one_step_2_divs(float %x) #1 { 129; SSE-LABEL: f32_one_step_2_divs: 130; SSE: # %bb.0: 131; SSE-NEXT: rcpss %xmm0, %xmm1 132; SSE-NEXT: mulss %xmm1, %xmm0 133; SSE-NEXT: movss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 134; SSE-NEXT: subss %xmm0, %xmm2 135; SSE-NEXT: mulss %xmm1, %xmm2 136; SSE-NEXT: addss %xmm1, %xmm2 137; SSE-NEXT: movss {{.*#+}} xmm0 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0] 138; SSE-NEXT: mulss %xmm2, %xmm0 139; SSE-NEXT: mulss %xmm2, %xmm0 140; SSE-NEXT: retq 141; 142; AVX-RECIP-LABEL: f32_one_step_2_divs: 143; AVX-RECIP: # %bb.0: 144; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 145; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 146; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 147; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0 148; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 149; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 150; AVX-RECIP-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 151; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 152; AVX-RECIP-NEXT: retq 153; 154; FMA-RECIP-LABEL: f32_one_step_2_divs: 155; FMA-RECIP: # %bb.0: 156; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 157; FMA-RECIP-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - mem 158; FMA-RECIP-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 159; FMA-RECIP-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 160; FMA-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 161; FMA-RECIP-NEXT: retq 162; 163; BDVER2-LABEL: f32_one_step_2_divs: 164; BDVER2: # %bb.0: 165; BDVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 166; BDVER2-NEXT: vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm1) - mem 167; BDVER2-NEXT: vfnmaddss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1 168; BDVER2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 169; BDVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 170; BDVER2-NEXT: retq 171; 172; BTVER2-LABEL: f32_one_step_2_divs: 173; BTVER2: # %bb.0: 174; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 175; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 176; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 177; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 178; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 179; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 180; BTVER2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 181; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 182; BTVER2-NEXT: retq 183; 184; SANDY-LABEL: f32_one_step_2_divs: 185; SANDY: # %bb.0: 186; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 187; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 188; SANDY-NEXT: vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 189; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 190; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 191; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 192; SANDY-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 193; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 194; SANDY-NEXT: retq 195; 196; HASWELL-LABEL: f32_one_step_2_divs: 197; HASWELL: # %bb.0: 198; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 199; HASWELL-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - mem 200; HASWELL-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 201; HASWELL-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 202; HASWELL-NEXT: vmulss %xmm0, %xmm1, %xmm0 203; HASWELL-NEXT: retq 204; 205; HASWELL-NO-FMA-LABEL: f32_one_step_2_divs: 206; HASWELL-NO-FMA: # %bb.0: 207; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 208; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 209; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 210; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 211; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 212; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 213; HASWELL-NO-FMA-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 214; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 215; HASWELL-NO-FMA-NEXT: retq 216; 217; AVX512-LABEL: f32_one_step_2_divs: 218; AVX512: # %bb.0: 219; AVX512-NEXT: vrcpss %xmm0, %xmm0, %xmm1 220; AVX512-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - mem 221; AVX512-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 222; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 223; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 224; AVX512-NEXT: retq 225 %div = fdiv fast float 3456.0, %x 226 %div2 = fdiv fast float %div, %x 227 ret float %div2 228} 229 230define float @f32_two_step_2(float %x) #2 { 231; SSE-LABEL: f32_two_step_2: 232; SSE: # %bb.0: 233; SSE-NEXT: rcpss %xmm0, %xmm1 234; SSE-NEXT: movaps %xmm0, %xmm2 235; SSE-NEXT: mulss %xmm1, %xmm2 236; SSE-NEXT: movss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 237; SSE-NEXT: subss %xmm2, %xmm3 238; SSE-NEXT: mulss %xmm1, %xmm3 239; SSE-NEXT: addss %xmm1, %xmm3 240; SSE-NEXT: movss {{.*#+}} xmm1 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0] 241; SSE-NEXT: movaps %xmm3, %xmm2 242; SSE-NEXT: mulss %xmm1, %xmm2 243; SSE-NEXT: mulss %xmm2, %xmm0 244; SSE-NEXT: subss %xmm0, %xmm1 245; SSE-NEXT: mulss %xmm3, %xmm1 246; SSE-NEXT: addss %xmm2, %xmm1 247; SSE-NEXT: movaps %xmm1, %xmm0 248; SSE-NEXT: retq 249; 250; AVX-RECIP-LABEL: f32_two_step_2: 251; AVX-RECIP: # %bb.0: 252; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 253; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm2 254; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 255; AVX-RECIP-NEXT: vsubss %xmm2, %xmm3, %xmm2 256; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm2 257; AVX-RECIP-NEXT: vaddss %xmm2, %xmm1, %xmm1 258; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0] 259; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm3 260; AVX-RECIP-NEXT: vmulss %xmm3, %xmm0, %xmm0 261; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0 262; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 263; AVX-RECIP-NEXT: vaddss %xmm0, %xmm3, %xmm0 264; AVX-RECIP-NEXT: retq 265; 266; FMA-RECIP-LABEL: f32_two_step_2: 267; FMA-RECIP: # %bb.0: 268; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 269; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 270; FMA-RECIP-NEXT: vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2 271; FMA-RECIP-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1 272; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm1 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0] 273; FMA-RECIP-NEXT: vmulss %xmm1, %xmm2, %xmm3 274; FMA-RECIP-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm1 275; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 276; FMA-RECIP-NEXT: retq 277; 278; BDVER2-LABEL: f32_two_step_2: 279; BDVER2: # %bb.0: 280; BDVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 281; BDVER2-NEXT: vfmsubss {{.*#+}} xmm2 = (xmm0 * xmm1) - mem 282; BDVER2-NEXT: vmovss {{.*#+}} xmm4 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0] 283; BDVER2-NEXT: vfnmaddss {{.*#+}} xmm1 = -(xmm1 * xmm2) + xmm1 284; BDVER2-NEXT: vmulss %xmm4, %xmm1, %xmm3 285; BDVER2-NEXT: vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm3) - xmm4 286; BDVER2-NEXT: vfnmaddss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3 287; BDVER2-NEXT: retq 288; 289; BTVER2-LABEL: f32_two_step_2: 290; BTVER2: # %bb.0: 291; BTVER2-NEXT: vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 292; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 293; BTVER2-NEXT: vmovss {{.*#+}} xmm4 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0] 294; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm2 295; BTVER2-NEXT: vsubss %xmm2, %xmm3, %xmm2 296; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm2 297; BTVER2-NEXT: vaddss %xmm2, %xmm1, %xmm1 298; BTVER2-NEXT: vmulss %xmm4, %xmm1, %xmm3 299; BTVER2-NEXT: vmulss %xmm3, %xmm0, %xmm0 300; BTVER2-NEXT: vsubss %xmm0, %xmm4, %xmm0 301; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 302; BTVER2-NEXT: vaddss %xmm0, %xmm3, %xmm0 303; BTVER2-NEXT: retq 304; 305; SANDY-LABEL: f32_two_step_2: 306; SANDY: # %bb.0: 307; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 308; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 309; SANDY-NEXT: vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 310; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 311; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 312; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 313; SANDY-NEXT: vmovss {{.*#+}} xmm2 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0] 314; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm3 315; SANDY-NEXT: vmulss %xmm3, %xmm0, %xmm0 316; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 317; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 318; SANDY-NEXT: vaddss %xmm0, %xmm3, %xmm0 319; SANDY-NEXT: retq 320; 321; HASWELL-LABEL: f32_two_step_2: 322; HASWELL: # %bb.0: 323; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 324; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 325; HASWELL-NEXT: vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2 326; HASWELL-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1 327; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0] 328; HASWELL-NEXT: vmulss %xmm1, %xmm2, %xmm3 329; HASWELL-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm1 330; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 331; HASWELL-NEXT: retq 332; 333; HASWELL-NO-FMA-LABEL: f32_two_step_2: 334; HASWELL-NO-FMA: # %bb.0: 335; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 336; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm2 337; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 338; HASWELL-NO-FMA-NEXT: vsubss %xmm2, %xmm3, %xmm2 339; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2 340; HASWELL-NO-FMA-NEXT: vaddss %xmm2, %xmm1, %xmm1 341; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0] 342; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm3 343; HASWELL-NO-FMA-NEXT: vmulss %xmm3, %xmm0, %xmm0 344; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 345; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 346; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm3, %xmm0 347; HASWELL-NO-FMA-NEXT: retq 348; 349; AVX512-LABEL: f32_two_step_2: 350; AVX512: # %bb.0: 351; AVX512-NEXT: vrcpss %xmm0, %xmm0, %xmm1 352; AVX512-NEXT: vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 353; AVX512-NEXT: vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2 354; AVX512-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1 355; AVX512-NEXT: vmovss {{.*#+}} xmm1 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0] 356; AVX512-NEXT: vmulss %xmm1, %xmm2, %xmm3 357; AVX512-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm1 358; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 359; AVX512-NEXT: retq 360 %div = fdiv fast float 6789.0, %x 361 ret float %div 362} 363 364define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 { 365; SSE-LABEL: v4f32_one_step2: 366; SSE: # %bb.0: 367; SSE-NEXT: rcpps %xmm0, %xmm2 368; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 369; SSE-NEXT: movaps %xmm2, %xmm3 370; SSE-NEXT: mulps %xmm1, %xmm3 371; SSE-NEXT: mulps %xmm3, %xmm0 372; SSE-NEXT: subps %xmm0, %xmm1 373; SSE-NEXT: mulps %xmm2, %xmm1 374; SSE-NEXT: addps %xmm3, %xmm1 375; SSE-NEXT: movaps %xmm1, %xmm0 376; SSE-NEXT: retq 377; 378; AVX-RECIP-LABEL: v4f32_one_step2: 379; AVX-RECIP: # %bb.0: 380; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 381; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 382; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm3 383; AVX-RECIP-NEXT: vmulps %xmm3, %xmm0, %xmm0 384; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 385; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 386; AVX-RECIP-NEXT: vaddps %xmm0, %xmm3, %xmm0 387; AVX-RECIP-NEXT: retq 388; 389; FMA-RECIP-LABEL: v4f32_one_step2: 390; FMA-RECIP: # %bb.0: 391; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 392; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 393; FMA-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm3 394; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2 395; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3 396; FMA-RECIP-NEXT: retq 397; 398; BDVER2-LABEL: v4f32_one_step2: 399; BDVER2: # %bb.0: 400; BDVER2-NEXT: vrcpps %xmm0, %xmm1 401; BDVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 402; BDVER2-NEXT: vmulps %xmm2, %xmm1, %xmm3 403; BDVER2-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm3) - xmm2 404; BDVER2-NEXT: vfnmaddps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3 405; BDVER2-NEXT: retq 406; 407; BTVER2-LABEL: v4f32_one_step2: 408; BTVER2: # %bb.0: 409; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 410; BTVER2-NEXT: vrcpps %xmm0, %xmm1 411; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm3 412; BTVER2-NEXT: vmulps %xmm3, %xmm0, %xmm0 413; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 414; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 415; BTVER2-NEXT: vaddps %xmm0, %xmm3, %xmm0 416; BTVER2-NEXT: retq 417; 418; SANDY-LABEL: v4f32_one_step2: 419; SANDY: # %bb.0: 420; SANDY-NEXT: vrcpps %xmm0, %xmm1 421; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 422; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm3 423; SANDY-NEXT: vmulps %xmm3, %xmm0, %xmm0 424; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 425; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 426; SANDY-NEXT: vaddps %xmm0, %xmm3, %xmm0 427; SANDY-NEXT: retq 428; 429; HASWELL-LABEL: v4f32_one_step2: 430; HASWELL: # %bb.0: 431; HASWELL-NEXT: vrcpps %xmm0, %xmm1 432; HASWELL-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 433; HASWELL-NEXT: vmulps %xmm2, %xmm1, %xmm3 434; HASWELL-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2 435; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3 436; HASWELL-NEXT: retq 437; 438; HASWELL-NO-FMA-LABEL: v4f32_one_step2: 439; HASWELL-NO-FMA: # %bb.0: 440; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 441; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 442; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm3 443; HASWELL-NO-FMA-NEXT: vmulps %xmm3, %xmm0, %xmm0 444; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 445; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 446; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm3, %xmm0 447; HASWELL-NO-FMA-NEXT: retq 448; 449; AVX512-LABEL: v4f32_one_step2: 450; AVX512: # %bb.0: 451; AVX512-NEXT: vrcpps %xmm0, %xmm1 452; AVX512-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 453; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm3 454; AVX512-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2 455; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3 456; AVX512-NEXT: retq 457 %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x 458 ret <4 x float> %div 459} 460 461define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 { 462; SSE-LABEL: v4f32_one_step_2_divs: 463; SSE: # %bb.0: 464; SSE-NEXT: rcpps %xmm0, %xmm1 465; SSE-NEXT: mulps %xmm1, %xmm0 466; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 467; SSE-NEXT: subps %xmm0, %xmm2 468; SSE-NEXT: mulps %xmm1, %xmm2 469; SSE-NEXT: addps %xmm1, %xmm2 470; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 471; SSE-NEXT: mulps %xmm2, %xmm0 472; SSE-NEXT: mulps %xmm2, %xmm0 473; SSE-NEXT: retq 474; 475; AVX-RECIP-LABEL: v4f32_one_step_2_divs: 476; AVX-RECIP: # %bb.0: 477; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 478; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 479; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 480; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 481; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 482; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 483; AVX-RECIP-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 484; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 485; AVX-RECIP-NEXT: retq 486; 487; FMA-RECIP-LABEL: v4f32_one_step_2_divs: 488; FMA-RECIP: # %bb.0: 489; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 490; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - mem 491; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 492; FMA-RECIP-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 493; FMA-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 494; FMA-RECIP-NEXT: retq 495; 496; BDVER2-LABEL: v4f32_one_step_2_divs: 497; BDVER2: # %bb.0: 498; BDVER2-NEXT: vrcpps %xmm0, %xmm1 499; BDVER2-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - mem 500; BDVER2-NEXT: vfnmaddps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1 501; BDVER2-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 502; BDVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 503; BDVER2-NEXT: retq 504; 505; BTVER2-LABEL: v4f32_one_step_2_divs: 506; BTVER2: # %bb.0: 507; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 508; BTVER2-NEXT: vrcpps %xmm0, %xmm1 509; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 510; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 511; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 512; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 513; BTVER2-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 514; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 515; BTVER2-NEXT: retq 516; 517; SANDY-LABEL: v4f32_one_step_2_divs: 518; SANDY: # %bb.0: 519; SANDY-NEXT: vrcpps %xmm0, %xmm1 520; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 521; SANDY-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 522; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 523; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 524; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 525; SANDY-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 526; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 527; SANDY-NEXT: retq 528; 529; HASWELL-LABEL: v4f32_one_step_2_divs: 530; HASWELL: # %bb.0: 531; HASWELL-NEXT: vrcpps %xmm0, %xmm1 532; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 533; HASWELL-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm1 * xmm0) - xmm2 534; HASWELL-NEXT: vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1 535; HASWELL-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 536; HASWELL-NEXT: vmulps %xmm2, %xmm0, %xmm0 537; HASWELL-NEXT: retq 538; 539; HASWELL-NO-FMA-LABEL: v4f32_one_step_2_divs: 540; HASWELL-NO-FMA: # %bb.0: 541; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 542; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 543; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 544; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 545; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 546; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 547; HASWELL-NO-FMA-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 548; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 549; HASWELL-NO-FMA-NEXT: retq 550; 551; KNL-LABEL: v4f32_one_step_2_divs: 552; KNL: # %bb.0: 553; KNL-NEXT: vrcpps %xmm0, %xmm1 554; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 555; KNL-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm1 * xmm0) - xmm2 556; KNL-NEXT: vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1 557; KNL-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 558; KNL-NEXT: vmulps %xmm2, %xmm0, %xmm0 559; KNL-NEXT: retq 560; 561; SKX-LABEL: v4f32_one_step_2_divs: 562; SKX: # %bb.0: 563; SKX-NEXT: vrcpps %xmm0, %xmm1 564; SKX-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - mem 565; SKX-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 566; SKX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 567; SKX-NEXT: vmulps %xmm0, %xmm1, %xmm0 568; SKX-NEXT: retq 569 %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x 570 %div2 = fdiv fast <4 x float> %div, %x 571 ret <4 x float> %div2 572} 573 574define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 { 575; SSE-LABEL: v4f32_two_step2: 576; SSE: # %bb.0: 577; SSE-NEXT: rcpps %xmm0, %xmm1 578; SSE-NEXT: movaps %xmm0, %xmm2 579; SSE-NEXT: mulps %xmm1, %xmm2 580; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 581; SSE-NEXT: subps %xmm2, %xmm3 582; SSE-NEXT: mulps %xmm1, %xmm3 583; SSE-NEXT: addps %xmm1, %xmm3 584; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 585; SSE-NEXT: movaps %xmm3, %xmm2 586; SSE-NEXT: mulps %xmm1, %xmm2 587; SSE-NEXT: mulps %xmm2, %xmm0 588; SSE-NEXT: subps %xmm0, %xmm1 589; SSE-NEXT: mulps %xmm3, %xmm1 590; SSE-NEXT: addps %xmm2, %xmm1 591; SSE-NEXT: movaps %xmm1, %xmm0 592; SSE-NEXT: retq 593; 594; AVX-RECIP-LABEL: v4f32_two_step2: 595; AVX-RECIP: # %bb.0: 596; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 597; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm2 598; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 599; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2 600; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2 601; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1 602; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 603; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm3 604; AVX-RECIP-NEXT: vmulps %xmm3, %xmm0, %xmm0 605; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 606; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 607; AVX-RECIP-NEXT: vaddps %xmm0, %xmm3, %xmm0 608; AVX-RECIP-NEXT: retq 609; 610; FMA-RECIP-LABEL: v4f32_two_step2: 611; FMA-RECIP: # %bb.0: 612; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 613; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 614; FMA-RECIP-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2 615; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1 616; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 617; FMA-RECIP-NEXT: vmulps %xmm1, %xmm2, %xmm3 618; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm1 619; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 620; FMA-RECIP-NEXT: retq 621; 622; BDVER2-LABEL: v4f32_two_step2: 623; BDVER2: # %bb.0: 624; BDVER2-NEXT: vrcpps %xmm0, %xmm1 625; BDVER2-NEXT: vfmsubps {{.*#+}} xmm2 = (xmm0 * xmm1) - mem 626; BDVER2-NEXT: vmovaps {{.*#+}} xmm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 627; BDVER2-NEXT: vfnmaddps {{.*#+}} xmm1 = -(xmm1 * xmm2) + xmm1 628; BDVER2-NEXT: vmulps %xmm4, %xmm1, %xmm3 629; BDVER2-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm3) - xmm4 630; BDVER2-NEXT: vfnmaddps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3 631; BDVER2-NEXT: retq 632; 633; BTVER2-LABEL: v4f32_two_step2: 634; BTVER2: # %bb.0: 635; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 636; BTVER2-NEXT: vrcpps %xmm0, %xmm1 637; BTVER2-NEXT: vmovaps {{.*#+}} xmm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 638; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 639; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2 640; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm2 641; BTVER2-NEXT: vaddps %xmm2, %xmm1, %xmm1 642; BTVER2-NEXT: vmulps %xmm4, %xmm1, %xmm3 643; BTVER2-NEXT: vmulps %xmm3, %xmm0, %xmm0 644; BTVER2-NEXT: vsubps %xmm0, %xmm4, %xmm0 645; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 646; BTVER2-NEXT: vaddps %xmm0, %xmm3, %xmm0 647; BTVER2-NEXT: retq 648; 649; SANDY-LABEL: v4f32_two_step2: 650; SANDY: # %bb.0: 651; SANDY-NEXT: vrcpps %xmm0, %xmm1 652; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 653; SANDY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 654; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 655; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 656; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 657; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 658; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm3 659; SANDY-NEXT: vmulps %xmm3, %xmm0, %xmm0 660; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 661; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 662; SANDY-NEXT: vaddps %xmm0, %xmm3, %xmm0 663; SANDY-NEXT: retq 664; 665; HASWELL-LABEL: v4f32_two_step2: 666; HASWELL: # %bb.0: 667; HASWELL-NEXT: vrcpps %xmm0, %xmm1 668; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 669; HASWELL-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2 670; HASWELL-NEXT: vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1 671; HASWELL-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 672; HASWELL-NEXT: vmulps %xmm1, %xmm2, %xmm3 673; HASWELL-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm1 674; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 675; HASWELL-NEXT: retq 676; 677; HASWELL-NO-FMA-LABEL: v4f32_two_step2: 678; HASWELL-NO-FMA: # %bb.0: 679; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 680; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2 681; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 682; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2 683; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2 684; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1 685; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 686; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm3 687; HASWELL-NO-FMA-NEXT: vmulps %xmm3, %xmm0, %xmm0 688; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 689; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 690; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm3, %xmm0 691; HASWELL-NO-FMA-NEXT: retq 692; 693; AVX512-LABEL: v4f32_two_step2: 694; AVX512: # %bb.0: 695; AVX512-NEXT: vrcpps %xmm0, %xmm1 696; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 697; AVX512-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2 698; AVX512-NEXT: vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1 699; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 700; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm3 701; AVX512-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm1 702; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 703; AVX512-NEXT: retq 704 %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x 705 ret <4 x float> %div 706} 707 708define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 { 709; SSE-LABEL: v8f32_one_step2: 710; SSE: # %bb.0: 711; SSE-NEXT: rcpps %xmm0, %xmm3 712; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 713; SSE-NEXT: movaps %xmm3, %xmm4 714; SSE-NEXT: mulps %xmm2, %xmm4 715; SSE-NEXT: mulps %xmm4, %xmm0 716; SSE-NEXT: subps %xmm0, %xmm2 717; SSE-NEXT: mulps %xmm3, %xmm2 718; SSE-NEXT: addps %xmm4, %xmm2 719; SSE-NEXT: rcpps %xmm1, %xmm0 720; SSE-NEXT: movaps {{.*#+}} xmm3 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0] 721; SSE-NEXT: movaps %xmm0, %xmm4 722; SSE-NEXT: mulps %xmm3, %xmm4 723; SSE-NEXT: mulps %xmm4, %xmm1 724; SSE-NEXT: subps %xmm1, %xmm3 725; SSE-NEXT: mulps %xmm0, %xmm3 726; SSE-NEXT: addps %xmm4, %xmm3 727; SSE-NEXT: movaps %xmm2, %xmm0 728; SSE-NEXT: movaps %xmm3, %xmm1 729; SSE-NEXT: retq 730; 731; AVX-RECIP-LABEL: v8f32_one_step2: 732; AVX-RECIP: # %bb.0: 733; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 734; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 735; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3 736; AVX-RECIP-NEXT: vmulps %ymm3, %ymm0, %ymm0 737; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 738; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 739; AVX-RECIP-NEXT: vaddps %ymm0, %ymm3, %ymm0 740; AVX-RECIP-NEXT: retq 741; 742; FMA-RECIP-LABEL: v8f32_one_step2: 743; FMA-RECIP: # %bb.0: 744; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 745; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 746; FMA-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3 747; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm2 748; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm3 749; FMA-RECIP-NEXT: retq 750; 751; BDVER2-LABEL: v8f32_one_step2: 752; BDVER2: # %bb.0: 753; BDVER2-NEXT: vrcpps %ymm0, %ymm1 754; BDVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 755; BDVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3 756; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm3) - ymm2 757; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm3 758; BDVER2-NEXT: retq 759; 760; BTVER2-LABEL: v8f32_one_step2: 761; BTVER2: # %bb.0: 762; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 763; BTVER2-NEXT: vrcpps %ymm0, %ymm1 764; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3 765; BTVER2-NEXT: vmulps %ymm3, %ymm0, %ymm0 766; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 767; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 768; BTVER2-NEXT: vaddps %ymm0, %ymm3, %ymm0 769; BTVER2-NEXT: retq 770; 771; SANDY-LABEL: v8f32_one_step2: 772; SANDY: # %bb.0: 773; SANDY-NEXT: vrcpps %ymm0, %ymm1 774; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 775; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm3 776; SANDY-NEXT: vmulps %ymm3, %ymm0, %ymm0 777; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 778; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 779; SANDY-NEXT: vaddps %ymm0, %ymm3, %ymm0 780; SANDY-NEXT: retq 781; 782; HASWELL-LABEL: v8f32_one_step2: 783; HASWELL: # %bb.0: 784; HASWELL-NEXT: vrcpps %ymm0, %ymm1 785; HASWELL-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 786; HASWELL-NEXT: vmulps %ymm2, %ymm1, %ymm3 787; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm2 788; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm3 789; HASWELL-NEXT: retq 790; 791; HASWELL-NO-FMA-LABEL: v8f32_one_step2: 792; HASWELL-NO-FMA: # %bb.0: 793; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 794; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 795; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm3 796; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm0, %ymm0 797; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 798; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 799; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm3, %ymm0 800; HASWELL-NO-FMA-NEXT: retq 801; 802; AVX512-LABEL: v8f32_one_step2: 803; AVX512: # %bb.0: 804; AVX512-NEXT: vrcpps %ymm0, %ymm1 805; AVX512-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 806; AVX512-NEXT: vmulps %ymm2, %ymm1, %ymm3 807; AVX512-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm2 808; AVX512-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm3 809; AVX512-NEXT: retq 810 %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x 811 ret <8 x float> %div 812} 813 814define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 { 815; SSE-LABEL: v8f32_one_step_2_divs: 816; SSE: # %bb.0: 817; SSE-NEXT: rcpps %xmm0, %xmm2 818; SSE-NEXT: mulps %xmm2, %xmm0 819; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 820; SSE-NEXT: movaps %xmm3, %xmm4 821; SSE-NEXT: subps %xmm0, %xmm4 822; SSE-NEXT: mulps %xmm2, %xmm4 823; SSE-NEXT: addps %xmm2, %xmm4 824; SSE-NEXT: rcpps %xmm1, %xmm0 825; SSE-NEXT: mulps %xmm0, %xmm1 826; SSE-NEXT: subps %xmm1, %xmm3 827; SSE-NEXT: mulps %xmm0, %xmm3 828; SSE-NEXT: addps %xmm0, %xmm3 829; SSE-NEXT: movaps {{.*#+}} xmm1 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0] 830; SSE-NEXT: mulps %xmm3, %xmm1 831; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 832; SSE-NEXT: mulps %xmm4, %xmm0 833; SSE-NEXT: mulps %xmm4, %xmm0 834; SSE-NEXT: mulps %xmm3, %xmm1 835; SSE-NEXT: retq 836; 837; AVX-RECIP-LABEL: v8f32_one_step_2_divs: 838; AVX-RECIP: # %bb.0: 839; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 840; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 841; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 842; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 843; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 844; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 845; AVX-RECIP-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 846; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 847; AVX-RECIP-NEXT: retq 848; 849; FMA-RECIP-LABEL: v8f32_one_step_2_divs: 850; FMA-RECIP: # %bb.0: 851; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 852; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - mem 853; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1 854; FMA-RECIP-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 855; FMA-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 856; FMA-RECIP-NEXT: retq 857; 858; BDVER2-LABEL: v8f32_one_step_2_divs: 859; BDVER2: # %bb.0: 860; BDVER2-NEXT: vrcpps %ymm0, %ymm1 861; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm1) - mem 862; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm1 863; BDVER2-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 864; BDVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 865; BDVER2-NEXT: retq 866; 867; BTVER2-LABEL: v8f32_one_step_2_divs: 868; BTVER2: # %bb.0: 869; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 870; BTVER2-NEXT: vrcpps %ymm0, %ymm1 871; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 872; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 873; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 874; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 875; BTVER2-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 876; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 877; BTVER2-NEXT: retq 878; 879; SANDY-LABEL: v8f32_one_step_2_divs: 880; SANDY: # %bb.0: 881; SANDY-NEXT: vrcpps %ymm0, %ymm1 882; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 883; SANDY-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 884; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 885; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 886; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 887; SANDY-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 888; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 889; SANDY-NEXT: retq 890; 891; HASWELL-LABEL: v8f32_one_step_2_divs: 892; HASWELL: # %bb.0: 893; HASWELL-NEXT: vrcpps %ymm0, %ymm1 894; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 895; HASWELL-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm1 * ymm0) - ymm2 896; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1 897; HASWELL-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 898; HASWELL-NEXT: vmulps %ymm2, %ymm0, %ymm0 899; HASWELL-NEXT: retq 900; 901; HASWELL-NO-FMA-LABEL: v8f32_one_step_2_divs: 902; HASWELL-NO-FMA: # %bb.0: 903; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 904; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 905; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 906; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 907; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 908; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 909; HASWELL-NO-FMA-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 910; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 911; HASWELL-NO-FMA-NEXT: retq 912; 913; KNL-LABEL: v8f32_one_step_2_divs: 914; KNL: # %bb.0: 915; KNL-NEXT: vrcpps %ymm0, %ymm1 916; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 917; KNL-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm1 * ymm0) - ymm2 918; KNL-NEXT: vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1 919; KNL-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 920; KNL-NEXT: vmulps %ymm2, %ymm0, %ymm0 921; KNL-NEXT: retq 922; 923; SKX-LABEL: v8f32_one_step_2_divs: 924; SKX: # %bb.0: 925; SKX-NEXT: vrcpps %ymm0, %ymm1 926; SKX-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - mem 927; SKX-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1 928; SKX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 929; SKX-NEXT: vmulps %ymm0, %ymm1, %ymm0 930; SKX-NEXT: retq 931 %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x 932 %div2 = fdiv fast <8 x float> %div, %x 933 ret <8 x float> %div2 934} 935 936define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 { 937; SSE-LABEL: v8f32_two_step2: 938; SSE: # %bb.0: 939; SSE-NEXT: rcpps %xmm0, %xmm2 940; SSE-NEXT: movaps %xmm0, %xmm3 941; SSE-NEXT: mulps %xmm2, %xmm3 942; SSE-NEXT: movaps {{.*#+}} xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 943; SSE-NEXT: movaps %xmm4, %xmm5 944; SSE-NEXT: subps %xmm3, %xmm5 945; SSE-NEXT: mulps %xmm2, %xmm5 946; SSE-NEXT: addps %xmm2, %xmm5 947; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 948; SSE-NEXT: movaps %xmm5, %xmm3 949; SSE-NEXT: mulps %xmm2, %xmm3 950; SSE-NEXT: mulps %xmm3, %xmm0 951; SSE-NEXT: subps %xmm0, %xmm2 952; SSE-NEXT: mulps %xmm5, %xmm2 953; SSE-NEXT: addps %xmm3, %xmm2 954; SSE-NEXT: rcpps %xmm1, %xmm0 955; SSE-NEXT: movaps %xmm1, %xmm3 956; SSE-NEXT: mulps %xmm0, %xmm3 957; SSE-NEXT: subps %xmm3, %xmm4 958; SSE-NEXT: mulps %xmm0, %xmm4 959; SSE-NEXT: addps %xmm0, %xmm4 960; SSE-NEXT: movaps {{.*#+}} xmm3 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0] 961; SSE-NEXT: movaps %xmm4, %xmm0 962; SSE-NEXT: mulps %xmm3, %xmm0 963; SSE-NEXT: mulps %xmm0, %xmm1 964; SSE-NEXT: subps %xmm1, %xmm3 965; SSE-NEXT: mulps %xmm4, %xmm3 966; SSE-NEXT: addps %xmm0, %xmm3 967; SSE-NEXT: movaps %xmm2, %xmm0 968; SSE-NEXT: movaps %xmm3, %xmm1 969; SSE-NEXT: retq 970; 971; AVX-RECIP-LABEL: v8f32_two_step2: 972; AVX-RECIP: # %bb.0: 973; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 974; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2 975; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 976; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2 977; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2 978; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1 979; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 980; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3 981; AVX-RECIP-NEXT: vmulps %ymm3, %ymm0, %ymm0 982; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 983; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 984; AVX-RECIP-NEXT: vaddps %ymm0, %ymm3, %ymm0 985; AVX-RECIP-NEXT: retq 986; 987; FMA-RECIP-LABEL: v8f32_two_step2: 988; FMA-RECIP: # %bb.0: 989; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 990; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 991; FMA-RECIP-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2 992; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1 993; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 994; FMA-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm3 995; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm1 996; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3 997; FMA-RECIP-NEXT: retq 998; 999; BDVER2-LABEL: v8f32_two_step2: 1000; BDVER2: # %bb.0: 1001; BDVER2-NEXT: vrcpps %ymm0, %ymm1 1002; BDVER2-NEXT: vfmsubps {{.*#+}} ymm2 = (ymm0 * ymm1) - mem 1003; BDVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 1004; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm1 1005; BDVER2-NEXT: vmulps %ymm4, %ymm1, %ymm3 1006; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm3) - ymm4 1007; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm3 1008; BDVER2-NEXT: retq 1009; 1010; BTVER2-LABEL: v8f32_two_step2: 1011; BTVER2: # %bb.0: 1012; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1013; BTVER2-NEXT: vrcpps %ymm0, %ymm1 1014; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 1015; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 1016; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 1017; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 1018; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 1019; BTVER2-NEXT: vmulps %ymm4, %ymm1, %ymm3 1020; BTVER2-NEXT: vmulps %ymm3, %ymm0, %ymm0 1021; BTVER2-NEXT: vsubps %ymm0, %ymm4, %ymm0 1022; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 1023; BTVER2-NEXT: vaddps %ymm0, %ymm3, %ymm0 1024; BTVER2-NEXT: retq 1025; 1026; SANDY-LABEL: v8f32_two_step2: 1027; SANDY: # %bb.0: 1028; SANDY-NEXT: vrcpps %ymm0, %ymm1 1029; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 1030; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1031; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 1032; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 1033; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 1034; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 1035; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm3 1036; SANDY-NEXT: vmulps %ymm3, %ymm0, %ymm0 1037; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 1038; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 1039; SANDY-NEXT: vaddps %ymm0, %ymm3, %ymm0 1040; SANDY-NEXT: retq 1041; 1042; HASWELL-LABEL: v8f32_two_step2: 1043; HASWELL: # %bb.0: 1044; HASWELL-NEXT: vrcpps %ymm0, %ymm1 1045; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1046; HASWELL-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2 1047; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1 1048; HASWELL-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 1049; HASWELL-NEXT: vmulps %ymm1, %ymm2, %ymm3 1050; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm1 1051; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3 1052; HASWELL-NEXT: retq 1053; 1054; HASWELL-NO-FMA-LABEL: v8f32_two_step2: 1055; HASWELL-NO-FMA: # %bb.0: 1056; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 1057; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2 1058; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1059; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2 1060; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 1061; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1 1062; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 1063; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm3 1064; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm0, %ymm0 1065; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 1066; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 1067; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm3, %ymm0 1068; HASWELL-NO-FMA-NEXT: retq 1069; 1070; AVX512-LABEL: v8f32_two_step2: 1071; AVX512: # %bb.0: 1072; AVX512-NEXT: vrcpps %ymm0, %ymm1 1073; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1074; AVX512-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2 1075; AVX512-NEXT: vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1 1076; AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 1077; AVX512-NEXT: vmulps %ymm1, %ymm2, %ymm3 1078; AVX512-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm1 1079; AVX512-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3 1080; AVX512-NEXT: retq 1081 %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x 1082 ret <8 x float> %div 1083} 1084 1085define <8 x float> @v8f32_no_step(<8 x float> %x) #3 { 1086; SSE-LABEL: v8f32_no_step: 1087; SSE: # %bb.0: 1088; SSE-NEXT: rcpps %xmm0, %xmm0 1089; SSE-NEXT: rcpps %xmm1, %xmm1 1090; SSE-NEXT: retq 1091; 1092; AVX-LABEL: v8f32_no_step: 1093; AVX: # %bb.0: 1094; AVX-NEXT: vrcpps %ymm0, %ymm0 1095; AVX-NEXT: retq 1096 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x 1097 ret <8 x float> %div 1098} 1099 1100define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 { 1101; SSE-LABEL: v8f32_no_step2: 1102; SSE: # %bb.0: 1103; SSE-NEXT: rcpps %xmm0, %xmm0 1104; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1105; SSE-NEXT: rcpps %xmm1, %xmm1 1106; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1107; SSE-NEXT: retq 1108; 1109; AVX-LABEL: v8f32_no_step2: 1110; AVX: # %bb.0: 1111; AVX-NEXT: vrcpps %ymm0, %ymm0 1112; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1113; AVX-NEXT: retq 1114 %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x 1115 ret <8 x float> %div 1116} 1117 1118define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 { 1119; SSE-LABEL: v16f32_one_step2: 1120; SSE: # %bb.0: 1121; SSE-NEXT: movaps %xmm1, %xmm4 1122; SSE-NEXT: movaps %xmm0, %xmm1 1123; SSE-NEXT: rcpps %xmm0, %xmm5 1124; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 1125; SSE-NEXT: movaps %xmm5, %xmm6 1126; SSE-NEXT: mulps %xmm0, %xmm6 1127; SSE-NEXT: mulps %xmm6, %xmm1 1128; SSE-NEXT: subps %xmm1, %xmm0 1129; SSE-NEXT: mulps %xmm5, %xmm0 1130; SSE-NEXT: addps %xmm6, %xmm0 1131; SSE-NEXT: rcpps %xmm4, %xmm5 1132; SSE-NEXT: movaps {{.*#+}} xmm1 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0] 1133; SSE-NEXT: movaps %xmm5, %xmm6 1134; SSE-NEXT: mulps %xmm1, %xmm6 1135; SSE-NEXT: mulps %xmm6, %xmm4 1136; SSE-NEXT: subps %xmm4, %xmm1 1137; SSE-NEXT: mulps %xmm5, %xmm1 1138; SSE-NEXT: addps %xmm6, %xmm1 1139; SSE-NEXT: rcpps %xmm2, %xmm5 1140; SSE-NEXT: movaps {{.*#+}} xmm4 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1] 1141; SSE-NEXT: movaps %xmm5, %xmm6 1142; SSE-NEXT: mulps %xmm4, %xmm6 1143; SSE-NEXT: mulps %xmm6, %xmm2 1144; SSE-NEXT: subps %xmm2, %xmm4 1145; SSE-NEXT: mulps %xmm5, %xmm4 1146; SSE-NEXT: addps %xmm6, %xmm4 1147; SSE-NEXT: rcpps %xmm3, %xmm2 1148; SSE-NEXT: movaps {{.*#+}} xmm5 = [1.3E+1,1.4E+1,1.5E+1,1.6E+1] 1149; SSE-NEXT: movaps %xmm2, %xmm6 1150; SSE-NEXT: mulps %xmm5, %xmm6 1151; SSE-NEXT: mulps %xmm6, %xmm3 1152; SSE-NEXT: subps %xmm3, %xmm5 1153; SSE-NEXT: mulps %xmm2, %xmm5 1154; SSE-NEXT: addps %xmm6, %xmm5 1155; SSE-NEXT: movaps %xmm4, %xmm2 1156; SSE-NEXT: movaps %xmm5, %xmm3 1157; SSE-NEXT: retq 1158; 1159; AVX-RECIP-LABEL: v16f32_one_step2: 1160; AVX-RECIP: # %bb.0: 1161; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 1162; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 1163; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4 1164; AVX-RECIP-NEXT: vmulps %ymm4, %ymm0, %ymm0 1165; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 1166; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 1167; AVX-RECIP-NEXT: vaddps %ymm0, %ymm4, %ymm0 1168; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2 1169; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] 1170; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4 1171; AVX-RECIP-NEXT: vmulps %ymm4, %ymm1, %ymm1 1172; AVX-RECIP-NEXT: vsubps %ymm1, %ymm3, %ymm1 1173; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 1174; AVX-RECIP-NEXT: vaddps %ymm1, %ymm4, %ymm1 1175; AVX-RECIP-NEXT: retq 1176; 1177; FMA-RECIP-LABEL: v16f32_one_step2: 1178; FMA-RECIP: # %bb.0: 1179; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 1180; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 1181; FMA-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4 1182; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm4 * ymm0) - ymm3 1183; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm4 1184; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 1185; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] 1186; FMA-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4 1187; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm4 * ymm1) - ymm3 1188; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm4 1189; FMA-RECIP-NEXT: retq 1190; 1191; BDVER2-LABEL: v16f32_one_step2: 1192; BDVER2: # %bb.0: 1193; BDVER2-NEXT: vrcpps %ymm0, %ymm2 1194; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 1195; BDVER2-NEXT: vrcpps %ymm1, %ymm5 1196; BDVER2-NEXT: vmulps %ymm3, %ymm2, %ymm4 1197; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm4) - ymm3 1198; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] 1199; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm4 1200; BDVER2-NEXT: vmulps %ymm3, %ymm5, %ymm4 1201; BDVER2-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm4) - ymm3 1202; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm5 * ymm1) + ymm4 1203; BDVER2-NEXT: retq 1204; 1205; BTVER2-LABEL: v16f32_one_step2: 1206; BTVER2: # %bb.0: 1207; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 1208; BTVER2-NEXT: vrcpps %ymm0, %ymm2 1209; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm4 1210; BTVER2-NEXT: vmulps %ymm4, %ymm0, %ymm0 1211; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 1212; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] 1213; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0 1214; BTVER2-NEXT: vrcpps %ymm1, %ymm2 1215; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm5 1216; BTVER2-NEXT: vaddps %ymm0, %ymm4, %ymm0 1217; BTVER2-NEXT: vmulps %ymm5, %ymm1, %ymm1 1218; BTVER2-NEXT: vsubps %ymm1, %ymm3, %ymm1 1219; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 1220; BTVER2-NEXT: vaddps %ymm1, %ymm5, %ymm1 1221; BTVER2-NEXT: retq 1222; 1223; SANDY-LABEL: v16f32_one_step2: 1224; SANDY: # %bb.0: 1225; SANDY-NEXT: vrcpps %ymm0, %ymm2 1226; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 1227; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm4 1228; SANDY-NEXT: vmulps %ymm4, %ymm0, %ymm0 1229; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 1230; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 1231; SANDY-NEXT: vaddps %ymm0, %ymm4, %ymm0 1232; SANDY-NEXT: vrcpps %ymm1, %ymm2 1233; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] 1234; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm4 1235; SANDY-NEXT: vmulps %ymm4, %ymm1, %ymm1 1236; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1 1237; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 1238; SANDY-NEXT: vaddps %ymm1, %ymm4, %ymm1 1239; SANDY-NEXT: retq 1240; 1241; HASWELL-LABEL: v16f32_one_step2: 1242; HASWELL: # %bb.0: 1243; HASWELL-NEXT: vrcpps %ymm0, %ymm2 1244; HASWELL-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 1245; HASWELL-NEXT: vmulps %ymm3, %ymm2, %ymm4 1246; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm4 * ymm0) - ymm3 1247; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm4 1248; HASWELL-NEXT: vrcpps %ymm1, %ymm2 1249; HASWELL-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] 1250; HASWELL-NEXT: vmulps %ymm3, %ymm2, %ymm4 1251; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm4 * ymm1) - ymm3 1252; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm4 1253; HASWELL-NEXT: retq 1254; 1255; HASWELL-NO-FMA-LABEL: v16f32_one_step2: 1256; HASWELL-NO-FMA: # %bb.0: 1257; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2 1258; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 1259; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm4 1260; HASWELL-NO-FMA-NEXT: vmulps %ymm4, %ymm0, %ymm0 1261; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 1262; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm3 1263; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0 1264; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm4, %ymm0 1265; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] 1266; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm3, %ymm4 1267; HASWELL-NO-FMA-NEXT: vmulps %ymm4, %ymm1, %ymm1 1268; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm2, %ymm1 1269; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm3, %ymm1 1270; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm4, %ymm1 1271; HASWELL-NO-FMA-NEXT: retq 1272; 1273; AVX512-LABEL: v16f32_one_step2: 1274; AVX512: # %bb.0: 1275; AVX512-NEXT: vrcp14ps %zmm0, %zmm1 1276; AVX512-NEXT: vmovaps {{.*#+}} zmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0,9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] 1277; AVX512-NEXT: vmulps %zmm2, %zmm1, %zmm3 1278; AVX512-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm3 * zmm0) - zmm2 1279; AVX512-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm3 1280; AVX512-NEXT: retq 1281 %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x 1282 ret <16 x float> %div 1283} 1284 1285define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 { 1286; SSE-LABEL: v16f32_one_step_2_divs: 1287; SSE: # %bb.0: 1288; SSE-NEXT: rcpps %xmm0, %xmm6 1289; SSE-NEXT: mulps %xmm6, %xmm0 1290; SSE-NEXT: movaps {{.*#+}} xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1291; SSE-NEXT: movaps %xmm4, %xmm5 1292; SSE-NEXT: subps %xmm0, %xmm5 1293; SSE-NEXT: mulps %xmm6, %xmm5 1294; SSE-NEXT: addps %xmm6, %xmm5 1295; SSE-NEXT: rcpps %xmm1, %xmm0 1296; SSE-NEXT: mulps %xmm0, %xmm1 1297; SSE-NEXT: movaps %xmm4, %xmm6 1298; SSE-NEXT: subps %xmm1, %xmm6 1299; SSE-NEXT: mulps %xmm0, %xmm6 1300; SSE-NEXT: addps %xmm0, %xmm6 1301; SSE-NEXT: rcpps %xmm2, %xmm0 1302; SSE-NEXT: mulps %xmm0, %xmm2 1303; SSE-NEXT: movaps %xmm4, %xmm7 1304; SSE-NEXT: subps %xmm2, %xmm7 1305; SSE-NEXT: mulps %xmm0, %xmm7 1306; SSE-NEXT: addps %xmm0, %xmm7 1307; SSE-NEXT: rcpps %xmm3, %xmm0 1308; SSE-NEXT: mulps %xmm0, %xmm3 1309; SSE-NEXT: subps %xmm3, %xmm4 1310; SSE-NEXT: mulps %xmm0, %xmm4 1311; SSE-NEXT: addps %xmm0, %xmm4 1312; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.3E+1,1.4E+1,1.5E+1,1.6E+1] 1313; SSE-NEXT: mulps %xmm4, %xmm3 1314; SSE-NEXT: movaps {{.*#+}} xmm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1] 1315; SSE-NEXT: mulps %xmm7, %xmm2 1316; SSE-NEXT: movaps {{.*#+}} xmm1 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0] 1317; SSE-NEXT: mulps %xmm6, %xmm1 1318; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 1319; SSE-NEXT: mulps %xmm5, %xmm0 1320; SSE-NEXT: mulps %xmm5, %xmm0 1321; SSE-NEXT: mulps %xmm6, %xmm1 1322; SSE-NEXT: mulps %xmm7, %xmm2 1323; SSE-NEXT: mulps %xmm4, %xmm3 1324; SSE-NEXT: retq 1325; 1326; AVX-RECIP-LABEL: v16f32_one_step_2_divs: 1327; AVX-RECIP: # %bb.0: 1328; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 1329; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 1330; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1331; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 1332; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 1333; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0 1334; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2 1335; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm1 1336; AVX-RECIP-NEXT: vsubps %ymm1, %ymm3, %ymm1 1337; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 1338; AVX-RECIP-NEXT: vaddps %ymm1, %ymm2, %ymm1 1339; AVX-RECIP-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 1340; AVX-RECIP-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 1341; AVX-RECIP-NEXT: vmulps %ymm0, %ymm3, %ymm0 1342; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 1343; AVX-RECIP-NEXT: retq 1344; 1345; FMA-RECIP-LABEL: v16f32_one_step_2_divs: 1346; FMA-RECIP: # %bb.0: 1347; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 1348; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1349; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm3 1350; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2 1351; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 1352; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm3 1353; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm2 1354; FMA-RECIP-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 1355; FMA-RECIP-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 1356; FMA-RECIP-NEXT: vmulps %ymm0, %ymm3, %ymm0 1357; FMA-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 1358; FMA-RECIP-NEXT: retq 1359; 1360; BDVER2-LABEL: v16f32_one_step_2_divs: 1361; BDVER2: # %bb.0: 1362; BDVER2-NEXT: vrcpps %ymm0, %ymm2 1363; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1364; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3 1365; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm2 1366; BDVER2-NEXT: vrcpps %ymm1, %ymm2 1367; BDVER2-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm2) - ymm3 1368; BDVER2-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 1369; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm2 1370; BDVER2-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 1371; BDVER2-NEXT: vmulps %ymm0, %ymm3, %ymm0 1372; BDVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 1373; BDVER2-NEXT: retq 1374; 1375; BTVER2-LABEL: v16f32_one_step_2_divs: 1376; BTVER2: # %bb.0: 1377; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1378; BTVER2-NEXT: vrcpps %ymm0, %ymm2 1379; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 1380; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 1381; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0 1382; BTVER2-NEXT: vaddps %ymm0, %ymm2, %ymm0 1383; BTVER2-NEXT: vrcpps %ymm1, %ymm2 1384; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm1 1385; BTVER2-NEXT: vsubps %ymm1, %ymm3, %ymm1 1386; BTVER2-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 1387; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 1388; BTVER2-NEXT: vaddps %ymm1, %ymm2, %ymm1 1389; BTVER2-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 1390; BTVER2-NEXT: vmulps %ymm0, %ymm3, %ymm0 1391; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 1392; BTVER2-NEXT: retq 1393; 1394; SANDY-LABEL: v16f32_one_step_2_divs: 1395; SANDY: # %bb.0: 1396; SANDY-NEXT: vrcpps %ymm0, %ymm2 1397; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 1398; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1399; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 1400; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 1401; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0 1402; SANDY-NEXT: vrcpps %ymm1, %ymm2 1403; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm1 1404; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1 1405; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 1406; SANDY-NEXT: vaddps %ymm1, %ymm2, %ymm1 1407; SANDY-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 1408; SANDY-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 1409; SANDY-NEXT: vmulps %ymm0, %ymm3, %ymm0 1410; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 1411; SANDY-NEXT: retq 1412; 1413; HASWELL-LABEL: v16f32_one_step_2_divs: 1414; HASWELL: # %bb.0: 1415; HASWELL-NEXT: vrcpps %ymm0, %ymm2 1416; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1417; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm3 1418; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2 1419; HASWELL-NEXT: vrcpps %ymm1, %ymm2 1420; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm3 1421; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm2 1422; HASWELL-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 1423; HASWELL-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 1424; HASWELL-NEXT: vmulps %ymm0, %ymm3, %ymm0 1425; HASWELL-NEXT: vmulps %ymm1, %ymm2, %ymm1 1426; HASWELL-NEXT: retq 1427; 1428; HASWELL-NO-FMA-LABEL: v16f32_one_step_2_divs: 1429; HASWELL-NO-FMA: # %bb.0: 1430; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2 1431; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 1432; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1433; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 1434; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm4 1435; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0 1436; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm2, %ymm0 1437; HASWELL-NO-FMA-NEXT: vmulps %ymm4, %ymm1, %ymm1 1438; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm3, %ymm1 1439; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm4, %ymm1 1440; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm4, %ymm1 1441; HASWELL-NO-FMA-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 1442; HASWELL-NO-FMA-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 1443; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm3, %ymm0 1444; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1 1445; HASWELL-NO-FMA-NEXT: retq 1446; 1447; AVX512-LABEL: v16f32_one_step_2_divs: 1448; AVX512: # %bb.0: 1449; AVX512-NEXT: vrcp14ps %zmm0, %zmm1 1450; AVX512-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - mem 1451; AVX512-NEXT: vfnmadd132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm1 1452; AVX512-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 1453; AVX512-NEXT: vmulps %zmm0, %zmm1, %zmm0 1454; AVX512-NEXT: retq 1455 %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x 1456 %div2 = fdiv fast <16 x float> %div, %x 1457 ret <16 x float> %div2 1458} 1459 1460define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 { 1461; SSE-LABEL: v16f32_two_step2: 1462; SSE: # %bb.0: 1463; SSE-NEXT: movaps %xmm1, %xmm4 1464; SSE-NEXT: movaps %xmm0, %xmm1 1465; SSE-NEXT: rcpps %xmm0, %xmm0 1466; SSE-NEXT: movaps %xmm1, %xmm5 1467; SSE-NEXT: mulps %xmm0, %xmm5 1468; SSE-NEXT: movaps {{.*#+}} xmm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1469; SSE-NEXT: movaps %xmm6, %xmm7 1470; SSE-NEXT: subps %xmm5, %xmm7 1471; SSE-NEXT: mulps %xmm0, %xmm7 1472; SSE-NEXT: addps %xmm0, %xmm7 1473; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 1474; SSE-NEXT: movaps %xmm7, %xmm5 1475; SSE-NEXT: mulps %xmm0, %xmm5 1476; SSE-NEXT: mulps %xmm5, %xmm1 1477; SSE-NEXT: subps %xmm1, %xmm0 1478; SSE-NEXT: mulps %xmm7, %xmm0 1479; SSE-NEXT: addps %xmm5, %xmm0 1480; SSE-NEXT: rcpps %xmm4, %xmm1 1481; SSE-NEXT: movaps %xmm4, %xmm5 1482; SSE-NEXT: mulps %xmm1, %xmm5 1483; SSE-NEXT: movaps %xmm6, %xmm7 1484; SSE-NEXT: subps %xmm5, %xmm7 1485; SSE-NEXT: mulps %xmm1, %xmm7 1486; SSE-NEXT: addps %xmm1, %xmm7 1487; SSE-NEXT: movaps {{.*#+}} xmm1 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0] 1488; SSE-NEXT: movaps %xmm7, %xmm5 1489; SSE-NEXT: mulps %xmm1, %xmm5 1490; SSE-NEXT: mulps %xmm5, %xmm4 1491; SSE-NEXT: subps %xmm4, %xmm1 1492; SSE-NEXT: mulps %xmm7, %xmm1 1493; SSE-NEXT: addps %xmm5, %xmm1 1494; SSE-NEXT: rcpps %xmm2, %xmm4 1495; SSE-NEXT: movaps %xmm2, %xmm5 1496; SSE-NEXT: mulps %xmm4, %xmm5 1497; SSE-NEXT: movaps %xmm6, %xmm7 1498; SSE-NEXT: subps %xmm5, %xmm7 1499; SSE-NEXT: mulps %xmm4, %xmm7 1500; SSE-NEXT: addps %xmm4, %xmm7 1501; SSE-NEXT: movaps {{.*#+}} xmm4 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1] 1502; SSE-NEXT: movaps %xmm7, %xmm5 1503; SSE-NEXT: mulps %xmm4, %xmm5 1504; SSE-NEXT: mulps %xmm5, %xmm2 1505; SSE-NEXT: subps %xmm2, %xmm4 1506; SSE-NEXT: mulps %xmm7, %xmm4 1507; SSE-NEXT: addps %xmm5, %xmm4 1508; SSE-NEXT: rcpps %xmm3, %xmm2 1509; SSE-NEXT: movaps %xmm3, %xmm5 1510; SSE-NEXT: mulps %xmm2, %xmm5 1511; SSE-NEXT: subps %xmm5, %xmm6 1512; SSE-NEXT: mulps %xmm2, %xmm6 1513; SSE-NEXT: addps %xmm2, %xmm6 1514; SSE-NEXT: movaps {{.*#+}} xmm5 = [1.3E+1,1.4E+1,1.5E+1,1.6E+1] 1515; SSE-NEXT: movaps %xmm6, %xmm2 1516; SSE-NEXT: mulps %xmm5, %xmm2 1517; SSE-NEXT: mulps %xmm2, %xmm3 1518; SSE-NEXT: subps %xmm3, %xmm5 1519; SSE-NEXT: mulps %xmm6, %xmm5 1520; SSE-NEXT: addps %xmm2, %xmm5 1521; SSE-NEXT: movaps %xmm4, %xmm2 1522; SSE-NEXT: movaps %xmm5, %xmm3 1523; SSE-NEXT: retq 1524; 1525; AVX-RECIP-LABEL: v16f32_two_step2: 1526; AVX-RECIP: # %bb.0: 1527; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 1528; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm3 1529; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1530; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3 1531; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3 1532; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2 1533; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 1534; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm5 1535; AVX-RECIP-NEXT: vmulps %ymm5, %ymm0, %ymm0 1536; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 1537; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 1538; AVX-RECIP-NEXT: vaddps %ymm0, %ymm5, %ymm0 1539; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2 1540; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3 1541; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3 1542; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3 1543; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2 1544; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] 1545; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4 1546; AVX-RECIP-NEXT: vmulps %ymm4, %ymm1, %ymm1 1547; AVX-RECIP-NEXT: vsubps %ymm1, %ymm3, %ymm1 1548; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 1549; AVX-RECIP-NEXT: vaddps %ymm1, %ymm4, %ymm1 1550; AVX-RECIP-NEXT: retq 1551; 1552; FMA-RECIP-LABEL: v16f32_two_step2: 1553; FMA-RECIP: # %bb.0: 1554; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 1555; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1556; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4 1557; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm4 = (ymm0 * ymm4) - ymm3 1558; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2 1559; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 1560; FMA-RECIP-NEXT: vmulps %ymm2, %ymm4, %ymm5 1561; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm5 * ymm0) - ymm2 1562; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm5 1563; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 1564; FMA-RECIP-NEXT: vfmsub231ps {{.*#+}} ymm3 = (ymm1 * ymm2) - ymm3 1565; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm3 = -(ymm3 * ymm2) + ymm2 1566; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] 1567; FMA-RECIP-NEXT: vmulps %ymm2, %ymm3, %ymm4 1568; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm4 * ymm1) - ymm2 1569; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm3 * ymm1) + ymm4 1570; FMA-RECIP-NEXT: retq 1571; 1572; BDVER2-LABEL: v16f32_two_step2: 1573; BDVER2: # %bb.0: 1574; BDVER2-NEXT: vrcpps %ymm0, %ymm2 1575; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1576; BDVER2-NEXT: vfmsubps {{.*#+}} ymm4 = (ymm0 * ymm2) - ymm3 1577; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm4) + ymm2 1578; BDVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 1579; BDVER2-NEXT: vmulps %ymm4, %ymm2, %ymm5 1580; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm5) - ymm4 1581; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm5 1582; BDVER2-NEXT: vrcpps %ymm1, %ymm2 1583; BDVER2-NEXT: vmovaps {{.*#+}} ymm5 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] 1584; BDVER2-NEXT: vfmsubps {{.*#+}} ymm3 = (ymm1 * ymm2) - ymm3 1585; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm3) + ymm2 1586; BDVER2-NEXT: vmulps %ymm5, %ymm2, %ymm4 1587; BDVER2-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm4) - ymm5 1588; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm4 1589; BDVER2-NEXT: retq 1590; 1591; BTVER2-LABEL: v16f32_two_step2: 1592; BTVER2: # %bb.0: 1593; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1594; BTVER2-NEXT: vrcpps %ymm0, %ymm2 1595; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3 1596; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 1597; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3 1598; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2 1599; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 1600; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm5 1601; BTVER2-NEXT: vmulps %ymm5, %ymm0, %ymm0 1602; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 1603; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0 1604; BTVER2-NEXT: vrcpps %ymm1, %ymm2 1605; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3 1606; BTVER2-NEXT: vaddps %ymm0, %ymm5, %ymm0 1607; BTVER2-NEXT: vmovaps {{.*#+}} ymm5 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] 1608; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 1609; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3 1610; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2 1611; BTVER2-NEXT: vmulps %ymm5, %ymm2, %ymm4 1612; BTVER2-NEXT: vmulps %ymm4, %ymm1, %ymm1 1613; BTVER2-NEXT: vsubps %ymm1, %ymm5, %ymm1 1614; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 1615; BTVER2-NEXT: vaddps %ymm1, %ymm4, %ymm1 1616; BTVER2-NEXT: retq 1617; 1618; SANDY-LABEL: v16f32_two_step2: 1619; SANDY: # %bb.0: 1620; SANDY-NEXT: vrcpps %ymm0, %ymm2 1621; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3 1622; SANDY-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1623; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3 1624; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3 1625; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2 1626; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 1627; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm5 1628; SANDY-NEXT: vmulps %ymm5, %ymm0, %ymm0 1629; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 1630; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 1631; SANDY-NEXT: vaddps %ymm0, %ymm5, %ymm0 1632; SANDY-NEXT: vrcpps %ymm1, %ymm2 1633; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm3 1634; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3 1635; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3 1636; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2 1637; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] 1638; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm4 1639; SANDY-NEXT: vmulps %ymm4, %ymm1, %ymm1 1640; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1 1641; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 1642; SANDY-NEXT: vaddps %ymm1, %ymm4, %ymm1 1643; SANDY-NEXT: retq 1644; 1645; HASWELL-LABEL: v16f32_two_step2: 1646; HASWELL: # %bb.0: 1647; HASWELL-NEXT: vrcpps %ymm0, %ymm2 1648; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1649; HASWELL-NEXT: vmovaps %ymm2, %ymm4 1650; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm4 = (ymm0 * ymm4) - ymm3 1651; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2 1652; HASWELL-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 1653; HASWELL-NEXT: vmulps %ymm2, %ymm4, %ymm5 1654; HASWELL-NEXT: vrcpps %ymm1, %ymm6 1655; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm5 * ymm0) - ymm2 1656; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm5 1657; HASWELL-NEXT: vfmsub231ps {{.*#+}} ymm3 = (ymm1 * ymm6) - ymm3 1658; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm3 = -(ymm3 * ymm6) + ymm6 1659; HASWELL-NEXT: vmovaps {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] 1660; HASWELL-NEXT: vmulps %ymm2, %ymm3, %ymm4 1661; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm4 * ymm1) - ymm2 1662; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm3 * ymm1) + ymm4 1663; HASWELL-NEXT: retq 1664; 1665; HASWELL-NO-FMA-LABEL: v16f32_two_step2: 1666; HASWELL-NO-FMA: # %bb.0: 1667; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2 1668; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm3 1669; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1670; HASWELL-NO-FMA-NEXT: vsubps %ymm3, %ymm4, %ymm3 1671; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm3 1672; HASWELL-NO-FMA-NEXT: vaddps %ymm3, %ymm2, %ymm2 1673; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] 1674; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm5 1675; HASWELL-NO-FMA-NEXT: vmulps %ymm5, %ymm0, %ymm0 1676; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 1677; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm3 1678; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0 1679; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm5, %ymm0 1680; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm1, %ymm2 1681; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm4, %ymm2 1682; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm3, %ymm2 1683; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm3, %ymm2 1684; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] 1685; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm4 1686; HASWELL-NO-FMA-NEXT: vmulps %ymm4, %ymm1, %ymm1 1687; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm3, %ymm1 1688; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1 1689; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm4, %ymm1 1690; HASWELL-NO-FMA-NEXT: retq 1691; 1692; AVX512-LABEL: v16f32_two_step2: 1693; AVX512: # %bb.0: 1694; AVX512-NEXT: vrcp14ps %zmm0, %zmm1 1695; AVX512-NEXT: vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1696; AVX512-NEXT: vfmsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2 1697; AVX512-NEXT: vfnmadd132ps {{.*#+}} zmm2 = -(zmm2 * zmm1) + zmm1 1698; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0,9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] 1699; AVX512-NEXT: vmulps %zmm1, %zmm2, %zmm3 1700; AVX512-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm3 * zmm0) - zmm1 1701; AVX512-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm2 * zmm0) + zmm3 1702; AVX512-NEXT: retq 1703 %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x 1704 ret <16 x float> %div 1705} 1706 1707define <16 x float> @v16f32_no_step(<16 x float> %x) #3 { 1708; SSE-LABEL: v16f32_no_step: 1709; SSE: # %bb.0: 1710; SSE-NEXT: rcpps %xmm0, %xmm0 1711; SSE-NEXT: rcpps %xmm1, %xmm1 1712; SSE-NEXT: rcpps %xmm2, %xmm2 1713; SSE-NEXT: rcpps %xmm3, %xmm3 1714; SSE-NEXT: retq 1715; 1716; AVX-RECIP-LABEL: v16f32_no_step: 1717; AVX-RECIP: # %bb.0: 1718; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0 1719; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm1 1720; AVX-RECIP-NEXT: retq 1721; 1722; FMA-RECIP-LABEL: v16f32_no_step: 1723; FMA-RECIP: # %bb.0: 1724; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0 1725; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm1 1726; FMA-RECIP-NEXT: retq 1727; 1728; BDVER2-LABEL: v16f32_no_step: 1729; BDVER2: # %bb.0: 1730; BDVER2-NEXT: vrcpps %ymm0, %ymm0 1731; BDVER2-NEXT: vrcpps %ymm1, %ymm1 1732; BDVER2-NEXT: retq 1733; 1734; BTVER2-LABEL: v16f32_no_step: 1735; BTVER2: # %bb.0: 1736; BTVER2-NEXT: vrcpps %ymm0, %ymm0 1737; BTVER2-NEXT: vrcpps %ymm1, %ymm1 1738; BTVER2-NEXT: retq 1739; 1740; SANDY-LABEL: v16f32_no_step: 1741; SANDY: # %bb.0: 1742; SANDY-NEXT: vrcpps %ymm0, %ymm0 1743; SANDY-NEXT: vrcpps %ymm1, %ymm1 1744; SANDY-NEXT: retq 1745; 1746; HASWELL-LABEL: v16f32_no_step: 1747; HASWELL: # %bb.0: 1748; HASWELL-NEXT: vrcpps %ymm0, %ymm0 1749; HASWELL-NEXT: vrcpps %ymm1, %ymm1 1750; HASWELL-NEXT: retq 1751; 1752; HASWELL-NO-FMA-LABEL: v16f32_no_step: 1753; HASWELL-NO-FMA: # %bb.0: 1754; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 1755; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm1 1756; HASWELL-NO-FMA-NEXT: retq 1757; 1758; AVX512-LABEL: v16f32_no_step: 1759; AVX512: # %bb.0: 1760; AVX512-NEXT: vrcp14ps %zmm0, %zmm0 1761; AVX512-NEXT: retq 1762 %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x 1763 ret <16 x float> %div 1764} 1765 1766define <16 x float> @v16f32_no_step2(<16 x float> %x) #3 { 1767; SSE-LABEL: v16f32_no_step2: 1768; SSE: # %bb.0: 1769; SSE-NEXT: rcpps %xmm0, %xmm0 1770; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1771; SSE-NEXT: rcpps %xmm1, %xmm1 1772; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1773; SSE-NEXT: rcpps %xmm2, %xmm2 1774; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 1775; SSE-NEXT: rcpps %xmm3, %xmm3 1776; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 1777; SSE-NEXT: retq 1778; 1779; AVX-RECIP-LABEL: v16f32_no_step2: 1780; AVX-RECIP: # %bb.0: 1781; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0 1782; AVX-RECIP-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1783; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm1 1784; AVX-RECIP-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1785; AVX-RECIP-NEXT: retq 1786; 1787; FMA-RECIP-LABEL: v16f32_no_step2: 1788; FMA-RECIP: # %bb.0: 1789; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0 1790; FMA-RECIP-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1791; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm1 1792; FMA-RECIP-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1793; FMA-RECIP-NEXT: retq 1794; 1795; BDVER2-LABEL: v16f32_no_step2: 1796; BDVER2: # %bb.0: 1797; BDVER2-NEXT: vrcpps %ymm0, %ymm0 1798; BDVER2-NEXT: vrcpps %ymm1, %ymm1 1799; BDVER2-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1800; BDVER2-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1801; BDVER2-NEXT: retq 1802; 1803; BTVER2-LABEL: v16f32_no_step2: 1804; BTVER2: # %bb.0: 1805; BTVER2-NEXT: vrcpps %ymm0, %ymm0 1806; BTVER2-NEXT: vrcpps %ymm1, %ymm1 1807; BTVER2-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1808; BTVER2-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1809; BTVER2-NEXT: retq 1810; 1811; SANDY-LABEL: v16f32_no_step2: 1812; SANDY: # %bb.0: 1813; SANDY-NEXT: vrcpps %ymm0, %ymm0 1814; SANDY-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1815; SANDY-NEXT: vrcpps %ymm1, %ymm1 1816; SANDY-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1817; SANDY-NEXT: retq 1818; 1819; HASWELL-LABEL: v16f32_no_step2: 1820; HASWELL: # %bb.0: 1821; HASWELL-NEXT: vrcpps %ymm0, %ymm0 1822; HASWELL-NEXT: vrcpps %ymm1, %ymm1 1823; HASWELL-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1824; HASWELL-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1825; HASWELL-NEXT: retq 1826; 1827; HASWELL-NO-FMA-LABEL: v16f32_no_step2: 1828; HASWELL-NO-FMA: # %bb.0: 1829; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 1830; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm1 1831; HASWELL-NO-FMA-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1832; HASWELL-NO-FMA-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1833; HASWELL-NO-FMA-NEXT: retq 1834; 1835; AVX512-LABEL: v16f32_no_step2: 1836; AVX512: # %bb.0: 1837; AVX512-NEXT: vrcp14ps %zmm0, %zmm0 1838; AVX512-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 1839; AVX512-NEXT: retq 1840 %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x 1841 ret <16 x float> %div 1842} 1843 1844attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" } 1845attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" } 1846attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" } 1847attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:0,vec-divf:0" } 1848 1849