1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fma | FileCheck %s --check-prefix=FMA 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=FMA 6 7; PR31866 8; complex float complex_square_f32(complex float x) { 9; return x*x; 10; } 11 12define <2 x float> @complex_square_f32(<2 x float>) #0 { 13; SSE-LABEL: complex_square_f32: 14; SSE: # %bb.0: 15; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 16; SSE-NEXT: movaps %xmm0, %xmm2 17; SSE-NEXT: addss %xmm0, %xmm2 18; SSE-NEXT: mulss %xmm1, %xmm2 19; SSE-NEXT: mulss %xmm0, %xmm0 20; SSE-NEXT: mulss %xmm1, %xmm1 21; SSE-NEXT: subss %xmm1, %xmm0 22; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] 23; SSE-NEXT: retq 24; 25; AVX1-LABEL: complex_square_f32: 26; AVX1: # %bb.0: 27; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 28; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm2 29; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm2 30; AVX1-NEXT: vmulss %xmm0, %xmm0, %xmm0 31; AVX1-NEXT: vmulss %xmm1, %xmm1, %xmm1 32; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm0 33; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] 34; AVX1-NEXT: retq 35; 36; FMA-LABEL: complex_square_f32: 37; FMA: # %bb.0: 38; FMA-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 39; FMA-NEXT: vaddss %xmm0, %xmm0, %xmm2 40; FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2 41; FMA-NEXT: vmulss %xmm1, %xmm1, %xmm1 42; FMA-NEXT: vfmsub231ss {{.*#+}} xmm1 = (xmm0 * xmm0) - xmm1 43; FMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[2,3] 44; FMA-NEXT: retq 45 %2 = extractelement <2 x float> %0, i32 0 46 %3 = extractelement <2 x float> %0, i32 1 47 %4 = fmul fast float %3, 2.000000e+00 48 %5 = fmul fast float %4, %2 49 %6 = fmul fast float %2, %2 50 %7 = fmul fast float %3, %3 51 %8 = fsub fast float %6, %7 52 %9 = insertelement <2 x float> undef, float %8, i32 0 53 %10 = insertelement <2 x float> %9, float %5, i32 1 54 ret <2 x float> %10 55} 56 57define <2 x double> @complex_square_f64(<2 x double>) #0 { 58; SSE-LABEL: complex_square_f64: 59; SSE: # %bb.0: 60; SSE-NEXT: movapd %xmm0, %xmm1 61; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 62; SSE-NEXT: movapd %xmm0, %xmm2 63; SSE-NEXT: addsd %xmm0, %xmm2 64; SSE-NEXT: mulsd %xmm1, %xmm2 65; SSE-NEXT: mulsd %xmm0, %xmm0 66; SSE-NEXT: mulsd %xmm1, %xmm1 67; SSE-NEXT: subsd %xmm1, %xmm0 68; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] 69; SSE-NEXT: retq 70; 71; AVX1-LABEL: complex_square_f64: 72; AVX1: # %bb.0: 73; AVX1-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 74; AVX1-NEXT: vaddsd %xmm0, %xmm0, %xmm2 75; AVX1-NEXT: vmulsd %xmm2, %xmm1, %xmm2 76; AVX1-NEXT: vmulsd %xmm0, %xmm0, %xmm0 77; AVX1-NEXT: vmulsd %xmm1, %xmm1, %xmm1 78; AVX1-NEXT: vsubsd %xmm1, %xmm0, %xmm0 79; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] 80; AVX1-NEXT: retq 81; 82; FMA-LABEL: complex_square_f64: 83; FMA: # %bb.0: 84; FMA-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 85; FMA-NEXT: vaddsd %xmm0, %xmm0, %xmm2 86; FMA-NEXT: vmulsd %xmm2, %xmm1, %xmm2 87; FMA-NEXT: vmulsd %xmm1, %xmm1, %xmm1 88; FMA-NEXT: vfmsub231sd {{.*#+}} xmm1 = (xmm0 * xmm0) - xmm1 89; FMA-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm2[0] 90; FMA-NEXT: retq 91 %2 = extractelement <2 x double> %0, i32 0 92 %3 = extractelement <2 x double> %0, i32 1 93 %4 = fmul fast double %3, 2.000000e+00 94 %5 = fmul fast double %4, %2 95 %6 = fmul fast double %2, %2 96 %7 = fmul fast double %3, %3 97 %8 = fsub fast double %6, %7 98 %9 = insertelement <2 x double> undef, double %8, i32 0 99 %10 = insertelement <2 x double> %9, double %5, i32 1 100 ret <2 x double> %10 101} 102 103; complex float complex_mul_f32(complex float x, complex float y) { 104; return x*y; 105; } 106 107define <2 x float> @complex_mul_f32(<2 x float>, <2 x float>) #0 { 108; SSE-LABEL: complex_mul_f32: 109; SSE: # %bb.0: 110; SSE-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 111; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 112; SSE-NEXT: movaps %xmm3, %xmm4 113; SSE-NEXT: mulss %xmm0, %xmm4 114; SSE-NEXT: mulss %xmm1, %xmm0 115; SSE-NEXT: mulss %xmm2, %xmm1 116; SSE-NEXT: addss %xmm4, %xmm1 117; SSE-NEXT: mulss %xmm2, %xmm3 118; SSE-NEXT: subss %xmm3, %xmm0 119; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 120; SSE-NEXT: retq 121; 122; AVX1-LABEL: complex_mul_f32: 123; AVX1: # %bb.0: 124; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 125; AVX1-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 126; AVX1-NEXT: vmulss %xmm0, %xmm3, %xmm4 127; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm5 128; AVX1-NEXT: vaddss %xmm5, %xmm4, %xmm4 129; AVX1-NEXT: vmulss %xmm0, %xmm1, %xmm0 130; AVX1-NEXT: vmulss %xmm2, %xmm3, %xmm1 131; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm0 132; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3] 133; AVX1-NEXT: retq 134; 135; FMA-LABEL: complex_mul_f32: 136; FMA: # %bb.0: 137; FMA-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 138; FMA-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 139; FMA-NEXT: vmulss %xmm2, %xmm1, %xmm4 140; FMA-NEXT: vfmadd231ss {{.*#+}} xmm4 = (xmm3 * xmm0) + xmm4 141; FMA-NEXT: vmulss %xmm2, %xmm3, %xmm2 142; FMA-NEXT: vfmsub231ss {{.*#+}} xmm2 = (xmm1 * xmm0) - xmm2 143; FMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0],xmm4[0],xmm2[2,3] 144; FMA-NEXT: retq 145 %3 = extractelement <2 x float> %0, i32 0 146 %4 = extractelement <2 x float> %0, i32 1 147 %5 = extractelement <2 x float> %1, i32 0 148 %6 = extractelement <2 x float> %1, i32 1 149 %7 = fmul fast float %6, %3 150 %8 = fmul fast float %5, %4 151 %9 = fadd fast float %7, %8 152 %10 = fmul fast float %5, %3 153 %11 = fmul fast float %6, %4 154 %12 = fsub fast float %10, %11 155 %13 = insertelement <2 x float> undef, float %12, i32 0 156 %14 = insertelement <2 x float> %13, float %9, i32 1 157 ret <2 x float> %14 158} 159 160define <2 x double> @complex_mul_f64(<2 x double>, <2 x double>) #0 { 161; SSE-LABEL: complex_mul_f64: 162; SSE: # %bb.0: 163; SSE-NEXT: movapd %xmm0, %xmm2 164; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 165; SSE-NEXT: movapd %xmm1, %xmm3 166; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] 167; SSE-NEXT: movapd %xmm3, %xmm4 168; SSE-NEXT: mulsd %xmm0, %xmm4 169; SSE-NEXT: mulsd %xmm1, %xmm0 170; SSE-NEXT: mulsd %xmm2, %xmm1 171; SSE-NEXT: addsd %xmm4, %xmm1 172; SSE-NEXT: mulsd %xmm2, %xmm3 173; SSE-NEXT: subsd %xmm3, %xmm0 174; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 175; SSE-NEXT: retq 176; 177; AVX1-LABEL: complex_mul_f64: 178; AVX1: # %bb.0: 179; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 180; AVX1-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] 181; AVX1-NEXT: vmulsd %xmm0, %xmm3, %xmm4 182; AVX1-NEXT: vmulsd %xmm2, %xmm1, %xmm5 183; AVX1-NEXT: vaddsd %xmm5, %xmm4, %xmm4 184; AVX1-NEXT: vmulsd %xmm0, %xmm1, %xmm0 185; AVX1-NEXT: vmulsd %xmm2, %xmm3, %xmm1 186; AVX1-NEXT: vsubsd %xmm1, %xmm0, %xmm0 187; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm4[0] 188; AVX1-NEXT: retq 189; 190; FMA-LABEL: complex_mul_f64: 191; FMA: # %bb.0: 192; FMA-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 193; FMA-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] 194; FMA-NEXT: vmulsd %xmm2, %xmm1, %xmm4 195; FMA-NEXT: vfmadd231sd {{.*#+}} xmm4 = (xmm3 * xmm0) + xmm4 196; FMA-NEXT: vmulsd %xmm2, %xmm3, %xmm2 197; FMA-NEXT: vfmsub231sd {{.*#+}} xmm2 = (xmm1 * xmm0) - xmm2 198; FMA-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm4[0] 199; FMA-NEXT: retq 200 %3 = extractelement <2 x double> %0, i32 0 201 %4 = extractelement <2 x double> %0, i32 1 202 %5 = extractelement <2 x double> %1, i32 0 203 %6 = extractelement <2 x double> %1, i32 1 204 %7 = fmul fast double %6, %3 205 %8 = fmul fast double %5, %4 206 %9 = fadd fast double %7, %8 207 %10 = fmul fast double %5, %3 208 %11 = fmul fast double %6, %4 209 %12 = fsub fast double %10, %11 210 %13 = insertelement <2 x double> undef, double %12, i32 0 211 %14 = insertelement <2 x double> %13, double %9, i32 1 212 ret <2 x double> %14 213} 214 215attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "unsafe-fp-math"="true" } 216