1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s -check-prefixes=NOFMA 3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma | FileCheck %s -check-prefixes=FMA3,FMA3_256 4; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma,+avx512f | FileCheck %s -check-prefixes=FMA3,FMA3_512 5; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma4 | FileCheck %s -check-prefixes=FMA4 6 7; This test checks the fusing of MUL + ADDSUB to FMADDSUB. 8 9define <2 x double> @mul_addsub_pd128(<2 x double> %A, <2 x double> %B, <2 x double> %C) #0 { 10; NOFMA-LABEL: mul_addsub_pd128: 11; NOFMA: # %bb.0: # %entry 12; NOFMA-NEXT: vmulpd %xmm1, %xmm0, %xmm0 13; NOFMA-NEXT: vaddsubpd %xmm2, %xmm0, %xmm0 14; NOFMA-NEXT: retq 15; 16; FMA3-LABEL: mul_addsub_pd128: 17; FMA3: # %bb.0: # %entry 18; FMA3-NEXT: vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2 19; FMA3-NEXT: retq 20; 21; FMA4-LABEL: mul_addsub_pd128: 22; FMA4: # %bb.0: # %entry 23; FMA4-NEXT: vfmaddsubpd {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2 24; FMA4-NEXT: retq 25entry: 26 %AB = fmul <2 x double> %A, %B 27 %Sub = fsub <2 x double> %AB, %C 28 %Add = fadd <2 x double> %AB, %C 29 %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add, <2 x i32> <i32 0, i32 3> 30 ret <2 x double> %Addsub 31} 32 33define <4 x float> @mul_addsub_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) #0 { 34; NOFMA-LABEL: mul_addsub_ps128: 35; NOFMA: # %bb.0: # %entry 36; NOFMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 37; NOFMA-NEXT: vaddsubps %xmm2, %xmm0, %xmm0 38; NOFMA-NEXT: retq 39; 40; FMA3-LABEL: mul_addsub_ps128: 41; FMA3: # %bb.0: # %entry 42; FMA3-NEXT: vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2 43; FMA3-NEXT: retq 44; 45; FMA4-LABEL: mul_addsub_ps128: 46; FMA4: # %bb.0: # %entry 47; FMA4-NEXT: vfmaddsubps {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2 48; FMA4-NEXT: retq 49entry: 50 %AB = fmul <4 x float> %A, %B 51 %Sub = fsub <4 x float> %AB, %C 52 %Add = fadd <4 x float> %AB, %C 53 %Addsub = shufflevector <4 x float> %Sub, <4 x float> %Add, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 54 ret <4 x float> %Addsub 55} 56 57define <4 x double> @mul_addsub_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) #0 { 58; NOFMA-LABEL: mul_addsub_pd256: 59; NOFMA: # %bb.0: # %entry 60; NOFMA-NEXT: vmulpd %ymm1, %ymm0, %ymm0 61; NOFMA-NEXT: vaddsubpd %ymm2, %ymm0, %ymm0 62; NOFMA-NEXT: retq 63; 64; FMA3-LABEL: mul_addsub_pd256: 65; FMA3: # %bb.0: # %entry 66; FMA3-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2 67; FMA3-NEXT: retq 68; 69; FMA4-LABEL: mul_addsub_pd256: 70; FMA4: # %bb.0: # %entry 71; FMA4-NEXT: vfmaddsubpd {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2 72; FMA4-NEXT: retq 73entry: 74 %AB = fmul <4 x double> %A, %B 75 %Sub = fsub <4 x double> %AB, %C 76 %Add = fadd <4 x double> %AB, %C 77 %Addsub = shufflevector <4 x double> %Sub, <4 x double> %Add, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 78 ret <4 x double> %Addsub 79} 80 81define <8 x float> @mul_addsub_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) #0 { 82; NOFMA-LABEL: mul_addsub_ps256: 83; NOFMA: # %bb.0: # %entry 84; NOFMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 85; NOFMA-NEXT: vaddsubps %ymm2, %ymm0, %ymm0 86; NOFMA-NEXT: retq 87; 88; FMA3-LABEL: mul_addsub_ps256: 89; FMA3: # %bb.0: # %entry 90; FMA3-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2 91; FMA3-NEXT: retq 92; 93; FMA4-LABEL: mul_addsub_ps256: 94; FMA4: # %bb.0: # %entry 95; FMA4-NEXT: vfmaddsubps {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2 96; FMA4-NEXT: retq 97entry: 98 %AB = fmul <8 x float> %A, %B 99 %Sub = fsub <8 x float> %AB, %C 100 %Add = fadd <8 x float> %AB, %C 101 %Addsub = shufflevector <8 x float> %Sub, <8 x float> %Add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 102 ret <8 x float> %Addsub 103} 104 105define <8 x double> @mul_addsub_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) #0 { 106; NOFMA-LABEL: mul_addsub_pd512: 107; NOFMA: # %bb.0: # %entry 108; NOFMA-NEXT: vmulpd %ymm3, %ymm1, %ymm1 109; NOFMA-NEXT: vmulpd %ymm2, %ymm0, %ymm0 110; NOFMA-NEXT: vaddsubpd %ymm4, %ymm0, %ymm0 111; NOFMA-NEXT: vaddsubpd %ymm5, %ymm1, %ymm1 112; NOFMA-NEXT: retq 113; 114; FMA3_256-LABEL: mul_addsub_pd512: 115; FMA3_256: # %bb.0: # %entry 116; FMA3_256-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4 117; FMA3_256-NEXT: vfmaddsub213pd {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm5 118; FMA3_256-NEXT: retq 119; 120; FMA3_512-LABEL: mul_addsub_pd512: 121; FMA3_512: # %bb.0: # %entry 122; FMA3_512-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2 123; FMA3_512-NEXT: retq 124; 125; FMA4-LABEL: mul_addsub_pd512: 126; FMA4: # %bb.0: # %entry 127; FMA4-NEXT: vfmaddsubpd {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm4 128; FMA4-NEXT: vfmaddsubpd {{.*#+}} ymm1 = (ymm1 * ymm3) +/- ymm5 129; FMA4-NEXT: retq 130entry: 131 %AB = fmul <8 x double> %A, %B 132 %Sub = fsub <8 x double> %AB, %C 133 %Add = fadd <8 x double> %AB, %C 134 %Addsub = shufflevector <8 x double> %Sub, <8 x double> %Add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 135 ret <8 x double> %Addsub 136} 137 138define <16 x float> @mul_addsub_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) #0 { 139; NOFMA-LABEL: mul_addsub_ps512: 140; NOFMA: # %bb.0: # %entry 141; NOFMA-NEXT: vmulps %ymm3, %ymm1, %ymm1 142; NOFMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 143; NOFMA-NEXT: vaddsubps %ymm4, %ymm0, %ymm0 144; NOFMA-NEXT: vaddsubps %ymm5, %ymm1, %ymm1 145; NOFMA-NEXT: retq 146; 147; FMA3_256-LABEL: mul_addsub_ps512: 148; FMA3_256: # %bb.0: # %entry 149; FMA3_256-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4 150; FMA3_256-NEXT: vfmaddsub213ps {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm5 151; FMA3_256-NEXT: retq 152; 153; FMA3_512-LABEL: mul_addsub_ps512: 154; FMA3_512: # %bb.0: # %entry 155; FMA3_512-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2 156; FMA3_512-NEXT: retq 157; 158; FMA4-LABEL: mul_addsub_ps512: 159; FMA4: # %bb.0: # %entry 160; FMA4-NEXT: vfmaddsubps {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm4 161; FMA4-NEXT: vfmaddsubps {{.*#+}} ymm1 = (ymm1 * ymm3) +/- ymm5 162; FMA4-NEXT: retq 163entry: 164 %AB = fmul <16 x float> %A, %B 165 %Sub = fsub <16 x float> %AB, %C 166 %Add = fadd <16 x float> %AB, %C 167 %Addsub = shufflevector <16 x float> %Sub, <16 x float> %Add, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 168 ret <16 x float> %Addsub 169} 170 171define <4 x float> @buildvector_mul_addsub_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 { 172; NOFMA-LABEL: buildvector_mul_addsub_ps128: 173; NOFMA: # %bb.0: # %bb 174; NOFMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 175; NOFMA-NEXT: vaddsubps %xmm2, %xmm0, %xmm0 176; NOFMA-NEXT: retq 177; 178; FMA3-LABEL: buildvector_mul_addsub_ps128: 179; FMA3: # %bb.0: # %bb 180; FMA3-NEXT: vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2 181; FMA3-NEXT: retq 182; 183; FMA4-LABEL: buildvector_mul_addsub_ps128: 184; FMA4: # %bb.0: # %bb 185; FMA4-NEXT: vfmaddsubps {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2 186; FMA4-NEXT: retq 187bb: 188 %A = fmul <4 x float> %C, %D 189 %A0 = extractelement <4 x float> %A, i32 0 190 %B0 = extractelement <4 x float> %B, i32 0 191 %sub0 = fsub float %A0, %B0 192 %A2 = extractelement <4 x float> %A, i32 2 193 %B2 = extractelement <4 x float> %B, i32 2 194 %sub2 = fsub float %A2, %B2 195 %A1 = extractelement <4 x float> %A, i32 1 196 %B1 = extractelement <4 x float> %B, i32 1 197 %add1 = fadd float %A1, %B1 198 %A3 = extractelement <4 x float> %A, i32 3 199 %B3 = extractelement <4 x float> %B, i32 3 200 %add3 = fadd float %A3, %B3 201 %vecinsert1 = insertelement <4 x float> undef, float %sub0, i32 0 202 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add1, i32 1 203 %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub2, i32 2 204 %vecinsert4 = insertelement <4 x float> %vecinsert3, float %add3, i32 3 205 ret <4 x float> %vecinsert4 206} 207 208define <2 x double> @buildvector_mul_addsub_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 { 209; NOFMA-LABEL: buildvector_mul_addsub_pd128: 210; NOFMA: # %bb.0: # %bb 211; NOFMA-NEXT: vmulpd %xmm1, %xmm0, %xmm0 212; NOFMA-NEXT: vaddsubpd %xmm2, %xmm0, %xmm0 213; NOFMA-NEXT: retq 214; 215; FMA3-LABEL: buildvector_mul_addsub_pd128: 216; FMA3: # %bb.0: # %bb 217; FMA3-NEXT: vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2 218; FMA3-NEXT: retq 219; 220; FMA4-LABEL: buildvector_mul_addsub_pd128: 221; FMA4: # %bb.0: # %bb 222; FMA4-NEXT: vfmaddsubpd {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2 223; FMA4-NEXT: retq 224bb: 225 %A = fmul <2 x double> %C, %D 226 %A0 = extractelement <2 x double> %A, i32 0 227 %B0 = extractelement <2 x double> %B, i32 0 228 %sub0 = fsub double %A0, %B0 229 %A1 = extractelement <2 x double> %A, i32 1 230 %B1 = extractelement <2 x double> %B, i32 1 231 %add1 = fadd double %A1, %B1 232 %vecinsert1 = insertelement <2 x double> undef, double %sub0, i32 0 233 %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add1, i32 1 234 ret <2 x double> %vecinsert2 235} 236 237define <8 x float> @buildvector_mul_addsub_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 { 238; NOFMA-LABEL: buildvector_mul_addsub_ps256: 239; NOFMA: # %bb.0: # %bb 240; NOFMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 241; NOFMA-NEXT: vaddsubps %ymm2, %ymm0, %ymm0 242; NOFMA-NEXT: retq 243; 244; FMA3-LABEL: buildvector_mul_addsub_ps256: 245; FMA3: # %bb.0: # %bb 246; FMA3-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2 247; FMA3-NEXT: retq 248; 249; FMA4-LABEL: buildvector_mul_addsub_ps256: 250; FMA4: # %bb.0: # %bb 251; FMA4-NEXT: vfmaddsubps {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2 252; FMA4-NEXT: retq 253bb: 254 %A = fmul <8 x float> %C, %D 255 %A0 = extractelement <8 x float> %A, i32 0 256 %B0 = extractelement <8 x float> %B, i32 0 257 %sub0 = fsub float %A0, %B0 258 %A2 = extractelement <8 x float> %A, i32 2 259 %B2 = extractelement <8 x float> %B, i32 2 260 %sub2 = fsub float %A2, %B2 261 %A4 = extractelement <8 x float> %A, i32 4 262 %B4 = extractelement <8 x float> %B, i32 4 263 %sub4 = fsub float %A4, %B4 264 %A6 = extractelement <8 x float> %A, i32 6 265 %B6 = extractelement <8 x float> %B, i32 6 266 %sub6 = fsub float %A6, %B6 267 %A1 = extractelement <8 x float> %A, i32 1 268 %B1 = extractelement <8 x float> %B, i32 1 269 %add1 = fadd float %A1, %B1 270 %A3 = extractelement <8 x float> %A, i32 3 271 %B3 = extractelement <8 x float> %B, i32 3 272 %add3 = fadd float %A3, %B3 273 %A5 = extractelement <8 x float> %A, i32 5 274 %B5 = extractelement <8 x float> %B, i32 5 275 %add5 = fadd float %A5, %B5 276 %A7 = extractelement <8 x float> %A, i32 7 277 %B7 = extractelement <8 x float> %B, i32 7 278 %add7 = fadd float %A7, %B7 279 %vecinsert1 = insertelement <8 x float> undef, float %sub0, i32 0 280 %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add1, i32 1 281 %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub2, i32 2 282 %vecinsert4 = insertelement <8 x float> %vecinsert3, float %add3, i32 3 283 %vecinsert5 = insertelement <8 x float> %vecinsert4, float %sub4, i32 4 284 %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add5, i32 5 285 %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub6, i32 6 286 %vecinsert8 = insertelement <8 x float> %vecinsert7, float %add7, i32 7 287 ret <8 x float> %vecinsert8 288} 289 290define <4 x double> @buildvector_mul_addsub_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 { 291; NOFMA-LABEL: buildvector_mul_addsub_pd256: 292; NOFMA: # %bb.0: # %bb 293; NOFMA-NEXT: vmulpd %ymm1, %ymm0, %ymm0 294; NOFMA-NEXT: vaddsubpd %ymm2, %ymm0, %ymm0 295; NOFMA-NEXT: retq 296; 297; FMA3-LABEL: buildvector_mul_addsub_pd256: 298; FMA3: # %bb.0: # %bb 299; FMA3-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2 300; FMA3-NEXT: retq 301; 302; FMA4-LABEL: buildvector_mul_addsub_pd256: 303; FMA4: # %bb.0: # %bb 304; FMA4-NEXT: vfmaddsubpd {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2 305; FMA4-NEXT: retq 306bb: 307 %A = fmul <4 x double> %C, %D 308 %A0 = extractelement <4 x double> %A, i32 0 309 %B0 = extractelement <4 x double> %B, i32 0 310 %sub0 = fsub double %A0, %B0 311 %A2 = extractelement <4 x double> %A, i32 2 312 %B2 = extractelement <4 x double> %B, i32 2 313 %sub2 = fsub double %A2, %B2 314 %A1 = extractelement <4 x double> %A, i32 1 315 %B1 = extractelement <4 x double> %B, i32 1 316 %add1 = fadd double %A1, %B1 317 %A3 = extractelement <4 x double> %A, i32 3 318 %B3 = extractelement <4 x double> %B, i32 3 319 %add3 = fadd double %A3, %B3 320 %vecinsert1 = insertelement <4 x double> undef, double %sub0, i32 0 321 %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add1, i32 1 322 %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub2, i32 2 323 %vecinsert4 = insertelement <4 x double> %vecinsert3, double %add3, i32 3 324 ret <4 x double> %vecinsert4 325} 326 327define <16 x float> @buildvector_mul_addsub_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 { 328; NOFMA-LABEL: buildvector_mul_addsub_ps512: 329; NOFMA: # %bb.0: # %bb 330; NOFMA-NEXT: vmulps %ymm3, %ymm1, %ymm1 331; NOFMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 332; NOFMA-NEXT: vaddsubps %ymm4, %ymm0, %ymm0 333; NOFMA-NEXT: vaddsubps %ymm5, %ymm1, %ymm1 334; NOFMA-NEXT: retq 335; 336; FMA3_256-LABEL: buildvector_mul_addsub_ps512: 337; FMA3_256: # %bb.0: # %bb 338; FMA3_256-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4 339; FMA3_256-NEXT: vfmaddsub213ps {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm5 340; FMA3_256-NEXT: retq 341; 342; FMA3_512-LABEL: buildvector_mul_addsub_ps512: 343; FMA3_512: # %bb.0: # %bb 344; FMA3_512-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2 345; FMA3_512-NEXT: retq 346; 347; FMA4-LABEL: buildvector_mul_addsub_ps512: 348; FMA4: # %bb.0: # %bb 349; FMA4-NEXT: vfmaddsubps {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm4 350; FMA4-NEXT: vfmaddsubps {{.*#+}} ymm1 = (ymm1 * ymm3) +/- ymm5 351; FMA4-NEXT: retq 352bb: 353 %A = fmul <16 x float> %C, %D 354 %A0 = extractelement <16 x float> %A, i32 0 355 %B0 = extractelement <16 x float> %B, i32 0 356 %sub0 = fsub float %A0, %B0 357 %A2 = extractelement <16 x float> %A, i32 2 358 %B2 = extractelement <16 x float> %B, i32 2 359 %sub2 = fsub float %A2, %B2 360 %A4 = extractelement <16 x float> %A, i32 4 361 %B4 = extractelement <16 x float> %B, i32 4 362 %sub4 = fsub float %A4, %B4 363 %A6 = extractelement <16 x float> %A, i32 6 364 %B6 = extractelement <16 x float> %B, i32 6 365 %sub6 = fsub float %A6, %B6 366 %A8 = extractelement <16 x float> %A, i32 8 367 %B8 = extractelement <16 x float> %B, i32 8 368 %sub8 = fsub float %A8, %B8 369 %A10 = extractelement <16 x float> %A, i32 10 370 %B10 = extractelement <16 x float> %B, i32 10 371 %sub10 = fsub float %A10, %B10 372 %A12 = extractelement <16 x float> %A, i32 12 373 %B12 = extractelement <16 x float> %B, i32 12 374 %sub12 = fsub float %A12, %B12 375 %A14 = extractelement <16 x float> %A, i32 14 376 %B14 = extractelement <16 x float> %B, i32 14 377 %sub14 = fsub float %A14, %B14 378 %A1 = extractelement <16 x float> %A, i32 1 379 %B1 = extractelement <16 x float> %B, i32 1 380 %add1 = fadd float %A1, %B1 381 %A3 = extractelement <16 x float> %A, i32 3 382 %B3 = extractelement <16 x float> %B, i32 3 383 %add3 = fadd float %A3, %B3 384 %A5 = extractelement <16 x float> %A, i32 5 385 %B5 = extractelement <16 x float> %B, i32 5 386 %add5 = fadd float %A5, %B5 387 %A7 = extractelement <16 x float> %A, i32 7 388 %B7 = extractelement <16 x float> %B, i32 7 389 %add7 = fadd float %A7, %B7 390 %A9 = extractelement <16 x float> %A, i32 9 391 %B9 = extractelement <16 x float> %B, i32 9 392 %add9 = fadd float %A9, %B9 393 %A11 = extractelement <16 x float> %A, i32 11 394 %B11 = extractelement <16 x float> %B, i32 11 395 %add11 = fadd float %A11, %B11 396 %A13 = extractelement <16 x float> %A, i32 13 397 %B13 = extractelement <16 x float> %B, i32 13 398 %add13 = fadd float %A13, %B13 399 %A15 = extractelement <16 x float> %A, i32 15 400 %B15 = extractelement <16 x float> %B, i32 15 401 %add15 = fadd float %A15, %B15 402 %vecinsert1 = insertelement <16 x float> undef, float %sub0, i32 0 403 %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add1, i32 1 404 %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub2, i32 2 405 %vecinsert4 = insertelement <16 x float> %vecinsert3, float %add3, i32 3 406 %vecinsert5 = insertelement <16 x float> %vecinsert4, float %sub4, i32 4 407 ; element 5 is undef 408 %vecinsert7 = insertelement <16 x float> %vecinsert5, float %sub6, i32 6 409 %vecinsert8 = insertelement <16 x float> %vecinsert7, float %add7, i32 7 410 %vecinsert9 = insertelement <16 x float> %vecinsert8, float %sub8, i32 8 411 %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add9, i32 9 412 %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub10, i32 10 413 %vecinsert12 = insertelement <16 x float> %vecinsert11, float %add11, i32 11 414 ; element 12 is undef 415 %vecinsert14 = insertelement <16 x float> %vecinsert12, float %add13, i32 13 416 %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub14, i32 14 417 %vecinsert16 = insertelement <16 x float> %vecinsert15, float %add15, i32 15 418 ret <16 x float> %vecinsert16 419} 420 421define <8 x double> @buildvector_mul_addsub_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 { 422; NOFMA-LABEL: buildvector_mul_addsub_pd512: 423; NOFMA: # %bb.0: # %bb 424; NOFMA-NEXT: vmulpd %ymm3, %ymm1, %ymm1 425; NOFMA-NEXT: vmulpd %ymm2, %ymm0, %ymm0 426; NOFMA-NEXT: vaddsubpd %ymm4, %ymm0, %ymm0 427; NOFMA-NEXT: vaddsubpd %ymm5, %ymm1, %ymm1 428; NOFMA-NEXT: retq 429; 430; FMA3_256-LABEL: buildvector_mul_addsub_pd512: 431; FMA3_256: # %bb.0: # %bb 432; FMA3_256-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4 433; FMA3_256-NEXT: vfmaddsub213pd {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm5 434; FMA3_256-NEXT: retq 435; 436; FMA3_512-LABEL: buildvector_mul_addsub_pd512: 437; FMA3_512: # %bb.0: # %bb 438; FMA3_512-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2 439; FMA3_512-NEXT: retq 440; 441; FMA4-LABEL: buildvector_mul_addsub_pd512: 442; FMA4: # %bb.0: # %bb 443; FMA4-NEXT: vfmaddsubpd {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm4 444; FMA4-NEXT: vfmaddsubpd {{.*#+}} ymm1 = (ymm1 * ymm3) +/- ymm5 445; FMA4-NEXT: retq 446bb: 447 %A = fmul <8 x double> %C, %D 448 %A0 = extractelement <8 x double> %A, i32 0 449 %B0 = extractelement <8 x double> %B, i32 0 450 %sub0 = fsub double %A0, %B0 451 %A2 = extractelement <8 x double> %A, i32 2 452 %B2 = extractelement <8 x double> %B, i32 2 453 %sub2 = fsub double %A2, %B2 454 %A4 = extractelement <8 x double> %A, i32 4 455 %B4 = extractelement <8 x double> %B, i32 4 456 %sub4 = fsub double %A4, %B4 457 %A6 = extractelement <8 x double> %A, i32 6 458 %B6 = extractelement <8 x double> %B, i32 6 459 %sub6 = fsub double %A6, %B6 460 %A1 = extractelement <8 x double> %A, i32 1 461 %B1 = extractelement <8 x double> %B, i32 1 462 %add1 = fadd double %A1, %B1 463 %A3 = extractelement <8 x double> %A, i32 3 464 %B3 = extractelement <8 x double> %B, i32 3 465 %add3 = fadd double %A3, %B3 466 %A7 = extractelement <8 x double> %A, i32 7 467 %B7 = extractelement <8 x double> %B, i32 7 468 %add7 = fadd double %A7, %B7 469 %vecinsert1 = insertelement <8 x double> undef, double %sub0, i32 0 470 %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add1, i32 1 471 %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub2, i32 2 472 %vecinsert4 = insertelement <8 x double> %vecinsert3, double %add3, i32 3 473 %vecinsert5 = insertelement <8 x double> %vecinsert4, double %sub4, i32 4 474 ; element 5 is undef 475 %vecinsert7 = insertelement <8 x double> %vecinsert5, double %sub6, i32 6 476 %vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7 477 ret <8 x double> %vecinsert8 478} 479 480define <4 x float> @buildvector_mul_subadd_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 { 481; NOFMA-LABEL: buildvector_mul_subadd_ps128: 482; NOFMA: # %bb.0: # %bb 483; NOFMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 484; NOFMA-NEXT: vaddss %xmm2, %xmm0, %xmm1 485; NOFMA-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] 486; NOFMA-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0] 487; NOFMA-NEXT: vaddss %xmm4, %xmm3, %xmm3 488; NOFMA-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 489; NOFMA-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3] 490; NOFMA-NEXT: vsubss %xmm5, %xmm4, %xmm4 491; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2,3] 492; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] 493; NOFMA-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 494; NOFMA-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 495; NOFMA-NEXT: vsubss %xmm2, %xmm0, %xmm0 496; NOFMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 497; NOFMA-NEXT: retq 498; 499; FMA3-LABEL: buildvector_mul_subadd_ps128: 500; FMA3: # %bb.0: # %bb 501; FMA3-NEXT: vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2 502; FMA3-NEXT: retq 503; 504; FMA4-LABEL: buildvector_mul_subadd_ps128: 505; FMA4: # %bb.0: # %bb 506; FMA4-NEXT: vfmsubaddps {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2 507; FMA4-NEXT: retq 508bb: 509 %A = fmul <4 x float> %C, %D 510 %A0 = extractelement <4 x float> %A, i32 0 511 %B0 = extractelement <4 x float> %B, i32 0 512 %sub0 = fadd float %A0, %B0 513 %A2 = extractelement <4 x float> %A, i32 2 514 %B2 = extractelement <4 x float> %B, i32 2 515 %sub2 = fadd float %A2, %B2 516 %A1 = extractelement <4 x float> %A, i32 1 517 %B1 = extractelement <4 x float> %B, i32 1 518 %add1 = fsub float %A1, %B1 519 %A3 = extractelement <4 x float> %A, i32 3 520 %B3 = extractelement <4 x float> %B, i32 3 521 %add3 = fsub float %A3, %B3 522 %vecinsert1 = insertelement <4 x float> undef, float %sub0, i32 0 523 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add1, i32 1 524 %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub2, i32 2 525 %vecinsert4 = insertelement <4 x float> %vecinsert3, float %add3, i32 3 526 ret <4 x float> %vecinsert4 527} 528 529define <2 x double> @buildvector_mul_subadd_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 { 530; NOFMA-LABEL: buildvector_mul_subadd_pd128: 531; NOFMA: # %bb.0: # %bb 532; NOFMA-NEXT: vmulpd %xmm1, %xmm0, %xmm0 533; NOFMA-NEXT: vaddsd %xmm2, %xmm0, %xmm1 534; NOFMA-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 535; NOFMA-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] 536; NOFMA-NEXT: vsubsd %xmm2, %xmm0, %xmm0 537; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] 538; NOFMA-NEXT: retq 539; 540; FMA3-LABEL: buildvector_mul_subadd_pd128: 541; FMA3: # %bb.0: # %bb 542; FMA3-NEXT: vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2 543; FMA3-NEXT: retq 544; 545; FMA4-LABEL: buildvector_mul_subadd_pd128: 546; FMA4: # %bb.0: # %bb 547; FMA4-NEXT: vfmsubaddpd {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2 548; FMA4-NEXT: retq 549bb: 550 %A = fmul <2 x double> %C, %D 551 %A0 = extractelement <2 x double> %A, i32 0 552 %B0 = extractelement <2 x double> %B, i32 0 553 %sub0 = fadd double %A0, %B0 554 %A1 = extractelement <2 x double> %A, i32 1 555 %B1 = extractelement <2 x double> %B, i32 1 556 %add1 = fsub double %A1, %B1 557 %vecinsert1 = insertelement <2 x double> undef, double %sub0, i32 0 558 %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add1, i32 1 559 ret <2 x double> %vecinsert2 560} 561 562define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 { 563; NOFMA-LABEL: buildvector_mul_subadd_ps256: 564; NOFMA: # %bb.0: # %bb 565; NOFMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 566; NOFMA-NEXT: vaddss %xmm2, %xmm0, %xmm1 567; NOFMA-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] 568; NOFMA-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0] 569; NOFMA-NEXT: vaddss %xmm4, %xmm3, %xmm3 570; NOFMA-NEXT: vextractf128 $1, %ymm0, %xmm4 571; NOFMA-NEXT: vextractf128 $1, %ymm2, %xmm5 572; NOFMA-NEXT: vaddss %xmm5, %xmm4, %xmm6 573; NOFMA-NEXT: vshufpd {{.*#+}} xmm7 = xmm4[1,0] 574; NOFMA-NEXT: vshufpd {{.*#+}} xmm8 = xmm5[1,0] 575; NOFMA-NEXT: vaddss %xmm7, %xmm8, %xmm7 576; NOFMA-NEXT: vmovshdup {{.*#+}} xmm8 = xmm0[1,1,3,3] 577; NOFMA-NEXT: vmovshdup {{.*#+}} xmm9 = xmm2[1,1,3,3] 578; NOFMA-NEXT: vsubss %xmm9, %xmm8, %xmm8 579; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[2,3] 580; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] 581; NOFMA-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 582; NOFMA-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 583; NOFMA-NEXT: vsubss %xmm2, %xmm0, %xmm0 584; NOFMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 585; NOFMA-NEXT: vmovshdup {{.*#+}} xmm1 = xmm4[1,1,3,3] 586; NOFMA-NEXT: vmovshdup {{.*#+}} xmm2 = xmm5[1,1,3,3] 587; NOFMA-NEXT: vsubss %xmm2, %xmm1, %xmm1 588; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[2,3] 589; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm7[0],xmm1[3] 590; NOFMA-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,3,3,3] 591; NOFMA-NEXT: vshufps {{.*#+}} xmm3 = xmm5[3,3,3,3] 592; NOFMA-NEXT: vsubss %xmm3, %xmm2, %xmm2 593; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] 594; NOFMA-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 595; NOFMA-NEXT: retq 596; 597; FMA3-LABEL: buildvector_mul_subadd_ps256: 598; FMA3: # %bb.0: # %bb 599; FMA3-NEXT: vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2 600; FMA3-NEXT: retq 601; 602; FMA4-LABEL: buildvector_mul_subadd_ps256: 603; FMA4: # %bb.0: # %bb 604; FMA4-NEXT: vfmsubaddps {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2 605; FMA4-NEXT: retq 606bb: 607 %A = fmul <8 x float> %C, %D 608 %A0 = extractelement <8 x float> %A, i32 0 609 %B0 = extractelement <8 x float> %B, i32 0 610 %sub0 = fadd float %A0, %B0 611 %A2 = extractelement <8 x float> %A, i32 2 612 %B2 = extractelement <8 x float> %B, i32 2 613 %sub2 = fadd float %A2, %B2 614 %A4 = extractelement <8 x float> %A, i32 4 615 %B4 = extractelement <8 x float> %B, i32 4 616 %sub4 = fadd float %A4, %B4 617 %A6 = extractelement <8 x float> %A, i32 6 618 %B6 = extractelement <8 x float> %B, i32 6 619 %sub6 = fadd float %A6, %B6 620 %A1 = extractelement <8 x float> %A, i32 1 621 %B1 = extractelement <8 x float> %B, i32 1 622 %add1 = fsub float %A1, %B1 623 %A3 = extractelement <8 x float> %A, i32 3 624 %B3 = extractelement <8 x float> %B, i32 3 625 %add3 = fsub float %A3, %B3 626 %A5 = extractelement <8 x float> %A, i32 5 627 %B5 = extractelement <8 x float> %B, i32 5 628 %add5 = fsub float %A5, %B5 629 %A7 = extractelement <8 x float> %A, i32 7 630 %B7 = extractelement <8 x float> %B, i32 7 631 %add7 = fsub float %A7, %B7 632 %vecinsert1 = insertelement <8 x float> undef, float %sub0, i32 0 633 %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add1, i32 1 634 %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub2, i32 2 635 %vecinsert4 = insertelement <8 x float> %vecinsert3, float %add3, i32 3 636 %vecinsert5 = insertelement <8 x float> %vecinsert4, float %sub4, i32 4 637 %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add5, i32 5 638 %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub6, i32 6 639 %vecinsert8 = insertelement <8 x float> %vecinsert7, float %add7, i32 7 640 ret <8 x float> %vecinsert8 641} 642 643define <4 x double> @buildvector_mul_subadd_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 { 644; NOFMA-LABEL: buildvector_mul_subadd_pd256: 645; NOFMA: # %bb.0: # %bb 646; NOFMA-NEXT: vmulpd %ymm1, %ymm0, %ymm0 647; NOFMA-NEXT: vaddsd %xmm2, %xmm0, %xmm1 648; NOFMA-NEXT: vextractf128 $1, %ymm0, %xmm3 649; NOFMA-NEXT: vextractf128 $1, %ymm2, %xmm4 650; NOFMA-NEXT: vaddsd %xmm4, %xmm3, %xmm5 651; NOFMA-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 652; NOFMA-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] 653; NOFMA-NEXT: vsubsd %xmm2, %xmm0, %xmm0 654; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] 655; NOFMA-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0] 656; NOFMA-NEXT: vshufpd {{.*#+}} xmm2 = xmm4[1,0] 657; NOFMA-NEXT: vsubsd %xmm2, %xmm1, %xmm1 658; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm5[0],xmm1[0] 659; NOFMA-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 660; NOFMA-NEXT: retq 661; 662; FMA3-LABEL: buildvector_mul_subadd_pd256: 663; FMA3: # %bb.0: # %bb 664; FMA3-NEXT: vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2 665; FMA3-NEXT: retq 666; 667; FMA4-LABEL: buildvector_mul_subadd_pd256: 668; FMA4: # %bb.0: # %bb 669; FMA4-NEXT: vfmsubaddpd {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2 670; FMA4-NEXT: retq 671bb: 672 %A = fmul <4 x double> %C, %D 673 %A0 = extractelement <4 x double> %A, i32 0 674 %B0 = extractelement <4 x double> %B, i32 0 675 %sub0 = fadd double %A0, %B0 676 %A2 = extractelement <4 x double> %A, i32 2 677 %B2 = extractelement <4 x double> %B, i32 2 678 %sub2 = fadd double %A2, %B2 679 %A1 = extractelement <4 x double> %A, i32 1 680 %B1 = extractelement <4 x double> %B, i32 1 681 %add1 = fsub double %A1, %B1 682 %A3 = extractelement <4 x double> %A, i32 3 683 %B3 = extractelement <4 x double> %B, i32 3 684 %add3 = fsub double %A3, %B3 685 %vecinsert1 = insertelement <4 x double> undef, double %sub0, i32 0 686 %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add1, i32 1 687 %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub2, i32 2 688 %vecinsert4 = insertelement <4 x double> %vecinsert3, double %add3, i32 3 689 ret <4 x double> %vecinsert4 690} 691 692define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 { 693; NOFMA-LABEL: buildvector_mul_subadd_ps512: 694; NOFMA: # %bb.0: # %bb 695; NOFMA-NEXT: vmulps %ymm3, %ymm1, %ymm1 696; NOFMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 697; NOFMA-NEXT: vaddss %xmm4, %xmm0, %xmm2 698; NOFMA-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] 699; NOFMA-NEXT: vshufpd {{.*#+}} xmm6 = xmm4[1,0] 700; NOFMA-NEXT: vaddss %xmm6, %xmm3, %xmm3 701; NOFMA-NEXT: vextractf128 $1, %ymm0, %xmm6 702; NOFMA-NEXT: vextractf128 $1, %ymm4, %xmm7 703; NOFMA-NEXT: vaddss %xmm7, %xmm6, %xmm8 704; NOFMA-NEXT: vshufpd {{.*#+}} xmm9 = xmm6[1,0] 705; NOFMA-NEXT: vshufpd {{.*#+}} xmm10 = xmm7[1,0] 706; NOFMA-NEXT: vaddss %xmm10, %xmm9, %xmm9 707; NOFMA-NEXT: vinsertps {{.*#+}} xmm8 = xmm8[0,1],xmm9[0],xmm8[3] 708; NOFMA-NEXT: vaddss %xmm5, %xmm1, %xmm9 709; NOFMA-NEXT: vshufpd {{.*#+}} xmm10 = xmm1[1,0] 710; NOFMA-NEXT: vshufpd {{.*#+}} xmm11 = xmm5[1,0] 711; NOFMA-NEXT: vaddss %xmm11, %xmm10, %xmm10 712; NOFMA-NEXT: vextractf128 $1, %ymm1, %xmm11 713; NOFMA-NEXT: vshufpd {{.*#+}} xmm12 = xmm11[1,0] 714; NOFMA-NEXT: vextractf128 $1, %ymm5, %xmm13 715; NOFMA-NEXT: vshufpd {{.*#+}} xmm14 = xmm13[1,0] 716; NOFMA-NEXT: vaddss %xmm14, %xmm12, %xmm12 717; NOFMA-NEXT: vmovshdup {{.*#+}} xmm14 = xmm0[1,1,3,3] 718; NOFMA-NEXT: vmovshdup {{.*#+}} xmm15 = xmm4[1,1,3,3] 719; NOFMA-NEXT: vsubss %xmm15, %xmm14, %xmm14 720; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[2,3] 721; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] 722; NOFMA-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 723; NOFMA-NEXT: vshufps {{.*#+}} xmm3 = xmm4[3,3,3,3] 724; NOFMA-NEXT: vsubss %xmm3, %xmm0, %xmm0 725; NOFMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] 726; NOFMA-NEXT: vshufps {{.*#+}} xmm2 = xmm6[3,3,3,3] 727; NOFMA-NEXT: vshufps {{.*#+}} xmm3 = xmm7[3,3,3,3] 728; NOFMA-NEXT: vsubss %xmm3, %xmm2, %xmm2 729; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[0] 730; NOFMA-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 731; NOFMA-NEXT: vmovshdup {{.*#+}} xmm4 = xmm5[1,1,3,3] 732; NOFMA-NEXT: vsubss %xmm4, %xmm3, %xmm3 733; NOFMA-NEXT: vinsertps {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[2,3] 734; NOFMA-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm10[0],xmm3[3] 735; NOFMA-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 736; NOFMA-NEXT: vshufps {{.*#+}} xmm4 = xmm5[3,3,3,3] 737; NOFMA-NEXT: vsubss %xmm4, %xmm1, %xmm1 738; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] 739; NOFMA-NEXT: vmovshdup {{.*#+}} xmm3 = xmm11[1,1,3,3] 740; NOFMA-NEXT: vmovshdup {{.*#+}} xmm4 = xmm13[1,1,3,3] 741; NOFMA-NEXT: vsubss %xmm4, %xmm3, %xmm3 742; NOFMA-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm12[0,0] 743; NOFMA-NEXT: vshufps {{.*#+}} xmm4 = xmm11[3,3,3,3] 744; NOFMA-NEXT: vshufps {{.*#+}} xmm5 = xmm13[3,3,3,3] 745; NOFMA-NEXT: vsubss %xmm5, %xmm4, %xmm4 746; NOFMA-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] 747; NOFMA-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 748; NOFMA-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 749; NOFMA-NEXT: retq 750; 751; FMA3_256-LABEL: buildvector_mul_subadd_ps512: 752; FMA3_256: # %bb.0: # %bb 753; FMA3_256-NEXT: vfmsubadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) -/+ ymm4 754; FMA3_256-NEXT: vfmsubadd213ps {{.*#+}} ymm1 = (ymm3 * ymm1) -/+ ymm5 755; FMA3_256-NEXT: retq 756; 757; FMA3_512-LABEL: buildvector_mul_subadd_ps512: 758; FMA3_512: # %bb.0: # %bb 759; FMA3_512-NEXT: vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2 760; FMA3_512-NEXT: retq 761; 762; FMA4-LABEL: buildvector_mul_subadd_ps512: 763; FMA4: # %bb.0: # %bb 764; FMA4-NEXT: vfmsubaddps {{.*#+}} ymm0 = (ymm0 * ymm2) -/+ ymm4 765; FMA4-NEXT: vfmsubaddps {{.*#+}} ymm1 = (ymm1 * ymm3) -/+ ymm5 766; FMA4-NEXT: retq 767bb: 768 %A = fmul <16 x float> %C, %D 769 %A0 = extractelement <16 x float> %A, i32 0 770 %B0 = extractelement <16 x float> %B, i32 0 771 %sub0 = fadd float %A0, %B0 772 %A2 = extractelement <16 x float> %A, i32 2 773 %B2 = extractelement <16 x float> %B, i32 2 774 %sub2 = fadd float %A2, %B2 775 %A4 = extractelement <16 x float> %A, i32 4 776 %B4 = extractelement <16 x float> %B, i32 4 777 %sub4 = fadd float %A4, %B4 778 %A6 = extractelement <16 x float> %A, i32 6 779 %B6 = extractelement <16 x float> %B, i32 6 780 %sub6 = fadd float %A6, %B6 781 %A8 = extractelement <16 x float> %A, i32 8 782 %B8 = extractelement <16 x float> %B, i32 8 783 %sub8 = fadd float %A8, %B8 784 %A10 = extractelement <16 x float> %A, i32 10 785 %B10 = extractelement <16 x float> %B, i32 10 786 %sub10 = fadd float %A10, %B10 787 %A12 = extractelement <16 x float> %A, i32 12 788 %B12 = extractelement <16 x float> %B, i32 12 789 %sub12 = fadd float %A12, %B12 790 %A14 = extractelement <16 x float> %A, i32 14 791 %B14 = extractelement <16 x float> %B, i32 14 792 %sub14 = fadd float %A14, %B14 793 %A1 = extractelement <16 x float> %A, i32 1 794 %B1 = extractelement <16 x float> %B, i32 1 795 %add1 = fsub float %A1, %B1 796 %A3 = extractelement <16 x float> %A, i32 3 797 %B3 = extractelement <16 x float> %B, i32 3 798 %add3 = fsub float %A3, %B3 799 %A5 = extractelement <16 x float> %A, i32 5 800 %B5 = extractelement <16 x float> %B, i32 5 801 %add5 = fsub float %A5, %B5 802 %A7 = extractelement <16 x float> %A, i32 7 803 %B7 = extractelement <16 x float> %B, i32 7 804 %add7 = fsub float %A7, %B7 805 %A9 = extractelement <16 x float> %A, i32 9 806 %B9 = extractelement <16 x float> %B, i32 9 807 %add9 = fsub float %A9, %B9 808 %A11 = extractelement <16 x float> %A, i32 11 809 %B11 = extractelement <16 x float> %B, i32 11 810 %add11 = fsub float %A11, %B11 811 %A13 = extractelement <16 x float> %A, i32 13 812 %B13 = extractelement <16 x float> %B, i32 13 813 %add13 = fsub float %A13, %B13 814 %A15 = extractelement <16 x float> %A, i32 15 815 %B15 = extractelement <16 x float> %B, i32 15 816 %add15 = fsub float %A15, %B15 817 %vecinsert1 = insertelement <16 x float> undef, float %sub0, i32 0 818 %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add1, i32 1 819 %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub2, i32 2 820 %vecinsert4 = insertelement <16 x float> %vecinsert3, float %add3, i32 3 821 %vecinsert5 = insertelement <16 x float> %vecinsert4, float %sub4, i32 4 822 ; element 5 is undef 823 %vecinsert7 = insertelement <16 x float> %vecinsert5, float %sub6, i32 6 824 %vecinsert8 = insertelement <16 x float> %vecinsert7, float %add7, i32 7 825 %vecinsert9 = insertelement <16 x float> %vecinsert8, float %sub8, i32 8 826 %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add9, i32 9 827 %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub10, i32 10 828 %vecinsert12 = insertelement <16 x float> %vecinsert11, float %add11, i32 11 829 ; element 12 is undef 830 %vecinsert14 = insertelement <16 x float> %vecinsert12, float %add13, i32 13 831 %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub14, i32 14 832 %vecinsert16 = insertelement <16 x float> %vecinsert15, float %add15, i32 15 833 ret <16 x float> %vecinsert16 834} 835 836define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 { 837; NOFMA-LABEL: buildvector_mul_subadd_pd512: 838; NOFMA: # %bb.0: # %bb 839; NOFMA-NEXT: vmulpd %ymm3, %ymm1, %ymm1 840; NOFMA-NEXT: vmulpd %ymm2, %ymm0, %ymm0 841; NOFMA-NEXT: vaddsd %xmm4, %xmm0, %xmm2 842; NOFMA-NEXT: vextractf128 $1, %ymm0, %xmm3 843; NOFMA-NEXT: vextractf128 $1, %ymm4, %xmm6 844; NOFMA-NEXT: vaddsd %xmm6, %xmm3, %xmm7 845; NOFMA-NEXT: vaddsd %xmm5, %xmm1, %xmm8 846; NOFMA-NEXT: vextractf128 $1, %ymm1, %xmm1 847; NOFMA-NEXT: vextractf128 $1, %ymm5, %xmm5 848; NOFMA-NEXT: vaddsd %xmm5, %xmm1, %xmm9 849; NOFMA-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 850; NOFMA-NEXT: vshufpd {{.*#+}} xmm4 = xmm4[1,0] 851; NOFMA-NEXT: vsubsd %xmm4, %xmm0, %xmm0 852; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] 853; NOFMA-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0] 854; NOFMA-NEXT: vshufpd {{.*#+}} xmm3 = xmm6[1,0] 855; NOFMA-NEXT: vsubsd %xmm3, %xmm2, %xmm2 856; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm7[0],xmm2[0] 857; NOFMA-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 858; NOFMA-NEXT: vshufpd {{.*#+}} xmm3 = xmm5[1,0] 859; NOFMA-NEXT: vsubsd %xmm3, %xmm1, %xmm1 860; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm9[0],xmm1[0] 861; NOFMA-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 862; NOFMA-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 863; NOFMA-NEXT: retq 864; 865; FMA3_256-LABEL: buildvector_mul_subadd_pd512: 866; FMA3_256: # %bb.0: # %bb 867; FMA3_256-NEXT: vfmsubadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) -/+ ymm4 868; FMA3_256-NEXT: vfmsubadd213pd {{.*#+}} ymm1 = (ymm3 * ymm1) -/+ ymm5 869; FMA3_256-NEXT: retq 870; 871; FMA3_512-LABEL: buildvector_mul_subadd_pd512: 872; FMA3_512: # %bb.0: # %bb 873; FMA3_512-NEXT: vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2 874; FMA3_512-NEXT: retq 875; 876; FMA4-LABEL: buildvector_mul_subadd_pd512: 877; FMA4: # %bb.0: # %bb 878; FMA4-NEXT: vfmsubaddpd {{.*#+}} ymm0 = (ymm0 * ymm2) -/+ ymm4 879; FMA4-NEXT: vfmsubaddpd {{.*#+}} ymm1 = (ymm1 * ymm3) -/+ ymm5 880; FMA4-NEXT: retq 881bb: 882 %A = fmul <8 x double> %C, %D 883 %A0 = extractelement <8 x double> %A, i32 0 884 %B0 = extractelement <8 x double> %B, i32 0 885 %sub0 = fadd double %A0, %B0 886 %A2 = extractelement <8 x double> %A, i32 2 887 %B2 = extractelement <8 x double> %B, i32 2 888 %sub2 = fadd double %A2, %B2 889 %A4 = extractelement <8 x double> %A, i32 4 890 %B4 = extractelement <8 x double> %B, i32 4 891 %sub4 = fadd double %A4, %B4 892 %A6 = extractelement <8 x double> %A, i32 6 893 %B6 = extractelement <8 x double> %B, i32 6 894 %sub6 = fadd double %A6, %B6 895 %A1 = extractelement <8 x double> %A, i32 1 896 %B1 = extractelement <8 x double> %B, i32 1 897 %add1 = fsub double %A1, %B1 898 %A3 = extractelement <8 x double> %A, i32 3 899 %B3 = extractelement <8 x double> %B, i32 3 900 %add3 = fsub double %A3, %B3 901 %A7 = extractelement <8 x double> %A, i32 7 902 %B7 = extractelement <8 x double> %B, i32 7 903 %add7 = fsub double %A7, %B7 904 %vecinsert1 = insertelement <8 x double> undef, double %sub0, i32 0 905 %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add1, i32 1 906 %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub2, i32 2 907 %vecinsert4 = insertelement <8 x double> %vecinsert3, double %add3, i32 3 908 %vecinsert5 = insertelement <8 x double> %vecinsert4, double %sub4, i32 4 909 ; element 5 is undef 910 %vecinsert7 = insertelement <8 x double> %vecinsert5, double %sub6, i32 6 911 %vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7 912 ret <8 x double> %vecinsert8 913} 914 915attributes #0 = { nounwind "unsafe-fp-math"="true" } 916