1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s -check-prefixes=CHECK,NOFMA 3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma | FileCheck %s -check-prefixes=CHECK,FMA3,FMA3_256 4; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma,+avx512f | FileCheck %s -check-prefixes=CHECK,FMA3,FMA3_512 5; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma4 | FileCheck %s -check-prefixes=CHECK,FMA4 6 7; This test checks the fusing of MUL + SUB/ADD to FMSUBADD. 8 9define <2 x double> @mul_subadd_pd128(<2 x double> %A, <2 x double> %B, <2 x double> %C) #0 { 10; NOFMA-LABEL: mul_subadd_pd128: 11; NOFMA: # %bb.0: # %entry 12; NOFMA-NEXT: vmulpd %xmm1, %xmm0, %xmm0 13; NOFMA-NEXT: vsubpd %xmm2, %xmm0, %xmm1 14; NOFMA-NEXT: vaddpd %xmm2, %xmm0, %xmm0 15; NOFMA-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 16; NOFMA-NEXT: retq 17; 18; FMA3-LABEL: mul_subadd_pd128: 19; FMA3: # %bb.0: # %entry 20; FMA3-NEXT: vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2 21; FMA3-NEXT: retq 22; 23; FMA4-LABEL: mul_subadd_pd128: 24; FMA4: # %bb.0: # %entry 25; FMA4-NEXT: vfmsubaddpd {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2 26; FMA4-NEXT: retq 27entry: 28 %AB = fmul <2 x double> %A, %B 29 %Sub = fsub <2 x double> %AB, %C 30 %Add = fadd <2 x double> %AB, %C 31 %subadd = shufflevector <2 x double> %Add, <2 x double> %Sub, <2 x i32> <i32 0, i32 3> 32 ret <2 x double> %subadd 33} 34 35define <4 x float> @mul_subadd_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) #0 { 36; NOFMA-LABEL: mul_subadd_ps128: 37; NOFMA: # %bb.0: # %entry 38; NOFMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 39; NOFMA-NEXT: vsubps %xmm2, %xmm0, %xmm1 40; NOFMA-NEXT: vaddps %xmm2, %xmm0, %xmm0 41; NOFMA-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 42; NOFMA-NEXT: retq 43; 44; FMA3-LABEL: mul_subadd_ps128: 45; FMA3: # %bb.0: # %entry 46; FMA3-NEXT: vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2 47; FMA3-NEXT: retq 48; 49; FMA4-LABEL: mul_subadd_ps128: 50; FMA4: # %bb.0: # %entry 51; FMA4-NEXT: vfmsubaddps {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2 52; FMA4-NEXT: retq 53entry: 54 %AB = fmul <4 x float> %A, %B 55 %Sub = fsub <4 x float> %AB, %C 56 %Add = fadd <4 x float> %AB, %C 57 %subadd = shufflevector <4 x float> %Add, <4 x float> %Sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 58 ret <4 x float> %subadd 59} 60 61define <4 x double> @mul_subadd_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) #0 { 62; NOFMA-LABEL: mul_subadd_pd256: 63; NOFMA: # %bb.0: # %entry 64; NOFMA-NEXT: vmulpd %ymm1, %ymm0, %ymm0 65; NOFMA-NEXT: vsubpd %ymm2, %ymm0, %ymm1 66; NOFMA-NEXT: vaddpd %ymm2, %ymm0, %ymm0 67; NOFMA-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] 68; NOFMA-NEXT: retq 69; 70; FMA3-LABEL: mul_subadd_pd256: 71; FMA3: # %bb.0: # %entry 72; FMA3-NEXT: vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2 73; FMA3-NEXT: retq 74; 75; FMA4-LABEL: mul_subadd_pd256: 76; FMA4: # %bb.0: # %entry 77; FMA4-NEXT: vfmsubaddpd {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2 78; FMA4-NEXT: retq 79entry: 80 %AB = fmul <4 x double> %A, %B 81 %Sub = fsub <4 x double> %AB, %C 82 %Add = fadd <4 x double> %AB, %C 83 %subadd = shufflevector <4 x double> %Add, <4 x double> %Sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 84 ret <4 x double> %subadd 85} 86 87define <8 x float> @mul_subadd_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) #0 { 88; NOFMA-LABEL: mul_subadd_ps256: 89; NOFMA: # %bb.0: # %entry 90; NOFMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 91; NOFMA-NEXT: vsubps %ymm2, %ymm0, %ymm1 92; NOFMA-NEXT: vaddps %ymm2, %ymm0, %ymm0 93; NOFMA-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] 94; NOFMA-NEXT: retq 95; 96; FMA3-LABEL: mul_subadd_ps256: 97; FMA3: # %bb.0: # %entry 98; FMA3-NEXT: vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2 99; FMA3-NEXT: retq 100; 101; FMA4-LABEL: mul_subadd_ps256: 102; FMA4: # %bb.0: # %entry 103; FMA4-NEXT: vfmsubaddps {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2 104; FMA4-NEXT: retq 105entry: 106 %AB = fmul <8 x float> %A, %B 107 %Sub = fsub <8 x float> %AB, %C 108 %Add = fadd <8 x float> %AB, %C 109 %subadd = shufflevector <8 x float> %Add, <8 x float> %Sub, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 110 ret <8 x float> %subadd 111} 112 113define <8 x double> @mul_subadd_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) #0 { 114; NOFMA-LABEL: mul_subadd_pd512: 115; NOFMA: # %bb.0: # %entry 116; NOFMA-NEXT: vmulpd %ymm2, %ymm0, %ymm0 117; NOFMA-NEXT: vmulpd %ymm3, %ymm1, %ymm1 118; NOFMA-NEXT: vsubpd %ymm5, %ymm1, %ymm2 119; NOFMA-NEXT: vsubpd %ymm4, %ymm0, %ymm3 120; NOFMA-NEXT: vaddpd %ymm5, %ymm1, %ymm1 121; NOFMA-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] 122; NOFMA-NEXT: vaddpd %ymm4, %ymm0, %ymm0 123; NOFMA-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3] 124; NOFMA-NEXT: retq 125; 126; FMA3_256-LABEL: mul_subadd_pd512: 127; FMA3_256: # %bb.0: # %entry 128; FMA3_256-NEXT: vfmsubadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) -/+ ymm4 129; FMA3_256-NEXT: vfmsubadd213pd {{.*#+}} ymm1 = (ymm3 * ymm1) -/+ ymm5 130; FMA3_256-NEXT: retq 131; 132; FMA3_512-LABEL: mul_subadd_pd512: 133; FMA3_512: # %bb.0: # %entry 134; FMA3_512-NEXT: vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2 135; FMA3_512-NEXT: retq 136; 137; FMA4-LABEL: mul_subadd_pd512: 138; FMA4: # %bb.0: # %entry 139; FMA4-NEXT: vfmsubaddpd {{.*#+}} ymm0 = (ymm0 * ymm2) -/+ ymm4 140; FMA4-NEXT: vfmsubaddpd {{.*#+}} ymm1 = (ymm1 * ymm3) -/+ ymm5 141; FMA4-NEXT: retq 142entry: 143 %AB = fmul <8 x double> %A, %B 144 %Sub = fsub <8 x double> %AB, %C 145 %Add = fadd <8 x double> %AB, %C 146 %subadd = shufflevector <8 x double> %Add, <8 x double> %Sub, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 147 ret <8 x double> %subadd 148} 149 150define <16 x float> @mul_subadd_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) #0 { 151; NOFMA-LABEL: mul_subadd_ps512: 152; NOFMA: # %bb.0: # %entry 153; NOFMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 154; NOFMA-NEXT: vmulps %ymm3, %ymm1, %ymm1 155; NOFMA-NEXT: vsubps %ymm5, %ymm1, %ymm2 156; NOFMA-NEXT: vsubps %ymm4, %ymm0, %ymm3 157; NOFMA-NEXT: vaddps %ymm5, %ymm1, %ymm1 158; NOFMA-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] 159; NOFMA-NEXT: vaddps %ymm4, %ymm0, %ymm0 160; NOFMA-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] 161; NOFMA-NEXT: retq 162; 163; FMA3_256-LABEL: mul_subadd_ps512: 164; FMA3_256: # %bb.0: # %entry 165; FMA3_256-NEXT: vfmsubadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) -/+ ymm4 166; FMA3_256-NEXT: vfmsubadd213ps {{.*#+}} ymm1 = (ymm3 * ymm1) -/+ ymm5 167; FMA3_256-NEXT: retq 168; 169; FMA3_512-LABEL: mul_subadd_ps512: 170; FMA3_512: # %bb.0: # %entry 171; FMA3_512-NEXT: vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2 172; FMA3_512-NEXT: retq 173; 174; FMA4-LABEL: mul_subadd_ps512: 175; FMA4: # %bb.0: # %entry 176; FMA4-NEXT: vfmsubaddps {{.*#+}} ymm0 = (ymm0 * ymm2) -/+ ymm4 177; FMA4-NEXT: vfmsubaddps {{.*#+}} ymm1 = (ymm1 * ymm3) -/+ ymm5 178; FMA4-NEXT: retq 179entry: 180 %AB = fmul <16 x float> %A, %B 181 %Sub = fsub <16 x float> %AB, %C 182 %Add = fadd <16 x float> %AB, %C 183 %subadd = shufflevector <16 x float> %Add, <16 x float> %Sub, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 184 ret <16 x float> %subadd 185} 186 187; This should not be matched to fmsubadd because the mul is on the wrong side of the fsub. 188define <2 x double> @mul_subadd_bad_commute(<2 x double> %A, <2 x double> %B, <2 x double> %C) #0 { 189; CHECK-LABEL: mul_subadd_bad_commute: 190; CHECK: # %bb.0: # %entry 191; CHECK-NEXT: vmulpd %xmm1, %xmm0, %xmm0 192; CHECK-NEXT: vsubpd %xmm0, %xmm2, %xmm1 193; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 194; CHECK-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 195; CHECK-NEXT: retq 196entry: 197 %AB = fmul <2 x double> %A, %B 198 %Sub = fsub <2 x double> %C, %AB 199 %Add = fadd <2 x double> %AB, %C 200 %subadd = shufflevector <2 x double> %Add, <2 x double> %Sub, <2 x i32> <i32 0, i32 3> 201 ret <2 x double> %subadd 202} 203 204attributes #0 = { nounwind "unsafe-fp-math"="true" } 205