1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=fast --enable-no-signed-zeros-fp-math -mattr=avx512fp16 | FileCheck %s --check-prefixes=CHECK,NO-SZ 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=fast -mattr=avx512fp16 | FileCheck %s --check-prefixes=CHECK,HAS-SZ 4 5; FADD(acc, FMA(a, b, +0.0)) can be combined to FMA(a, b, acc) if the nsz flag set. 6define dso_local <32 x half> @test1(<32 x half> %acc, <32 x half> %a, <32 x half> %b) { 7; NO-SZ-LABEL: test1: 8; NO-SZ: # %bb.0: # %entry 9; NO-SZ-NEXT: vfcmaddcph %zmm2, %zmm1, %zmm0 10; NO-SZ-NEXT: retq 11; 12; HAS-SZ-LABEL: test1: 13; HAS-SZ: # %bb.0: # %entry 14; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3 15; HAS-SZ-NEXT: vfcmaddcph %zmm2, %zmm1, %zmm3 16; HAS-SZ-NEXT: vaddph %zmm0, %zmm3, %zmm0 17; HAS-SZ-NEXT: retq 18entry: 19 %0 = bitcast <32 x half> %a to <16 x float> 20 %1 = bitcast <32 x half> %b to <16 x float> 21 %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4) 22 %3 = bitcast <16 x float> %2 to <32 x half> 23 %add.i = fadd <32 x half> %3, %acc 24 ret <32 x half> %add.i 25} 26 27define dso_local <32 x half> @test2(<32 x half> %acc, <32 x half> %a, <32 x half> %b) { 28; NO-SZ-LABEL: test2: 29; NO-SZ: # %bb.0: # %entry 30; NO-SZ-NEXT: vfmaddcph %zmm2, %zmm1, %zmm0 31; NO-SZ-NEXT: retq 32; 33; HAS-SZ-LABEL: test2: 34; HAS-SZ: # %bb.0: # %entry 35; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3 36; HAS-SZ-NEXT: vfmaddcph %zmm2, %zmm1, %zmm3 37; HAS-SZ-NEXT: vaddph %zmm0, %zmm3, %zmm0 38; HAS-SZ-NEXT: retq 39entry: 40 %0 = bitcast <32 x half> %a to <16 x float> 41 %1 = bitcast <32 x half> %b to <16 x float> 42 %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4) 43 %3 = bitcast <16 x float> %2 to <32 x half> 44 %add.i = fadd <32 x half> %3, %acc 45 ret <32 x half> %add.i 46} 47 48define dso_local <16 x half> @test3(<16 x half> %acc, <16 x half> %a, <16 x half> %b) { 49; NO-SZ-LABEL: test3: 50; NO-SZ: # %bb.0: # %entry 51; NO-SZ-NEXT: vfcmaddcph %ymm2, %ymm1, %ymm0 52; NO-SZ-NEXT: retq 53; 54; HAS-SZ-LABEL: test3: 55; HAS-SZ: # %bb.0: # %entry 56; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3 57; HAS-SZ-NEXT: vfcmaddcph %ymm2, %ymm1, %ymm3 58; HAS-SZ-NEXT: vaddph %ymm0, %ymm3, %ymm0 59; HAS-SZ-NEXT: retq 60entry: 61 %0 = bitcast <16 x half> %a to <8 x float> 62 %1 = bitcast <16 x half> %b to <8 x float> 63 %2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> zeroinitializer, i8 -1) 64 %3 = bitcast <8 x float> %2 to <16 x half> 65 %add.i = fadd <16 x half> %3, %acc 66 ret <16 x half> %add.i 67} 68 69define dso_local <16 x half> @test4(<16 x half> %acc, <16 x half> %a, <16 x half> %b) { 70; NO-SZ-LABEL: test4: 71; NO-SZ: # %bb.0: # %entry 72; NO-SZ-NEXT: vfmaddcph %ymm2, %ymm1, %ymm0 73; NO-SZ-NEXT: retq 74; 75; HAS-SZ-LABEL: test4: 76; HAS-SZ: # %bb.0: # %entry 77; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3 78; HAS-SZ-NEXT: vfmaddcph %ymm2, %ymm1, %ymm3 79; HAS-SZ-NEXT: vaddph %ymm0, %ymm3, %ymm0 80; HAS-SZ-NEXT: retq 81entry: 82 %0 = bitcast <16 x half> %a to <8 x float> 83 %1 = bitcast <16 x half> %b to <8 x float> 84 %2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> zeroinitializer, i8 -1) 85 %3 = bitcast <8 x float> %2 to <16 x half> 86 %add.i = fadd <16 x half> %3, %acc 87 ret <16 x half> %add.i 88} 89 90define dso_local <8 x half> @test5(<8 x half> %acc, <8 x half> %a, <8 x half> %b) { 91; NO-SZ-LABEL: test5: 92; NO-SZ: # %bb.0: # %entry 93; NO-SZ-NEXT: vfcmaddcph %xmm2, %xmm1, %xmm0 94; NO-SZ-NEXT: retq 95; 96; HAS-SZ-LABEL: test5: 97; HAS-SZ: # %bb.0: # %entry 98; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3 99; HAS-SZ-NEXT: vfcmaddcph %xmm2, %xmm1, %xmm3 100; HAS-SZ-NEXT: vaddph %xmm0, %xmm3, %xmm0 101; HAS-SZ-NEXT: retq 102entry: 103 %0 = bitcast <8 x half> %a to <4 x float> 104 %1 = bitcast <8 x half> %b to <4 x float> 105 %2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1) 106 %3 = bitcast <4 x float> %2 to <8 x half> 107 %add.i = fadd <8 x half> %3, %acc 108 ret <8 x half> %add.i 109} 110 111define dso_local <8 x half> @test6(<8 x half> %acc, <8 x half> %a, <8 x half> %b) { 112; NO-SZ-LABEL: test6: 113; NO-SZ: # %bb.0: # %entry 114; NO-SZ-NEXT: vfmaddcph %xmm2, %xmm1, %xmm0 115; NO-SZ-NEXT: retq 116; 117; HAS-SZ-LABEL: test6: 118; HAS-SZ: # %bb.0: # %entry 119; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3 120; HAS-SZ-NEXT: vfmaddcph %xmm2, %xmm1, %xmm3 121; HAS-SZ-NEXT: vaddph %xmm0, %xmm3, %xmm0 122; HAS-SZ-NEXT: retq 123entry: 124 %0 = bitcast <8 x half> %a to <4 x float> 125 %1 = bitcast <8 x half> %b to <4 x float> 126 %2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1) 127 %3 = bitcast <4 x float> %2 to <8 x half> 128 %add.i = fadd <8 x half> %3, %acc 129 ret <8 x half> %add.i 130} 131 132; FADD(acc, FMA(a, b, -0.0)) can be combined to FMA(a, b, acc) no matter if the nsz flag set. 133define dso_local <32 x half> @test13(<32 x half> %acc, <32 x half> %a, <32 x half> %b) { 134; CHECK-LABEL: test13: 135; CHECK: # %bb.0: # %entry 136; CHECK-NEXT: vfcmaddcph %zmm2, %zmm1, %zmm0 137; CHECK-NEXT: retq 138entry: 139 %0 = bitcast <32 x half> %a to <16 x float> 140 %1 = bitcast <32 x half> %b to <16 x float> 141 %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i16 -1, i32 4) 142 %3 = bitcast <16 x float> %2 to <32 x half> 143 %add.i = fadd <32 x half> %3, %acc 144 ret <32 x half> %add.i 145} 146 147define dso_local <32 x half> @test14(<32 x half> %acc, <32 x half> %a, <32 x half> %b) { 148; CHECK-LABEL: test14: 149; CHECK: # %bb.0: # %entry 150; CHECK-NEXT: vfmaddcph %zmm2, %zmm1, %zmm0 151; CHECK-NEXT: retq 152entry: 153 %0 = bitcast <32 x half> %a to <16 x float> 154 %1 = bitcast <32 x half> %b to <16 x float> 155 %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i16 -1, i32 4) 156 %3 = bitcast <16 x float> %2 to <32 x half> 157 %add.i = fadd <32 x half> %3, %acc 158 ret <32 x half> %add.i 159} 160 161define dso_local <16 x half> @test15(<16 x half> %acc, <16 x half> %a, <16 x half> %b) { 162; CHECK-LABEL: test15: 163; CHECK: # %bb.0: # %entry 164; CHECK-NEXT: vfcmaddcph %ymm2, %ymm1, %ymm0 165; CHECK-NEXT: retq 166entry: 167 %0 = bitcast <16 x half> %a to <8 x float> 168 %1 = bitcast <16 x half> %b to <8 x float> 169 %2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1) 170 %3 = bitcast <8 x float> %2 to <16 x half> 171 %add.i = fadd <16 x half> %3, %acc 172 ret <16 x half> %add.i 173} 174 175define dso_local <16 x half> @test16(<16 x half> %acc, <16 x half> %a, <16 x half> %b) { 176; CHECK-LABEL: test16: 177; CHECK: # %bb.0: # %entry 178; CHECK-NEXT: vfmaddcph %ymm2, %ymm1, %ymm0 179; CHECK-NEXT: retq 180entry: 181 %0 = bitcast <16 x half> %a to <8 x float> 182 %1 = bitcast <16 x half> %b to <8 x float> 183 %2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1) 184 %3 = bitcast <8 x float> %2 to <16 x half> 185 %add.i = fadd <16 x half> %3, %acc 186 ret <16 x half> %add.i 187} 188 189define dso_local <8 x half> @test17(<8 x half> %acc, <8 x half> %a, <8 x half> %b) { 190; CHECK-LABEL: test17: 191; CHECK: # %bb.0: # %entry 192; CHECK-NEXT: vfcmaddcph %xmm2, %xmm1, %xmm0 193; CHECK-NEXT: retq 194entry: 195 %0 = bitcast <8 x half> %a to <4 x float> 196 %1 = bitcast <8 x half> %b to <4 x float> 197 %2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1) 198 %3 = bitcast <4 x float> %2 to <8 x half> 199 %add.i = fadd <8 x half> %3, %acc 200 ret <8 x half> %add.i 201} 202 203define dso_local <8 x half> @test18(<8 x half> %acc, <8 x half> %a, <8 x half> %b) { 204; CHECK-LABEL: test18: 205; CHECK: # %bb.0: # %entry 206; CHECK-NEXT: vfmaddcph %xmm2, %xmm1, %xmm0 207; CHECK-NEXT: retq 208entry: 209 %0 = bitcast <8 x half> %a to <4 x float> 210 %1 = bitcast <8 x half> %b to <4 x float> 211 %2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1) 212 %3 = bitcast <4 x float> %2 to <8 x half> 213 %add.i = fadd <8 x half> %3, %acc 214 ret <8 x half> %add.i 215} 216 217declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg) 218declare <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg) 219declare <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8) 220declare <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8) 221declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8) 222declare <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8) 223