1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512fp16 -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-verify-pattern-order=true < %s | FileCheck %s 3 4; Incremental updates of the instruction depths should be enough for this test 5; case. 6; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512fp16 -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-inc-threshold=0 < %s | FileCheck %s 7 8; Verify that the first two adds are independent regardless of how the inputs are 9; commuted. The destination registers are used as source registers for the third add. 10 11define half @reassociate_adds1(half %x0, half %x1, half %x2, half %x3) { 12; CHECK-LABEL: reassociate_adds1: 13; CHECK: # %bb.0: 14; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 15; CHECK-NEXT: vaddsh %xmm3, %xmm2, %xmm1 16; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 17; CHECK-NEXT: retq 18 %t0 = fadd reassoc nsz half %x0, %x1 19 %t1 = fadd reassoc nsz half %t0, %x2 20 %t2 = fadd reassoc nsz half %t1, %x3 21 ret half %t2 22} 23 24define half @reassociate_adds2(half %x0, half %x1, half %x2, half %x3) { 25; CHECK-LABEL: reassociate_adds2: 26; CHECK: # %bb.0: 27; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 28; CHECK-NEXT: vaddsh %xmm3, %xmm2, %xmm1 29; CHECK-NEXT: vaddsh %xmm0, %xmm1, %xmm0 30; CHECK-NEXT: retq 31 %t0 = fadd reassoc nsz half %x0, %x1 32 %t1 = fadd reassoc nsz half %x2, %t0 33 %t2 = fadd reassoc nsz half %t1, %x3 34 ret half %t2 35} 36 37define half @reassociate_adds3(half %x0, half %x1, half %x2, half %x3) { 38; CHECK-LABEL: reassociate_adds3: 39; CHECK: # %bb.0: 40; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 41; CHECK-NEXT: vaddsh %xmm2, %xmm3, %xmm1 42; CHECK-NEXT: vaddsh %xmm0, %xmm1, %xmm0 43; CHECK-NEXT: retq 44 %t0 = fadd reassoc nsz half %x0, %x1 45 %t1 = fadd reassoc nsz half %t0, %x2 46 %t2 = fadd reassoc nsz half %x3, %t1 47 ret half %t2 48} 49 50define half @reassociate_adds4(half %x0, half %x1, half %x2, half %x3) { 51; CHECK-LABEL: reassociate_adds4: 52; CHECK: # %bb.0: 53; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 54; CHECK-NEXT: vaddsh %xmm2, %xmm3, %xmm1 55; CHECK-NEXT: vaddsh %xmm0, %xmm1, %xmm0 56; CHECK-NEXT: retq 57 %t0 = fadd reassoc nsz half %x0, %x1 58 %t1 = fadd reassoc nsz half %x2, %t0 59 %t2 = fadd reassoc nsz half %x3, %t1 60 ret half %t2 61} 62 63; Verify that we reassociate some of these ops. The optimal balanced tree of adds is not 64; produced because that would cost more compile time. 65 66define half @reassociate_adds5(half %x0, half %x1, half %x2, half %x3, half %x4, half %x5, half %x6, half %x7) { 67; CHECK-LABEL: reassociate_adds5: 68; CHECK: # %bb.0: 69; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 70; CHECK-NEXT: vaddsh %xmm3, %xmm2, %xmm1 71; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 72; CHECK-NEXT: vaddsh %xmm5, %xmm4, %xmm1 73; CHECK-NEXT: vaddsh %xmm6, %xmm1, %xmm1 74; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 75; CHECK-NEXT: vaddsh %xmm7, %xmm0, %xmm0 76; CHECK-NEXT: retq 77 %t0 = fadd reassoc nsz half %x0, %x1 78 %t1 = fadd reassoc nsz half %t0, %x2 79 %t2 = fadd reassoc nsz half %t1, %x3 80 %t3 = fadd reassoc nsz half %t2, %x4 81 %t4 = fadd reassoc nsz half %t3, %x5 82 %t5 = fadd reassoc nsz half %t4, %x6 83 %t6 = fadd reassoc nsz half %t5, %x7 84 ret half %t6 85} 86 87; Verify that we only need two associative operations to reassociate the operands. 88; Also, we should reassociate such that the result of the high latency division 89; is used by the final 'add' rather than reassociating the %x3 operand with the 90; division. The latter reassociation would not improve anything. 91 92define half @reassociate_adds6(half %x0, half %x1, half %x2, half %x3) { 93; CHECK-LABEL: reassociate_adds6: 94; CHECK: # %bb.0: 95; CHECK-NEXT: vdivsh %xmm1, %xmm0, %xmm0 96; CHECK-NEXT: vaddsh %xmm2, %xmm3, %xmm1 97; CHECK-NEXT: vaddsh %xmm0, %xmm1, %xmm0 98; CHECK-NEXT: retq 99 %t0 = fdiv reassoc nsz half %x0, %x1 100 %t1 = fadd reassoc nsz half %x2, %t0 101 %t2 = fadd reassoc nsz half %x3, %t1 102 ret half %t2 103} 104 105; Verify that SSE and AVX scalar single-precision multiplies are reassociated. 106 107define half @reassociate_muls1(half %x0, half %x1, half %x2, half %x3) { 108; CHECK-LABEL: reassociate_muls1: 109; CHECK: # %bb.0: 110; CHECK-NEXT: vdivsh %xmm1, %xmm0, %xmm0 111; CHECK-NEXT: vmulsh %xmm2, %xmm3, %xmm1 112; CHECK-NEXT: vmulsh %xmm0, %xmm1, %xmm0 113; CHECK-NEXT: retq 114 %t0 = fdiv reassoc nsz half %x0, %x1 115 %t1 = fmul reassoc nsz half %x2, %t0 116 %t2 = fmul reassoc nsz half %x3, %t1 117 ret half %t2 118} 119 120; Verify that SSE and AVX 128-bit vector half-precision adds are reassociated. 121 122define <8 x half> @reassociate_adds_v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, <8 x half> %x3) { 123; CHECK-LABEL: reassociate_adds_v8f16: 124; CHECK: # %bb.0: 125; CHECK-NEXT: vdivph %xmm1, %xmm0, %xmm0 126; CHECK-NEXT: vaddph %xmm2, %xmm3, %xmm1 127; CHECK-NEXT: vaddph %xmm0, %xmm1, %xmm0 128; CHECK-NEXT: retq 129 %t0 = fdiv reassoc nsz <8 x half> %x0, %x1 130 %t1 = fadd reassoc nsz <8 x half> %x2, %t0 131 %t2 = fadd reassoc nsz <8 x half> %x3, %t1 132 ret <8 x half> %t2 133} 134 135; Verify that SSE and AVX 128-bit vector half-precision multiplies are reassociated. 136 137define <8 x half> @reassociate_muls_v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, <8 x half> %x3) { 138; CHECK-LABEL: reassociate_muls_v8f16: 139; CHECK: # %bb.0: 140; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 141; CHECK-NEXT: vmulph %xmm2, %xmm3, %xmm1 142; CHECK-NEXT: vmulph %xmm0, %xmm1, %xmm0 143; CHECK-NEXT: retq 144 %t0 = fadd reassoc nsz <8 x half> %x0, %x1 145 %t1 = fmul reassoc nsz <8 x half> %x2, %t0 146 %t2 = fmul reassoc nsz <8 x half> %x3, %t1 147 ret <8 x half> %t2 148} 149 150; Verify that AVX 256-bit vector half-precision adds are reassociated. 151 152define <16 x half> @reassociate_adds_v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, <16 x half> %x3) { 153; CHECK-LABEL: reassociate_adds_v16f16: 154; CHECK: # %bb.0: 155; CHECK-NEXT: vdivph %ymm1, %ymm0, %ymm0 156; CHECK-NEXT: vaddph %ymm2, %ymm3, %ymm1 157; CHECK-NEXT: vaddph %ymm0, %ymm1, %ymm0 158; CHECK-NEXT: retq 159 %t0 = fdiv reassoc nsz <16 x half> %x0, %x1 160 %t1 = fadd reassoc nsz <16 x half> %x2, %t0 161 %t2 = fadd reassoc nsz <16 x half> %x3, %t1 162 ret <16 x half> %t2 163} 164 165; Verify that AVX 256-bit vector half-precision multiplies are reassociated. 166 167define <16 x half> @reassociate_muls_v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, <16 x half> %x3) { 168; CHECK-LABEL: reassociate_muls_v16f16: 169; CHECK: # %bb.0: 170; CHECK-NEXT: vaddph %ymm1, %ymm0, %ymm0 171; CHECK-NEXT: vmulph %ymm2, %ymm3, %ymm1 172; CHECK-NEXT: vmulph %ymm0, %ymm1, %ymm0 173; CHECK-NEXT: retq 174 %t0 = fadd reassoc nsz <16 x half> %x0, %x1 175 %t1 = fmul reassoc nsz <16 x half> %x2, %t0 176 %t2 = fmul reassoc nsz <16 x half> %x3, %t1 177 ret <16 x half> %t2 178} 179 180; Verify that AVX512 512-bit vector half-precision adds are reassociated. 181 182define <32 x half> @reassociate_adds_v32f16(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, <32 x half> %x3) { 183; CHECK-LABEL: reassociate_adds_v32f16: 184; CHECK: # %bb.0: 185; CHECK-NEXT: vdivph %zmm1, %zmm0, %zmm0 186; CHECK-NEXT: vaddph %zmm2, %zmm3, %zmm1 187; CHECK-NEXT: vaddph %zmm0, %zmm1, %zmm0 188; CHECK-NEXT: retq 189 %t0 = fdiv reassoc nsz <32 x half> %x0, %x1 190 %t1 = fadd reassoc nsz <32 x half> %x2, %t0 191 %t2 = fadd reassoc nsz <32 x half> %x3, %t1 192 ret <32 x half> %t2 193} 194 195; Verify that AVX512 512-bit vector half-precision multiplies are reassociated. 196 197define <32 x half> @reassociate_muls_v32f16(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, <32 x half> %x3) { 198; CHECK-LABEL: reassociate_muls_v32f16: 199; CHECK: # %bb.0: 200; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 201; CHECK-NEXT: vmulph %zmm2, %zmm3, %zmm1 202; CHECK-NEXT: vmulph %zmm0, %zmm1, %zmm0 203; CHECK-NEXT: retq 204 %t0 = fadd reassoc nsz <32 x half> %x0, %x1 205 %t1 = fmul reassoc nsz <32 x half> %x2, %t0 206 %t2 = fmul reassoc nsz <32 x half> %x3, %t1 207 ret <32 x half> %t2 208} 209 210; Verify that SSE and AVX scalar half-precision minimum ops are reassociated. 211 212define half @reassociate_mins_half(half %x0, half %x1, half %x2, half %x3) { 213; CHECK-LABEL: reassociate_mins_half: 214; CHECK: # %bb.0: 215; CHECK-NEXT: vdivsh %xmm1, %xmm0, %xmm0 216; CHECK-NEXT: vminsh %xmm2, %xmm3, %xmm1 217; CHECK-NEXT: vminsh %xmm0, %xmm1, %xmm0 218; CHECK-NEXT: retq 219 %t0 = fdiv half %x0, %x1 220 %cmp1 = fcmp olt half %x2, %t0 221 %sel1 = select i1 %cmp1, half %x2, half %t0 222 %cmp2 = fcmp olt half %x3, %sel1 223 %sel2 = select i1 %cmp2, half %x3, half %sel1 224 ret half %sel2 225} 226 227; Verify that SSE and AVX scalar half-precision maximum ops are reassociated. 228 229define half @reassociate_maxs_half(half %x0, half %x1, half %x2, half %x3) { 230; CHECK-LABEL: reassociate_maxs_half: 231; CHECK: # %bb.0: 232; CHECK-NEXT: vdivsh %xmm1, %xmm0, %xmm0 233; CHECK-NEXT: vmaxsh %xmm2, %xmm3, %xmm1 234; CHECK-NEXT: vmaxsh %xmm0, %xmm1, %xmm0 235; CHECK-NEXT: retq 236 %t0 = fdiv half %x0, %x1 237 %cmp1 = fcmp ogt half %x2, %t0 238 %sel1 = select i1 %cmp1, half %x2, half %t0 239 %cmp2 = fcmp ogt half %x3, %sel1 240 %sel2 = select i1 %cmp2, half %x3, half %sel1 241 ret half %sel2 242} 243 244; Verify that SSE and AVX 128-bit vector half-precision minimum ops are reassociated. 245 246define <8 x half> @reassociate_mins_v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, <8 x half> %x3) { 247; CHECK-LABEL: reassociate_mins_v8f16: 248; CHECK: # %bb.0: 249; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 250; CHECK-NEXT: vminph %xmm2, %xmm3, %xmm1 251; CHECK-NEXT: vminph %xmm0, %xmm1, %xmm0 252; CHECK-NEXT: retq 253 %t0 = fadd <8 x half> %x0, %x1 254 %cmp1 = fcmp olt <8 x half> %x2, %t0 255 %sel1 = select <8 x i1> %cmp1, <8 x half> %x2, <8 x half> %t0 256 %cmp2 = fcmp olt <8 x half> %x3, %sel1 257 %sel2 = select <8 x i1> %cmp2, <8 x half> %x3, <8 x half> %sel1 258 ret <8 x half> %sel2 259} 260 261; Verify that SSE and AVX 128-bit vector half-precision maximum ops are reassociated. 262 263define <8 x half> @reassociate_maxs_v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, <8 x half> %x3) { 264; CHECK-LABEL: reassociate_maxs_v8f16: 265; CHECK: # %bb.0: 266; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 267; CHECK-NEXT: vmaxph %xmm2, %xmm3, %xmm1 268; CHECK-NEXT: vmaxph %xmm0, %xmm1, %xmm0 269; CHECK-NEXT: retq 270 %t0 = fadd <8 x half> %x0, %x1 271 %cmp1 = fcmp ogt <8 x half> %x2, %t0 272 %sel1 = select <8 x i1> %cmp1, <8 x half> %x2, <8 x half> %t0 273 %cmp2 = fcmp ogt <8 x half> %x3, %sel1 274 %sel2 = select <8 x i1> %cmp2, <8 x half> %x3, <8 x half> %sel1 275 ret <8 x half> %sel2 276} 277 278; Verify that AVX 256-bit vector half-precision minimum ops are reassociated. 279 280define <16 x half> @reassociate_mins_v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, <16 x half> %x3) { 281; CHECK-LABEL: reassociate_mins_v16f16: 282; CHECK: # %bb.0: 283; CHECK-NEXT: vaddph %ymm1, %ymm0, %ymm0 284; CHECK-NEXT: vminph %ymm2, %ymm3, %ymm1 285; CHECK-NEXT: vminph %ymm0, %ymm1, %ymm0 286; CHECK-NEXT: retq 287 %t0 = fadd <16 x half> %x0, %x1 288 %cmp1 = fcmp olt <16 x half> %x2, %t0 289 %sel1 = select <16 x i1> %cmp1, <16 x half> %x2, <16 x half> %t0 290 %cmp2 = fcmp olt <16 x half> %x3, %sel1 291 %sel2 = select <16 x i1> %cmp2, <16 x half> %x3, <16 x half> %sel1 292 ret <16 x half> %sel2 293} 294 295; Verify that AVX 256-bit vector half-precision maximum ops are reassociated. 296 297define <16 x half> @reassociate_maxs_v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, <16 x half> %x3) { 298; CHECK-LABEL: reassociate_maxs_v16f16: 299; CHECK: # %bb.0: 300; CHECK-NEXT: vaddph %ymm1, %ymm0, %ymm0 301; CHECK-NEXT: vmaxph %ymm2, %ymm3, %ymm1 302; CHECK-NEXT: vmaxph %ymm0, %ymm1, %ymm0 303; CHECK-NEXT: retq 304 %t0 = fadd <16 x half> %x0, %x1 305 %cmp1 = fcmp ogt <16 x half> %x2, %t0 306 %sel1 = select <16 x i1> %cmp1, <16 x half> %x2, <16 x half> %t0 307 %cmp2 = fcmp ogt <16 x half> %x3, %sel1 308 %sel2 = select <16 x i1> %cmp2, <16 x half> %x3, <16 x half> %sel1 309 ret <16 x half> %sel2 310} 311 312; Verify that AVX512 512-bit vector half-precision minimum ops are reassociated. 313 314define <32 x half> @reassociate_mins_v32f16(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, <32 x half> %x3) { 315; CHECK-LABEL: reassociate_mins_v32f16: 316; CHECK: # %bb.0: 317; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 318; CHECK-NEXT: vminph %zmm2, %zmm3, %zmm1 319; CHECK-NEXT: vminph %zmm0, %zmm1, %zmm0 320; CHECK-NEXT: retq 321 %t0 = fadd <32 x half> %x0, %x1 322 %cmp1 = fcmp olt <32 x half> %x2, %t0 323 %sel1 = select <32 x i1> %cmp1, <32 x half> %x2, <32 x half> %t0 324 %cmp2 = fcmp olt <32 x half> %x3, %sel1 325 %sel2 = select <32 x i1> %cmp2, <32 x half> %x3, <32 x half> %sel1 326 ret <32 x half> %sel2 327} 328 329; Verify that AVX512 512-bit vector half-precision maximum ops are reassociated. 330 331define <32 x half> @reassociate_maxs_v16f32(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, <32 x half> %x3) { 332; CHECK-LABEL: reassociate_maxs_v16f32: 333; CHECK: # %bb.0: 334; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 335; CHECK-NEXT: vmaxph %zmm2, %zmm3, %zmm1 336; CHECK-NEXT: vmaxph %zmm0, %zmm1, %zmm0 337; CHECK-NEXT: retq 338 %t0 = fadd <32 x half> %x0, %x1 339 %cmp1 = fcmp ogt <32 x half> %x2, %t0 340 %sel1 = select <32 x i1> %cmp1, <32 x half> %x2, <32 x half> %t0 341 %cmp2 = fcmp ogt <32 x half> %x3, %sel1 342 %sel2 = select <32 x i1> %cmp2, <32 x half> %x3, <32 x half> %sel1 343 ret <32 x half> %sel2 344} 345 346