//===- ExpandOps.cpp - Pass to legalize Arith ops for LLVM lowering --===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "mlir/Dialect/Arith/Transforms/Passes.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/IR/ImplicitLocOpBuilder.h" #include "mlir/IR/TypeUtilities.h" #include "mlir/Transforms/DialectConversion.h" namespace mlir { namespace arith { #define GEN_PASS_DEF_ARITHEXPANDOPSPASS #include "mlir/Dialect/Arith/Transforms/Passes.h.inc" } // namespace arith } // namespace mlir using namespace mlir; /// Create an integer or index constant. static Value createConst(Location loc, Type type, int value, PatternRewriter &rewriter) { auto attr = rewriter.getIntegerAttr(getElementTypeOrSelf(type), value); if (auto shapedTy = dyn_cast(type)) { return rewriter.create( loc, DenseElementsAttr::get(shapedTy, attr)); } return rewriter.create(loc, attr); } namespace { /// Expands CeilDivUIOp (n, m) into /// n == 0 ? 0 : ((n-1) / m) + 1 struct CeilDivUIOpConverter : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(arith::CeilDivUIOp op, PatternRewriter &rewriter) const final { Location loc = op.getLoc(); Value a = op.getLhs(); Value b = op.getRhs(); Value zero = createConst(loc, a.getType(), 0, rewriter); Value compare = rewriter.create(loc, arith::CmpIPredicate::eq, a, zero); Value one = createConst(loc, a.getType(), 1, rewriter); Value minusOne = rewriter.create(loc, a, one); Value quotient = rewriter.create(loc, minusOne, b); Value plusOne = rewriter.create(loc, quotient, one); rewriter.replaceOpWithNewOp(op, compare, zero, plusOne); return success(); } }; /// Expands CeilDivSIOp (n, m) into /// 1) x = (m > 0) ? -1 : 1 /// 2) (n*m>0) ? ((n+x) / m) + 1 : - (-n / m) struct CeilDivSIOpConverter : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(arith::CeilDivSIOp op, PatternRewriter &rewriter) const final { Location loc = op.getLoc(); Type type = op.getType(); Value a = op.getLhs(); Value b = op.getRhs(); Value plusOne = createConst(loc, type, 1, rewriter); Value zero = createConst(loc, type, 0, rewriter); Value minusOne = createConst(loc, type, -1, rewriter); // Compute x = (b>0) ? -1 : 1. Value compare = rewriter.create(loc, arith::CmpIPredicate::sgt, b, zero); Value x = rewriter.create(loc, compare, minusOne, plusOne); // Compute positive res: 1 + ((x+a)/b). Value xPlusA = rewriter.create(loc, x, a); Value xPlusADivB = rewriter.create(loc, xPlusA, b); Value posRes = rewriter.create(loc, plusOne, xPlusADivB); // Compute negative res: - ((-a)/b). Value minusA = rewriter.create(loc, zero, a); Value minusADivB = rewriter.create(loc, minusA, b); Value negRes = rewriter.create(loc, zero, minusADivB); // Result is (a*b>0) ? pos result : neg result. // Note, we want to avoid using a*b because of possible overflow. // The case that matters are a>0, a==0, a<0, b>0 and b<0. We do // not particuliarly care if a*b<0 is true or false when b is zero // as this will result in an illegal divide. So `a*b<0` can be reformulated // as `(a<0 && b<0) || (a>0 && b>0)' or `(a<0 && b<0) || (a>0 && b>=0)'. // We pick the first expression here. Value aNeg = rewriter.create(loc, arith::CmpIPredicate::slt, a, zero); Value aPos = rewriter.create(loc, arith::CmpIPredicate::sgt, a, zero); Value bNeg = rewriter.create(loc, arith::CmpIPredicate::slt, b, zero); Value bPos = rewriter.create(loc, arith::CmpIPredicate::sgt, b, zero); Value firstTerm = rewriter.create(loc, aNeg, bNeg); Value secondTerm = rewriter.create(loc, aPos, bPos); Value compareRes = rewriter.create(loc, firstTerm, secondTerm); // Perform substitution and return success. rewriter.replaceOpWithNewOp(op, compareRes, posRes, negRes); return success(); } }; /// Expands FloorDivSIOp (x, y) into /// z = x / y /// if (z * y != x && (x < 0) != (y < 0)) { /// return z - 1; /// } else { /// return z; /// } struct FloorDivSIOpConverter : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(arith::FloorDivSIOp op, PatternRewriter &rewriter) const final { Location loc = op.getLoc(); Type type = op.getType(); Value a = op.getLhs(); Value b = op.getRhs(); Value quotient = rewriter.create(loc, a, b); Value product = rewriter.create(loc, quotient, b); Value notEqualDivisor = rewriter.create( loc, arith::CmpIPredicate::ne, a, product); Value zero = createConst(loc, type, 0, rewriter); Value aNeg = rewriter.create(loc, arith::CmpIPredicate::slt, a, zero); Value bNeg = rewriter.create(loc, arith::CmpIPredicate::slt, b, zero); Value signOpposite = rewriter.create( loc, arith::CmpIPredicate::ne, aNeg, bNeg); Value cond = rewriter.create(loc, notEqualDivisor, signOpposite); Value minusOne = createConst(loc, type, -1, rewriter); Value quotientMinusOne = rewriter.create(loc, quotient, minusOne); rewriter.replaceOpWithNewOp(op, cond, quotientMinusOne, quotient); return success(); } }; template struct MaxMinIOpConverter : public OpRewritePattern { public: using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(OpTy op, PatternRewriter &rewriter) const final { Value lhs = op.getLhs(); Value rhs = op.getRhs(); Value cmp = rewriter.create(op.getLoc(), pred, lhs, rhs); rewriter.replaceOpWithNewOp(op, cmp, lhs, rhs); return success(); } }; template struct MaximumMinimumFOpConverter : public OpRewritePattern { public: using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(OpTy op, PatternRewriter &rewriter) const final { Value lhs = op.getLhs(); Value rhs = op.getRhs(); Location loc = op.getLoc(); // If any operand is NaN, 'cmp' will be true (and 'select' returns 'lhs'). static_assert(pred == arith::CmpFPredicate::UGT || pred == arith::CmpFPredicate::ULT, "pred must be either UGT or ULT"); Value cmp = rewriter.create(loc, pred, lhs, rhs); Value select = rewriter.create(loc, cmp, lhs, rhs); // Handle the case where rhs is NaN: 'isNaN(rhs) ? rhs : select'. Value isNaN = rewriter.create(loc, arith::CmpFPredicate::UNO, rhs, rhs); rewriter.replaceOpWithNewOp(op, isNaN, rhs, select); return success(); } }; template struct MaxNumMinNumFOpConverter : public OpRewritePattern { public: using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(OpTy op, PatternRewriter &rewriter) const final { Value lhs = op.getLhs(); Value rhs = op.getRhs(); Location loc = op.getLoc(); // If any operand is NaN, 'cmp' will be true (and 'select' returns 'lhs'). static_assert(pred == arith::CmpFPredicate::UGT || pred == arith::CmpFPredicate::ULT, "pred must be either UGT or ULT"); Value cmp = rewriter.create(loc, pred, lhs, rhs); Value select = rewriter.create(loc, cmp, lhs, rhs); // Handle the case where lhs is NaN: 'isNaN(lhs) ? rhs : select'. Value isNaN = rewriter.create(loc, arith::CmpFPredicate::UNO, lhs, lhs); rewriter.replaceOpWithNewOp(op, isNaN, rhs, select); return success(); } }; struct BFloat16ExtFOpConverter : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(arith::ExtFOp op, PatternRewriter &rewriter) const final { ImplicitLocOpBuilder b(op.getLoc(), rewriter); auto operand = op.getOperand(); Type operandTy = operand.getType(); Type resultTy = op.getType(); Type operandETy = getElementTypeOrSelf(operandTy); Type resultETy = getElementTypeOrSelf(resultTy); if (!operandETy.isBF16() || !resultETy.isF32()) { return rewriter.notifyMatchFailure(op, "not a ext of bf16 to f32."); } Type i16Ty = b.getI16Type(); Type i32Ty = b.getI32Type(); if (auto shapedTy = dyn_cast(operandTy)) { i16Ty = shapedTy.clone(i16Ty); i32Ty = shapedTy.clone(i32Ty); } Value bitcast = b.create(i16Ty, operand); Value exti = b.create(i32Ty, bitcast); Value c16 = createConst(op.getLoc(), i32Ty, 16, rewriter); Value shl = b.create(exti, c16); Value result = b.create(resultTy, shl); rewriter.replaceOp(op, result); return success(); } }; struct BFloat16TruncFOpConverter : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(arith::TruncFOp op, PatternRewriter &rewriter) const final { ImplicitLocOpBuilder b(op.getLoc(), rewriter); auto operand = op.getOperand(); Type operandTy = operand.getType(); Type resultTy = op.getType(); Type operandETy = getElementTypeOrSelf(operandTy); Type resultETy = getElementTypeOrSelf(resultTy); if (!operandETy.isF32() || !resultETy.isBF16()) { return rewriter.notifyMatchFailure(op, "not a trunc of f32 to bf16."); } if (op.getRoundingmodeAttr()) { return rewriter.notifyMatchFailure( op, "only applicable to default rounding mode."); } Type i16Ty = b.getI16Type(); Type i32Ty = b.getI32Type(); Type f32Ty = b.getF32Type(); if (auto shapedTy = dyn_cast(operandTy)) { i16Ty = shapedTy.clone(i16Ty); i32Ty = shapedTy.clone(i32Ty); f32Ty = shapedTy.clone(f32Ty); } // Algorithm borrowed from this excellent code: // https://github.com/pytorch/pytorch/blob/e1502c0cdbfd17548c612f25d5a65b1e4b86224d/c10/util/BFloat16.h#L60-L79 // There is a magic idea there, to let the addition of the rounding_bias to // the mantissa simply overflow into the exponent bits. It's a bit of an // aggressive, obfuscating optimization, but it is well-tested code, and it // results in more concise and efficient IR. // The case of NaN is handled separately (see isNaN and the final select). // The case of infinities is NOT handled separately, which deserves an // explanation. As the encoding of infinities has zero mantissa, the // rounding-bias addition never carries into the exponent so that just gets // truncated away, and as bfloat16 and float32 have the same number of // exponent bits, that simple truncation is the desired outcome for // infinities. Value isNan = b.create(arith::CmpFPredicate::UNE, operand, operand); // Constant used to make the rounding bias. Value c7FFF = createConst(op.getLoc(), i32Ty, 0x7fff, rewriter); // Constant used to generate a quiet NaN. Value c7FC0_i16 = createConst(op.getLoc(), i16Ty, 0x7fc0, rewriter); // Small constants used to address bits. Value c16 = createConst(op.getLoc(), i32Ty, 16, rewriter); Value c1 = createConst(op.getLoc(), i32Ty, 1, rewriter); // Reinterpret the input f32 value as bits. Value bitcast = b.create(i32Ty, operand); // Read bit 16 as a value in {0,1}. Value bit16 = b.create(b.create(bitcast, c16), c1); // Determine the rounding bias to add as either 0x7fff or 0x8000 depending // on bit 16, implementing the tie-breaking "to nearest even". Value roundingBias = b.create(bit16, c7FFF); // Add the rounding bias. Generally we want this to be added to the // mantissa, but nothing prevents this to from carrying into the exponent // bits, which would feel like a bug, but this is the magic trick here: // when that happens, the mantissa gets reset to zero and the exponent // gets incremented by the carry... which is actually exactly what we // want. Value biased = b.create(bitcast, roundingBias); // Now that the rounding-bias has been added, truncating the low bits // yields the correctly rounded result. Value biasedAndShifted = b.create(biased, c16); Value normalCaseResult_i16 = b.create(i16Ty, biasedAndShifted); // Select either the above-computed result, or a quiet NaN constant // if the input was NaN. Value select = b.create(isNan, c7FC0_i16, normalCaseResult_i16); Value result = b.create(resultTy, select); rewriter.replaceOp(op, result); return success(); } }; struct ArithExpandOpsPass : public arith::impl::ArithExpandOpsPassBase { using ArithExpandOpsPassBase::ArithExpandOpsPassBase; void runOnOperation() override { RewritePatternSet patterns(&getContext()); ConversionTarget target(getContext()); arith::populateArithExpandOpsPatterns(patterns); target.addLegalDialect(); // clang-format off target.addIllegalOp< arith::CeilDivSIOp, arith::CeilDivUIOp, arith::FloorDivSIOp, arith::MaxSIOp, arith::MaxUIOp, arith::MinSIOp, arith::MinUIOp, arith::MaximumFOp, arith::MinimumFOp, arith::MaxNumFOp, arith::MinNumFOp >(); if (includeBf16) { arith::populateExpandBFloat16Patterns(patterns); target.addDynamicallyLegalOp( [](arith::ExtFOp op) { Type inETy = getElementTypeOrSelf(op.getOperand().getType()); Type outETy = getElementTypeOrSelf(op.getType()); return !(inETy.isBF16() && outETy.isF32()); }); target.addDynamicallyLegalOp( [](arith::TruncFOp op) { Type inETy = getElementTypeOrSelf(op.getOperand().getType()); Type outETy = getElementTypeOrSelf(op.getType()); return !(inETy.isF32() && outETy.isBF16()); }); } // clang-format on if (failed(applyPartialConversion(getOperation(), target, std::move(patterns)))) signalPassFailure(); } }; } // namespace void mlir::arith::populateCeilFloorDivExpandOpsPatterns( RewritePatternSet &patterns) { patterns .add( patterns.getContext()); } void mlir::arith::populateExpandBFloat16Patterns(RewritePatternSet &patterns) { patterns.add( patterns.getContext()); } void mlir::arith::populateArithExpandOpsPatterns(RewritePatternSet &patterns) { populateCeilFloorDivExpandOpsPatterns(patterns); // clang-format off patterns.add< MaxMinIOpConverter, MaxMinIOpConverter, MaxMinIOpConverter, MaxMinIOpConverter, MaximumMinimumFOpConverter, MaximumMinimumFOpConverter, MaxNumMinNumFOpConverter, MaxNumMinNumFOpConverter >(patterns.getContext()); // clang-format on }