FakeQuantSupport.cpp - OpenGrok cross reference for /llvm-project/mlir/lib/Dialect/Quant/Utils/FakeQuantSupport.cpp

Lines Matching +full:zero +full:- +full:point

1 //===- FakeQuantSupport.cpp - Support utilities for FakeQuant ops ---------===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
19   // Hard-coded type mapping from TFLite.
23       qmin = -128;
32       qmin = -32768;
60 // to include 0.0, but the range width size (rmax-rmin) isn't changed. The zero
61 // point is derived from the shifted range, and the scale isn't changed. As
72   scale = (rmax - rmin) / (qmaxDouble - qminDouble);
74   // Zero point computation.
78   // The arithmetic error on the zero point computed from either pair will be
81   const double zeroPointFromMin = qminDouble - rmin / scale;
84   const double zeroPointFromMax = qmaxDouble - rmax / scale;
92   // Now nudge the zero point to be an integer.
102   // By construction, the nudged zero point should always be in range.
123   // 0.0s, so the scale is set to 1.0 and the tensor can be quantized to zero
125   if (std::fabs(rmax - rmin) < std::numeric_limits<double>::epsilon()) {
145     return (emitError(loc, "mismatched per-axis min and max size: ")
167     if (std::fabs(rmax - rmin) < std::numeric_limits<double>::epsilon()) {