lib/sanitizer_common/sanitizer_lzw.h

*810390e3Srobert//===-- sanitizer_lzw.h -----------------------------------------*- C++ -*-===//
*810390e3Srobert//
*810390e3Srobert// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
*810390e3Srobert// See https://llvm.org/LICENSE.txt for license information.
*810390e3Srobert// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*810390e3Srobert//
*810390e3Srobert//===----------------------------------------------------------------------===//
*810390e3Srobert//
*810390e3Srobert// Lempel–Ziv–Welch encoding/decoding
*810390e3Srobert//
*810390e3Srobert//===----------------------------------------------------------------------===//
*810390e3Srobert
*810390e3Srobert#ifndef SANITIZER_LZW_H
*810390e3Srobert#define SANITIZER_LZW_H
*810390e3Srobert
*810390e3Srobert#include "sanitizer_dense_map.h"
*810390e3Srobert
*810390e3Srobertnamespace __sanitizer {
*810390e3Srobert
*810390e3Srobertusing LzwCodeType = u32;
*810390e3Srobert
*810390e3Sroberttemplate <class T, class ItIn, class ItOut>
*810390e3SrobertItOut LzwEncode(ItIn begin, ItIn end, ItOut out) {
*810390e3Srobert  using Substring =
*810390e3Srobert      detail::DenseMapPair<LzwCodeType /* Prefix */, T /* Next input */>;
*810390e3Srobert
*810390e3Srobert  // Sentinel value for substrings of len 1.
*810390e3Srobert  static constexpr LzwCodeType kNoPrefix =
*810390e3Srobert      Min(DenseMapInfo<Substring>::getEmptyKey().first,
*810390e3Srobert          DenseMapInfo<Substring>::getTombstoneKey().first) -
*810390e3Srobert      1;
*810390e3Srobert  DenseMap<Substring, LzwCodeType> prefix_to_code;
*810390e3Srobert  {
*810390e3Srobert    // Add all substring of len 1 as initial dictionary.
*810390e3Srobert    InternalMmapVector<T> dict_len1;
*810390e3Srobert    for (auto it = begin; it != end; ++it)
*810390e3Srobert      if (prefix_to_code.try_emplace({kNoPrefix, *it}, 0).second)
*810390e3Srobert        dict_len1.push_back(*it);
*810390e3Srobert
*810390e3Srobert    // Slightly helps with later delta encoding.
*810390e3Srobert    Sort(dict_len1.data(), dict_len1.size());
*810390e3Srobert
*810390e3Srobert    // For large sizeof(T) we have to store dict_len1. Smaller types like u8 can
*810390e3Srobert    // just generate them.
*810390e3Srobert    *out = dict_len1.size();
*810390e3Srobert    ++out;
*810390e3Srobert
*810390e3Srobert    for (uptr i = 0; i != dict_len1.size(); ++i) {
*810390e3Srobert      // Remap after the Sort.
*810390e3Srobert      prefix_to_code[{kNoPrefix, dict_len1[i]}] = i;
*810390e3Srobert      *out = dict_len1[i];
*810390e3Srobert      ++out;
*810390e3Srobert    }
*810390e3Srobert    CHECK_EQ(prefix_to_code.size(), dict_len1.size());
*810390e3Srobert  }
*810390e3Srobert
*810390e3Srobert  if (begin == end)
*810390e3Srobert    return out;
*810390e3Srobert
*810390e3Srobert  // Main LZW encoding loop.
*810390e3Srobert  LzwCodeType match = prefix_to_code.find({kNoPrefix, *begin})->second;
*810390e3Srobert  ++begin;
*810390e3Srobert  for (auto it = begin; it != end; ++it) {
*810390e3Srobert    // Extend match with the new item.
*810390e3Srobert    auto ins = prefix_to_code.try_emplace({match, *it}, prefix_to_code.size());
*810390e3Srobert    if (ins.second) {
*810390e3Srobert      // This is a new substring, but emit the code for the current match
*810390e3Srobert      // (before extend). This allows LZW decoder to recover the dictionary.
*810390e3Srobert      *out = match;
*810390e3Srobert      ++out;
*810390e3Srobert      // Reset the match to a single item, which must be already in the map.
*810390e3Srobert      match = prefix_to_code.find({kNoPrefix, *it})->second;
*810390e3Srobert    } else {
*810390e3Srobert      // Already known, use as the current match.
*810390e3Srobert      match = ins.first->second;
*810390e3Srobert    }
*810390e3Srobert  }
*810390e3Srobert
*810390e3Srobert  *out = match;
*810390e3Srobert  ++out;
*810390e3Srobert
*810390e3Srobert  return out;
*810390e3Srobert}
*810390e3Srobert
*810390e3Sroberttemplate <class T, class ItIn, class ItOut>
*810390e3SrobertItOut LzwDecode(ItIn begin, ItIn end, ItOut out) {
*810390e3Srobert  if (begin == end)
*810390e3Srobert    return out;
*810390e3Srobert
*810390e3Srobert  // Load dictionary of len 1 substrings. Theses correspont to lowest codes.
*810390e3Srobert  InternalMmapVector<T> dict_len1(*begin);
*810390e3Srobert  ++begin;
*810390e3Srobert
*810390e3Srobert  if (begin == end)
*810390e3Srobert    return out;
*810390e3Srobert
*810390e3Srobert  for (auto& v : dict_len1) {
*810390e3Srobert    v = *begin;
*810390e3Srobert    ++begin;
*810390e3Srobert  }
*810390e3Srobert
*810390e3Srobert  // Substrings of len 2 and up. Indexes are shifted because [0,
*810390e3Srobert  // dict_len1.size()) stored in dict_len1. Substings get here after being
*810390e3Srobert  // emitted to the output, so we can use output position.
*810390e3Srobert  InternalMmapVector<detail::DenseMapPair<ItOut /* begin. */, ItOut /* end */>>
*810390e3Srobert      code_to_substr;
*810390e3Srobert
*810390e3Srobert  // Copies already emitted substrings into the output again.
*810390e3Srobert  auto copy = [&code_to_substr, &dict_len1](LzwCodeType code, ItOut out) {
*810390e3Srobert    if (code < dict_len1.size()) {
*810390e3Srobert      *out = dict_len1[code];
*810390e3Srobert      ++out;
*810390e3Srobert      return out;
*810390e3Srobert    }
*810390e3Srobert    const auto& s = code_to_substr[code - dict_len1.size()];
*810390e3Srobert
*810390e3Srobert    for (ItOut it = s.first; it != s.second; ++it, ++out) *out = *it;
*810390e3Srobert    return out;
*810390e3Srobert  };
*810390e3Srobert
*810390e3Srobert  // Returns lens of the substring with the given code.
*810390e3Srobert  auto code_to_len = [&code_to_substr, &dict_len1](LzwCodeType code) -> uptr {
*810390e3Srobert    if (code < dict_len1.size())
*810390e3Srobert      return 1;
*810390e3Srobert    const auto& s = code_to_substr[code - dict_len1.size()];
*810390e3Srobert    return s.second - s.first;
*810390e3Srobert  };
*810390e3Srobert
*810390e3Srobert  // Main LZW decoding loop.
*810390e3Srobert  LzwCodeType prev_code = *begin;
*810390e3Srobert  ++begin;
*810390e3Srobert  out = copy(prev_code, out);
*810390e3Srobert  for (auto it = begin; it != end; ++it) {
*810390e3Srobert    LzwCodeType code = *it;
*810390e3Srobert    auto start = out;
*810390e3Srobert    if (code == dict_len1.size() + code_to_substr.size()) {
*810390e3Srobert      // Special LZW case. The code is not in the dictionary yet. This is
*810390e3Srobert      // possible only when the new substring is the same as previous one plus
*810390e3Srobert      // the first item of the previous substring. We can emit that in two
*810390e3Srobert      // steps.
*810390e3Srobert      out = copy(prev_code, out);
*810390e3Srobert      *out = *start;
*810390e3Srobert      ++out;
*810390e3Srobert    } else {
*810390e3Srobert      out = copy(code, out);
*810390e3Srobert    }
*810390e3Srobert
*810390e3Srobert    // Every time encoded emits the code, it also creates substing of len + 1
*810390e3Srobert    // including the first item of the just emmited substring. Do the same here.
*810390e3Srobert    uptr len = code_to_len(prev_code);
*810390e3Srobert    code_to_substr.push_back({start - len, start + 1});
*810390e3Srobert
*810390e3Srobert    prev_code = code;
*810390e3Srobert  }
*810390e3Srobert  return out;
*810390e3Srobert}
*810390e3Srobert
*810390e3Srobert}  // namespace __sanitizer
*810390e3Srobert#endif