xref: /openbsd-src/gnu/llvm/compiler-rt/lib/sanitizer_common/sanitizer_lzw.h (revision 810390e339a5425391477d5d41c78d7cab2424ac)
1*810390e3Srobert //===-- sanitizer_lzw.h -----------------------------------------*- C++ -*-===//
2*810390e3Srobert //
3*810390e3Srobert // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*810390e3Srobert // See https://llvm.org/LICENSE.txt for license information.
5*810390e3Srobert // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*810390e3Srobert //
7*810390e3Srobert //===----------------------------------------------------------------------===//
8*810390e3Srobert //
9*810390e3Srobert // Lempel–Ziv–Welch encoding/decoding
10*810390e3Srobert //
11*810390e3Srobert //===----------------------------------------------------------------------===//
12*810390e3Srobert 
13*810390e3Srobert #ifndef SANITIZER_LZW_H
14*810390e3Srobert #define SANITIZER_LZW_H
15*810390e3Srobert 
16*810390e3Srobert #include "sanitizer_dense_map.h"
17*810390e3Srobert 
18*810390e3Srobert namespace __sanitizer {
19*810390e3Srobert 
20*810390e3Srobert using LzwCodeType = u32;
21*810390e3Srobert 
22*810390e3Srobert template <class T, class ItIn, class ItOut>
LzwEncode(ItIn begin,ItIn end,ItOut out)23*810390e3Srobert ItOut LzwEncode(ItIn begin, ItIn end, ItOut out) {
24*810390e3Srobert   using Substring =
25*810390e3Srobert       detail::DenseMapPair<LzwCodeType /* Prefix */, T /* Next input */>;
26*810390e3Srobert 
27*810390e3Srobert   // Sentinel value for substrings of len 1.
28*810390e3Srobert   static constexpr LzwCodeType kNoPrefix =
29*810390e3Srobert       Min(DenseMapInfo<Substring>::getEmptyKey().first,
30*810390e3Srobert           DenseMapInfo<Substring>::getTombstoneKey().first) -
31*810390e3Srobert       1;
32*810390e3Srobert   DenseMap<Substring, LzwCodeType> prefix_to_code;
33*810390e3Srobert   {
34*810390e3Srobert     // Add all substring of len 1 as initial dictionary.
35*810390e3Srobert     InternalMmapVector<T> dict_len1;
36*810390e3Srobert     for (auto it = begin; it != end; ++it)
37*810390e3Srobert       if (prefix_to_code.try_emplace({kNoPrefix, *it}, 0).second)
38*810390e3Srobert         dict_len1.push_back(*it);
39*810390e3Srobert 
40*810390e3Srobert     // Slightly helps with later delta encoding.
41*810390e3Srobert     Sort(dict_len1.data(), dict_len1.size());
42*810390e3Srobert 
43*810390e3Srobert     // For large sizeof(T) we have to store dict_len1. Smaller types like u8 can
44*810390e3Srobert     // just generate them.
45*810390e3Srobert     *out = dict_len1.size();
46*810390e3Srobert     ++out;
47*810390e3Srobert 
48*810390e3Srobert     for (uptr i = 0; i != dict_len1.size(); ++i) {
49*810390e3Srobert       // Remap after the Sort.
50*810390e3Srobert       prefix_to_code[{kNoPrefix, dict_len1[i]}] = i;
51*810390e3Srobert       *out = dict_len1[i];
52*810390e3Srobert       ++out;
53*810390e3Srobert     }
54*810390e3Srobert     CHECK_EQ(prefix_to_code.size(), dict_len1.size());
55*810390e3Srobert   }
56*810390e3Srobert 
57*810390e3Srobert   if (begin == end)
58*810390e3Srobert     return out;
59*810390e3Srobert 
60*810390e3Srobert   // Main LZW encoding loop.
61*810390e3Srobert   LzwCodeType match = prefix_to_code.find({kNoPrefix, *begin})->second;
62*810390e3Srobert   ++begin;
63*810390e3Srobert   for (auto it = begin; it != end; ++it) {
64*810390e3Srobert     // Extend match with the new item.
65*810390e3Srobert     auto ins = prefix_to_code.try_emplace({match, *it}, prefix_to_code.size());
66*810390e3Srobert     if (ins.second) {
67*810390e3Srobert       // This is a new substring, but emit the code for the current match
68*810390e3Srobert       // (before extend). This allows LZW decoder to recover the dictionary.
69*810390e3Srobert       *out = match;
70*810390e3Srobert       ++out;
71*810390e3Srobert       // Reset the match to a single item, which must be already in the map.
72*810390e3Srobert       match = prefix_to_code.find({kNoPrefix, *it})->second;
73*810390e3Srobert     } else {
74*810390e3Srobert       // Already known, use as the current match.
75*810390e3Srobert       match = ins.first->second;
76*810390e3Srobert     }
77*810390e3Srobert   }
78*810390e3Srobert 
79*810390e3Srobert   *out = match;
80*810390e3Srobert   ++out;
81*810390e3Srobert 
82*810390e3Srobert   return out;
83*810390e3Srobert }
84*810390e3Srobert 
85*810390e3Srobert template <class T, class ItIn, class ItOut>
LzwDecode(ItIn begin,ItIn end,ItOut out)86*810390e3Srobert ItOut LzwDecode(ItIn begin, ItIn end, ItOut out) {
87*810390e3Srobert   if (begin == end)
88*810390e3Srobert     return out;
89*810390e3Srobert 
90*810390e3Srobert   // Load dictionary of len 1 substrings. Theses correspont to lowest codes.
91*810390e3Srobert   InternalMmapVector<T> dict_len1(*begin);
92*810390e3Srobert   ++begin;
93*810390e3Srobert 
94*810390e3Srobert   if (begin == end)
95*810390e3Srobert     return out;
96*810390e3Srobert 
97*810390e3Srobert   for (auto& v : dict_len1) {
98*810390e3Srobert     v = *begin;
99*810390e3Srobert     ++begin;
100*810390e3Srobert   }
101*810390e3Srobert 
102*810390e3Srobert   // Substrings of len 2 and up. Indexes are shifted because [0,
103*810390e3Srobert   // dict_len1.size()) stored in dict_len1. Substings get here after being
104*810390e3Srobert   // emitted to the output, so we can use output position.
105*810390e3Srobert   InternalMmapVector<detail::DenseMapPair<ItOut /* begin. */, ItOut /* end */>>
106*810390e3Srobert       code_to_substr;
107*810390e3Srobert 
108*810390e3Srobert   // Copies already emitted substrings into the output again.
109*810390e3Srobert   auto copy = [&code_to_substr, &dict_len1](LzwCodeType code, ItOut out) {
110*810390e3Srobert     if (code < dict_len1.size()) {
111*810390e3Srobert       *out = dict_len1[code];
112*810390e3Srobert       ++out;
113*810390e3Srobert       return out;
114*810390e3Srobert     }
115*810390e3Srobert     const auto& s = code_to_substr[code - dict_len1.size()];
116*810390e3Srobert 
117*810390e3Srobert     for (ItOut it = s.first; it != s.second; ++it, ++out) *out = *it;
118*810390e3Srobert     return out;
119*810390e3Srobert   };
120*810390e3Srobert 
121*810390e3Srobert   // Returns lens of the substring with the given code.
122*810390e3Srobert   auto code_to_len = [&code_to_substr, &dict_len1](LzwCodeType code) -> uptr {
123*810390e3Srobert     if (code < dict_len1.size())
124*810390e3Srobert       return 1;
125*810390e3Srobert     const auto& s = code_to_substr[code - dict_len1.size()];
126*810390e3Srobert     return s.second - s.first;
127*810390e3Srobert   };
128*810390e3Srobert 
129*810390e3Srobert   // Main LZW decoding loop.
130*810390e3Srobert   LzwCodeType prev_code = *begin;
131*810390e3Srobert   ++begin;
132*810390e3Srobert   out = copy(prev_code, out);
133*810390e3Srobert   for (auto it = begin; it != end; ++it) {
134*810390e3Srobert     LzwCodeType code = *it;
135*810390e3Srobert     auto start = out;
136*810390e3Srobert     if (code == dict_len1.size() + code_to_substr.size()) {
137*810390e3Srobert       // Special LZW case. The code is not in the dictionary yet. This is
138*810390e3Srobert       // possible only when the new substring is the same as previous one plus
139*810390e3Srobert       // the first item of the previous substring. We can emit that in two
140*810390e3Srobert       // steps.
141*810390e3Srobert       out = copy(prev_code, out);
142*810390e3Srobert       *out = *start;
143*810390e3Srobert       ++out;
144*810390e3Srobert     } else {
145*810390e3Srobert       out = copy(code, out);
146*810390e3Srobert     }
147*810390e3Srobert 
148*810390e3Srobert     // Every time encoded emits the code, it also creates substing of len + 1
149*810390e3Srobert     // including the first item of the just emmited substring. Do the same here.
150*810390e3Srobert     uptr len = code_to_len(prev_code);
151*810390e3Srobert     code_to_substr.push_back({start - len, start + 1});
152*810390e3Srobert 
153*810390e3Srobert     prev_code = code;
154*810390e3Srobert   }
155*810390e3Srobert   return out;
156*810390e3Srobert }
157*810390e3Srobert 
158*810390e3Srobert }  // namespace __sanitizer
159*810390e3Srobert #endif
160