xref: /llvm-project/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h (revision 4daf86ef3f9a6781eedbe5d5a161e4f1e75df323)
1 //===- File.h - Reading sparse tensors from files ---------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements reading sparse tensor from files in one of the
10 // following external formats:
11 //
12 // (1) Matrix Market Exchange (MME): *.mtx
13 //     https://math.nist.gov/MatrixMarket/formats.html
14 //
15 // (2) Formidable Repository of Open Sparse Tensors and Tools (FROSTT): *.tns
16 //     http://frostt.io/tensors/file-formats.html
17 //
18 //===----------------------------------------------------------------------===//
19 
20 #ifndef MLIR_EXECUTIONENGINE_SPARSETENSOR_FILE_H
21 #define MLIR_EXECUTIONENGINE_SPARSETENSOR_FILE_H
22 
23 #include "mlir/ExecutionEngine/SparseTensor/MapRef.h"
24 #include "mlir/ExecutionEngine/SparseTensor/Storage.h"
25 
26 #include <fstream>
27 
28 namespace mlir {
29 namespace sparse_tensor {
30 
31 namespace detail {
32 
33 template <typename T>
34 struct is_complex final : public std::false_type {};
35 
36 template <typename T>
37 struct is_complex<std::complex<T>> final : public std::true_type {};
38 
39 /// Returns an element-value of non-complex type.  If `IsPattern` is true,
40 /// then returns an arbitrary value.  If `IsPattern` is false, then
41 /// reads the value from the current line buffer beginning at `linePtr`.
42 template <typename V, bool IsPattern>
43 inline std::enable_if_t<!is_complex<V>::value, V> readValue(char **linePtr) {
44   // The external formats always store these numerical values with the type
45   // double, but we cast these values to the sparse tensor object type.
46   // For a pattern tensor, we arbitrarily pick the value 1 for all entries.
47   if constexpr (IsPattern)
48     return 1.0;
49   return strtod(*linePtr, linePtr);
50 }
51 
52 /// Returns an element-value of complex type.  If `IsPattern` is true,
53 /// then returns an arbitrary value.  If `IsPattern` is false, then reads
54 /// the value from the current line buffer beginning at `linePtr`.
55 template <typename V, bool IsPattern>
56 inline std::enable_if_t<is_complex<V>::value, V> readValue(char **linePtr) {
57   // Read two values to make a complex. The external formats always store
58   // numerical values with the type double, but we cast these values to the
59   // sparse tensor object type. For a pattern tensor, we arbitrarily pick the
60   // value 1 for all entries.
61   if constexpr (IsPattern)
62     return V(1.0, 1.0);
63   double re = strtod(*linePtr, linePtr);
64   double im = strtod(*linePtr, linePtr);
65   // Avoiding brace-notation since that forbids narrowing to `float`.
66   return V(re, im);
67 }
68 
69 /// Returns an element-value.  If `isPattern` is true, then returns an
70 /// arbitrary value.  If `isPattern` is false, then reads the value from
71 /// the current line buffer beginning at `linePtr`.
72 template <typename V>
73 inline V readValue(char **linePtr, bool isPattern) {
74   return isPattern ? readValue<V, true>(linePtr) : readValue<V, false>(linePtr);
75 }
76 
77 } // namespace detail
78 
79 //===----------------------------------------------------------------------===//
80 //
81 //  Reader class.
82 //
83 //===----------------------------------------------------------------------===//
84 
85 /// This class abstracts over the information stored in file headers,
86 /// as well as providing the buffers and methods for parsing those headers.
87 class SparseTensorReader final {
88 public:
89   enum class ValueKind : uint8_t {
90     // The value before calling `readHeader`.
91     kInvalid = 0,
92     // Values that can be set by `readMMEHeader`.
93     kPattern = 1,
94     kReal = 2,
95     kInteger = 3,
96     kComplex = 4,
97     // The value set by `readExtFROSTTHeader`.
98     kUndefined = 5
99   };
100 
101   explicit SparseTensorReader(const char *filename) : filename(filename) {
102     assert(filename && "Received nullptr for filename");
103   }
104 
105   // Disallows copying, to avoid duplicating the `file` pointer.
106   SparseTensorReader(const SparseTensorReader &) = delete;
107   SparseTensorReader &operator=(const SparseTensorReader &) = delete;
108 
109   /// Factory method to allocate a new reader, open the file, read the
110   /// header, and validate that the actual contents of the file match
111   /// the expected `dimShape` and `valTp`.
112   static SparseTensorReader *create(const char *filename, uint64_t dimRank,
113                                     const uint64_t *dimShape,
114                                     PrimaryType valTp) {
115     SparseTensorReader *reader = new SparseTensorReader(filename);
116     reader->openFile();
117     reader->readHeader();
118     if (!reader->canReadAs(valTp)) {
119       fprintf(stderr,
120               "Tensor element type %d not compatible with values in file %s\n",
121               static_cast<int>(valTp), filename);
122       exit(1);
123     }
124     reader->assertMatchesShape(dimRank, dimShape);
125     return reader;
126   }
127 
128   // This dtor tries to avoid leaking the `file`.  (Though it's better
129   // to call `closeFile` explicitly when possible, since there are
130   // circumstances where dtors are not called reliably.)
131   ~SparseTensorReader() { closeFile(); }
132 
133   /// Opens the file for reading.
134   void openFile();
135 
136   /// Closes the file.
137   void closeFile();
138 
139   /// Reads and parses the file's header.
140   void readHeader();
141 
142   /// Returns the stored value kind.
143   ValueKind getValueKind() const { return valueKind_; }
144 
145   /// Checks if a header has been successfully read.
146   bool isValid() const { return valueKind_ != ValueKind::kInvalid; }
147 
148   /// Checks if the file's ValueKind can be converted into the given
149   /// tensor PrimaryType.  Is only valid after parsing the header.
150   bool canReadAs(PrimaryType valTy) const;
151 
152   /// Gets the MME "pattern" property setting.  Is only valid after
153   /// parsing the header.
154   bool isPattern() const {
155     assert(isValid() && "Attempt to isPattern() before readHeader()");
156     return valueKind_ == ValueKind::kPattern;
157   }
158 
159   /// Gets the MME "symmetric" property setting.  Is only valid after
160   /// parsing the header.
161   bool isSymmetric() const {
162     assert(isValid() && "Attempt to isSymmetric() before readHeader()");
163     return isSymmetric_;
164   }
165 
166   /// Gets the dimension-rank of the tensor.  Is only valid after parsing
167   /// the header.
168   uint64_t getRank() const {
169     assert(isValid() && "Attempt to getRank() before readHeader()");
170     return idata[0];
171   }
172 
173   /// Gets the number of stored elements.  Is only valid after parsing
174   /// the header.
175   uint64_t getNSE() const {
176     assert(isValid() && "Attempt to getNSE() before readHeader()");
177     return idata[1];
178   }
179 
180   /// Gets the dimension-sizes array.  The pointer itself is always
181   /// valid; however, the values stored therein are only valid after
182   /// parsing the header.
183   const uint64_t *getDimSizes() const { return idata + 2; }
184 
185   /// Safely gets the size of the given dimension.  Is only valid
186   /// after parsing the header.
187   uint64_t getDimSize(uint64_t d) const {
188     assert(d < getRank() && "Dimension out of bounds");
189     return idata[2 + d];
190   }
191 
192   /// Asserts the shape subsumes the actual dimension sizes.  Is only
193   /// valid after parsing the header.
194   void assertMatchesShape(uint64_t rank, const uint64_t *shape) const;
195 
196   /// Allocates a new sparse-tensor storage object with the given encoding,
197   /// initializes it by reading all the elements from the file, and then
198   /// closes the file. Templated on P, I, and V.
199   template <typename P, typename I, typename V>
200   SparseTensorStorage<P, I, V> *
201   readSparseTensor(uint64_t lvlRank, const uint64_t *lvlSizes,
202                    const LevelType *lvlTypes, const uint64_t *dim2lvl,
203                    const uint64_t *lvl2dim) {
204     const uint64_t dimRank = getRank();
205     MapRef map(dimRank, lvlRank, dim2lvl, lvl2dim);
206     auto *lvlCOO = readCOO<V>(map, lvlSizes);
207     auto *tensor = SparseTensorStorage<P, I, V>::newFromCOO(
208         dimRank, getDimSizes(), lvlRank, lvlSizes, lvlTypes, dim2lvl, lvl2dim,
209         lvlCOO);
210     delete lvlCOO;
211     return tensor;
212   }
213 
214   /// Reads the COO tensor from the file, stores the coordinates and values to
215   /// the given buffers, returns a boolean value to indicate whether the COO
216   /// elements are sorted.
217   template <typename C, typename V>
218   bool readToBuffers(uint64_t lvlRank, const uint64_t *dim2lvl,
219                      const uint64_t *lvl2dim, C *lvlCoordinates, V *values);
220 
221 private:
222   /// Attempts to read a line from the file.
223   void readLine();
224 
225   /// Reads the next line of the input file and parses the coordinates
226   /// into the `dimCoords` argument.  Returns the position in the `line`
227   /// buffer where the element's value should be parsed from.
228   template <typename C>
229   char *readCoords(C *dimCoords) {
230     readLine();
231     // Local variable for tracking the parser's position in the `line` buffer.
232     char *linePtr = line;
233     for (uint64_t dimRank = getRank(), d = 0; d < dimRank; ++d) {
234       // Parse the 1-based coordinate.
235       uint64_t c = strtoul(linePtr, &linePtr, 10);
236       // Store the 0-based coordinate.
237       dimCoords[d] = static_cast<C>(c - 1);
238     }
239     return linePtr;
240   }
241 
242   /// Reads all the elements from the file while applying the given map.
243   template <typename V>
244   SparseTensorCOO<V> *readCOO(const MapRef &map, const uint64_t *lvlSizes);
245 
246   /// The implementation of `readCOO` that is templated `IsPattern` in order
247   /// to perform LICM without needing to duplicate the source code.
248   template <typename V, bool IsPattern>
249   void readCOOLoop(const MapRef &map, SparseTensorCOO<V> *coo);
250 
251   /// The internal implementation of `readToBuffers`. We template over
252   /// `IsPattern` in order to perform LICM without needing to duplicate
253   /// the source code.
254   template <typename C, typename V, bool IsPattern>
255   bool readToBuffersLoop(const MapRef &map, C *lvlCoordinates, V *values);
256 
257   /// Reads the MME header of a general sparse matrix of type real.
258   void readMMEHeader();
259 
260   /// Reads the "extended" FROSTT header. Although not part of the
261   /// documented format, we assume that the file starts with optional
262   /// comments followed by two lines that define the rank, the number of
263   /// nonzeros, and the dimensions sizes (one per rank) of the sparse tensor.
264   void readExtFROSTTHeader();
265 
266   static constexpr int kColWidth = 1025;
267   const char *const filename;
268   FILE *file = nullptr;
269   ValueKind valueKind_ = ValueKind::kInvalid;
270   bool isSymmetric_ = false;
271   uint64_t idata[512];
272   char line[kColWidth];
273 };
274 
275 //===----------------------------------------------------------------------===//
276 //
277 //  Reader class methods.
278 //
279 //===----------------------------------------------------------------------===//
280 
281 template <typename V>
282 SparseTensorCOO<V> *SparseTensorReader::readCOO(const MapRef &map,
283                                                 const uint64_t *lvlSizes) {
284   assert(isValid() && "Attempt to readCOO() before readHeader()");
285   // Prepare a COO object with the number of stored elems as initial capacity.
286   auto *coo = new SparseTensorCOO<V>(map.getLvlRank(), lvlSizes, getNSE());
287   // Enter the reading loop.
288   if (isPattern())
289     readCOOLoop<V, true>(map, coo);
290   else
291     readCOOLoop<V, false>(map, coo);
292   // Close the file and return the COO.
293   closeFile();
294   return coo;
295 }
296 
297 template <typename V, bool IsPattern>
298 void SparseTensorReader::readCOOLoop(const MapRef &map,
299                                      SparseTensorCOO<V> *coo) {
300   const uint64_t dimRank = map.getDimRank();
301   const uint64_t lvlRank = map.getLvlRank();
302   assert(dimRank == getRank());
303   std::vector<uint64_t> dimCoords(dimRank);
304   std::vector<uint64_t> lvlCoords(lvlRank);
305   for (uint64_t k = 0, nse = getNSE(); k < nse; k++) {
306     char *linePtr = readCoords(dimCoords.data());
307     const V value = detail::readValue<V, IsPattern>(&linePtr);
308     map.pushforward(dimCoords.data(), lvlCoords.data());
309     coo->add(lvlCoords, value);
310   }
311 }
312 
313 template <typename C, typename V>
314 bool SparseTensorReader::readToBuffers(uint64_t lvlRank,
315                                        const uint64_t *dim2lvl,
316                                        const uint64_t *lvl2dim,
317                                        C *lvlCoordinates, V *values) {
318   assert(isValid() && "Attempt to readCOO() before readHeader()");
319   MapRef map(getRank(), lvlRank, dim2lvl, lvl2dim);
320   bool isSorted =
321       isPattern() ? readToBuffersLoop<C, V, true>(map, lvlCoordinates, values)
322                   : readToBuffersLoop<C, V, false>(map, lvlCoordinates, values);
323   closeFile();
324   return isSorted;
325 }
326 
327 template <typename C, typename V, bool IsPattern>
328 bool SparseTensorReader::readToBuffersLoop(const MapRef &map, C *lvlCoordinates,
329                                            V *values) {
330   const uint64_t dimRank = map.getDimRank();
331   const uint64_t lvlRank = map.getLvlRank();
332   const uint64_t nse = getNSE();
333   assert(dimRank == getRank());
334   std::vector<C> dimCoords(dimRank);
335   bool isSorted = false;
336   char *linePtr;
337   const auto readNextElement = [&]() {
338     linePtr = readCoords<C>(dimCoords.data());
339     map.pushforward(dimCoords.data(), lvlCoordinates);
340     *values = detail::readValue<V, IsPattern>(&linePtr);
341     if (isSorted) {
342       // Note that isSorted is set to false when reading the first element,
343       // to guarantee the safeness of using prevLvlCoords.
344       C *prevLvlCoords = lvlCoordinates - lvlRank;
345       for (uint64_t l = 0; l < lvlRank; ++l) {
346         if (prevLvlCoords[l] != lvlCoordinates[l]) {
347           if (prevLvlCoords[l] > lvlCoordinates[l])
348             isSorted = false;
349           break;
350         }
351       }
352     }
353     lvlCoordinates += lvlRank;
354     ++values;
355   };
356   readNextElement();
357   isSorted = true;
358   for (uint64_t n = 1; n < nse; ++n)
359     readNextElement();
360   return isSorted;
361 }
362 
363 } // namespace sparse_tensor
364 } // namespace mlir
365 
366 #endif // MLIR_EXECUTIONENGINE_SPARSETENSOR_FILE_H
367