1 //===- File.h - Reading sparse tensors from files ---------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements reading sparse tensor from files in one of the 10 // following external formats: 11 // 12 // (1) Matrix Market Exchange (MME): *.mtx 13 // https://math.nist.gov/MatrixMarket/formats.html 14 // 15 // (2) Formidable Repository of Open Sparse Tensors and Tools (FROSTT): *.tns 16 // http://frostt.io/tensors/file-formats.html 17 // 18 //===----------------------------------------------------------------------===// 19 20 #ifndef MLIR_EXECUTIONENGINE_SPARSETENSOR_FILE_H 21 #define MLIR_EXECUTIONENGINE_SPARSETENSOR_FILE_H 22 23 #include "mlir/ExecutionEngine/SparseTensor/MapRef.h" 24 #include "mlir/ExecutionEngine/SparseTensor/Storage.h" 25 26 #include <fstream> 27 28 namespace mlir { 29 namespace sparse_tensor { 30 31 namespace detail { 32 33 template <typename T> 34 struct is_complex final : public std::false_type {}; 35 36 template <typename T> 37 struct is_complex<std::complex<T>> final : public std::true_type {}; 38 39 /// Returns an element-value of non-complex type. If `IsPattern` is true, 40 /// then returns an arbitrary value. If `IsPattern` is false, then 41 /// reads the value from the current line buffer beginning at `linePtr`. 42 template <typename V, bool IsPattern> 43 inline std::enable_if_t<!is_complex<V>::value, V> readValue(char **linePtr) { 44 // The external formats always store these numerical values with the type 45 // double, but we cast these values to the sparse tensor object type. 46 // For a pattern tensor, we arbitrarily pick the value 1 for all entries. 47 if constexpr (IsPattern) 48 return 1.0; 49 return strtod(*linePtr, linePtr); 50 } 51 52 /// Returns an element-value of complex type. If `IsPattern` is true, 53 /// then returns an arbitrary value. If `IsPattern` is false, then reads 54 /// the value from the current line buffer beginning at `linePtr`. 55 template <typename V, bool IsPattern> 56 inline std::enable_if_t<is_complex<V>::value, V> readValue(char **linePtr) { 57 // Read two values to make a complex. The external formats always store 58 // numerical values with the type double, but we cast these values to the 59 // sparse tensor object type. For a pattern tensor, we arbitrarily pick the 60 // value 1 for all entries. 61 if constexpr (IsPattern) 62 return V(1.0, 1.0); 63 double re = strtod(*linePtr, linePtr); 64 double im = strtod(*linePtr, linePtr); 65 // Avoiding brace-notation since that forbids narrowing to `float`. 66 return V(re, im); 67 } 68 69 /// Returns an element-value. If `isPattern` is true, then returns an 70 /// arbitrary value. If `isPattern` is false, then reads the value from 71 /// the current line buffer beginning at `linePtr`. 72 template <typename V> 73 inline V readValue(char **linePtr, bool isPattern) { 74 return isPattern ? readValue<V, true>(linePtr) : readValue<V, false>(linePtr); 75 } 76 77 } // namespace detail 78 79 //===----------------------------------------------------------------------===// 80 // 81 // Reader class. 82 // 83 //===----------------------------------------------------------------------===// 84 85 /// This class abstracts over the information stored in file headers, 86 /// as well as providing the buffers and methods for parsing those headers. 87 class SparseTensorReader final { 88 public: 89 enum class ValueKind : uint8_t { 90 // The value before calling `readHeader`. 91 kInvalid = 0, 92 // Values that can be set by `readMMEHeader`. 93 kPattern = 1, 94 kReal = 2, 95 kInteger = 3, 96 kComplex = 4, 97 // The value set by `readExtFROSTTHeader`. 98 kUndefined = 5 99 }; 100 101 explicit SparseTensorReader(const char *filename) : filename(filename) { 102 assert(filename && "Received nullptr for filename"); 103 } 104 105 // Disallows copying, to avoid duplicating the `file` pointer. 106 SparseTensorReader(const SparseTensorReader &) = delete; 107 SparseTensorReader &operator=(const SparseTensorReader &) = delete; 108 109 /// Factory method to allocate a new reader, open the file, read the 110 /// header, and validate that the actual contents of the file match 111 /// the expected `dimShape` and `valTp`. 112 static SparseTensorReader *create(const char *filename, uint64_t dimRank, 113 const uint64_t *dimShape, 114 PrimaryType valTp) { 115 SparseTensorReader *reader = new SparseTensorReader(filename); 116 reader->openFile(); 117 reader->readHeader(); 118 if (!reader->canReadAs(valTp)) { 119 fprintf(stderr, 120 "Tensor element type %d not compatible with values in file %s\n", 121 static_cast<int>(valTp), filename); 122 exit(1); 123 } 124 reader->assertMatchesShape(dimRank, dimShape); 125 return reader; 126 } 127 128 // This dtor tries to avoid leaking the `file`. (Though it's better 129 // to call `closeFile` explicitly when possible, since there are 130 // circumstances where dtors are not called reliably.) 131 ~SparseTensorReader() { closeFile(); } 132 133 /// Opens the file for reading. 134 void openFile(); 135 136 /// Closes the file. 137 void closeFile(); 138 139 /// Reads and parses the file's header. 140 void readHeader(); 141 142 /// Returns the stored value kind. 143 ValueKind getValueKind() const { return valueKind_; } 144 145 /// Checks if a header has been successfully read. 146 bool isValid() const { return valueKind_ != ValueKind::kInvalid; } 147 148 /// Checks if the file's ValueKind can be converted into the given 149 /// tensor PrimaryType. Is only valid after parsing the header. 150 bool canReadAs(PrimaryType valTy) const; 151 152 /// Gets the MME "pattern" property setting. Is only valid after 153 /// parsing the header. 154 bool isPattern() const { 155 assert(isValid() && "Attempt to isPattern() before readHeader()"); 156 return valueKind_ == ValueKind::kPattern; 157 } 158 159 /// Gets the MME "symmetric" property setting. Is only valid after 160 /// parsing the header. 161 bool isSymmetric() const { 162 assert(isValid() && "Attempt to isSymmetric() before readHeader()"); 163 return isSymmetric_; 164 } 165 166 /// Gets the dimension-rank of the tensor. Is only valid after parsing 167 /// the header. 168 uint64_t getRank() const { 169 assert(isValid() && "Attempt to getRank() before readHeader()"); 170 return idata[0]; 171 } 172 173 /// Gets the number of stored elements. Is only valid after parsing 174 /// the header. 175 uint64_t getNSE() const { 176 assert(isValid() && "Attempt to getNSE() before readHeader()"); 177 return idata[1]; 178 } 179 180 /// Gets the dimension-sizes array. The pointer itself is always 181 /// valid; however, the values stored therein are only valid after 182 /// parsing the header. 183 const uint64_t *getDimSizes() const { return idata + 2; } 184 185 /// Safely gets the size of the given dimension. Is only valid 186 /// after parsing the header. 187 uint64_t getDimSize(uint64_t d) const { 188 assert(d < getRank() && "Dimension out of bounds"); 189 return idata[2 + d]; 190 } 191 192 /// Asserts the shape subsumes the actual dimension sizes. Is only 193 /// valid after parsing the header. 194 void assertMatchesShape(uint64_t rank, const uint64_t *shape) const; 195 196 /// Allocates a new sparse-tensor storage object with the given encoding, 197 /// initializes it by reading all the elements from the file, and then 198 /// closes the file. Templated on P, I, and V. 199 template <typename P, typename I, typename V> 200 SparseTensorStorage<P, I, V> * 201 readSparseTensor(uint64_t lvlRank, const uint64_t *lvlSizes, 202 const LevelType *lvlTypes, const uint64_t *dim2lvl, 203 const uint64_t *lvl2dim) { 204 const uint64_t dimRank = getRank(); 205 MapRef map(dimRank, lvlRank, dim2lvl, lvl2dim); 206 auto *lvlCOO = readCOO<V>(map, lvlSizes); 207 auto *tensor = SparseTensorStorage<P, I, V>::newFromCOO( 208 dimRank, getDimSizes(), lvlRank, lvlSizes, lvlTypes, dim2lvl, lvl2dim, 209 lvlCOO); 210 delete lvlCOO; 211 return tensor; 212 } 213 214 /// Reads the COO tensor from the file, stores the coordinates and values to 215 /// the given buffers, returns a boolean value to indicate whether the COO 216 /// elements are sorted. 217 template <typename C, typename V> 218 bool readToBuffers(uint64_t lvlRank, const uint64_t *dim2lvl, 219 const uint64_t *lvl2dim, C *lvlCoordinates, V *values); 220 221 private: 222 /// Attempts to read a line from the file. 223 void readLine(); 224 225 /// Reads the next line of the input file and parses the coordinates 226 /// into the `dimCoords` argument. Returns the position in the `line` 227 /// buffer where the element's value should be parsed from. 228 template <typename C> 229 char *readCoords(C *dimCoords) { 230 readLine(); 231 // Local variable for tracking the parser's position in the `line` buffer. 232 char *linePtr = line; 233 for (uint64_t dimRank = getRank(), d = 0; d < dimRank; ++d) { 234 // Parse the 1-based coordinate. 235 uint64_t c = strtoul(linePtr, &linePtr, 10); 236 // Store the 0-based coordinate. 237 dimCoords[d] = static_cast<C>(c - 1); 238 } 239 return linePtr; 240 } 241 242 /// Reads all the elements from the file while applying the given map. 243 template <typename V> 244 SparseTensorCOO<V> *readCOO(const MapRef &map, const uint64_t *lvlSizes); 245 246 /// The implementation of `readCOO` that is templated `IsPattern` in order 247 /// to perform LICM without needing to duplicate the source code. 248 template <typename V, bool IsPattern> 249 void readCOOLoop(const MapRef &map, SparseTensorCOO<V> *coo); 250 251 /// The internal implementation of `readToBuffers`. We template over 252 /// `IsPattern` in order to perform LICM without needing to duplicate 253 /// the source code. 254 template <typename C, typename V, bool IsPattern> 255 bool readToBuffersLoop(const MapRef &map, C *lvlCoordinates, V *values); 256 257 /// Reads the MME header of a general sparse matrix of type real. 258 void readMMEHeader(); 259 260 /// Reads the "extended" FROSTT header. Although not part of the 261 /// documented format, we assume that the file starts with optional 262 /// comments followed by two lines that define the rank, the number of 263 /// nonzeros, and the dimensions sizes (one per rank) of the sparse tensor. 264 void readExtFROSTTHeader(); 265 266 static constexpr int kColWidth = 1025; 267 const char *const filename; 268 FILE *file = nullptr; 269 ValueKind valueKind_ = ValueKind::kInvalid; 270 bool isSymmetric_ = false; 271 uint64_t idata[512]; 272 char line[kColWidth]; 273 }; 274 275 //===----------------------------------------------------------------------===// 276 // 277 // Reader class methods. 278 // 279 //===----------------------------------------------------------------------===// 280 281 template <typename V> 282 SparseTensorCOO<V> *SparseTensorReader::readCOO(const MapRef &map, 283 const uint64_t *lvlSizes) { 284 assert(isValid() && "Attempt to readCOO() before readHeader()"); 285 // Prepare a COO object with the number of stored elems as initial capacity. 286 auto *coo = new SparseTensorCOO<V>(map.getLvlRank(), lvlSizes, getNSE()); 287 // Enter the reading loop. 288 if (isPattern()) 289 readCOOLoop<V, true>(map, coo); 290 else 291 readCOOLoop<V, false>(map, coo); 292 // Close the file and return the COO. 293 closeFile(); 294 return coo; 295 } 296 297 template <typename V, bool IsPattern> 298 void SparseTensorReader::readCOOLoop(const MapRef &map, 299 SparseTensorCOO<V> *coo) { 300 const uint64_t dimRank = map.getDimRank(); 301 const uint64_t lvlRank = map.getLvlRank(); 302 assert(dimRank == getRank()); 303 std::vector<uint64_t> dimCoords(dimRank); 304 std::vector<uint64_t> lvlCoords(lvlRank); 305 for (uint64_t k = 0, nse = getNSE(); k < nse; k++) { 306 char *linePtr = readCoords(dimCoords.data()); 307 const V value = detail::readValue<V, IsPattern>(&linePtr); 308 map.pushforward(dimCoords.data(), lvlCoords.data()); 309 coo->add(lvlCoords, value); 310 } 311 } 312 313 template <typename C, typename V> 314 bool SparseTensorReader::readToBuffers(uint64_t lvlRank, 315 const uint64_t *dim2lvl, 316 const uint64_t *lvl2dim, 317 C *lvlCoordinates, V *values) { 318 assert(isValid() && "Attempt to readCOO() before readHeader()"); 319 MapRef map(getRank(), lvlRank, dim2lvl, lvl2dim); 320 bool isSorted = 321 isPattern() ? readToBuffersLoop<C, V, true>(map, lvlCoordinates, values) 322 : readToBuffersLoop<C, V, false>(map, lvlCoordinates, values); 323 closeFile(); 324 return isSorted; 325 } 326 327 template <typename C, typename V, bool IsPattern> 328 bool SparseTensorReader::readToBuffersLoop(const MapRef &map, C *lvlCoordinates, 329 V *values) { 330 const uint64_t dimRank = map.getDimRank(); 331 const uint64_t lvlRank = map.getLvlRank(); 332 const uint64_t nse = getNSE(); 333 assert(dimRank == getRank()); 334 std::vector<C> dimCoords(dimRank); 335 bool isSorted = false; 336 char *linePtr; 337 const auto readNextElement = [&]() { 338 linePtr = readCoords<C>(dimCoords.data()); 339 map.pushforward(dimCoords.data(), lvlCoordinates); 340 *values = detail::readValue<V, IsPattern>(&linePtr); 341 if (isSorted) { 342 // Note that isSorted is set to false when reading the first element, 343 // to guarantee the safeness of using prevLvlCoords. 344 C *prevLvlCoords = lvlCoordinates - lvlRank; 345 for (uint64_t l = 0; l < lvlRank; ++l) { 346 if (prevLvlCoords[l] != lvlCoordinates[l]) { 347 if (prevLvlCoords[l] > lvlCoordinates[l]) 348 isSorted = false; 349 break; 350 } 351 } 352 } 353 lvlCoordinates += lvlRank; 354 ++values; 355 }; 356 readNextElement(); 357 isSorted = true; 358 for (uint64_t n = 1; n < nse; ++n) 359 readNextElement(); 360 return isSorted; 361 } 362 363 } // namespace sparse_tensor 364 } // namespace mlir 365 366 #endif // MLIR_EXECUTIONENGINE_SPARSETENSOR_FILE_H 367