ExecutionEngine/SparseTensor/File.cpp

//===- File.cpp - Parsing sparse tensors from files -----------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements parsing and printing of files in one of the
// following external formats:
//
// (1) Matrix Market Exchange (MME): *.mtx
//     https://math.nist.gov/MatrixMarket/formats.html
//
// (2) Formidable Repository of Open Sparse Tensors and Tools (FROSTT): *.tns
//     http://frostt.io/tensors/file-formats.html
//
// This file is part of the lightweight runtime support library for sparse
// tensor manipulations.  The functionality of the support library is meant
// to simplify benchmarking, testing, and debugging MLIR code operating on
// sparse tensors.  However, the provided functionality is **not** part of
// core MLIR itself.
//
//===----------------------------------------------------------------------===//

#include "mlir/ExecutionEngine/SparseTensor/File.h"

#include <cctype>
#include <cstring>

using namespace mlir::sparse_tensor;

/// Opens the file for reading.
void SparseTensorReader::openFile() {
  if (file)
    MLIR_SPARSETENSOR_FATAL("Already opened file %s\n", filename);
  file = fopen(filename, "r");
  if (!file)
    MLIR_SPARSETENSOR_FATAL("Cannot find file %s\n", filename);
}

/// Closes the file.
void SparseTensorReader::closeFile() {
  if (file) {
    fclose(file);
    file = nullptr;
  }
}

/// Attempts to read a line from the file.
void SparseTensorReader::readLine() {
  if (!fgets(line, kColWidth, file))
    MLIR_SPARSETENSOR_FATAL("Cannot read next line of %s\n", filename);
}

char *SparseTensorReader::readCOOIndices(uint64_t *indices) {
  readLine();
  // Local variable for tracking the parser's position in the `line` buffer.
  char *linePtr = line;
  for (uint64_t rank = getRank(), r = 0; r < rank; ++r) {
    // Parse the 1-based index.
    uint64_t idx = strtoul(linePtr, &linePtr, 10);
    // Store the 0-based index.
    indices[r] = idx - 1;
  }
  return linePtr;
}

/// Reads and parses the file's header.
void SparseTensorReader::readHeader() {
  assert(file && "Attempt to readHeader() before openFile()");
  if (strstr(filename, ".mtx"))
    readMMEHeader();
  else if (strstr(filename, ".tns"))
    readExtFROSTTHeader();
  else
    MLIR_SPARSETENSOR_FATAL("Unknown format %s\n", filename);
  assert(isValid() && "Failed to read the header");
}

/// Asserts the shape subsumes the actual dimension sizes.  Is only
/// valid after parsing the header.
void SparseTensorReader::assertMatchesShape(uint64_t rank,
                                            const uint64_t *shape) const {
  assert(rank == getRank() && "Rank mismatch");
  for (uint64_t r = 0; r < rank; ++r)
    assert((shape[r] == 0 || shape[r] == idata[2 + r]) &&
           "Dimension size mismatch");
}

bool SparseTensorReader::canReadAs(PrimaryType valTy) const {
  switch (valueKind_) {
  case ValueKind::kInvalid:
    assert(false && "Must readHeader() before calling canReadAs()");
    return false; // In case assertions are disabled.
  case ValueKind::kPattern:
    return true;
  case ValueKind::kInteger:
    // When the file is specified to store integer values, we still
    // allow implicitly converting those to floating primary-types.
    return isRealPrimaryType(valTy);
  case ValueKind::kReal:
    // When the file is specified to store real/floating values, then
    // we disallow implicit conversion to integer primary-types.
    return isFloatingPrimaryType(valTy);
  case ValueKind::kComplex:
    // When the file is specified to store complex values, then we
    // require a complex primary-type.
    return isComplexPrimaryType(valTy);
  case ValueKind::kUndefined:
    // The "extended" FROSTT format doesn't specify a ValueKind.
    // So we allow implicitly converting the stored values to both
    // integer and floating primary-types.
    return isRealPrimaryType(valTy);
  }
  MLIR_SPARSETENSOR_FATAL("Unknown ValueKind: %d\n",
                          static_cast<uint8_t>(valueKind_));
}

/// Helper to convert C-style strings (i.e., '\0' terminated) to lower case.
static inline void toLower(char *token) {
  for (char *c = token; *c; ++c)
    *c = tolower(*c);
}

/// Idiomatic name for checking string equality.
static inline bool streq(const char *lhs, const char *rhs) {
  return strcmp(lhs, rhs) == 0;
}

/// Idiomatic name for checking string inequality.
static inline bool strne(const char *lhs, const char *rhs) {
  return strcmp(lhs, rhs); // aka `!= 0`
}

/// Read the MME header of a general sparse matrix of type real.
void SparseTensorReader::readMMEHeader() {
  char header[64];
  char object[64];
  char format[64];
  char field[64];
  char symmetry[64];
  // Read header line.
  if (fscanf(file, "%63s %63s %63s %63s %63s\n", header, object, format, field,
             symmetry) != 5)
    MLIR_SPARSETENSOR_FATAL("Corrupt header in %s\n", filename);
  // Convert all to lowercase up front (to avoid accidental redundancy).
  toLower(header);
  toLower(object);
  toLower(format);
  toLower(field);
  toLower(symmetry);
  // Process `field`, which specify pattern or the data type of the values.
  if (streq(field, "pattern"))
    valueKind_ = ValueKind::kPattern;
  else if (streq(field, "real"))
    valueKind_ = ValueKind::kReal;
  else if (streq(field, "integer"))
    valueKind_ = ValueKind::kInteger;
  else if (streq(field, "complex"))
    valueKind_ = ValueKind::kComplex;
  else
    MLIR_SPARSETENSOR_FATAL("Unexpected header field value in %s\n", filename);
  // Set properties.
  isSymmetric_ = streq(symmetry, "symmetric");
  // Make sure this is a general sparse matrix.
  if (strne(header, "%%matrixmarket") || strne(object, "matrix") ||
      strne(format, "coordinate") ||
      (strne(symmetry, "general") && !isSymmetric_))
    MLIR_SPARSETENSOR_FATAL("Cannot find a general sparse matrix in %s\n",
                            filename);
  // Skip comments.
  while (true) {
    readLine();
    if (line[0] != '%')
      break;
  }
  // Next line contains M N NNZ.
  idata[0] = 2; // rank
  if (sscanf(line, "%" PRIu64 "%" PRIu64 "%" PRIu64 "\n", idata + 2, idata + 3,
             idata + 1) != 3)
    MLIR_SPARSETENSOR_FATAL("Cannot find size in %s\n", filename);
}

/// Read the "extended" FROSTT header. Although not part of the documented
/// format, we assume that the file starts with optional comments followed
/// by two lines that define the rank, the number of nonzeros, and the
/// dimensions sizes (one per rank) of the sparse tensor.
void SparseTensorReader::readExtFROSTTHeader() {
  // Skip comments.
  while (true) {
    readLine();
    if (line[0] != '#')
      break;
  }
  // Next line contains RANK and NNZ.
  if (sscanf(line, "%" PRIu64 "%" PRIu64 "\n", idata, idata + 1) != 2)
    MLIR_SPARSETENSOR_FATAL("Cannot find metadata in %s\n", filename);
  // Followed by a line with the dimension sizes (one per rank).
  for (uint64_t r = 0; r < idata[0]; ++r)
    if (fscanf(file, "%" PRIu64, idata + 2 + r) != 1)
      MLIR_SPARSETENSOR_FATAL("Cannot find dimension size %s\n", filename);
  readLine(); // end of line
  // The FROSTT format does not define the data type of the nonzero elements.
  valueKind_ = ValueKind::kUndefined;
}