From 68a8c124e85a0bb6fd6415bf3c0ee69df82a9f0f Mon Sep 17 00:00:00 2001
From: Aart Bik <abik@nvidia.com>
Date: Fri, 24 Jan 2025 14:52:02 -0800
Subject: [PATCH] Implement the () operator on sparse tensors (#837)

Note that is a fully functional "locator" for the () operator
that works for *all* versatile sparse tensors. Currently, the
operator is defined at sparse_tensor level, but it should be
moved into tensor_impl after this. Also, clients should always
be aware that the () operator for compressed levels are *not*
random-access, but involve a search to find if the element is
stored.
---
 examples/sparse_tensor.cu              | 18 ++++++
 include/matx/core/make_sparse_tensor.h | 16 +++++-
 include/matx/core/sparse_tensor.h      | 76 ++++++++++++++++++++++++++
 include/matx/core/tensor_impl.h        |  3 +-
 4 files changed, 110 insertions(+), 3 deletions(-)
diff --git a/examples/sparse_tensor.cu b/examples/sparse_tensor.cu
index 32bc58b8..2ac5cb14 100644
--- a/examples/sparse_tensor.cu
+++ b/examples/sparse_tensor.cu
@@ -97,6 +97,24 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   //
   print(Acoo);
 
+  //
+  // A very naive way to convert the sparse matrix back to a dense
+  // matrix. Note that one should **never** use the ()-operator in
+  // performance critical code, since sparse data structures do
+  // not provide O(1) random access to their elements (compressed
+  // levels will use some form of search to determine if an element
+  // is present). Instead, conversions (and other operations) should
+  // use sparse operations that are tailored for the sparse data
+  // structure (such as scanning by row for CSR).
+  //
+  tensor_t<float, 2> Dense{{m, n}};
+  for (index_t i = 0; i < m; i++) {
+    for (index_t j = 0; j < n; j++) {
+      Dense(i, j) = Acoo(i, j);
+    }
+  }
+  print(Dense);
+
   // TODO: operations on Acoo
 
   MATX_EXIT_HANDLER();
diff --git a/include/matx/core/make_sparse_tensor.h b/include/matx/core/make_sparse_tensor.h
index 3cffa46c..6d81565b 100644
--- a/include/matx/core/make_sparse_tensor.h
+++ b/include/matx/core/make_sparse_tensor.h
@@ -55,11 +55,23 @@ auto make_tensor_coo(ValTensor &val, CrdTensor &row, CrdTensor &col,
   static_assert(ValTensor::Rank() == 1 && CrdTensor::Rank() == 1);
   using VAL = typename ValTensor::value_type;
   using CRD = typename CrdTensor::value_type;
-  using POS = int; // no positions, although some forms use [0,nse]
+  using POS = index_t;
+  // Note that the COO API typically does not involve positions.
+  // However, under the formal DSL specifications, the top level
+  // compression should set up pos[0] = {0, nse}. This is done
+  // here, using the same memory space as the other data.
+  POS *ptr;
+  matxMemorySpace_t space = GetPointerKind(val.GetStorage().data());
+  matxAlloc((void **)&ptr, 2 * sizeof(POS), space, 0);
+  ptr[0] = 0;
+  ptr[1] = val.Size(0);
+  raw_pointer_buffer<POS, matx_allocator<POS>> topp{ptr, 2 * sizeof(POS),
+                                                    owning};
+  basic_storage<decltype(topp)> tp{std::move(topp)};
   raw_pointer_buffer<POS, matx_allocator<POS>> emptyp{nullptr, 0, owning};
   basic_storage<decltype(emptyp)> ep{std::move(emptyp)};
   return sparse_tensor_t<VAL, CRD, POS, COO>(
-      shape, val.GetStorage(), {row.GetStorage(), col.GetStorage()}, {ep, ep});
+      shape, val.GetStorage(), {row.GetStorage(), col.GetStorage()}, {tp, ep});
 }
 
 // Constructs a sparse matrix in CSR format directly from the values, the row
diff --git a/include/matx/core/sparse_tensor.h b/include/matx/core/sparse_tensor.h
index 473325b0..e0a1e03d 100644
--- a/include/matx/core/sparse_tensor.h
+++ b/include/matx/core/sparse_tensor.h
@@ -109,6 +109,82 @@ class sparse_tensor_t
   index_t crdSize(int l) const { return coordinates_[l].size() / sizeof(CRD); }
   index_t posSize(int l) const { return positions_[l].size() / sizeof(POS); }
 
+  // Locates position of an element at given indices, or returns -1 when not
+  // found.
+  template <int L = 0>
+  __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ index_t
+  GetPos(index_t *lvlsz, index_t *lvl, index_t pos) const {
+    if constexpr (L < LVL) {
+      using ftype = std::tuple_element_t<L, typename TF::LVLSPECS>;
+      if constexpr (ftype::lvltype == LvlType::Dense) {
+        // Dense level: pos * size + i.
+        // TODO: see below, use a constexpr GetLvlSize(L) instead?
+        const index_t dpos = pos * lvlsz[L] + lvl[L];
+        if constexpr (L + 1 < LVL) {
+          return GetPos<L + 1>(lvlsz, lvl, dpos);
+        } else {
+          return dpos;
+        }
+      } else if constexpr (ftype::lvltype == LvlType::Singleton) {
+        // Singleton level: pos if crd[pos] == i and next levels match.
+        if (this->CRDData(L)[pos] == lvl[L]) {
+          if constexpr (L + 1 < LVL) {
+            return GetPos<L + 1>(lvlsz, lvl, pos);
+          } else {
+            return pos;
+          }
+        }
+      } else if constexpr (ftype::lvltype == LvlType::Compressed ||
+                           ftype::lvltype == LvlType::CompressedNonUnique) {
+        // Compressed level: scan for match on i and test next levels.
+        const CRD *c = this->CRDData(L);
+        const POS *p = this->POSData(L);
+        for (index_t pp = p[pos], hi = p[pos + 1]; pp < hi; pp++) {
+          if (c[pp] == lvl[L]) {
+            if constexpr (L + 1 < LVL) {
+              const index_t cpos = GetPos<L + 1>(lvlsz, lvl, pp);
+              if constexpr (ftype::lvltype == LvlType::Compressed) {
+                return cpos; // always end scan (unique)
+              } else if (cpos != -1) {
+                return cpos; // only end scan on success (non-unique)
+              }
+            } else {
+              return pp;
+            }
+          }
+        }
+      }
+    }
+    return -1; // not found
+  }
+
+  // Element getter (viz. "lhs = Acoo(0,0);"). Note that due to the compact
+  // nature of sparse data structures, these storage formats do not provide
+  // cheap random access to their elements. Instead, the implementation will
+  // search for a stored element at the given position (which involves a scan
+  // at each compressed level). The implicit value zero is returned when the
+  // element cannot be found. So, although functional for testing, clients
+  // should avoid using getters inside performance critial regions, since
+  // the implementation is far worse than O(1).
+  template <typename... Is>
+  __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ VAL
+  operator()(Is... indices) const noexcept {
+    static_assert(
+        sizeof...(Is) == DIM,
+        "Number of indices of operator() must match rank of sparse tensor");
+    cuda::std::array<index_t, DIM> dim{indices...};
+    cuda::std::array<index_t, LVL> lvl;
+    cuda::std::array<index_t, LVL> lvlsz;
+    TF::dim2lvl(dim.data(), lvl.data(), /*asSize=*/false);
+    // TODO: only compute once and provide a constexpr LvlSize(l) instead?
+    TF::dim2lvl(this->Shape().data(), lvlsz.data(), /*asSize=*/true);
+    const index_t pos = GetPos(lvlsz.data(), lvl.data(), 0);
+    if (pos != -1) {
+      return this->Data()[pos];
+    }
+    return static_cast<VAL>(0); // implicit zero
+  }
+
 private:
   // Primary storage of sparse tensor (explicitly stored element values).
   StorageV values_;
diff --git a/include/matx/core/tensor_impl.h b/include/matx/core/tensor_impl.h
index 4bfe1586..12fff7ce 100644
--- a/include/matx/core/tensor_impl.h
+++ b/include/matx/core/tensor_impl.h
@@ -637,7 +637,8 @@ MATX_IGNORE_WARNING_POP_GCC
      * @return
      *    A shape of the data with the appropriate dimensions set
      */
-    __MATX_INLINE__ auto Shape() const noexcept { return this->desc_.Shape(); }
+    __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__
+    auto Shape() const noexcept { return this->desc_.Shape(); }
 
     /**
      * Get the strides the tensor from the underlying data