rerun-io · zehiko · Jan 30, 2025 · Jan 24, 2025 · Jan 24, 2025 · Jan 24, 2025
diff --git a/crates/store/re_protos/proto/rerun/v0/common.proto b/crates/store/re_protos/proto/rerun/v0/common.proto
@@ -102,7 +102,6 @@ message Query {
     SparseFillStrategy sparse_fill_strategy = 11;
 }
 
-
 message ColumnSelection {
     repeated ColumnSelector columns = 1;
 }

diff --git a/crates/store/re_protos/proto/rerun/v0/remote_store.proto b/crates/store/re_protos/proto/rerun/v0/remote_store.proto
@@ -9,6 +9,17 @@ service StorageNode {
     rpc Query(QueryRequest) returns (stream DataframePart) {}
     rpc FetchRecording(FetchRecordingRequest) returns (stream rerun.common.v0.RerunChunk) {}
 
+    rpc CreateCollectionIndex(CreateCollectionIndexRequest) returns (CreateCollectionIndexResponse) {}
+    // Collection index query response is a RecordBatch with 3 columns:
+    // - 'resource_id' column with the id of the resource
+    // - timepoint column with the values reprensenting the points in time
+    // where index query matches. What time points are matched depends on the type of
+    // index that is queried. For example for vector search it might be timepoints where
+    // top-K matches are found within *each* resource in the collection. For inverted index
+    // it might be timepoints where the query string is found in the indexed column
+    // - 'data' column with the data that is returned for the matched timepoints
+    rpc QueryCollectionIndex(QueryCollectionIndexRequest) returns (stream DataframePart) {}
+
     // metadata API calls
     rpc QueryCatalog(QueryCatalogRequest) returns (stream DataframePart) {}
     rpc UpdateCatalog(UpdateCatalogRequest) returns (UpdateCatalogResponse) {}
@@ -32,6 +43,115 @@ message DataframePart {
     bytes payload = 1000;
 }
 
+// ---------------- CreateCollectionIndex ------------------
+
+// used to define which column we want to index
+message IndexColumn {
+    // The path of the entity.
+    rerun.common.v0.EntityPath entity_path = 1;
+    // Optional name of the `Archetype` associated with this data.
+    optional string archetype_name = 2;
+    // Optional name of the field within `Archetype` associated with this data.
+    optional string archetype_field_name = 3;
+    // Semantic name associated with this data.
+    string component_name = 4;
+}
+
+message CreateCollectionIndexRequest {
+    // which collection do we want to create index for
+    Collection collection = 1;
+    // what kind of index do we want to create and what are
+    // its index specific properties
+    IndexProperties properties = 2;
+    // Component / column we want to index
+    IndexColumn column = 3;
+    // What is the filter index i.e. timeline for which we
+    // will query the timepoints
+    // TODO(zehiko) this might go away and we might just index
+    // across all the timelines
+    rerun.common.v0.IndexColumnSelector time_index = 4;
+}
+
+message IndexProperties {
+    oneof props {
+        InvertedIndex inverted = 1;
+        VectorIvfPqIndex vector = 2;
+        BTreeIndex btree = 3;
+    }
+}
+
+message InvertedIndex {
+    bool store_position = 1;
+    string base_tokenizer  = 2;
+    // TODO(zehiko) add other properties as needed
+}
+
+message VectorIvfPqIndex {
+    uint32 num_partitions = 1;
+    uint32 num_sub_vectors = 2;
+    VectorDistanceMetric distance_metrics = 3;
+}
+
+enum VectorDistanceMetric {
+    L2 = 0;
+    COSINE = 1;
+    DOT = 2;
+    HAMMING = 3;
+}
+
+message BTreeIndex {
+    // TODO(zehiko) add properties as needed
+}
+
+message CreateCollectionIndexResponse {
+    uint64 indexed_rows = 1;
+}
+
+
+// ---------------- QueryCollectionIndex ------------------
+
+message QueryCollectionIndexRequest {
+    // Collection we want to run the query against on
+    // If not specified, the default collection is queried
+    Collection collection = 1;
+    // Index column that is queried
+    IndexColumn column = 2;
+    // Query data - type of data is index specific. Caller must ensure
+    // to provide the right type. For vector search this should
+    // be a vector of appropriate size, for inverted index this should be a string.
+    // Query data is represented as a unit (single row) RecordBatch with 1 column.
+    DataframePart query = 3;
+    // Index type specific properties
+    IndexQueryProperties properties = 4;
+    // max number of rows to be returned
+    optional uint32 limit = 5;
+}
+
+message IndexQueryProperties {
+    // specific index query properties based on the index type
+    oneof props {
+        InvertedIndexQuery inverted = 1;
+        VectorIndexQuery vector = 2;
+        BTreeIndexQuery btree = 3;
+    }
+}
+
+message InvertedIndexQuery {
+    // TODO(zehiko) add properties as needed
+}
+
+message VectorIndexQuery {
+    uint32 top_k = 2;
+}
+
+message BTreeIndexQuery {
+    // TODO(zehiko) add properties as needed
+}
+
+message Collection {
+    string name = 1;
+}
+
 // ---------------- GetRecordingSchema ------------------
 
 message GetRecordingSchemaRequest {
-Original file line number
+Diff line change
@@ Expand Up / @@ -102,7 +102,6 @@ message Query { @@
         SparseFillStrategy sparse_fill_strategy = 11;
     }
     message ColumnSelection {
         repeated ColumnSelector columns = 1;
     }
@@ Expand Down @@