Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

initial gRPC spec changes for supporting index creation and querying #8829

Merged
merged 12 commits into from
Jan 30, 2025
1 change: 0 additions & 1 deletion crates/store/re_protos/proto/rerun/v0/common.proto
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,6 @@ message Query {
SparseFillStrategy sparse_fill_strategy = 11;
}


message ColumnSelection {
repeated ColumnSelector columns = 1;
}
Expand Down
120 changes: 120 additions & 0 deletions crates/store/re_protos/proto/rerun/v0/remote_store.proto
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,17 @@ service StorageNode {
rpc Query(QueryRequest) returns (stream DataframePart) {}
rpc FetchRecording(FetchRecordingRequest) returns (stream rerun.common.v0.RerunChunk) {}

rpc CreateCollectionIndex(CreateCollectionIndexRequest) returns (CreateCollectionIndexResponse) {}
// Collection index query response is a RecordBatch with 3 columns:
// - 'resource_id' column with the id of the resource
// - timepoint column with the values reprensenting the points in time
// where index query matches. What time points are matched depends on the type of
// index that is queried. For example for vector search it might be timepoints where
// top-K matches are found within *each* resource in the collection. For inverted index
// it might be timepoints where the query string is found in the indexed column
// - 'data' column with the data that is returned for the matched timepoints
rpc QueryCollectionIndex(QueryCollectionIndexRequest) returns (stream DataframePart) {}

// metadata API calls
rpc QueryCatalog(QueryCatalogRequest) returns (stream DataframePart) {}
rpc UpdateCatalog(UpdateCatalogRequest) returns (UpdateCatalogResponse) {}
Expand All @@ -32,6 +43,115 @@ message DataframePart {
bytes payload = 1000;
}

// ---------------- CreateCollectionIndex ------------------

// used to define which column we want to index
message IndexColumn {
// The path of the entity.
rerun.common.v0.EntityPath entity_path = 1;
// Optional name of the `Archetype` associated with this data.
optional string archetype_name = 2;
// Optional name of the field within `Archetype` associated with this data.
optional string archetype_field_name = 3;
// Semantic name associated with this data.
string component_name = 4;
}

message CreateCollectionIndexRequest {
// which collection do we want to create index for
Collection collection = 1;
// what kind of index do we want to create and what are
// its index specific properties
IndexProperties properties = 2;
// Component / column we want to index
IndexColumn column = 3;
// What is the filter index i.e. timeline for which we
// will query the timepoints
// TODO(zehiko) this might go away and we might just index
// across all the timelines
rerun.common.v0.IndexColumnSelector time_index = 4;
}

message IndexProperties {
oneof props {
InvertedIndex inverted = 1;
VectorIvfPqIndex vector = 2;
BTreeIndex btree = 3;
}
}

message InvertedIndex {
bool store_position = 1;
string base_tokenizer = 2;
// TODO(zehiko) add other properties as needed
}

message VectorIvfPqIndex {
uint32 num_partitions = 1;
uint32 num_sub_vectors = 2;
VectorDistanceMetric distance_metrics = 3;
}

enum VectorDistanceMetric {
L2 = 0;
COSINE = 1;
DOT = 2;
HAMMING = 3;
}

message BTreeIndex {
// TODO(zehiko) add properties as needed
}

message CreateCollectionIndexResponse {
uint64 indexed_rows = 1;
}


// ---------------- QueryCollectionIndex ------------------

message QueryCollectionIndexRequest {
zehiko marked this conversation as resolved.
Show resolved Hide resolved
// Collection we want to run the query against on
// If not specified, the default collection is queried
Collection collection = 1;
// Index column that is queried
IndexColumn column = 2;
// Query data - type of data is index specific. Caller must ensure
// to provide the right type. For vector search this should
// be a vector of appropriate size, for inverted index this should be a string.
// Query data is represented as a unit (single row) RecordBatch with 1 column.
DataframePart query = 3;
// Index type specific properties
IndexQueryProperties properties = 4;
// max number of rows to be returned
optional uint32 limit = 5;
}

message IndexQueryProperties {
// specific index query properties based on the index type
oneof props {
InvertedIndexQuery inverted = 1;
VectorIndexQuery vector = 2;
BTreeIndexQuery btree = 3;
}
}

message InvertedIndexQuery {
// TODO(zehiko) add properties as needed
}

message VectorIndexQuery {
uint32 top_k = 2;
}

message BTreeIndexQuery {
// TODO(zehiko) add properties as needed
}

message Collection {
string name = 1;
}

// ---------------- GetRecordingSchema ------------------

message GetRecordingSchemaRequest {
Expand Down
Loading