Berserk Docs

Meta gRPC API

Catalog management, ingestion, merge tasks, and cluster operations

The meta service exposes a gRPC API on port 9560 (configurable via META_BIND). It manages all cluster metadata: databases, tables, views, ingestion streams, merge tasks, and segment lifecycle. gRPC reflection is enabled for tooling support.

Services Overview

ServiceDescription
DatabaseServiceManages top-level databases (the unit that owns tables and views).
TableServiceManages tables within a database, including sharding fields, retention, and merge-span configuration.
ViewServiceManages named, database-scoped unions over tables. A view name lowers to the same plan shape as an explicit union T1, T2, ... in KQL.
ClusterServiceCluster membership registry — meta tracks live cluster members for leader-election and routing.
IngestionServiceAuthenticates ingest tokens and manages collector streams.
IngestTokenServiceManages ingest tokens for authentication and signal routing.
NurseryServiceManages ingest stream lifecycle, merged segment registration, and offset tracking.
MergeTaskServiceCoordinates segment merge tasks between the janitor and merge workers.
SegmentDeletionServiceManages cleanup of tombstoned segments from storage.
SegmentLookupServiceDiscovers segments matching a time range and table filter.
JanitorServiceProvides cluster statistics and merge task management for maintenance operations.

DatabaseService

Manages top-level databases (the unit that owns tables and views).

RPCs

RPCDescription
CreateDatabaseCreate a new database
ListDatabasesList all databases
DeleteDatabaseTombstone a database. Without cascade, rejects when any live table / view / ingest token still lives in the database. With cascade, tombstones every catalog row in the database in one transaction.
ResurrectDatabaseLift the tombstone on a database and every currently-tombstoned child within the grace window.
PreviewDeleteDatabaseLive children a cascade DeleteDatabase would tombstone. Read-only and advisory (children may change before the delete).

Proto Definition

service DatabaseService {
  rpc CreateDatabase(CreateDatabaseRequest) returns (CreateDatabaseResponse);
  rpc ListDatabases(ListDatabasesRequest) returns (ListDatabasesResponse);
  // Without cascade, rejects if any live child (table/view/token) remains; tombstoned children do not block.
  rpc DeleteDatabase(DeleteDatabaseRequest) returns (DeleteDatabaseResponse);
  rpc ResurrectDatabase(ResurrectDatabaseRequest) returns (ResurrectDatabaseResponse);
  // Live children a cascade DeleteDatabase would tombstone. Read-only and advisory (children may change before the delete).
  rpc PreviewDeleteDatabase(PreviewDeleteDatabaseRequest) returns (PreviewDeleteDatabaseResponse);
}

message Database {
  string id = 1;
  string name = 2;
  // Absent for live rows.
  Tombstone tombstone = 3;
}

message CreateDatabaseRequest {
  string name = 1;
}

message CreateDatabaseResponse {
  Database database = 1;
}

message ListDatabasesRequest {
  bool include_deleted = 1;
}

message ListDatabasesResponse {
  repeated Database items = 1;
}

message DeleteDatabaseRequest {
  string database_id = 1;
  bool cascade = 2;
  // Grace window zero; bytes still bound by the segment tombstone_min_age floor (~12 min).
  bool immediate = 3;
}

message DeleteDatabaseResponse {
  Database database = 1;
}

message ResurrectDatabaseRequest {
  string database_id = 1;
}

message ResurrectDatabaseResponse {
  Database database = 1;
}

message PreviewDeleteDatabaseRequest {
  string database_id = 1;
}

// One live child (id + display name) a cascade delete would tombstone.
message DeletableChild {
  string id = 1;
  string name = 2;
}

message PreviewDeleteDatabaseResponse {
  repeated DeletableChild tables = 1;
  repeated DeletableChild views = 2;
  repeated DeletableChild ingest_tokens = 3;
  // Grace window (seconds) a non-immediate delete schedules before the
  // database is permanently removed — meta's effective value, so the UI
  // need not hardcode it.
  int64 grace_window_seconds = 4;
}

TableService

Manages tables within a database, including sharding fields, retention, and merge-span configuration.

RPCs

RPCDescription
ListTablesList tables in a database
CreateTableCreate a new table
GetTableGet table by ID
DeleteTableTombstone a table. Rejects when live views or live ingest tokens still reference the table, returning a structured inventory of the blockers.
ResurrectTableLift the tombstone on a table within the grace window.
SetShardingFieldsConfigure table sharding fields
GetShardingFieldsGet table sharding configuration
SetTableRetentionSet per-table retention (unset clears the override)
SetTableMaxMergeTimeSpanSet the per-table merge-span cap (unset reverts to retention-derived default)

Proto Definition

// All RPCs are database-scoped via an explicit `database_id`. Writes
// keyed on `(database_id, name)` to match the table's per-database
// uniqueness; id-keyed reads use `(database_id, table_id)` and the
// server verifies membership.
service TableService {
  rpc ListTables(ListTablesRequest) returns (ListTablesResponse);

  // Table CRUD
  rpc CreateTable(CreateTableRequest) returns (CreateTableResponse);
  rpc GetTable(GetTableRequest) returns (GetTableResponse);
  // Rejects with a TableReferences inventory when any live view or ingest token still references the table.
  rpc DeleteTable(DeleteTableRequest) returns (DeleteTableResponse);
  rpc ResurrectTable(ResurrectTableRequest) returns (ResurrectTableResponse);

  // Table Sharding Fields
  rpc SetShardingFields(SetShardingFieldsRequest) returns (SetShardingFieldsResponse);
  rpc GetShardingFields(GetShardingFieldsRequest) returns (GetShardingFieldsResponse);

  // Table Retention & Merge Span
  rpc SetTableRetention(SetTableRetentionRequest) returns (SetTableRetentionResponse);
  rpc SetTableMaxMergeTimeSpan(SetTableMaxMergeTimeSpanRequest) returns (SetTableMaxMergeTimeSpanResponse);
}

// region: Common Types

enum BqlType {
  BQL_TYPE_BOOL = 0;
  BQL_TYPE_INT = 1;
  BQL_TYPE_LONG = 2;
  BQL_TYPE_REAL = 3;
  BQL_TYPE_STRING = 4;
  BQL_TYPE_DATETIME = 5;
  BQL_TYPE_TIMESPAN = 6;
  BQL_TYPE_GUID = 7;
  BQL_TYPE_DYNAMIC = 8;
}

message Column {
  string name = 1;
  BqlType type = 2;
}

// endregion

// region: Table Messages

message CreateTableRequest {
  string database_id = 1;
  string name = 2;
}

message CreateTableResponse {
  string id = 1;
  string name = 2;
  string database_id = 3;
}

message GetTableRequest {
  string database_id = 1;
  string table_id = 2;
  bool include_deleted = 3;
}

message GetTableResponse {
  string id = 1;
  string name = 2;
  string database_id = 3;
  // Retention in nanoseconds. Unset = no retention configured.
  optional int64 retention_ns = 4;
  // Explicit per-table merge span cap in nanoseconds. Unset = derive from
  // retention (or fall back to the global cap).
  optional int64 max_merge_time_span_ns = 5;
  // Absent for live rows.
  Tombstone tombstone = 6;
}

message ListTablesRequest {
  string database_id = 1;
  bool include_deleted = 2;
}

message ListTablesResponse {
  repeated Table items = 1;
  // Grace window (seconds) a non-immediate table delete schedules — meta's
  // effective TABLE_GRACE, so the UI need not hardcode it.
  int64 grace_window_seconds = 2;
}

message Table {
  string id = 1;
  string name = 2;
  string database_id = 3;
  optional int64 retention_ns = 4;
  optional int64 max_merge_time_span_ns = 5;
  // Absent for live rows.
  Tombstone tombstone = 6;
}

message DeleteTableRequest {
  string database_id = 1;
  string table_id = 2;
  // Grace window zero; bytes still bound by the segment tombstone_min_age floor (~12 min).
  bool immediate = 3;
}

message DeleteTableResponse {
  oneof outcome {
    Table tombstoned = 1;
    // Live views/tokens still referencing the table; tombstoned references are omitted (they do not block).
    TableReferences blocked = 2;
  }
}

message TableReferences {
  repeated ViewReference views = 1;
  repeated IngestTokenReference ingest_tokens = 2;
}

message ResurrectTableRequest {
  string database_id = 1;
  string table_id = 2;
}

message ResurrectTableResponse {
  Table table = 1;
}

message ViewReference {
  string id = 1;
  string name = 2;
  // Other tables this view spans; excludes the target table.
  repeated TableRef other_members = 3;
  // Absent for live rows.
  Tombstone tombstone = 4;
}

message IngestTokenReference {
  string id = 1;
  string name = 2;
  // Signal columns ("traces"/"logs"/"metrics") of this token pointing at the target table.
  repeated string signals = 3;
  // Absent for live rows.
  Tombstone tombstone = 4;
}

message TableRef {
  string id = 1;
  string name = 2;
}

message SetShardingFieldsRequest {
  string database_id = 1;
  string table_id = 2;
  repeated ShardingField fields = 3;
}

message SetShardingFieldsResponse {
  repeated ShardingField fields = 1;
}

message GetShardingFieldsRequest {
  string database_id = 1;
  string table_id = 2;
  bool include_deleted = 3;
}

message GetShardingFieldsResponse {
  repeated ShardingField fields = 1;
}

// Set per-table retention. Unset `retention_ns` clears the setting
// (reverts to "no retention configured").
message SetTableRetentionRequest {
  string database_id = 1;
  string table_id = 2;
  optional int64 retention_ns = 3;
}

message SetTableRetentionResponse {
  optional int64 retention_ns = 1;
}

// Set the per-table merge-span override. Unset `max_merge_time_span_ns`
// clears the override (planner falls back to the retention formula).
message SetTableMaxMergeTimeSpanRequest {
  string database_id = 1;
  string table_id = 2;
  optional int64 max_merge_time_span_ns = 3;
}

message SetTableMaxMergeTimeSpanResponse {
  optional int64 max_merge_time_span_ns = 1;
}

// endregion

ViewService

Manages named, database-scoped unions over tables. A view name lowers to the same plan shape as an explicit union T1, T2, ... in KQL.

RPCs

RPCDescription
CreateViewCreate a new view
GetViewGet view by ID
ListViewsList views in a database
SetViewTablesAtomic full replacement of a view's member tables
DeleteViewTombstone a view. Views always use the 5-minute SHORT_GRACE (no immediate flag).
ResurrectViewLift the tombstone on a view within the grace window.
LookupTableOrViewDatabase-scoped name resolution returning either a table or a view (used by the binder)

Proto Definition

// ViewService manages named, database-scoped unions over tables. A view is
// "this database has a thing called `MyView` that means: union over
// T1, T2, T3" — the engine binder lowers a view-named reference into the
// same plan shape as an explicit `union T1, T2, T3` in KQL.
//
// All RPCs are database-scoped via an explicit `database_id`. Cross-type
// name collisions (a table and a view sharing a name in the same
// database) are rejected by CreateView / CreateTable at the service
// layer. No Delete RPC in v1.
service ViewService {
  rpc CreateView(CreateViewRequest) returns (CreateViewResponse);
  rpc GetView(GetViewRequest) returns (GetViewResponse);
  rpc ListViews(ListViewsRequest) returns (ListViewsResponse);
  // Atomic full replacement of the view's member list. Server rejects
  // an empty `table_ids` (a view with no members has no defined
  // schema and can't be lowered to a plan).
  rpc SetViewTables(SetViewTablesRequest) returns (SetViewTablesResponse);
  // Short default grace (5 min): views hold no recoverable data, so the window is only fat-finger insurance.
  rpc DeleteView(DeleteViewRequest) returns (DeleteViewResponse);
  rpc ResurrectView(ResurrectViewRequest) returns (ResurrectViewResponse);
  // Database-scoped name resolution for the engine binder. Returns
  // either a Table or a View. Replaces the unscoped
  // `list_tables() + find_by_name` fallback that lived in the binder
  // before PR 4b. Cross-type collisions are blocked at write time, so
  // a name resolves to exactly one variant.
  rpc LookupTableOrView(LookupTableOrViewRequest) returns (LookupTableOrViewResponse);
}

message View {
  string id = 1;
  string name = 2;
  optional string description = 3;
  // Ordered list of member tables (by `ordinal`). The order is
  // stable across `SetViewTables` and surfaces directly in the plan.
  repeated ViewMember members = 4;
  // Absent for live rows.
  Tombstone tombstone = 5;
}

// One member of a view's union. Carries both id and name so the
// binder can resolve without a second round-trip.
message ViewMember {
  string id = 1;
  string name = 2;
}

message CreateViewRequest {
  string database_id = 1;
  string name = 2;
  // Ordered list of member table UUIDs. Must be non-empty.
  repeated string table_ids = 3;
  optional string description = 4;
}

message CreateViewResponse {
  View view = 1;
}

message GetViewRequest {
  // Database that owns the view. Required. Server rejects with
  // INVALID_ARGUMENT if `database_id` does not own `view_id`.
  string database_id = 2;
  string view_id = 1;
  bool include_deleted = 3;
}

message GetViewResponse {
  View view = 1;
}

message ListViewsRequest {
  string database_id = 1;
  bool include_deleted = 2;
}

message ListViewsResponse {
  repeated View items = 1;
}

message SetViewTablesRequest {
  // Database that owns the view. Required. Server rejects with
  // INVALID_ARGUMENT if `database_id` does not own `view_id`.
  string database_id = 3;
  string view_id = 1;
  // Atomic full replacement. Must be non-empty.
  repeated string table_ids = 2;
}

message SetViewTablesResponse {
  View view = 1;
}

message DeleteViewRequest {
  string database_id = 1;
  string view_id = 2;
}

message DeleteViewResponse {
  View view = 1;
}

message ResurrectViewRequest {
  string database_id = 1;
  string view_id = 2;
}

message ResurrectViewResponse {
  View view = 1;
}

message LookupTableOrViewRequest {
  // UUID of the database to resolve `name` against. Callers are
  // expected to have already resolved via `GetDatabase`.
  string database_id = 1;
  string name = 2;
  bool include_deleted = 3;
}

// Result of a `LookupTableOrView`: the name resolves to exactly one of
// a Table or a View. Caller can match on the `kind` oneof to dispatch.
message LookupTableOrViewResponse {
  oneof kind {
    LookupTable table = 1;
    LookupView view = 2;
  }
}

message LookupTable {
  string id = 1;
  string name = 2;
}

message LookupView {
  string id = 1;
  string name = 2;
  repeated ViewMember members = 3;
}

ClusterService

Cluster membership registry — meta tracks live cluster members for leader-election and routing.

RPCs

RPCDescription
RegisterMemberRegister the calling node and refresh its heartbeat
ListMembersList currently-registered cluster members

Proto Definition

// Ephemeral, in-memory registry of long-lived services (nursery, etc).
// Members heartbeat periodically; meta evicts stale entries on TTL.
// On meta restart the registry is empty — services re-register on next heartbeat.
service ClusterService {
  // Register a member and refresh its heartbeat in one call. Idempotent;
  // repeated calls update last_heartbeat. Members should call this
  // periodically (e.g. every 5s) to remain in the registry.
  rpc RegisterMember(RegisterMemberRequest) returns (RegisterMemberResponse);

  // List members of a given kind that are currently registered (not evicted).
  rpc ListMembers(ListMembersRequest) returns (ListMembersResponse);
}

// Kind of cluster member. Same QWS code can register as either lane —
// the kind tells QC which work-scope the member can handle. Unknown
// values are treated as MEMBER_KIND_UNSPECIFIED.
enum MemberKind {
  MEMBER_KIND_UNSPECIFIED = 0;
  // QWS in the cloud-segment lane: scans S3-resident segments via cache_server.
  MEMBER_KIND_QWS_CLOUD = 2;
  // QWS in the nursery-local lane: scans nursery-local .ttseg files via
  // NurseryLocalCache; no S3 credentials. Today registered by the nursery
  // service itself; in Phase 8.4 split into a separate qws child.
  MEMBER_KIND_QWS_NURSERY = 1;
}

message RegisterMemberRequest {
  // Stable identity for this member (e.g. nursery_node_id). Re-registering
  // with the same id replaces the previous record.
  string id = 1;
  MemberKind kind = 2;
  // gRPC endpoint other services can dial (e.g. "http://nursery-0:9531").
  string endpoint = 3;
  // Relative capacity of this member, used by the QC's partition_ring to
  // route proportionally more partitions to stronger nodes. First version
  // is the effective vCPU count visible to the registering process
  // (cgroup-aware via num_cpus::get on Linux, so under k8s this is the
  // pod's CPU limit, not the host's full capacity). Zero means "unset by
  // an old client" — the QC treats this as weight=1 (equal share).
  uint32 weight = 4;
}

message RegisterMemberResponse {
  // Cluster identity string this meta is configured with — set per
  // environment in helm values (e.g. "dev", "valhalla", a UUID).
  // Not crypto, not a secret — a deliberate mixup-detection
  // mechanism: services that talked to this meta tag their outbound
  // gRPC calls with this id; the receiver rejects mismatches. Cheap
  // way to catch "I accidentally pointed at the wrong meta_endpoint
  // and now I'm dialing valhalla's qws from a dev process."
  string cluster_id = 1;
}

message ListMembersRequest {
  // If MEMBER_KIND_UNSPECIFIED, all members are returned. Otherwise filtered
  // to the requested kind.
  MemberKind kind = 1;
}

message MemberRecord {
  string id = 1;
  MemberKind kind = 2;
  string endpoint = 3;
  // Time of the last heartbeat received by meta, nanoseconds since unix epoch.
  int64 last_heartbeat_nanos = 4;
  // Relative capacity, set by the registering member at registration time
  // (see RegisterMemberRequest.weight). Used by the QC's partition_ring to
  // route proportionally more partitions to stronger nodes. Zero from
  // pre-weighting clients is treated as weight=1 by the consumer.
  uint32 weight = 5;
}

message ListMembersResponse {
  repeated MemberRecord members = 1;
  // Same cluster id as RegisterMemberResponse.cluster_id — see there.
  string cluster_id = 2;
}

IngestionService

Authenticates ingest tokens and manages collector streams.

RPCs

RPCDescription
RegisterForStreamAuthenticate an ingest token and get/create a stream for a collector

Proto Definition

service IngestionService {
  // RegisterForStream authenticates an ingest token and returns a stream for this collector.
  // Returns the existing active stream for the collector_id, or creates a new one.
  rpc RegisterForStream(RegisterForStreamRequest) returns (RegisterForStreamResponse);
}

message RegisterForStreamRequest {
  reserved 1;
  string collector_string = 2;
  string collector_id = 3;
  string ingest_token = 4;
}

message StreamConfig {
  CloudObjectPrefix cloud_object_prefix = 1;
  uint64 flush_timeout_ms = 2;
  uint64 max_segment_size_bytes = 3;
}

message RegisterForStreamResponse {
  string stream_id = 1;
  StreamConfig config = 2;
  // UUID of the ingest token (from the meta database).
  // Tjalfe uses this as the key in CombinedSignals.token_signals
  // so the plaintext token is never persisted to S3.
  string token_id = 3;
}

IngestTokenService

Manages ingest tokens for authentication and signal routing.

RPCs

RPCDescription
CreateIngestTokenCreate a new token (plaintext shown only once)
ValidateIngestTokenValidate a token and return its routing
DeleteIngestTokenTombstone a token; the catalog row sits for the grace window so a fat-finger delete can be resurrected
ResurrectIngestTokenLift the tombstone on a token within the grace window
ListIngestTokensList all tokens
GetIngestTokenGet token details by ID
UpdateIngestTokenRoutingUpdate signal routing for a token

Proto Definition

// All RPCs except ValidateIngestToken require `database_id`. Tokens are
// auth credentials, so every operation explicitly carries the database
// scope. Validate is the exception because the ingest path receives only
// a plaintext token header — the response carries `database_id` outward.
service IngestTokenService {
  rpc CreateIngestToken(CreateIngestTokenRequest) returns (CreateIngestTokenResponse);
  rpc ValidateIngestToken(ValidateIngestTokenRequest) returns (ValidateIngestTokenResponse);
  // New stream registrations are rejected at tombstone time, before the grace window expires.
  rpc DeleteIngestToken(DeleteIngestTokenRequest) returns (DeleteIngestTokenResponse);
  rpc ResurrectIngestToken(ResurrectIngestTokenRequest) returns (ResurrectIngestTokenResponse);
  rpc ListIngestTokens(ListIngestTokensRequest) returns (ListIngestTokensResponse);
  rpc GetIngestToken(GetIngestTokenRequest) returns (GetIngestTokenResponse);
  rpc UpdateIngestTokenRouting(UpdateIngestTokenRoutingRequest) returns (UpdateIngestTokenRoutingResponse);
}

message IngestTokenRouting {
  string traces_table_id = 1;
  string logs_table_id = 2;
  string metrics_table_id = 3;
}

message IngestTokenInfo {
  string id = 1;
  string name = 2;
  string created_at = 3;
  IngestTokenRouting routing = 4;
  string token_hint = 5;  // last 4 chars of the plaintext token
  string database_id = 6;
  // Absent for live rows.
  Tombstone tombstone = 7;
}

message CreateIngestTokenRequest {
  string database_id = 1;
  string name = 2;
  // All 3 routing targets are required. A "logs-only" token points all 3
  // signals at the logs table; wrong-signal data lands there (schema-on-read
  // absorbs it). Tightening to per-signal optional is a future, non-breaking
  // change.
  string traces_table_id = 3;
  string logs_table_id = 4;
  string metrics_table_id = 5;
}

message CreateIngestTokenResponse {
  string plaintext_token = 1;  // shown only once
  IngestTokenInfo token = 2;
}

message ValidateIngestTokenRequest {
  string plaintext_token = 1;
}

message ValidateIngestTokenResponse {
  string token_id = 1;
  IngestTokenRouting routing = 2;
  string database_id = 3;
}

message DeleteIngestTokenRequest {
  string database_id = 1;
  string token_id = 2;
}

message DeleteIngestTokenResponse {
  // Carries the grace window: grace = delete_at - tombstoned_at.
  Tombstone tombstone = 1;
}

message ResurrectIngestTokenRequest {
  string database_id = 1;
  string token_id = 2;
}

message ResurrectIngestTokenResponse {
  IngestTokenInfo token = 1;
}

message ListIngestTokensRequest {
  string database_id = 1;
  bool include_deleted = 2;
}

message ListIngestTokensResponse {
  repeated IngestTokenInfo tokens = 1;
}

message GetIngestTokenRequest {
  string database_id = 1;
  string token_id = 2;
  bool include_deleted = 3;
}

message GetIngestTokenResponse {
  IngestTokenInfo token = 1;
}

message UpdateIngestTokenRoutingRequest {
  string database_id = 1;
  string token_id = 2;
  IngestTokenRouting routing = 3;
}

message UpdateIngestTokenRoutingResponse {
  // empty on success
}

NurseryService

Manages ingest stream lifecycle, merged segment registration, and offset tracking.

RPCs

RPCDescription
GetIngestStreamsForNurseryGet all active streams and table routing info
MarkStreamStoppedMark a stream as stopped (client disconnect or timeout)
RegisterMergedSegments
UpdateDeletedOffsetUpdate the deleted offset after cleaning baby segments
DeleteIngestStreamTableRemove a fully-processed table entry from a stream

Proto Definition

service NurseryService {
  rpc GetIngestStreamsForNursery(GetIngestStreamsRequest)
      returns (GetIngestStreamsResponse);
  rpc MarkStreamStopped(MarkStreamStoppedRequest)
      returns (MarkStreamStoppedResponse);
  rpc RegisterMergedSegments(RegisterMergedSegmentsRequest)
      returns (RegisterMergedSegmentsResponse);
  rpc UpdateDeletedOffset(UpdateDeletedOffsetRequest)
      returns (UpdateDeletedOffsetResponse);
  rpc DeleteIngestStreamTable(DeleteIngestStreamTableRequest)
      returns (DeleteIngestStreamTableResponse);
}

message GetIngestStreamsRequest {}

// Checkpoint tracking the mapping between a merged segment offset and stream offset
message OffsetCheckpoint {
  // The MVCC version of the merged segment
  int64 segment_offset = 1;
  // The stream offset at the time of this merge
  int64 stream_offset = 2;
  // Ingest timestamp of the baby segment (nanoseconds since unix epoch)
  int64 ingest_timestamp_nanos = 3;
}

// A best-effort (stream_offset, ingest_time) sample used to pick the ~1h
// "old enough to delete" boundary for a stream's shared blobs.
message OffsetTimestamp {
  int64 stream_offset = 1;
  int64 ingest_time_nanos = 2;
}

// Table-specific routing information within a stream
message IngestStreamTableInfo {
  string table_id = 1;
  // Last merged offset. Nursery should fetch from merged_offset + 1
  int64 merged_offset = 2;
  // Ingest timestamp of the last merged baby segment (nanoseconds since unix epoch)
  // NOTE: Source precision is seconds (from S3 last_modified), sub-second portion is always zero
  // Used by query engine to determine which segments to query
  int64 merged_ingest_time_nanos = 3;
  // Table stream creation time in nanoseconds since unix epoch
  int64 created_at_nanos = 6;
  // S3 location for uploading merged segments: endpoint, bucket, path prefix
  // Path format: tables/<table_id>/segments/
  CloudObjectPrefix cloud_merged_segments_prefix = 7;
  // Checkpoints tracking recent merge points (segment_offset, stream_offset, ingest_timestamp)
  // Pruned to keep only checkpoints from the last 30 minutes
  repeated OffsetCheckpoint offset_checkpoints = 8;
}

// External stream information - matches two-table design (ingest_streams + ingest_stream_tables)
message IngestStreamInfo {
  string stream_id = 1;
  // S3 location for baby segments: endpoint, bucket, path prefix
  // Path format: ingest-stream/<stream_id>/
  CloudObjectPrefix cloud_stream_prefix = 2;
  // If set, stream is stopped (no more baby segments will arrive)
  // Nursery can release stopped streams after processing remaining segments
  optional int64 stopped_at_nanos = 3;
  // Offset of the stop message (only set if stopped_at_nanos is set)
  // A fully-merged stream has merged_offset == stopped_offset
  optional int64 stopped_offset = 4;
  // Table-specific routing information for this stream
  repeated IngestStreamTableInfo tables = 5;
  // Durable completeness frontier (nanos): min over this stream's tables of
  // merged_ingest_time_nanos, or its creation time when it has no tables yet.
  // The nursery's ingest watermark floor for a stream it hasn't fetched.
  int64 known_complete_through_nanos = 6;
  // Highest stream offset whose shared baby-segment blobs are deleted from S3
  // (default -1). Nursery only deletes offsets <= min(merged_offset) over tables.
  int64 deleted_offset = 7;
  // Best-effort ingest time near the deletion frontier (nanoseconds since unix
  // epoch). Observability only — correctness rides on deleted_offset.
  int64 deleted_ingest_time_nanos = 8;
  // Sparse age-ladder samples for picking the deletion boundary.
  repeated OffsetTimestamp offset_timestamps = 9;
}

// Per-token routing: maps each signal type to a target table.
// Keyed by token_id (UUID), matching the key in CombinedSignals.token_signals.
message IngestTokenRoutingInfo {
  // UUID of the ingest token (from the meta database).
  // Matches the key used in CombinedSignals.token_signals.
  string token_id = 1;
  string traces_table_id = 2;
  string logs_table_id = 3;
  string metrics_table_id = 4;
  // PR 5: token's owning database. Used by nursery to scope
  // `bzrk.table` attribute lookups and reject cross-DB routing.
  string database_id = 5;
}

message GetIngestStreamsResponse {
  repeated IngestStreamInfo streams = 1;
  // All known tables (name → id mapping for routing resolution).
  // Each `TableInfo.database_id` scopes the table to a single
  // database, which is the only legal target for that table's
  // routing (cross-database routing is impossible by construction).
  repeated TableInfo tables = 2;
  // Monotonic sequence number for staleness detection.
  // Nursery should discard responses with a lower sequence than already seen.
  int64 meta_response_seq = 3;
  // Active ingest token routing for token-based signal routing
  repeated IngestTokenRoutingInfo token_routing = 4;
  // The `streams` list is complete through this time (nanos): any stream not in
  // this response was created strictly after it. The nursery caps its ingest
  // watermark here so a just-created, not-yet-polled stream can't be skipped.
  int64 stream_list_complete_through_nanos = 6;
  // Field 5 was `repeated DatabaseInfo databases`, used by the old
  // qualified-routing path (`bzrk.table=db.foo`). The qualifier was
  // removed in PR 7: every routing key is unqualified and resolves
  // strictly in the token's database, so the nursery never needs a
  // name→id database map.
  reserved 5;
  reserved "databases";
}

// Reason why a stream was stopped
enum StopReason {
  STOP_REASON_UNSPECIFIED = 0;
  // Client (ingester) wrote a stop message
  STOP_REASON_CLIENT = 1;
  // Nursery detected inactivity timeout
  STOP_REASON_TIMEOUT = 2;
}

message MarkStreamStoppedRequest {
  string stream_id = 1;
  // When the stream was stopped (nanoseconds since unix epoch)
  int64 stopped_at_nanos = 2;
  // Final processed offset in the stream
  int64 final_offset = 3;
  // Reason for stopping
  StopReason reason = 4;
}

message MarkStreamStoppedResponse {}

// Per-stream offset info for merged segment registration
message StreamOffsetInfo {
  string stream_id = 1;
  string table_id = 2;
  // New merged_offset after this merge (max baby segment offset included)
  int64 new_merged_offset = 3;
  // Ingest timestamp (S3 last_modified) of the highest-offset baby segment
  // NOTE: S3 provides second-level precision, sub-second portion is always zero
  int64 ingest_time_nanos = 4;
  // What merged_offset was when nursery selected segments for merge.
  // Meta validates this matches current value to detect conflicts.
  int64 start_merged_offset = 5;
}

// One merged segment registration: a new segment plus the per-stream offset
// advances for streams that contributed data to it.
message MergeRegistration {
  CreateSegment segment = 1;
  // Per-stream offset advance for streams that contributed segments to this merge.
  // Each item's start_merged_offset must equal the current merged_offset for that
  // (stream, table). Within a batch, items are applied in order, so item N+1's
  // start_merged_offset must equal item N's new_merged_offset (for the same stream).
  repeated StreamOffsetInfo stream_offsets = 2;
}

// Register one or more merged segments for a single table, atomically.
//
// All merges in the batch must share the same table_id (server enforced),
// and that table must live in `database_id` (server enforced). `merges`
// are applied in order in one Postgres transaction; on any failure the
// whole transaction rolls back. After all merges, `idle_advances` are
// applied — these advance merged_offset for streams that didn't contribute
// segments to any merge in this batch (e.g. streams whose follower
// processed keep-alives or a stop message). Idle-advance checkpoints
// reuse the last allocated segment version in the batch.
message RegisterMergedSegmentsRequest {
  // Database that owns the batch's table. Required. Server rejects with
  // INVALID_ARGUMENT if `database_id` does not own the batch's table_id.
  string database_id = 3;
  // Must be non-empty.
  repeated MergeRegistration merges = 1;
  // Optional. Applied last in the same transaction.
  repeated StreamOffsetInfo idle_advances = 2;
  // Idempotency key: a stable per-batch id, reused across retries of the same
  // batch. Meta uses it to make registration replay-safe — a retry after a
  // committed-but-lost response returns the original result instead of
  // re-applying or wedging on the unique storage-path index.
  string register_request_id = 4;
}

message RegisterMergedSegmentsResponse {
  // segment_versions[i] is the MVCC version assigned to merges[i].
  repeated int64 segment_versions = 1;
  // Updated stream info for nursery to cache (avoids separate poll)
  repeated IngestStreamInfo streams = 2;
  // Monotonic sequence number for staleness detection (same counter as GetIngestStreamsResponse)
  int64 meta_response_seq = 3;
}

message UpdateDeletedOffsetRequest {
  string stream_id = 1;
  int64 deleted_offset = 3;
  // Best-effort ingest time near the deletion frontier (nanoseconds since unix
  // epoch). Observability only — correctness rides on deleted_offset.
  int64 deleted_ingest_time_nanos = 4;
}

message UpdateDeletedOffsetResponse {}

message DeleteIngestStreamTableRequest {
  string stream_id = 1;
  string table_id = 2;
  // The current (highest) offset nursery is processing for this stream.
  // Meta validates this is >= the table's merged_offset as a sanity check.
  int64 current_offset = 3;
  // Nursery's view of the table's merged_offset.
  // Meta validates this matches its own merged_offset and deleted_offset
  // to catch stale nursery state.
  int64 merged_offset = 4;
}

message DeleteIngestStreamTableResponse {}

MergeTaskService

Coordinates segment merge tasks between the janitor and merge workers.

RPCs

RPCDescription
PollForSegmentMergeTaskWorker polls for an available merge task
CompleteSegmentMergeTaskWorker reports successful merge with the new segment
FailSegmentMergeTaskWorker reports merge failure

Proto Definition

service MergeTaskService {
  rpc PollForSegmentMergeTask(PollForSegmentMergeTaskRequest) returns (PollForSegmentMergeTaskResponse);
  rpc CompleteSegmentMergeTask(CompleteSegmentMergeTaskRequest) returns (CompleteSegmentMergeTaskResponse);
  rpc FailSegmentMergeTask(FailSegmentMergeTaskRequest) returns (FailSegmentMergeTaskResponse);
}

message PollForSegmentMergeTaskRequest {
  string worker_id = 1;
}

message PollForSegmentMergeTaskResponse {
  optional SegmentMergeTaskInfo task = 1;
}

message CompleteSegmentMergeTaskRequest {
  string task_id = 1;
  CreateSegment merged_segment = 2;
  string rewrite_journal_relative_path = 3;
}

message CompleteSegmentMergeTaskResponse {
  oneof result {
    CompleteSegmentMergeTaskSuccess success = 1;
    CompleteSegmentMergeTaskError error = 2;
  }
}

message CompleteSegmentMergeTaskSuccess {}

message CompleteSegmentMergeTaskError {
  string message = 1;
}

message FailSegmentMergeTaskRequest {
  string task_id = 1;
  string error_message = 2;
}

message FailSegmentMergeTaskResponse {}

SegmentDeletionService

Manages cleanup of tombstoned segments from storage.

RPCs

RPCDescription
GetTombstonedSegmentsForDeletionGet segments marked for deletion
ConfirmTombstoneDeletionConfirm segments have been deleted from storage
ConfirmRewriteJournalDeletion

Proto Definition

service SegmentDeletionService {
  rpc GetTombstonedSegmentsForDeletion(GetTombstonedSegmentsForDeletionRequest) returns (GetTombstonedSegmentsForDeletionResponse);
  rpc ConfirmTombstoneDeletion(ConfirmTombstoneDeletionRequest) returns (ConfirmTombstoneDeletionResponse);
  rpc ConfirmRewriteJournalDeletion(ConfirmRewriteJournalDeletionRequest) returns (ConfirmRewriteJournalDeletionResponse);
}

message GetTombstonedSegmentsForDeletionRequest {
  // Empty - service uses internal configuration
}

message GetTombstonedSegmentsForDeletionResponse {
  repeated TombstonedSegmentInfo segments = 1;
}

message TombstonedSegmentInfo {
  string segment_id = 1;
  string table_id = 2;
  CloudObjectKey cloud_object_key = 3;
  int64 tombstone_time_nanos = 4;
  int64 tombstone_version = 5;
}

message ConfirmTombstoneDeletionRequest {
  repeated string segment_ids = 1;
}

message ConfirmTombstoneDeletionResponse {
  oneof result {
    ConfirmTombstoneDeletionSuccess success = 1;
    ConfirmTombstoneDeletionError error = 2;
  }
}

message ConfirmTombstoneDeletionSuccess {
  int32 deleted_count = 1;
  repeated RewriteJournalReadyForDeletion journals_ready = 2;
}

message RewriteJournalReadyForDeletion {
  string output_segment_id = 1;
  CloudObjectKey journal_key = 2;
}

message ConfirmTombstoneDeletionError {
  string message = 1;
}

message ConfirmRewriteJournalDeletionRequest {
  repeated string output_segment_ids = 1;
}

message ConfirmRewriteJournalDeletionResponse {
  int32 cleared_count = 1;
}

SegmentLookupService

Discovers segments matching a time range and table filter.

RPCs

RPCDescription
FindSegmentsFind segments by time range and table names (paginated)
GetSegmentsByIdsGet specific segments by their IDs
ListWarmCandidatesReturns segments ordered for cache warming priority (newest-first by time_range.end_time, globally across all tables). Paginated.

Proto Definition

service SegmentLookupService {
  rpc FindSegments(FindSegmentsRequest) returns (FindSegmentsResponse);
  rpc GetSegmentsByIds(GetSegmentsByIdsRequest) returns (GetSegmentsByIdsResponse);
  // Returns segments ordered for cache warming priority (newest-first by
  // time_range.end_time, globally across all tables). Paginated.
  rpc ListWarmCandidates(ListWarmCandidatesRequest) returns (ListWarmCandidatesResponse);
}

message FindSegmentsRequest {
  TimeRange time_range = 1;
  repeated string table_names = 3;
  int32 limit = 4;
  optional string cursor = 5;
  // UUID of the database to resolve `table_names` against. Required;
  // server rejects empty. Callers (query service, admin tools) are
  // expected to have already resolved any name via `GetDatabase`.
  string database_id = 6;
}

message PagedResponse {
  optional string next_cursor = 1;
}

message FindSegmentsResponse {
  repeated SegmentInfo segments = 1;
  PagedResponse page = 2;
  // Snapshot of meta's monotonic segment-version counter
  // (`counters.version`) read in the same transaction as the
  // segment list. QC pins this as the watermark `V` for the query:
  // the cloud QWS lane sees only segments with `version <= V`; a
  // future nursery lane (Phase 4-5 of docs/dev/query-exec-in-nursery.md)
  // fills the gap with locally-staged segments that are either
  // unregistered or registered at `version > V`. The two sets are
  // disjoint by construction, so the merged result has no double-count.
  uint64 current_version = 3;
}

message GetSegmentsByIdsRequest {
  repeated string segment_ids = 1;
}

message GetSegmentsByIdsResponse {
  repeated SegmentInfo segments = 1;
}

message ListWarmCandidatesRequest {
  // Max segments per page (capped server-side).
  int32 limit = 1;
  // Opaque cursor from previous page; empty on first call.
  optional string cursor = 2;
  // Shard-filter fields will be added later as part of the sharding design.
}

message ListWarmCandidatesResponse {
  // Ordered newest-first by time_range.end_time, then by id descending.
  repeated SegmentInfo segments = 1;
  PagedResponse page = 2;
}

JanitorService

Provides cluster statistics and merge task management for maintenance operations.

RPCs

RPCDescription
GetJanitorStatsGet segment count, sizes, and outstanding merge tasks
GetSegmentSizeStatsGet segment size distribution by tier
ListMergeTasksList active merge tasks
GetMergeTaskGet detailed info about a specific merge task
UnclaimMergeTaskRelease a stale merge task claim
CreateMergeTasksCreate new merge tasks for eligible segments
RewriteAllSegmentsCreate one single-segment merge task per existing segment not already in a merge task. This forces every segment through the rewrite pipeline to pick up index fixes.
DeleteMergeTask

Proto Definition

service JanitorService {
  rpc GetJanitorStats(GetJanitorStatsRequest) returns (GetJanitorStatsResponse);
  rpc GetSegmentSizeStats(GetSegmentSizeStatsRequest) returns (GetSegmentSizeStatsResponse);
  rpc ListMergeTasks(ListMergeTasksRequest) returns (ListMergeTasksResponse);
  rpc GetMergeTask(GetMergeTaskRequest) returns (GetMergeTaskResponse);
  rpc UnclaimMergeTask(UnclaimMergeTaskRequest) returns (UnclaimMergeTaskResponse);
  rpc CreateMergeTasks(CreateMergeTasksRequest) returns (CreateMergeTasksResponse);
  // Create one single-segment merge task per existing segment not already in a merge task.
  // This forces every segment through the rewrite pipeline to pick up index fixes.
  rpc RewriteAllSegments(RewriteAllSegmentsRequest) returns (RewriteAllSegmentsResponse);
  rpc DeleteMergeTask(DeleteMergeTaskRequest) returns (DeleteMergeTaskResponse);
}

message GetSegmentSizeStatsRequest {
  // Optional maximum compressed size in bytes (e.g., 10485760 for 10MB).
  // Only segments with compressed_size <= this value are included.
  optional int64 max_compressed_size = 1;
  // Optional minimum age in seconds. Only segments created after now - min_age are included.
  optional int64 min_age_secs = 2;
  // Optional maximum age in seconds. Only segments created before now - max_age are included.
  optional int64 max_age_secs = 3;
  // Optional: scope to a specific table by UUID. Pre-PR-4a this took a
  // name; switching to id avoids cross-database collisions when multi-DB
  // arrives.
  optional string table_id = 4;
}

message SegmentSizeTier {
  int32 tier = 1;
  string tier_min = 2;
  string tier_max = 3;
  int64 segment_count = 4;
  int64 total_size = 5;
  int64 smallest = 6;
  int64 largest = 7;
  int64 avg_size = 8;
  string earliest_time = 9;
  string latest_time = 10;
}

message GetSegmentSizeStatsResponse {
  repeated SegmentSizeTier tiers = 1;
}

message GetJanitorStatsRequest {
  // Optional: scope to a specific table by UUID. Pre-PR-4a this took a
  // name; switching to id avoids cross-database collisions.
  optional string table_id = 1;
}

message GetJanitorStatsResponse {
  int64 segment_count = 1;
  int64 outstanding_merge_tasks = 2;
  int64 total_size_bytes = 3;
  int64 total_compressed_size_bytes = 4;
  int64 segments_incl_tombstoned = 5;
}

message ListMergeTasksRequest {
  // Optional: scope to a specific table by UUID. Pre-PR-4a this took a
  // name; switching to id avoids cross-database collisions.
  optional string table_id = 1;
}

message ListMergeTasksResponse {
  repeated MergeTaskSummary tasks = 1;
}

message MergeTaskSummary {
  string task_id = 1;
  string table_id = 2;
  int32 segment_count = 3;
  string assigned_to_worker = 4;
  string created_at = 5;
  string assigned_at = 6;
}

message GetMergeTaskRequest {
  string task_id = 1;
}

message UnclaimMergeTaskRequest {
  string task_id = 1;
  // Minimum duration in seconds the task must have been claimed before it can be unclaimed.
  // Defaults to 3600 (1 hour) if not set.
  int64 min_claimed_secs = 2;
}

message UnclaimMergeTaskResponse {
  // True if the task was unclaimed, false if it was not found or not yet old enough.
  bool unclaimed = 1;
}

message CreateMergeTasksRequest {
  // Optional maximum compressed size in bytes. Only segments with compressed_size <= this value.
  optional int64 max_compressed_size = 1;
  // Optional minimum age in seconds. Only segments older than now - min_age.
  optional int64 min_age_secs = 2;
  // Optional maximum age in seconds. Only segments newer than now - max_age.
  optional int64 max_age_secs = 3;
  // Optional: scope to a specific table by UUID. Pre-PR-4a this took a
  // name; switching to id avoids cross-database collisions when multi-DB
  // arrives.
  optional string table_id = 4;
}

message CreateMergeTasksResponse {
  // Number of merge tasks created
  int32 tasks_created = 1;
  // Total number of segments included across all tasks
  int64 segments_included = 2;
}

message GetMergeTaskResponse {
  string task_id = 1;
  string table_id = 2;
  int32 segment_count = 3;
  string assigned_to_worker = 4;
  string created_at = 5;
  int64 smallest_segment_size = 6;
  int64 largest_segment_size = 7;
  int64 smallest_segment_compressed_size = 8;
  int64 largest_segment_compressed_size = 9;
  string earliest_time = 10;
  string latest_time = 11;
  int64 total_size = 12;
  int64 total_compressed_size = 13;
  string assigned_at = 14;
}

message RewriteAllSegmentsRequest {
  // Optional: limit to a specific table
  // Optional: scope to a specific table by UUID. Pre-PR-4a this took a
  // name; switching to id avoids cross-database collisions.
  optional string table_id = 1;
}

message RewriteAllSegmentsResponse {
  // Number of single-segment merge tasks created
  int32 tasks_created = 1;
  // Total number of segments that will be rewritten
  int64 segments_included = 2;
}

message DeleteMergeTaskRequest {
  string task_id = 1;
}

message DeleteMergeTaskResponse {
  // True if the task was found and deleted.
  bool deleted = 1;
}

Common Types

Shared message types used across multiple services.

// Both timestamps set or neither; server enforces delete_at >= tombstoned_at.
// Unix nanos (not google.protobuf.Timestamp) to dodge the well-known-types include path.
message Tombstone {
  int64 tombstoned_at_unix_ns = 1;
  int64 delete_at_unix_ns = 2;
}

message CloudBucket {
  string endpoint = 1;
  string bucket = 2;
  string region = 3;       // empty = provider default
  string path = 4;         // customer-configured base path within bucket (can be empty)
  string provider = 5;     // "s3" (default) or "gcs"
}

message CloudObjectPrefix {
  CloudBucket bucket = 1;
  string prefix = 2;       // e.g. "tables/ds-123/segments"
}

message CloudObjectKey {
  CloudObjectPrefix prefix = 1;
  string key = 2;           // e.g. "seg-456.ttseg"
}

message TimeRange {
  int64 start_time = 1; // in nanoseconds since the unix epoch
  int64 end_time = 2; // in nanoseconds since the unix epoch
}

message CreateSegment {
  string table_id = 1;
  TimeRange time_range = 2;
  int64 size_bytes = 3;
  int64 compressed_size_bytes = 4;
  int32 number_of_events = 5;
  CloudObjectKey cloud_object_key = 6;
  // AUDT digest of the segment file's audit chunk (32 bytes). Empty
  // for legacy segments without an AUDT chunk. New merger outputs
  // populate it.
  bytes audt_hash = 7;
  // Hash algorithm used for `audt_hash`. Wire codes match
  // libs/segment_files/src/audt.rs::AudtAlgo
  // (0 = sha256, 1 = blake3-256, 2 = sha3-256). Ignored when
  // `audt_hash` is empty.
  uint32 audt_algo = 8;
  // Ingest-time min/max across rows in the segment, taken from the
  // merger's verifier (MIMX on `ingest_time`). Drives ingest-time slicing
  // in the coordinator and the freeze watermark for the map-reduce-state
  // cache. The invariant `event_time <= ingest_time` is enforced at write.
  TimeRange ingest_time_range = 9;
}

message SegmentInfo {
  string id = 1;
  TimeRange time_range = 2;
  int64 size_bytes = 3;
  int64 compressed_size_bytes = 4;
  int32 number_of_events = 5;
  string table_id = 6;
  CloudObjectKey cloud_object_key = 7;
  // AUDT digest of the segment file's audit chunk (32 bytes). Empty
  // for pre-existing v0 rows that have NULL `audt_hash` in meta; a
  // query worker opening the segment uses this to verify the file
  // against tampering at load time when present.
  bytes audt_hash = 8;
  // Hash algorithm wire code for `audt_hash` (see AudtAlgo). Ignored
  // when `audt_hash` is empty.
  uint32 audt_algo = 9;
  // Ingest-time min/max across rows in the segment, populated from the
  // merger's verifier. Seeded from `time_range` for rows older than
  // migration 010 (safe under the `event_time <= ingest_time` invariant).
  TimeRange ingest_time_range = 10;
}

message ShardingField {
  string field_name = 1;
  double weight = 2;
}

message SegmentMergeTaskInfo {
  string task_id = 1;
  string table_id = 2;
  repeated SegmentInfo segments = 4;
  CloudObjectPrefix cloud_object_prefix = 5;
  repeated ShardingField sharding_fields = 6;
}

message TableInfo {
  string id = 1;
  string name = 2;
  CloudObjectPrefix cloud_object_prefix = 3;
  // PR 5: which database this table belongs to. Carries the
  // `tables.database_id` column added by migration 026. Required for
  // nursery routing to scope name lookups by database.
  string database_id = 4;
}

On this page