feat: Add Type API updates needed to support structured keys in materialized views

Google APIs · copybara-github · commit 6d1dca2b8e3d · 2025-09-09T13:04:59.000-07:00
feat: Add encodings for STRUCT and the Timestamp type

PiperOrigin-RevId: 805031861
diff --git a/google/bigtable/v2/types.proto b/google/bigtable/v2/types.proto
@@ -31,36 +31,41 @@ option ruby_package = "Google::Cloud::Bigtable::V2";
 // familiarity and consistency across products and features.
 //
 // For compatibility with Bigtable's existing untyped APIs, each `Type` includes
-// an `Encoding` which describes how to convert to/from the underlying data.
+// an `Encoding` which describes how to convert to or from the underlying data.
 //
-// Each encoding also defines the following properties:
+// Each encoding can operate in one of two modes:
 //
-//  * Order-preserving: Does the encoded value sort consistently with the
-//    original typed value? Note that Bigtable will always sort data based on
-//    the raw encoded value, *not* the decoded type.
-//     - Example: BYTES values sort in the same order as their raw encodings.
-//     - Counterexample: Encoding INT64 as a fixed-width decimal string does
-//       *not* preserve sort order when dealing with negative numbers.
-//       `INT64(1) > INT64(-1)`, but `STRING("-00001") > STRING("00001)`.
-//  * Self-delimiting: If we concatenate two encoded values, can we always tell
-//    where the first one ends and the second one begins?
-//     - Example: If we encode INT64s to fixed-width STRINGs, the first value
-//       will always contain exactly N digits, possibly preceded by a sign.
-//     - Counterexample: If we concatenate two UTF-8 encoded STRINGs, we have
-//       no way to tell where the first one ends.
-//  * Compatibility: Which other systems have matching encoding schemes? For
-//    example, does this encoding have a GoogleSQL equivalent? HBase? Java?
+//  - Sorted: In this mode, Bigtable guarantees that `Encode(X) <= Encode(Y)`
+//    if and only if `X <= Y`. This is useful anywhere sort order is important,
+//    for example when encoding keys.
+//  - Distinct: In this mode, Bigtable guarantees that if `X != Y` then
+//   `Encode(X) != Encode(Y)`. However, the converse is not guaranteed. For
+//    example, both `{'foo': '1', 'bar': '2'}` and `{'bar': '2', 'foo': '1'}`
+//    are valid encodings of the same JSON value.
+//
+// The API clearly documents which mode is used wherever an encoding can be
+// configured. Each encoding also documents which values are supported in which
+// modes. For example, when encoding INT64 as a numeric STRING, negative numbers
+// cannot be encoded in sorted mode. This is because `INT64(1) > INT64(-1)`, but
+// `STRING("-00001") > STRING("00001")`.
 message Type {
   // Bytes
   // Values of type `Bytes` are stored in `Value.bytes_value`.
   message Bytes {
-    // Rules used to convert to/from lower level types.
+    // Rules used to convert to or from lower level types.
     message Encoding {
-      // Leaves the value "as-is"
-      // * Order-preserving? Yes
-      // * Self-delimiting? No
-      // * Compatibility? N/A
-      message Raw {}
+      // Leaves the value as-is.
+      //
+      // Sorted mode: all values are supported.
+      //
+      // Distinct mode: all values are supported.
+      message Raw {
+        // If set, allows NULL values to be encoded as the empty string "".
+        //
+        // The actual empty string, or any value which only contains the
+        // null byte `0x00`, has one more null byte appended.
+        bool escape_nulls = 1;
+      }
 
       // Which encoding to use.
       oneof encoding {
@@ -69,28 +74,47 @@ message Type {
       }
     }
 
-    // The encoding to use when converting to/from lower level types.
+    // The encoding to use when converting to or from lower level types.
     Encoding encoding = 1;
   }
 
   // String
   // Values of type `String` are stored in `Value.string_value`.
   message String {
-    // Rules used to convert to/from lower level types.
+    // Rules used to convert to or from lower level types.
     message Encoding {
       // Deprecated: prefer the equivalent `Utf8Bytes`.
       message Utf8Raw {
         option deprecated = true;
       }
 
-      // UTF-8 encoding
-      // * Order-preserving? Yes (code point order)
-      // * Self-delimiting? No
-      // * Compatibility?
-      //    - BigQuery Federation `TEXT` encoding
-      //    - HBase `Bytes.toBytes`
-      //    - Java `String#getBytes(StandardCharsets.UTF_8)`
-      message Utf8Bytes {}
+      // UTF-8 encoding.
+      //
+      // Sorted mode:
+      //  - All values are supported.
+      //  - Code point order is preserved.
+      //
+      // Distinct mode: all values are supported.
+      //
+      // Compatible with:
+      //
+      //  - BigQuery `TEXT` encoding
+      //  - HBase `Bytes.toBytes`
+      //  - Java `String#getBytes(StandardCharsets.UTF_8)`
+      message Utf8Bytes {
+        // Single-character escape sequence used to support NULL values.
+        //
+        // If set, allows NULL values to be encoded as the empty string "".
+        //
+        // The actual empty string, or any value where every character equals
+        // `null_escape_char`, has one more `null_escape_char` appended.
+        //
+        // If `null_escape_char` is set and does not equal the ASCII null
+        // character `0x00`, then the encoding will not support sorted mode.
+        //
+        // .
+        string null_escape_char = 1;
+      }
 
       // Which encoding to use.
       oneof encoding {
@@ -102,36 +126,50 @@ message Type {
       }
     }
 
-    // The encoding to use when converting to/from lower level types.
+    // The encoding to use when converting to or from lower level types.
     Encoding encoding = 1;
   }
 
   // Int64
   // Values of type `Int64` are stored in `Value.int_value`.
   message Int64 {
-    // Rules used to convert to/from lower level types.
+    // Rules used to convert to or from lower level types.
     message Encoding {
-      // Encodes the value as an 8-byte big endian twos complement `Bytes`
-      // value.
-      // * Order-preserving? No (positive values only)
-      // * Self-delimiting? Yes
-      // * Compatibility?
-      //    - BigQuery Federation `BINARY` encoding
-      //    - HBase `Bytes.toBytes`
-      //    - Java `ByteBuffer.putLong()` with `ByteOrder.BIG_ENDIAN`
+      // Encodes the value as an 8-byte big-endian two's complement value.
+      //
+      // Sorted mode: non-negative values are supported.
+      //
+      // Distinct mode: all values are supported.
+      //
+      // Compatible with:
+      //
+      //  - BigQuery `BINARY` encoding
+      //  - HBase `Bytes.toBytes`
+      //  - Java `ByteBuffer.putLong()` with `ByteOrder.BIG_ENDIAN`
       message BigEndianBytes {
         // Deprecated: ignored if set.
-        Bytes bytes_type = 1;
+        Bytes bytes_type = 1 [deprecated = true];
       }
 
+      // Encodes the value in a variable length binary format of up to 10 bytes.
+      // Values that are closer to zero use fewer bytes.
+      //
+      // Sorted mode: all values are supported.
+      //
+      // Distinct mode: all values are supported.
+      message OrderedCodeBytes {}
+
       // Which encoding to use.
       oneof encoding {
         // Use `BigEndianBytes` encoding.
         BigEndianBytes big_endian_bytes = 1;
+
+        // Use `OrderedCodeBytes` encoding.
+        OrderedCodeBytes ordered_code_bytes = 2;
       }
     }
 
-    // The encoding to use when converting to/from lower level types.
+    // The encoding to use when converting to or from lower level types.
     Encoding encoding = 1;
   }
 
@@ -149,7 +187,24 @@ message Type {
 
   // Timestamp
   // Values of type `Timestamp` are stored in `Value.timestamp_value`.
-  message Timestamp {}
+  message Timestamp {
+    // Rules used to convert to or from lower level types.
+    message Encoding {
+      // Which encoding to use.
+      oneof encoding {
+        // Encodes the number of microseconds since the Unix epoch using the
+        // given `Int64` encoding. Values must be microsecond-aligned.
+        //
+        // Compatible with:
+        //
+        //  - Java `Instant.truncatedTo()` with `ChronoUnit.MICROS`
+        Int64.Encoding unix_micros_int64 = 1;
+      }
+    }
+
+    // The encoding to use when converting to or from lower level types.
+    Encoding encoding = 1;
+  }
 
   // Date
   // Values of type `Date` are stored in `Value.date_value`.
@@ -170,8 +225,97 @@ message Type {
       Type type = 2;
     }
 
+    // Rules used to convert to or from lower level types.
+    message Encoding {
+      // Uses the encoding of `fields[0].type` as-is.
+      // Only valid if `fields.size == 1`.
+      message Singleton {}
+
+      // Fields are encoded independently and concatenated with a configurable
+      // `delimiter` in between.
+      //
+      // A struct with no fields defined is encoded as a single `delimiter`.
+      //
+      // Sorted mode:
+      //
+      //  - Fields are encoded in sorted mode.
+      //  - Encoded field values must not contain any bytes <= `delimiter[0]`
+      //  - Element-wise order is preserved: `A < B` if `A[0] < B[0]`, or if
+      //    `A[0] == B[0] && A[1] < B[1]`, etc. Strict prefixes sort first.
+      //
+      // Distinct mode:
+      //
+      //  - Fields are encoded in distinct mode.
+      //  - Encoded field values must not contain `delimiter[0]`.
+      message DelimitedBytes {
+        // Byte sequence used to delimit concatenated fields. The delimiter must
+        // contain at least 1 character and at most 50 characters.
+        bytes delimiter = 1;
+      }
+
+      // Fields are encoded independently and concatenated with the fixed byte
+      // pair `{0x00, 0x01}` in between.
+      //
+      // Any null `(0x00)` byte in an encoded field is replaced by the fixed
+      // byte pair `{0x00, 0xFF}`.
+      //
+      // Fields that encode to the empty string "" have special handling:
+      //
+      //  - If *every* field encodes to "", or if the STRUCT has no fields
+      //    defined, then the STRUCT is encoded as the fixed byte pair
+      //    `{0x00, 0x00}`.
+      //  - Otherwise, the STRUCT only encodes until the last non-empty field,
+      //    omitting any trailing empty fields. Any empty fields that aren't
+      //    omitted are replaced with the fixed byte pair `{0x00, 0x00}`.
+      //
+      // Examples:
+      //
+      // ```
+      //  - STRUCT()             -> "\00\00"
+      //  - STRUCT("")           -> "\00\00"
+      //  - STRUCT("", "")       -> "\00\00"
+      //  - STRUCT("", "B")      -> "\00\00" + "\00\01" + "B"
+      //  - STRUCT("A", "")      -> "A"
+      //  - STRUCT("", "B", "")  -> "\00\00" + "\00\01" + "B"
+      //  - STRUCT("A", "", "C") -> "A" + "\00\01" + "\00\00" + "\00\01" + "C"
+      // ```
+      //
+      //
+      // Since null bytes are always escaped, this encoding can cause size
+      // blowup for encodings like `Int64.BigEndianBytes` that are likely to
+      // produce many such bytes.
+      //
+      // Sorted mode:
+      //
+      //  - Fields are encoded in sorted mode.
+      //  - All values supported by the field encodings are allowed
+      //  - Element-wise order is preserved: `A < B` if `A[0] < B[0]`, or if
+      //    `A[0] == B[0] && A[1] < B[1]`, etc. Strict prefixes sort first.
+      //
+      // Distinct mode:
+      //
+      //  - Fields are encoded in distinct mode.
+      //  - All values supported by the field encodings are allowed.
+      message OrderedCodeBytes {}
+
+      // Which encoding to use.
+      oneof encoding {
+        // Use `Singleton` encoding.
+        Singleton singleton = 1;
+
+        // Use `DelimitedBytes` encoding.
+        DelimitedBytes delimited_bytes = 2;
+
+        // User `OrderedCodeBytes` encoding.
+        OrderedCodeBytes ordered_code_bytes = 3;
+      }
+    }
+
     // The names and types of the fields in this struct.
     repeated Field fields = 1;
+
+    // The encoding to use when converting to or from lower level types.
+    Encoding encoding = 2;
   }
 
   // A protobuf message type.
@@ -221,9 +365,9 @@ message Type {
 
   // A value that combines incremental updates into a summarized value.
   //
-  // Data is never directly written or read using type `Aggregate`. Writes will
-  // provide either the `input_type` or `state_type`, and reads will always
-  // return the `state_type` .
+  // Data is never directly written or read using type `Aggregate`. Writes
+  // provide either the `input_type` or `state_type`, and reads always return
+  // the `state_type` .
   message Aggregate {
     // Computes the sum of the input values.
     // Allowed input: `Int64`
@@ -249,14 +393,13 @@ message Type {
     // Special state conversions: `Int64` (the unique count estimate)
     message HyperLogLogPlusPlusUniqueCount {}
 
-    // Type of the inputs that are accumulated by this `Aggregate`, which must
-    // specify a full encoding.
+    // Type of the inputs that are accumulated by this `Aggregate`.
     // Use `AddInput` mutations to accumulate new inputs.
     Type input_type = 1;
 
     // Output only. Type that holds the internal accumulator state for the
     // `Aggregate`. This is a function of the `input_type` and `aggregator`
-    // chosen, and will always specify a full encoding.
+    // chosen.
     Type state_type = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
 
     // Which aggregator function to use. The configured types must match.