@@ -31,36 +31,41 @@ option ruby_package = "Google::Cloud::Bigtable::V2";
3131// familiarity and consistency across products and features.
3232//
3333// For compatibility with Bigtable's existing untyped APIs, each `Type` includes
34- // an `Encoding` which describes how to convert to/ from the underlying data.
34+ // an `Encoding` which describes how to convert to or from the underlying data.
3535//
36- // Each encoding also defines the following properties :
36+ // Each encoding can operate in one of two modes :
3737//
38- // * Order-preserving: Does the encoded value sort consistently with the
39- // original typed value? Note that Bigtable will always sort data based on
40- // the raw encoded value, *not* the decoded type.
41- // - Example: BYTES values sort in the same order as their raw encodings.
42- // - Counterexample: Encoding INT64 as a fixed-width decimal string does
43- // *not* preserve sort order when dealing with negative numbers.
44- // `INT64(1) > INT64(-1)`, but `STRING("-00001") > STRING("00001)`.
45- // * Self-delimiting: If we concatenate two encoded values, can we always tell
46- // where the first one ends and the second one begins?
47- // - Example: If we encode INT64s to fixed-width STRINGs, the first value
48- // will always contain exactly N digits, possibly preceded by a sign.
49- // - Counterexample: If we concatenate two UTF-8 encoded STRINGs, we have
50- // no way to tell where the first one ends.
51- // * Compatibility: Which other systems have matching encoding schemes? For
52- // example, does this encoding have a GoogleSQL equivalent? HBase? Java?
38+ // - Sorted: In this mode, Bigtable guarantees that `Encode(X) <= Encode(Y)`
39+ // if and only if `X <= Y`. This is useful anywhere sort order is important,
40+ // for example when encoding keys.
41+ // - Distinct: In this mode, Bigtable guarantees that if `X != Y` then
42+ // `Encode(X) != Encode(Y)`. However, the converse is not guaranteed. For
43+ // example, both `{'foo': '1', 'bar': '2'}` and `{'bar': '2', 'foo': '1'}`
44+ // are valid encodings of the same JSON value.
45+ //
46+ // The API clearly documents which mode is used wherever an encoding can be
47+ // configured. Each encoding also documents which values are supported in which
48+ // modes. For example, when encoding INT64 as a numeric STRING, negative numbers
49+ // cannot be encoded in sorted mode. This is because `INT64(1) > INT64(-1)`, but
50+ // `STRING("-00001") > STRING("00001")`.
5351message Type {
5452 // Bytes
5553 // Values of type `Bytes` are stored in `Value.bytes_value`.
5654 message Bytes {
57- // Rules used to convert to/ from lower level types.
55+ // Rules used to convert to or from lower level types.
5856 message Encoding {
59- // Leaves the value "as-is"
60- // * Order-preserving? Yes
61- // * Self-delimiting? No
62- // * Compatibility? N/A
63- message Raw {}
57+ // Leaves the value as-is.
58+ //
59+ // Sorted mode: all values are supported.
60+ //
61+ // Distinct mode: all values are supported.
62+ message Raw {
63+ // If set, allows NULL values to be encoded as the empty string "".
64+ //
65+ // The actual empty string, or any value which only contains the
66+ // null byte `0x00`, has one more null byte appended.
67+ bool escape_nulls = 1 ;
68+ }
6469
6570 // Which encoding to use.
6671 oneof encoding {
@@ -69,28 +74,47 @@ message Type {
6974 }
7075 }
7176
72- // The encoding to use when converting to/ from lower level types.
77+ // The encoding to use when converting to or from lower level types.
7378 Encoding encoding = 1 ;
7479 }
7580
7681 // String
7782 // Values of type `String` are stored in `Value.string_value`.
7883 message String {
79- // Rules used to convert to/ from lower level types.
84+ // Rules used to convert to or from lower level types.
8085 message Encoding {
8186 // Deprecated: prefer the equivalent `Utf8Bytes`.
8287 message Utf8Raw {
8388 option deprecated = true ;
8489 }
8590
86- // UTF-8 encoding
87- // * Order-preserving? Yes (code point order)
88- // * Self-delimiting? No
89- // * Compatibility?
90- // - BigQuery Federation `TEXT` encoding
91- // - HBase `Bytes.toBytes`
92- // - Java `String#getBytes(StandardCharsets.UTF_8)`
93- message Utf8Bytes {}
91+ // UTF-8 encoding.
92+ //
93+ // Sorted mode:
94+ // - All values are supported.
95+ // - Code point order is preserved.
96+ //
97+ // Distinct mode: all values are supported.
98+ //
99+ // Compatible with:
100+ //
101+ // - BigQuery `TEXT` encoding
102+ // - HBase `Bytes.toBytes`
103+ // - Java `String#getBytes(StandardCharsets.UTF_8)`
104+ message Utf8Bytes {
105+ // Single-character escape sequence used to support NULL values.
106+ //
107+ // If set, allows NULL values to be encoded as the empty string "".
108+ //
109+ // The actual empty string, or any value where every character equals
110+ // `null_escape_char`, has one more `null_escape_char` appended.
111+ //
112+ // If `null_escape_char` is set and does not equal the ASCII null
113+ // character `0x00`, then the encoding will not support sorted mode.
114+ //
115+ // .
116+ string null_escape_char = 1 ;
117+ }
94118
95119 // Which encoding to use.
96120 oneof encoding {
@@ -102,36 +126,50 @@ message Type {
102126 }
103127 }
104128
105- // The encoding to use when converting to/ from lower level types.
129+ // The encoding to use when converting to or from lower level types.
106130 Encoding encoding = 1 ;
107131 }
108132
109133 // Int64
110134 // Values of type `Int64` are stored in `Value.int_value`.
111135 message Int64 {
112- // Rules used to convert to/ from lower level types.
136+ // Rules used to convert to or from lower level types.
113137 message Encoding {
114- // Encodes the value as an 8-byte big endian twos complement `Bytes`
115- // value.
116- // * Order-preserving? No (positive values only)
117- // * Self-delimiting? Yes
118- // * Compatibility?
119- // - BigQuery Federation `BINARY` encoding
120- // - HBase `Bytes.toBytes`
121- // - Java `ByteBuffer.putLong()` with `ByteOrder.BIG_ENDIAN`
138+ // Encodes the value as an 8-byte big-endian two's complement value.
139+ //
140+ // Sorted mode: non-negative values are supported.
141+ //
142+ // Distinct mode: all values are supported.
143+ //
144+ // Compatible with:
145+ //
146+ // - BigQuery `BINARY` encoding
147+ // - HBase `Bytes.toBytes`
148+ // - Java `ByteBuffer.putLong()` with `ByteOrder.BIG_ENDIAN`
122149 message BigEndianBytes {
123150 // Deprecated: ignored if set.
124- Bytes bytes_type = 1 ;
151+ Bytes bytes_type = 1 [ deprecated = true ] ;
125152 }
126153
154+ // Encodes the value in a variable length binary format of up to 10 bytes.
155+ // Values that are closer to zero use fewer bytes.
156+ //
157+ // Sorted mode: all values are supported.
158+ //
159+ // Distinct mode: all values are supported.
160+ message OrderedCodeBytes {}
161+
127162 // Which encoding to use.
128163 oneof encoding {
129164 // Use `BigEndianBytes` encoding.
130165 BigEndianBytes big_endian_bytes = 1 ;
166+
167+ // Use `OrderedCodeBytes` encoding.
168+ OrderedCodeBytes ordered_code_bytes = 2 ;
131169 }
132170 }
133171
134- // The encoding to use when converting to/ from lower level types.
172+ // The encoding to use when converting to or from lower level types.
135173 Encoding encoding = 1 ;
136174 }
137175
@@ -149,7 +187,24 @@ message Type {
149187
150188 // Timestamp
151189 // Values of type `Timestamp` are stored in `Value.timestamp_value`.
152- message Timestamp {}
190+ message Timestamp {
191+ // Rules used to convert to or from lower level types.
192+ message Encoding {
193+ // Which encoding to use.
194+ oneof encoding {
195+ // Encodes the number of microseconds since the Unix epoch using the
196+ // given `Int64` encoding. Values must be microsecond-aligned.
197+ //
198+ // Compatible with:
199+ //
200+ // - Java `Instant.truncatedTo()` with `ChronoUnit.MICROS`
201+ Int64.Encoding unix_micros_int64 = 1 ;
202+ }
203+ }
204+
205+ // The encoding to use when converting to or from lower level types.
206+ Encoding encoding = 1 ;
207+ }
153208
154209 // Date
155210 // Values of type `Date` are stored in `Value.date_value`.
@@ -170,8 +225,97 @@ message Type {
170225 Type type = 2 ;
171226 }
172227
228+ // Rules used to convert to or from lower level types.
229+ message Encoding {
230+ // Uses the encoding of `fields[0].type` as-is.
231+ // Only valid if `fields.size == 1`.
232+ message Singleton {}
233+
234+ // Fields are encoded independently and concatenated with a configurable
235+ // `delimiter` in between.
236+ //
237+ // A struct with no fields defined is encoded as a single `delimiter`.
238+ //
239+ // Sorted mode:
240+ //
241+ // - Fields are encoded in sorted mode.
242+ // - Encoded field values must not contain any bytes <= `delimiter[0]`
243+ // - Element-wise order is preserved: `A < B` if `A[0] < B[0]`, or if
244+ // `A[0] == B[0] && A[1] < B[1]`, etc. Strict prefixes sort first.
245+ //
246+ // Distinct mode:
247+ //
248+ // - Fields are encoded in distinct mode.
249+ // - Encoded field values must not contain `delimiter[0]`.
250+ message DelimitedBytes {
251+ // Byte sequence used to delimit concatenated fields. The delimiter must
252+ // contain at least 1 character and at most 50 characters.
253+ bytes delimiter = 1 ;
254+ }
255+
256+ // Fields are encoded independently and concatenated with the fixed byte
257+ // pair `{0x00, 0x01}` in between.
258+ //
259+ // Any null `(0x00)` byte in an encoded field is replaced by the fixed
260+ // byte pair `{0x00, 0xFF}`.
261+ //
262+ // Fields that encode to the empty string "" have special handling:
263+ //
264+ // - If *every* field encodes to "", or if the STRUCT has no fields
265+ // defined, then the STRUCT is encoded as the fixed byte pair
266+ // `{0x00, 0x00}`.
267+ // - Otherwise, the STRUCT only encodes until the last non-empty field,
268+ // omitting any trailing empty fields. Any empty fields that aren't
269+ // omitted are replaced with the fixed byte pair `{0x00, 0x00}`.
270+ //
271+ // Examples:
272+ //
273+ // ```
274+ // - STRUCT() -> "\00\00"
275+ // - STRUCT("") -> "\00\00"
276+ // - STRUCT("", "") -> "\00\00"
277+ // - STRUCT("", "B") -> "\00\00" + "\00\01" + "B"
278+ // - STRUCT("A", "") -> "A"
279+ // - STRUCT("", "B", "") -> "\00\00" + "\00\01" + "B"
280+ // - STRUCT("A", "", "C") -> "A" + "\00\01" + "\00\00" + "\00\01" + "C"
281+ // ```
282+ //
283+ //
284+ // Since null bytes are always escaped, this encoding can cause size
285+ // blowup for encodings like `Int64.BigEndianBytes` that are likely to
286+ // produce many such bytes.
287+ //
288+ // Sorted mode:
289+ //
290+ // - Fields are encoded in sorted mode.
291+ // - All values supported by the field encodings are allowed
292+ // - Element-wise order is preserved: `A < B` if `A[0] < B[0]`, or if
293+ // `A[0] == B[0] && A[1] < B[1]`, etc. Strict prefixes sort first.
294+ //
295+ // Distinct mode:
296+ //
297+ // - Fields are encoded in distinct mode.
298+ // - All values supported by the field encodings are allowed.
299+ message OrderedCodeBytes {}
300+
301+ // Which encoding to use.
302+ oneof encoding {
303+ // Use `Singleton` encoding.
304+ Singleton singleton = 1 ;
305+
306+ // Use `DelimitedBytes` encoding.
307+ DelimitedBytes delimited_bytes = 2 ;
308+
309+ // User `OrderedCodeBytes` encoding.
310+ OrderedCodeBytes ordered_code_bytes = 3 ;
311+ }
312+ }
313+
173314 // The names and types of the fields in this struct.
174315 repeated Field fields = 1 ;
316+
317+ // The encoding to use when converting to or from lower level types.
318+ Encoding encoding = 2 ;
175319 }
176320
177321 // A protobuf message type.
@@ -221,9 +365,9 @@ message Type {
221365
222366 // A value that combines incremental updates into a summarized value.
223367 //
224- // Data is never directly written or read using type `Aggregate`. Writes will
225- // provide either the `input_type` or `state_type`, and reads will always
226- // return the `state_type` .
368+ // Data is never directly written or read using type `Aggregate`. Writes
369+ // provide either the `input_type` or `state_type`, and reads always return
370+ // the `state_type` .
227371 message Aggregate {
228372 // Computes the sum of the input values.
229373 // Allowed input: `Int64`
@@ -249,14 +393,13 @@ message Type {
249393 // Special state conversions: `Int64` (the unique count estimate)
250394 message HyperLogLogPlusPlusUniqueCount {}
251395
252- // Type of the inputs that are accumulated by this `Aggregate`, which must
253- // specify a full encoding.
396+ // Type of the inputs that are accumulated by this `Aggregate`.
254397 // Use `AddInput` mutations to accumulate new inputs.
255398 Type input_type = 1 ;
256399
257400 // Output only. Type that holds the internal accumulator state for the
258401 // `Aggregate`. This is a function of the `input_type` and `aggregator`
259- // chosen, and will always specify a full encoding .
402+ // chosen.
260403 Type state_type = 2 [(google.api.field_behavior ) = OUTPUT_ONLY ];
261404
262405 // Which aggregator function to use. The configured types must match.
0 commit comments