这是indexloc提供的服务,不要输入任何密码
Skip to content

Commit 6d1dca2

Browse files
Google APIscopybara-github
authored andcommitted
feat: Add Type API updates needed to support structured keys in materialized views
feat: Add encodings for STRUCT and the Timestamp type PiperOrigin-RevId: 805031861
1 parent 8727b5b commit 6d1dca2

File tree

1 file changed

+195
-52
lines changed

1 file changed

+195
-52
lines changed

google/bigtable/v2/types.proto

Lines changed: 195 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -31,36 +31,41 @@ option ruby_package = "Google::Cloud::Bigtable::V2";
3131
// familiarity and consistency across products and features.
3232
//
3333
// For compatibility with Bigtable's existing untyped APIs, each `Type` includes
34-
// an `Encoding` which describes how to convert to/from the underlying data.
34+
// an `Encoding` which describes how to convert to or from the underlying data.
3535
//
36-
// Each encoding also defines the following properties:
36+
// Each encoding can operate in one of two modes:
3737
//
38-
// * Order-preserving: Does the encoded value sort consistently with the
39-
// original typed value? Note that Bigtable will always sort data based on
40-
// the raw encoded value, *not* the decoded type.
41-
// - Example: BYTES values sort in the same order as their raw encodings.
42-
// - Counterexample: Encoding INT64 as a fixed-width decimal string does
43-
// *not* preserve sort order when dealing with negative numbers.
44-
// `INT64(1) > INT64(-1)`, but `STRING("-00001") > STRING("00001)`.
45-
// * Self-delimiting: If we concatenate two encoded values, can we always tell
46-
// where the first one ends and the second one begins?
47-
// - Example: If we encode INT64s to fixed-width STRINGs, the first value
48-
// will always contain exactly N digits, possibly preceded by a sign.
49-
// - Counterexample: If we concatenate two UTF-8 encoded STRINGs, we have
50-
// no way to tell where the first one ends.
51-
// * Compatibility: Which other systems have matching encoding schemes? For
52-
// example, does this encoding have a GoogleSQL equivalent? HBase? Java?
38+
// - Sorted: In this mode, Bigtable guarantees that `Encode(X) <= Encode(Y)`
39+
// if and only if `X <= Y`. This is useful anywhere sort order is important,
40+
// for example when encoding keys.
41+
// - Distinct: In this mode, Bigtable guarantees that if `X != Y` then
42+
// `Encode(X) != Encode(Y)`. However, the converse is not guaranteed. For
43+
// example, both `{'foo': '1', 'bar': '2'}` and `{'bar': '2', 'foo': '1'}`
44+
// are valid encodings of the same JSON value.
45+
//
46+
// The API clearly documents which mode is used wherever an encoding can be
47+
// configured. Each encoding also documents which values are supported in which
48+
// modes. For example, when encoding INT64 as a numeric STRING, negative numbers
49+
// cannot be encoded in sorted mode. This is because `INT64(1) > INT64(-1)`, but
50+
// `STRING("-00001") > STRING("00001")`.
5351
message Type {
5452
// Bytes
5553
// Values of type `Bytes` are stored in `Value.bytes_value`.
5654
message Bytes {
57-
// Rules used to convert to/from lower level types.
55+
// Rules used to convert to or from lower level types.
5856
message Encoding {
59-
// Leaves the value "as-is"
60-
// * Order-preserving? Yes
61-
// * Self-delimiting? No
62-
// * Compatibility? N/A
63-
message Raw {}
57+
// Leaves the value as-is.
58+
//
59+
// Sorted mode: all values are supported.
60+
//
61+
// Distinct mode: all values are supported.
62+
message Raw {
63+
// If set, allows NULL values to be encoded as the empty string "".
64+
//
65+
// The actual empty string, or any value which only contains the
66+
// null byte `0x00`, has one more null byte appended.
67+
bool escape_nulls = 1;
68+
}
6469

6570
// Which encoding to use.
6671
oneof encoding {
@@ -69,28 +74,47 @@ message Type {
6974
}
7075
}
7176

72-
// The encoding to use when converting to/from lower level types.
77+
// The encoding to use when converting to or from lower level types.
7378
Encoding encoding = 1;
7479
}
7580

7681
// String
7782
// Values of type `String` are stored in `Value.string_value`.
7883
message String {
79-
// Rules used to convert to/from lower level types.
84+
// Rules used to convert to or from lower level types.
8085
message Encoding {
8186
// Deprecated: prefer the equivalent `Utf8Bytes`.
8287
message Utf8Raw {
8388
option deprecated = true;
8489
}
8590

86-
// UTF-8 encoding
87-
// * Order-preserving? Yes (code point order)
88-
// * Self-delimiting? No
89-
// * Compatibility?
90-
// - BigQuery Federation `TEXT` encoding
91-
// - HBase `Bytes.toBytes`
92-
// - Java `String#getBytes(StandardCharsets.UTF_8)`
93-
message Utf8Bytes {}
91+
// UTF-8 encoding.
92+
//
93+
// Sorted mode:
94+
// - All values are supported.
95+
// - Code point order is preserved.
96+
//
97+
// Distinct mode: all values are supported.
98+
//
99+
// Compatible with:
100+
//
101+
// - BigQuery `TEXT` encoding
102+
// - HBase `Bytes.toBytes`
103+
// - Java `String#getBytes(StandardCharsets.UTF_8)`
104+
message Utf8Bytes {
105+
// Single-character escape sequence used to support NULL values.
106+
//
107+
// If set, allows NULL values to be encoded as the empty string "".
108+
//
109+
// The actual empty string, or any value where every character equals
110+
// `null_escape_char`, has one more `null_escape_char` appended.
111+
//
112+
// If `null_escape_char` is set and does not equal the ASCII null
113+
// character `0x00`, then the encoding will not support sorted mode.
114+
//
115+
// .
116+
string null_escape_char = 1;
117+
}
94118

95119
// Which encoding to use.
96120
oneof encoding {
@@ -102,36 +126,50 @@ message Type {
102126
}
103127
}
104128

105-
// The encoding to use when converting to/from lower level types.
129+
// The encoding to use when converting to or from lower level types.
106130
Encoding encoding = 1;
107131
}
108132

109133
// Int64
110134
// Values of type `Int64` are stored in `Value.int_value`.
111135
message Int64 {
112-
// Rules used to convert to/from lower level types.
136+
// Rules used to convert to or from lower level types.
113137
message Encoding {
114-
// Encodes the value as an 8-byte big endian twos complement `Bytes`
115-
// value.
116-
// * Order-preserving? No (positive values only)
117-
// * Self-delimiting? Yes
118-
// * Compatibility?
119-
// - BigQuery Federation `BINARY` encoding
120-
// - HBase `Bytes.toBytes`
121-
// - Java `ByteBuffer.putLong()` with `ByteOrder.BIG_ENDIAN`
138+
// Encodes the value as an 8-byte big-endian two's complement value.
139+
//
140+
// Sorted mode: non-negative values are supported.
141+
//
142+
// Distinct mode: all values are supported.
143+
//
144+
// Compatible with:
145+
//
146+
// - BigQuery `BINARY` encoding
147+
// - HBase `Bytes.toBytes`
148+
// - Java `ByteBuffer.putLong()` with `ByteOrder.BIG_ENDIAN`
122149
message BigEndianBytes {
123150
// Deprecated: ignored if set.
124-
Bytes bytes_type = 1;
151+
Bytes bytes_type = 1 [deprecated = true];
125152
}
126153

154+
// Encodes the value in a variable length binary format of up to 10 bytes.
155+
// Values that are closer to zero use fewer bytes.
156+
//
157+
// Sorted mode: all values are supported.
158+
//
159+
// Distinct mode: all values are supported.
160+
message OrderedCodeBytes {}
161+
127162
// Which encoding to use.
128163
oneof encoding {
129164
// Use `BigEndianBytes` encoding.
130165
BigEndianBytes big_endian_bytes = 1;
166+
167+
// Use `OrderedCodeBytes` encoding.
168+
OrderedCodeBytes ordered_code_bytes = 2;
131169
}
132170
}
133171

134-
// The encoding to use when converting to/from lower level types.
172+
// The encoding to use when converting to or from lower level types.
135173
Encoding encoding = 1;
136174
}
137175

@@ -149,7 +187,24 @@ message Type {
149187

150188
// Timestamp
151189
// Values of type `Timestamp` are stored in `Value.timestamp_value`.
152-
message Timestamp {}
190+
message Timestamp {
191+
// Rules used to convert to or from lower level types.
192+
message Encoding {
193+
// Which encoding to use.
194+
oneof encoding {
195+
// Encodes the number of microseconds since the Unix epoch using the
196+
// given `Int64` encoding. Values must be microsecond-aligned.
197+
//
198+
// Compatible with:
199+
//
200+
// - Java `Instant.truncatedTo()` with `ChronoUnit.MICROS`
201+
Int64.Encoding unix_micros_int64 = 1;
202+
}
203+
}
204+
205+
// The encoding to use when converting to or from lower level types.
206+
Encoding encoding = 1;
207+
}
153208

154209
// Date
155210
// Values of type `Date` are stored in `Value.date_value`.
@@ -170,8 +225,97 @@ message Type {
170225
Type type = 2;
171226
}
172227

228+
// Rules used to convert to or from lower level types.
229+
message Encoding {
230+
// Uses the encoding of `fields[0].type` as-is.
231+
// Only valid if `fields.size == 1`.
232+
message Singleton {}
233+
234+
// Fields are encoded independently and concatenated with a configurable
235+
// `delimiter` in between.
236+
//
237+
// A struct with no fields defined is encoded as a single `delimiter`.
238+
//
239+
// Sorted mode:
240+
//
241+
// - Fields are encoded in sorted mode.
242+
// - Encoded field values must not contain any bytes <= `delimiter[0]`
243+
// - Element-wise order is preserved: `A < B` if `A[0] < B[0]`, or if
244+
// `A[0] == B[0] && A[1] < B[1]`, etc. Strict prefixes sort first.
245+
//
246+
// Distinct mode:
247+
//
248+
// - Fields are encoded in distinct mode.
249+
// - Encoded field values must not contain `delimiter[0]`.
250+
message DelimitedBytes {
251+
// Byte sequence used to delimit concatenated fields. The delimiter must
252+
// contain at least 1 character and at most 50 characters.
253+
bytes delimiter = 1;
254+
}
255+
256+
// Fields are encoded independently and concatenated with the fixed byte
257+
// pair `{0x00, 0x01}` in between.
258+
//
259+
// Any null `(0x00)` byte in an encoded field is replaced by the fixed
260+
// byte pair `{0x00, 0xFF}`.
261+
//
262+
// Fields that encode to the empty string "" have special handling:
263+
//
264+
// - If *every* field encodes to "", or if the STRUCT has no fields
265+
// defined, then the STRUCT is encoded as the fixed byte pair
266+
// `{0x00, 0x00}`.
267+
// - Otherwise, the STRUCT only encodes until the last non-empty field,
268+
// omitting any trailing empty fields. Any empty fields that aren't
269+
// omitted are replaced with the fixed byte pair `{0x00, 0x00}`.
270+
//
271+
// Examples:
272+
//
273+
// ```
274+
// - STRUCT() -> "\00\00"
275+
// - STRUCT("") -> "\00\00"
276+
// - STRUCT("", "") -> "\00\00"
277+
// - STRUCT("", "B") -> "\00\00" + "\00\01" + "B"
278+
// - STRUCT("A", "") -> "A"
279+
// - STRUCT("", "B", "") -> "\00\00" + "\00\01" + "B"
280+
// - STRUCT("A", "", "C") -> "A" + "\00\01" + "\00\00" + "\00\01" + "C"
281+
// ```
282+
//
283+
//
284+
// Since null bytes are always escaped, this encoding can cause size
285+
// blowup for encodings like `Int64.BigEndianBytes` that are likely to
286+
// produce many such bytes.
287+
//
288+
// Sorted mode:
289+
//
290+
// - Fields are encoded in sorted mode.
291+
// - All values supported by the field encodings are allowed
292+
// - Element-wise order is preserved: `A < B` if `A[0] < B[0]`, or if
293+
// `A[0] == B[0] && A[1] < B[1]`, etc. Strict prefixes sort first.
294+
//
295+
// Distinct mode:
296+
//
297+
// - Fields are encoded in distinct mode.
298+
// - All values supported by the field encodings are allowed.
299+
message OrderedCodeBytes {}
300+
301+
// Which encoding to use.
302+
oneof encoding {
303+
// Use `Singleton` encoding.
304+
Singleton singleton = 1;
305+
306+
// Use `DelimitedBytes` encoding.
307+
DelimitedBytes delimited_bytes = 2;
308+
309+
// User `OrderedCodeBytes` encoding.
310+
OrderedCodeBytes ordered_code_bytes = 3;
311+
}
312+
}
313+
173314
// The names and types of the fields in this struct.
174315
repeated Field fields = 1;
316+
317+
// The encoding to use when converting to or from lower level types.
318+
Encoding encoding = 2;
175319
}
176320

177321
// A protobuf message type.
@@ -221,9 +365,9 @@ message Type {
221365

222366
// A value that combines incremental updates into a summarized value.
223367
//
224-
// Data is never directly written or read using type `Aggregate`. Writes will
225-
// provide either the `input_type` or `state_type`, and reads will always
226-
// return the `state_type` .
368+
// Data is never directly written or read using type `Aggregate`. Writes
369+
// provide either the `input_type` or `state_type`, and reads always return
370+
// the `state_type` .
227371
message Aggregate {
228372
// Computes the sum of the input values.
229373
// Allowed input: `Int64`
@@ -249,14 +393,13 @@ message Type {
249393
// Special state conversions: `Int64` (the unique count estimate)
250394
message HyperLogLogPlusPlusUniqueCount {}
251395

252-
// Type of the inputs that are accumulated by this `Aggregate`, which must
253-
// specify a full encoding.
396+
// Type of the inputs that are accumulated by this `Aggregate`.
254397
// Use `AddInput` mutations to accumulate new inputs.
255398
Type input_type = 1;
256399

257400
// Output only. Type that holds the internal accumulator state for the
258401
// `Aggregate`. This is a function of the `input_type` and `aggregator`
259-
// chosen, and will always specify a full encoding.
402+
// chosen.
260403
Type state_type = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
261404

262405
// Which aggregator function to use. The configured types must match.

0 commit comments

Comments
 (0)