diff --git a/.cargo/audit.toml b/.cargo/audit.toml new file mode 100644 index 00000000..6ca240cb --- /dev/null +++ b/.cargo/audit.toml @@ -0,0 +1,4 @@ +[advisories] +ignore = [ + "RUSTSEC-2024-0437" # in protobuf via prometheus, but we're not using proto so it shouldn't be an issue +] diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 834776ce..3583317e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -30,11 +30,24 @@ jobs: - name: run linter checks with clippy 🔨 run: nix build .#checks.x86_64-linux.lint --print-build-logs - - name: update rust-sec advisory db before scanning for vulnerabilities - run: nix flake lock --update-input advisory-db - - - name: audit for reported security problems 🔨 - run: nix build .#checks.x86_64-linux.audit --print-build-logs - - name: run integration tests 📋 run: nix develop --command just test-mongodb-versions + + audit: + name: Security Audit + runs-on: ubuntu-24.04 + steps: + - name: Checkout 🛎️ + uses: actions/checkout@v3 + + - name: Install Nix ❄ + uses: DeterminateSystems/nix-installer-action@v4 + + - name: Link Cachix 🔌 + uses: cachix/cachix-action@v12 + with: + name: '${{ vars.CACHIX_CACHE_NAME }}' + authToken: '${{ secrets.CACHIX_CACHE_AUTH_TOKEN }}' + + - name: audit for reported security problems 🔨 + run: nix develop --command cargo audit diff --git a/.gitignore b/.gitignore index 9bbaa564..bd97b4fb 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,9 @@ debug/ target/ +.cargo/* +!.cargo/audit.toml + # These are backup files generated by rustfmt **/*.rs.bk diff --git a/CHANGELOG.md b/CHANGELOG.md index 27a2ae7b..e3495bca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,10 +8,31 @@ This changelog documents the changes between release versions. - Add uuid scalar type ([#148](https://github.com/hasura/ndc-mongodb/pull/148)) +### Changed + +- On database introspection newly-added collection fields will be added to existing schema configurations ([#152](https://github.com/hasura/ndc-mongodb/pull/152)) + ### Fixed - Update dependencies to get fixes for reported security vulnerabilities ([#149](https://github.com/hasura/ndc-mongodb/pull/149)) +#### Changes to database introspection + +Previously running introspection would not update existing schema definitions, it would only add definitions for +newly-added collections. This release changes that behavior to make conservative changes to existing definitions: + +- added fields, either top-level or nested, will be added to existing schema definitions +- types for fields that are already configured will **not** be changed automatically +- fields that appear to have been added to collections will **not** be removed from configurations + +We take such a conservative approach to schema configuration changes because we want to avoid accidental breaking API +changes, and because schema configuration can be edited by hand, and we don't want to accidentally reverse such +modifications. + +If you want to make type changes to fields that are already configured, or if you want to remove fields from schema +configuration you can either make those edits to schema configurations by hand, or you can delete schema files before +running introspection. + #### UUID scalar type Previously UUID values would show up in GraphQL as `BinData`. BinData is a generalized BSON type for binary data. It diff --git a/Cargo.lock b/Cargo.lock index 328c27b2..821d9243 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -462,7 +462,7 @@ dependencies = [ "anyhow", "async-tempfile", "futures", - "googletest", + "googletest 0.12.0", "itertools", "mongodb", "mongodb-support", @@ -684,6 +684,12 @@ version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" +[[package]] +name = "difflib" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" + [[package]] name = "digest" version = "0.10.7" @@ -973,7 +979,19 @@ version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22e38fa267f4db1a2fa51795ea4234eaadc3617a97486a9f158de9256672260e" dependencies = [ - "googletest_macro", + "googletest_macro 0.12.0", + "num-traits", + "regex", + "rustversion", +] + +[[package]] +name = "googletest" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dce026f84cdd339bf71be01b24fe67470ee634282f68c1c4b563d00a9f002b05" +dependencies = [ + "googletest_macro 0.13.0", "num-traits", "regex", "rustversion", @@ -989,6 +1007,17 @@ dependencies = [ "syn 2.0.66", ] +[[package]] +name = "googletest_macro" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f5070fa86976044fe2b004d874c10af5d1aed6d8f6a72ff93a6eb29cc87048bc" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.66", +] + [[package]] name = "h2" version = "0.3.26" @@ -1589,6 +1618,17 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "json-structural-diff" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e878e36a8a44c158505c2c818abdc1350413ad83dcb774a0459f6a7ef2b65cbf" +dependencies = [ + "difflib", + "regex", + "serde_json", +] + [[package]] name = "lazy_static" version = "1.4.0" @@ -1903,13 +1943,15 @@ dependencies = [ "configuration", "enum-iterator", "futures-util", - "googletest", + "googletest 0.13.0", "indexmap 2.2.6", "itertools", + "json-structural-diff", "mongodb", "mongodb-agent-common", "mongodb-support", "ndc-models", + "ndc-test-helpers", "nom", "nonempty", "pretty", diff --git a/crates/cli/Cargo.toml b/crates/cli/Cargo.toml index 1ecc27c3..bbe736ce 100644 --- a/crates/cli/Cargo.toml +++ b/crates/cli/Cargo.toml @@ -19,6 +19,7 @@ enum-iterator = "^2.0.0" futures-util = "0.3.28" indexmap = { workspace = true } itertools = { workspace = true } +json-structural-diff = "^0.2.0" ndc-models = { workspace = true } nom = { version = "^7.1.3", optional = true } nonempty = "^0.10.0" @@ -35,7 +36,8 @@ tokio = { version = "1.36.0", features = ["full"] } mongodb-agent-common = { path = "../mongodb-agent-common", features = ["test-helpers"] } async-tempfile = "^0.6.0" -googletest = "^0.12.0" +googletest = "^0.13.0" pretty_assertions = "1" proptest = "1" +ndc-test-helpers = { path = "../ndc-test-helpers" } test-helpers = { path = "../test-helpers" } diff --git a/crates/cli/src/introspection/sampling.rs b/crates/cli/src/introspection/sampling.rs index c0809fe9..4018f48c 100644 --- a/crates/cli/src/introspection/sampling.rs +++ b/crates/cli/src/introspection/sampling.rs @@ -1,23 +1,76 @@ -use std::collections::{BTreeMap, HashSet}; +mod keep_backward_compatible_changes; + +use std::collections::BTreeMap; use crate::log_warning; use super::type_unification::{make_nullable_field, unify_object_types, unify_type}; use configuration::{ - schema::{self, Type}, + schema::{self, Collection, CollectionSchema, ObjectTypes, Type}, Schema, WithName, }; use futures_util::TryStreamExt; +use json_structural_diff::JsonDiff; use mongodb::bson::{doc, spec::BinarySubtype, Binary, Bson, Document}; use mongodb_agent_common::mongodb::{CollectionTrait as _, DatabaseTrait}; use mongodb_support::{ aggregate::{Pipeline, Stage}, - BsonScalarType::{self, *}, + BsonScalarType::{self, self as S}, }; +use ndc_models::{CollectionName, ObjectTypeName}; + +use self::keep_backward_compatible_changes::keep_backward_compatible_changes; type ObjectField = WithName; type ObjectType = WithName; +#[derive(Default)] +pub struct SampledSchema { + pub schemas: BTreeMap, + + /// Updates to existing schema changes are made conservatively. These diffs show the difference + /// between each new configuration to be written to disk on the left, and the schema that would + /// have been written if starting from scratch on the right. + pub ignored_changes: BTreeMap, +} + +impl SampledSchema { + pub fn insert_collection( + &mut self, + name: impl std::fmt::Display, + collection: CollectionSchema, + ) { + self.schemas.insert( + name.to_string(), + Self::schema_from_collection(name, collection), + ); + } + + pub fn record_ignored_collection_changes( + &mut self, + name: impl std::fmt::Display, + before: &CollectionSchema, + after: &CollectionSchema, + ) -> Result<(), serde_json::error::Error> { + let a = serde_json::to_value(Self::schema_from_collection(&name, before.clone()))?; + let b = serde_json::to_value(Self::schema_from_collection(&name, after.clone()))?; + if let Some(diff) = JsonDiff::diff_string(&a, &b, false) { + self.ignored_changes.insert(name.to_string(), diff); + } + Ok(()) + } + + fn schema_from_collection( + name: impl std::fmt::Display, + collection: CollectionSchema, + ) -> Schema { + Schema { + collections: [(name.to_string().into(), collection.collection)].into(), + object_types: collection.object_types, + } + } +} + /// Sample from all collections in the database and return a Schema. /// Return an error if there are any errors accessing the database /// or if the types derived from the sample documents for a collection @@ -25,39 +78,76 @@ type ObjectType = WithName; pub async fn sample_schema_from_db( sample_size: u32, all_schema_nullable: bool, - config_file_changed: bool, db: &impl DatabaseTrait, - existing_schemas: &HashSet, -) -> anyhow::Result> { - let mut schemas = BTreeMap::new(); + mut previously_defined_collections: BTreeMap, +) -> anyhow::Result { + let mut sampled_schema: SampledSchema = Default::default(); let mut collections_cursor = db.list_collections().await?; while let Some(collection_spec) = collections_cursor.try_next().await? { let collection_name = collection_spec.name; - if !existing_schemas.contains(&collection_name) || config_file_changed { - let collection_schema = sample_schema_from_collection( - &collection_name, - sample_size, - all_schema_nullable, - db, - ) - .await?; - if let Some(collection_schema) = collection_schema { - schemas.insert(collection_name, collection_schema); - } else { - log_warning!("could not find any documents to sample from collection, {collection_name} - skipping"); + + let previously_defined_collection = + previously_defined_collections.remove(collection_name.as_str()); + + // Use previously-defined type name in case user has customized it + let collection_type_name = previously_defined_collection + .as_ref() + .map(|c| c.collection.r#type.clone()) + .unwrap_or_else(|| collection_name.clone().into()); + + let Some(collection_schema) = sample_schema_from_collection( + &collection_name, + collection_type_name.clone(), + sample_size, + all_schema_nullable, + db, + ) + .await? + else { + log_warning!("could not find any documents to sample from collection, {collection_name} - skipping"); + continue; + }; + + let collection_schema = match previously_defined_collection { + Some(previously_defined_collection) => { + let backward_compatible_schema = keep_backward_compatible_changes( + previously_defined_collection, + collection_schema.object_types.clone(), + ); + let _ = sampled_schema.record_ignored_collection_changes( + &collection_name, + &backward_compatible_schema, + &collection_schema, + ); + let updated_collection = Collection { + r#type: collection_type_name, + description: collection_schema + .collection + .description + .or(backward_compatible_schema.collection.description), + }; + CollectionSchema { + collection: updated_collection, + object_types: backward_compatible_schema.object_types, + } } - } + None => collection_schema, + }; + + sampled_schema.insert_collection(collection_name, collection_schema); } - Ok(schemas) + + Ok(sampled_schema) } async fn sample_schema_from_collection( collection_name: &str, + collection_type_name: ObjectTypeName, sample_size: u32, all_schema_nullable: bool, db: &impl DatabaseTrait, -) -> anyhow::Result> { +) -> anyhow::Result> { let options = None; let mut cursor = db .collection(collection_name) @@ -72,7 +162,7 @@ async fn sample_schema_from_collection( let is_collection_type = true; while let Some(document) = cursor.try_next().await? { let object_types = make_object_type( - &collection_name.into(), + &collection_type_name, &document, is_collection_type, all_schema_nullable, @@ -86,15 +176,12 @@ async fn sample_schema_from_collection( if collected_object_types.is_empty() { Ok(None) } else { - let collection_info = WithName::named( - collection_name.into(), - schema::Collection { - description: None, - r#type: collection_name.into(), - }, - ); - Ok(Some(Schema { - collections: WithName::into_map([collection_info]), + let collection_info = schema::Collection { + description: None, + r#type: collection_type_name, + }; + Ok(Some(CollectionSchema { + collection: collection_info, object_types: WithName::into_map(collected_object_types), })) } @@ -184,12 +271,12 @@ fn make_field_type( (vec![], Type::Scalar(t)) } match field_value { - Bson::Double(_) => scalar(Double), - Bson::String(_) => scalar(String), + Bson::Double(_) => scalar(S::Double), + Bson::String(_) => scalar(S::String), Bson::Array(arr) => { // Examine all elements of the array and take the union of the resulting types. let mut collected_otds = vec![]; - let mut result_type = Type::Scalar(Undefined); + let mut result_type = Type::Scalar(S::Undefined); for elem in arr { let (elem_collected_otds, elem_type) = make_field_type(object_type_name, elem, all_schema_nullable); @@ -212,29 +299,29 @@ fn make_field_type( ); (collected_otds, Type::Object(object_type_name.to_owned())) } - Bson::Boolean(_) => scalar(Bool), - Bson::Null => scalar(Null), - Bson::RegularExpression(_) => scalar(Regex), - Bson::JavaScriptCode(_) => scalar(Javascript), - Bson::JavaScriptCodeWithScope(_) => scalar(JavascriptWithScope), - Bson::Int32(_) => scalar(Int), - Bson::Int64(_) => scalar(Long), - Bson::Timestamp(_) => scalar(Timestamp), + Bson::Boolean(_) => scalar(S::Bool), + Bson::Null => scalar(S::Null), + Bson::RegularExpression(_) => scalar(S::Regex), + Bson::JavaScriptCode(_) => scalar(S::Javascript), + Bson::JavaScriptCodeWithScope(_) => scalar(S::JavascriptWithScope), + Bson::Int32(_) => scalar(S::Int), + Bson::Int64(_) => scalar(S::Long), + Bson::Timestamp(_) => scalar(S::Timestamp), Bson::Binary(Binary { subtype, .. }) => { if *subtype == BinarySubtype::Uuid { - scalar(UUID) + scalar(S::UUID) } else { - scalar(BinData) + scalar(S::BinData) } } - Bson::ObjectId(_) => scalar(ObjectId), - Bson::DateTime(_) => scalar(Date), - Bson::Symbol(_) => scalar(Symbol), - Bson::Decimal128(_) => scalar(Decimal), - Bson::Undefined => scalar(Undefined), - Bson::MaxKey => scalar(MaxKey), - Bson::MinKey => scalar(MinKey), - Bson::DbPointer(_) => scalar(DbPointer), + Bson::ObjectId(_) => scalar(S::ObjectId), + Bson::DateTime(_) => scalar(S::Date), + Bson::Symbol(_) => scalar(S::Symbol), + Bson::Decimal128(_) => scalar(S::Decimal), + Bson::Undefined => scalar(S::Undefined), + Bson::MaxKey => scalar(S::MaxKey), + Bson::MinKey => scalar(S::MinKey), + Bson::DbPointer(_) => scalar(S::DbPointer), } } diff --git a/crates/cli/src/introspection/sampling/keep_backward_compatible_changes.rs b/crates/cli/src/introspection/sampling/keep_backward_compatible_changes.rs new file mode 100644 index 00000000..6f710cad --- /dev/null +++ b/crates/cli/src/introspection/sampling/keep_backward_compatible_changes.rs @@ -0,0 +1,156 @@ +use std::collections::BTreeMap; + +use configuration::schema::{CollectionSchema, ObjectField, ObjectType, Type}; +use itertools::Itertools as _; +use ndc_models::ObjectTypeName; + +use super::ObjectTypes; + +pub fn keep_backward_compatible_changes( + existing_collection: CollectionSchema, + mut updated_object_types: ObjectTypes, +) -> CollectionSchema { + let mut accumulated_new_object_types = Default::default(); + let CollectionSchema { + collection, + object_types: mut previously_defined_object_types, + } = existing_collection; + backward_compatible_helper( + &mut previously_defined_object_types, + &mut updated_object_types, + &mut accumulated_new_object_types, + collection.r#type.clone(), + ); + CollectionSchema { + collection, + object_types: accumulated_new_object_types, + } +} + +fn backward_compatible_helper( + previously_defined_object_types: &mut ObjectTypes, + updated_object_types: &mut ObjectTypes, + accumulated_new_object_types: &mut ObjectTypes, + type_name: ObjectTypeName, +) { + if accumulated_new_object_types.contains_key(&type_name) { + return; + } + let existing = previously_defined_object_types.remove(&type_name); + let updated = updated_object_types.remove(&type_name); + match (existing, updated) { + (Some(existing), Some(updated)) => { + let object_type = backward_compatible_object_type( + previously_defined_object_types, + updated_object_types, + accumulated_new_object_types, + existing, + updated, + ); + accumulated_new_object_types.insert(type_name, object_type); + } + (Some(existing), None) => { + accumulated_new_object_types.insert(type_name, existing.clone()); + } + (None, Some(updated)) => { + accumulated_new_object_types.insert(type_name, updated); + } + // shouldn't be reachable + (None, None) => (), + } +} + +fn backward_compatible_object_type( + previously_defined_object_types: &mut ObjectTypes, + updated_object_types: &mut ObjectTypes, + accumulated_new_object_types: &mut ObjectTypes, + existing: ObjectType, + mut updated: ObjectType, +) -> ObjectType { + let field_names = updated + .fields + .keys() + .chain(existing.fields.keys()) + .unique() + .cloned() + .collect_vec(); + let fields = field_names + .into_iter() + .map(|name| { + let existing_field = existing.fields.get(&name); + let updated_field = updated.fields.remove(&name); + let field = match (existing_field, updated_field) { + (Some(existing_field), Some(updated_field)) => { + let r#type = reconcile_types( + previously_defined_object_types, + updated_object_types, + accumulated_new_object_types, + existing_field.r#type.clone(), + updated_field.r#type, + ); + ObjectField { + description: existing.description.clone().or(updated_field.description), + r#type, + } + } + (Some(existing_field), None) => existing_field.clone(), + (None, Some(updated_field)) => updated_field, + (None, None) => unreachable!(), + }; + (name.clone(), field) + }) + .collect(); + ObjectType { + description: existing.description.clone().or(updated.description), + fields, + } +} + +fn reconcile_types( + previously_defined_object_types: &mut BTreeMap, + updated_object_types: &mut BTreeMap, + accumulated_new_object_types: &mut BTreeMap, + existing_type: Type, + updated_type: Type, +) -> Type { + match (existing_type, updated_type) { + (Type::Nullable(a), Type::Nullable(b)) => Type::Nullable(Box::new(reconcile_types( + previously_defined_object_types, + updated_object_types, + accumulated_new_object_types, + *a, + *b, + ))), + (Type::Nullable(a), b) => Type::Nullable(Box::new(reconcile_types( + previously_defined_object_types, + updated_object_types, + accumulated_new_object_types, + *a, + b, + ))), + (a, Type::Nullable(b)) => reconcile_types( + previously_defined_object_types, + updated_object_types, + accumulated_new_object_types, + a, + *b, + ), + (Type::ArrayOf(a), Type::ArrayOf(b)) => Type::ArrayOf(Box::new(reconcile_types( + previously_defined_object_types, + updated_object_types, + accumulated_new_object_types, + *a, + *b, + ))), + (Type::Object(_), Type::Object(b)) => { + backward_compatible_helper( + previously_defined_object_types, + updated_object_types, + accumulated_new_object_types, + b.clone().into(), + ); + Type::Object(b) + } + (a, _) => a, + } +} diff --git a/crates/cli/src/lib.rs b/crates/cli/src/lib.rs index 57bae3d1..95f90e13 100644 --- a/crates/cli/src/lib.rs +++ b/crates/cli/src/lib.rs @@ -13,6 +13,8 @@ use std::path::PathBuf; use clap::{Parser, Subcommand}; +use configuration::SCHEMA_DIRNAME; +use introspection::sampling::SampledSchema; // Exported for use in tests pub use introspection::type_from_bson; use mongodb_agent_common::{mongodb::DatabaseTrait, state::try_init_state_from_uri}; @@ -91,7 +93,6 @@ async fn update( .all_schema_nullable } }; - let config_file_changed = configuration::get_config_file_changed(&context.path).await?; if !no_validator_schema { let schemas_from_json_validation = @@ -99,14 +100,35 @@ async fn update( configuration::write_schema_directory(&context.path, schemas_from_json_validation).await?; } - let existing_schemas = configuration::list_existing_schemas(&context.path).await?; - let schemas_from_sampling = introspection::sample_schema_from_db( + let existing_schemas = configuration::read_existing_schemas(&context.path).await?; + let SampledSchema { + schemas: schemas_from_sampling, + ignored_changes, + } = introspection::sample_schema_from_db( sample_size, all_schema_nullable, - config_file_changed, database, - &existing_schemas, + existing_schemas, ) .await?; - configuration::write_schema_directory(&context.path, schemas_from_sampling).await + configuration::write_schema_directory(&context.path, schemas_from_sampling).await?; + + if !ignored_changes.is_empty() { + eprintln!("Warning: introspection detected some changes to to database that were **not** applied to existing +schema configurations. To avoid accidental breaking changes the introspection system is +conservative about what changes are applied automatically."); + eprintln!(); + eprintln!("To apply changes delete the schema configuration files you want updated, and run introspection +again; or edit the files directly."); + eprintln!(); + eprintln!("These database changes were **not** applied:"); + } + for (collection_name, changes) in ignored_changes { + let mut config_path = context.path.join(SCHEMA_DIRNAME).join(collection_name); + config_path.set_extension("json"); + eprintln!(); + eprintln!("{}:", config_path.to_string_lossy()); + eprintln!("{}", changes) + } + Ok(()) } diff --git a/crates/cli/src/tests.rs b/crates/cli/src/tests.rs index b41ef57e..a18e80ab 100644 --- a/crates/cli/src/tests.rs +++ b/crates/cli/src/tests.rs @@ -1,8 +1,18 @@ +use std::path::Path; + use async_tempfile::TempDir; -use configuration::read_directory; -use mongodb::bson::{self, doc, from_document}; -use mongodb_agent_common::mongodb::{test_helpers::mock_stream, MockDatabaseTrait}; +use configuration::{read_directory, Configuration}; +use googletest::prelude::*; +use itertools::Itertools as _; +use mongodb::{ + bson::{self, doc, from_document, Bson}, + options::AggregateOptions, +}; +use mongodb_agent_common::mongodb::{ + test_helpers::mock_stream, MockCollectionTrait, MockDatabaseTrait, +}; use ndc_models::{CollectionName, FieldName, ObjectField, ObjectType, Type}; +use ndc_test_helpers::{array_of, named_type, nullable, object_type}; use pretty_assertions::assert_eq; use crate::{update, Context, UpdateArgs}; @@ -80,6 +90,211 @@ async fn validator_object_with_no_properties_becomes_extended_json_object() -> a Ok(()) } +#[gtest] +#[tokio::test] +async fn adds_new_fields_on_re_introspection() -> anyhow::Result<()> { + let config_dir = TempDir::new().await?; + schema_from_sampling( + &config_dir, + vec![doc! { "title": "First post!", "author": "Alice" }], + ) + .await?; + + // re-introspect after database changes + let configuration = schema_from_sampling( + &config_dir, + vec![doc! { "title": "First post!", "author": "Alice", "body": "Hello, world!" }], + ) + .await?; + + let updated_type = configuration + .object_types + .get("posts") + .expect("got posts collection type"); + + expect_that!( + updated_type.fields, + unordered_elements_are![ + ( + displays_as(eq("title")), + field!(ObjectField.r#type, eq(&named_type("String"))) + ), + ( + displays_as(eq("author")), + field!(ObjectField.r#type, eq(&named_type("String"))) + ), + ( + displays_as(eq("body")), + field!(ObjectField.r#type, eq(&named_type("String"))) + ), + ] + ); + Ok(()) +} + +#[gtest] +#[tokio::test] +async fn changes_from_re_introspection_are_additive_only() -> anyhow::Result<()> { + let config_dir = TempDir::new().await?; + schema_from_sampling( + &config_dir, + vec![ + doc! { + "created_at": "2025-07-03T02:31Z", + "removed_field": true, + "author": "Alice", + "nested": { + "scalar_type_changed": 1, + "removed": 1, + "made_nullable": 1, + + }, + "nested_array": [{ + "scalar_type_changed": 1, + "removed": 1, + "made_nullable": 1, + + }], + "nested_nullable": { + "scalar_type_changed": 1, + "removed": 1, + "made_nullable": 1, + + } + }, + doc! { + "created_at": "2025-07-03T02:31Z", + "removed_field": true, + "author": "Alice", + "nested": { + "scalar_type_changed": 1, + "removed": 1, + "made_nullable": 1, + + }, + "nested_array": [{ + "scalar_type_changed": 1, + "removed": 1, + "made_nullable": 1, + + }], + "nested_nullable": null, + }, + ], + ) + .await?; + + // re-introspect after database changes + let configuration = schema_from_sampling( + &config_dir, + vec![ + doc! { + "created_at": Bson::DateTime(bson::DateTime::from_millis(1741372252881)), + "author": "Alice", + "nested": { + "scalar_type_changed": true, + "made_nullable": 1, + }, + "nested_array": [{ + "scalar_type_changed": true, + "made_nullable": 1, + + }], + "nested_nullable": { + "scalar_type_changed": true, + "made_nullable": 1, + + } + }, + doc! { + "created_at": Bson::DateTime(bson::DateTime::from_millis(1741372252881)), + "author": null, + "nested": { + "scalar_type_changed": true, + "made_nullable": null, + }, + "nested_array": [{ + "scalar_type_changed": true, + "made_nullable": null, + }], + "nested_nullable": null, + }, + ], + ) + .await?; + + let updated_type = configuration + .object_types + .get("posts") + .expect("got posts collection type"); + + expect_that!( + updated_type.fields, + unordered_elements_are![ + ( + displays_as(eq("created_at")), + field!(ObjectField.r#type, eq(&named_type("String"))) + ), + ( + displays_as(eq("removed_field")), + field!(ObjectField.r#type, eq(&named_type("Bool"))) + ), + ( + displays_as(eq("author")), + field!(ObjectField.r#type, eq(&named_type("String"))) + ), + ( + displays_as(eq("nested")), + field!(ObjectField.r#type, eq(&named_type("posts_nested"))) + ), + ( + displays_as(eq("nested_array")), + field!( + ObjectField.r#type, + eq(&array_of(named_type("posts_nested_array"))) + ) + ), + ( + displays_as(eq("nested_nullable")), + field!( + ObjectField.r#type, + eq(&nullable(named_type("posts_nested_nullable"))) + ) + ), + ] + ); + expect_that!( + configuration.object_types, + contains_each![ + ( + displays_as(eq("posts_nested")), + eq(&object_type([ + ("scalar_type_changed", named_type("Int")), + ("removed", named_type("Int")), + ("made_nullable", named_type("Int")), + ])) + ), + ( + displays_as(eq("posts_nested_array")), + eq(&object_type([ + ("scalar_type_changed", named_type("Int")), + ("removed", named_type("Int")), + ("made_nullable", named_type("Int")), + ])) + ), + ( + displays_as(eq("posts_nested_nullable")), + eq(&object_type([ + ("scalar_type_changed", named_type("Int")), + ("removed", named_type("Int")), + ("made_nullable", named_type("Int")), + ])) + ), + ] + ); + Ok(()) +} + async fn collection_schema_from_validator(validator: bson::Document) -> anyhow::Result { let mut db = MockDatabaseTrait::new(); let config_dir = TempDir::new().await?; @@ -112,6 +327,14 @@ async fn collection_schema_from_validator(validator: bson::Document) -> anyhow:: )])) }); + db.expect_collection().returning(|_collection_name| { + let mut collection = MockCollectionTrait::new(); + collection + .expect_aggregate() + .returning(|_pipeline, _options: Option| Ok(mock_stream(vec![]))); + collection + }); + update(&context, &args, &db).await?; let configuration = read_directory(config_dir).await?; @@ -127,3 +350,54 @@ async fn collection_schema_from_validator(validator: bson::Document) -> anyhow:: Ok(collection_object_type.clone()) } + +async fn schema_from_sampling( + config_dir: &Path, + sampled_documents: Vec, +) -> anyhow::Result { + let mut db = MockDatabaseTrait::new(); + + let context = Context { + path: config_dir.to_path_buf(), + connection_uri: None, + display_color: false, + }; + + let args = UpdateArgs { + sample_size: Some(100), + no_validator_schema: None, + all_schema_nullable: Some(false), + }; + + db.expect_list_collections().returning(move || { + let collection_spec = doc! { + "name": "posts", + "type": "collection", + "options": {}, + "info": { "readOnly": false }, + }; + Ok(mock_stream(vec![Ok( + from_document(collection_spec).unwrap() + )])) + }); + + db.expect_collection().returning(move |_collection_name| { + let mut collection = MockCollectionTrait::new(); + let sample_results = sampled_documents + .iter() + .cloned() + .map(Ok::<_, mongodb::error::Error>) + .collect_vec(); + collection.expect_aggregate().returning( + move |_pipeline, _options: Option| { + Ok(mock_stream(sample_results.clone())) + }, + ); + collection + }); + + update(&context, &args, &db).await?; + + let configuration = read_directory(config_dir).await?; + Ok(configuration) +} diff --git a/crates/configuration/src/directory.rs b/crates/configuration/src/directory.rs index 262d5f6d..0bff4130 100644 --- a/crates/configuration/src/directory.rs +++ b/crates/configuration/src/directory.rs @@ -1,18 +1,18 @@ use anyhow::{anyhow, Context as _}; use futures::stream::TryStreamExt as _; use itertools::Itertools as _; -use ndc_models::FunctionName; +use ndc_models::{CollectionName, FunctionName}; use serde::{Deserialize, Serialize}; use std::{ - collections::{BTreeMap, HashSet}, - fs::Metadata, + collections::BTreeMap, path::{Path, PathBuf}, }; -use tokio::{fs, io::AsyncWriteExt}; +use tokio::fs; use tokio_stream::wrappers::ReadDirStream; use crate::{ configuration::ConfigurationOptions, + schema::CollectionSchema, serialized::{NativeQuery, Schema}, with_name::WithName, Configuration, @@ -22,7 +22,6 @@ pub const SCHEMA_DIRNAME: &str = "schema"; pub const NATIVE_MUTATIONS_DIRNAME: &str = "native_mutations"; pub const NATIVE_QUERIES_DIRNAME: &str = "native_queries"; pub const CONFIGURATION_OPTIONS_BASENAME: &str = "configuration"; -pub const CONFIGURATION_OPTIONS_METADATA: &str = ".configuration_metadata"; // Deprecated: Discussion came out that we standardize names and the decision // was to use `native_mutations`. We should leave this in for a few releases @@ -207,7 +206,6 @@ pub async fn parse_configuration_options_file(dir: &Path) -> anyhow::Result, -) -> anyhow::Result> { +) -> anyhow::Result> { let dir = configuration_dir.as_ref(); - // TODO: we don't really need to read and parse all the schema files here, just get their names. - let schemas = read_subdir_configs::<_, Schema>(&dir.join(SCHEMA_DIRNAME), &[]) + let schemas = read_subdir_configs::(&dir.join(SCHEMA_DIRNAME), &[]) .await? .unwrap_or_default(); - Ok(schemas.into_keys().collect()) -} - -// Metadata file is just a dot filed used for the purposes of know if the user has updated their config to force refresh -// of the schema introspection. -async fn write_config_metadata_file(configuration_dir: impl AsRef) { - let dir = configuration_dir.as_ref(); - let file_result = fs::OpenOptions::new() - .write(true) - .truncate(true) - .create(true) - .open(dir.join(CONFIGURATION_OPTIONS_METADATA)) - .await; - - if let Ok(mut file) = file_result { - let _ = file.write_all(b"").await; - }; -} - -pub async fn get_config_file_changed(dir: impl AsRef) -> anyhow::Result { - let path = dir.as_ref(); - let dot_metadata: Result = - fs::metadata(&path.join(CONFIGURATION_OPTIONS_METADATA)).await; - let json_metadata = - fs::metadata(&path.join(CONFIGURATION_OPTIONS_BASENAME.to_owned() + ".json")).await; - let yaml_metadata = - fs::metadata(&path.join(CONFIGURATION_OPTIONS_BASENAME.to_owned() + ".yaml")).await; - - let compare = |dot_date, config_date| async move { - if dot_date < config_date { - let _ = write_config_metadata_file(path).await; - Ok(true) - } else { - Ok(false) - } - }; + // Get a single collection schema out of each file + let schemas = schemas + .into_iter() + .flat_map(|(name, schema)| { + let mut collections = schema.collections.into_iter().collect_vec(); + let (collection_name, collection) = collections.pop()?; + if !collections.is_empty() { + return Some(Err(anyhow!("found schemas for multiple collections in {SCHEMA_DIRNAME}/{name}.json - please limit schema configurations to one collection per file"))); + } + Some(Ok((collection_name, CollectionSchema { + collection, + object_types: schema.object_types, + }))) + }) + .collect::>>()?; - match (dot_metadata, json_metadata, yaml_metadata) { - (Ok(dot), Ok(json), _) => compare(dot.modified()?, json.modified()?).await, - (Ok(dot), _, Ok(yaml)) => compare(dot.modified()?, yaml.modified()?).await, - _ => Ok(true), - } + Ok(schemas) } #[cfg(test)] diff --git a/crates/configuration/src/lib.rs b/crates/configuration/src/lib.rs index 798f232c..9e0402a2 100644 --- a/crates/configuration/src/lib.rs +++ b/crates/configuration/src/lib.rs @@ -8,16 +8,15 @@ pub mod serialized; mod with_name; pub use crate::configuration::Configuration; -pub use crate::directory::get_config_file_changed; -pub use crate::directory::list_existing_schemas; pub use crate::directory::parse_configuration_options_file; +pub use crate::directory::read_existing_schemas; pub use crate::directory::write_schema_directory; pub use crate::directory::{ read_directory, read_directory_with_ignored_configs, read_native_query_directory, }; pub use crate::directory::{ - CONFIGURATION_OPTIONS_BASENAME, CONFIGURATION_OPTIONS_METADATA, NATIVE_MUTATIONS_DIRNAME, - NATIVE_QUERIES_DIRNAME, SCHEMA_DIRNAME, + CONFIGURATION_OPTIONS_BASENAME, NATIVE_MUTATIONS_DIRNAME, NATIVE_QUERIES_DIRNAME, + SCHEMA_DIRNAME, }; pub use crate::mongo_scalar_type::MongoScalarType; pub use crate::serialized::Schema; diff --git a/crates/configuration/src/schema/mod.rs b/crates/configuration/src/schema/mod.rs index 3b43e173..cba2a589 100644 --- a/crates/configuration/src/schema/mod.rs +++ b/crates/configuration/src/schema/mod.rs @@ -18,6 +18,13 @@ pub struct Collection { pub description: Option, } +/// Schema for a single collection, as opposed to [Schema] which can describe multiple collections. +#[derive(Clone, Debug)] +pub struct CollectionSchema { + pub collection: Collection, + pub object_types: BTreeMap, +} + /// The type of values that a column, field, or argument may take. #[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, JsonSchema)] #[serde(rename_all = "camelCase")] @@ -202,6 +209,8 @@ impl From for ObjectType { } } +pub type ObjectTypes = BTreeMap; + /// Information about an object type field. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, JsonSchema)] #[serde(rename_all = "camelCase")]