From dd4672bcf274be06aaf4af5973b4f45fb620d248 Mon Sep 17 00:00:00 2001 From: dasm Date: Fri, 12 Sep 2025 09:06:04 -0700 Subject: [PATCH 01/25] Add the dependency matrix (#55) * Add dependency matrix nox sessions. * Drop support for pyspark <3.5 * Add the dependency matrix to the release pipeline * Set hostname to localhost * Use new setup action. * Reduce the size of the matrix. --- .github/workflows/release.yml | 35 +++++++++++++++++++++++++---- CHANGELOG.rst | 2 ++ noxfile.py | 42 +++++++++++++---------------------- pyproject.toml | 4 +--- uv.lock | 8 +++---- 5 files changed, 52 insertions(+), 39 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 0759991b..12db548e 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -34,7 +34,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - name: Set up runner - uses: opendp/tumult-tools/actions/setup@0f3d49599e5824a9f407a6e2063990c1e0d4c2e8 + uses: opendp/tumult-tools/actions/setup@eabe1054863f0916a0087ad180fd83719049c094 - run: uv run nox -s build - name: Archive packaged library uses: actions/upload-artifact@v4 @@ -50,7 +50,7 @@ jobs: - name: Checkout code repository uses: actions/checkout@v4 - name: Set up runner - uses: opendp/tumult-tools/actions/setup@0f3d49599e5824a9f407a6e2063990c1e0d4c2e8 + uses: opendp/tumult-tools/actions/setup@eabe1054863f0916a0087ad180fd83719049c094 - name: Download dist uses: actions/download-artifact@v4 with: @@ -66,7 +66,7 @@ jobs: - name: Checkout code repository uses: actions/checkout@v4 - name: Set up runner - uses: opendp/tumult-tools/actions/setup@0f3d49599e5824a9f407a6e2063990c1e0d4c2e8 + uses: opendp/tumult-tools/actions/setup@eabe1054863f0916a0087ad180fd83719049c094 - name: Download dist uses: actions/download-artifact@v4 with: @@ -74,12 +74,38 @@ jobs: path: dist - run: uv run nox -t benchmark + Dependency-Matrix: + if: github.repository == 'opendp/tumult-analytics' + strategy: + fail-fast: false + matrix: + # Note: mac runners are rather expensive (10x multiplier) so we don't use them here. + os: [ubuntu-latest] + dependencies: [oldest, newest] + python: ["3.9", "3.12"] + runs-on: ${{ matrix.os }} + needs: Package + steps: + - name: Checkout code repository + uses: actions/checkout@v4 + - name: Set up runner + uses: opendp/tumult-tools/actions/setup@eabe1054863f0916a0087ad180fd83719049c094 + - name: Download dist + uses: actions/download-artifact@v4 + with: + name: dist + path: dist + - run: uv run nox -s "test_dependency_matrix(${{matrix.python}}-${{matrix.dependencies}})" + env: + SPARK_LOCAL_HOSTNAME: localhost + Publish-To-PyPI: if: github.repository == 'opendp/tumult-analytics' runs-on: ubuntu-latest needs: - Test-Slow - Benchmark + - Dependency-Matrix environment: name: pypi url: https://pypi.org/p/tmlt-analytics @@ -106,7 +132,7 @@ jobs: - name: Checkout code repository uses: actions/checkout@v4 - name: Set up runner - uses: opendp/tumult-tools/actions/setup@0f3d49599e5824a9f407a6e2063990c1e0d4c2e8 + uses: opendp/tumult-tools/actions/setup@eabe1054863f0916a0087ad180fd83719049c094 - name: Download dist uses: actions/download-artifact@v4 with: @@ -124,3 +150,4 @@ jobs: docs-repository-token: ${{ secrets.DOCS_REPO_PAT }} docs-path: docs/analytics version: ${{ format('v{0}.{1}', env.MAJOR_VERSION, env.MINOR_VERSION) }} + diff --git a/CHANGELOG.rst b/CHANGELOG.rst index a5490a30..b65bf933 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -12,6 +12,8 @@ Unreleased Changed ~~~~~~~ +- Dropped support for pyspark <3.5.0 after discovering that it does not work on Macs (this may not work for older versions as well). + .. _v0.20.2: diff --git a/noxfile.py b/noxfile.py index 661b8246..adbb6427 100644 --- a/noxfile.py +++ b/noxfile.py @@ -5,11 +5,10 @@ API reference. """ -import os from pathlib import Path import nox -from tmlt.nox_utils import SessionManager +from tmlt.nox_utils import DependencyConfiguration, SessionManager nox.options.default_venv_backend = "uv|virtualenv" @@ -29,31 +28,18 @@ """For test suites where we track coverage (i.e. the fast tests and the full test suite), fail if test coverage falls below this percentage.""" -DEPENDENCY_MATRIX = { - name: { - # The Python minor version to run with - "python": python, - # All other entries take PEP440 version specifiers for the package named in - # the key -- see https://peps.python.org/pep-0440/#version-specifiers - "pyspark[sql]": pyspark, - "sympy": sympy, - "pandas": pandas, - "tmlt.core": core, - } - for (name, python, pyspark, sympy, pandas, core) in [ - # fmt: off - ("3.9-oldest", "3.9", "==3.3.1", "==1.8", "==1.4.0", "==0.18.0"), - ("3.9-pyspark3.4", "3.9", "==3.4.0", "==1.9", "==1.5.3", ">=0.18.0"), - ("3.9-newest", "3.9", "==3.5.1", "==1.9", "==1.5.3", ">=0.18.0"), - ("3.10-oldest", "3.10", "==3.3.1", "==1.8", "==1.4.0", "==0.18.0"), - ("3.10-newest", "3.10", "==3.5.1", "==1.9", "==1.5.3", ">=0.18.0"), - ("3.11-oldest", "3.11", "==3.4.0", "==1.8", "==1.5.0", "==0.18.0"), - ("3.11-newest", "3.11", "==3.5.1", "==1.9", "==1.5.3", ">=0.18.0"), - ("3.12-oldest", "3.12", "==3.5.0", "==1.8", "==2.2.0", "==0.18.0"), - ("3.12-newest", "3.12", "==3.5.1", "==1.9", "==2.2.3", ">=0.18.0"), - # fmt: on - ] -} +DEPENDENCY_MATRIX = [ + #fmt: off + DependencyConfiguration(id="3.9-oldest", python="3.9", packages={"pyspark[sql]": "==3.5.0", "sympy": "==1.8", "pandas": "==1.4.0", "tmlt.core": "==0.18.0"}), + DependencyConfiguration(id="3.9-newest", python="3.9", packages={"pyspark[sql]": "==3.5.6", "sympy": "==1.9", "pandas": "==1.5.3", "tmlt.core": ">=0.18.0"}), + DependencyConfiguration(id="3.10-oldest", python="3.10", packages={"pyspark[sql]": "==3.5.0", "sympy": "==1.8", "pandas": "==1.4.0", "tmlt.core": "==0.18.0"}), + DependencyConfiguration(id="3.10-newest", python="3.10", packages={"pyspark[sql]": "==3.5.6", "sympy": "==1.9", "pandas": "==1.5.3", "tmlt.core": ">=0.18.0"}), + DependencyConfiguration(id="3.11-oldest", python="3.11", packages={"pyspark[sql]": "==3.5.0", "sympy": "==1.8", "pandas": "==1.5.0", "tmlt.core": "==0.18.0"}), + DependencyConfiguration(id="3.11-newest", python="3.11", packages={"pyspark[sql]": "==3.5.6", "sympy": "==1.9", "pandas": "==1.5.3", "tmlt.core": ">=0.18.0"}), + DependencyConfiguration(id="3.12-oldest", python="3.12", packages={"pyspark[sql]": "==3.5.0", "sympy": "==1.8", "pandas": "==2.2.0", "tmlt.core": "==0.18.0"}), + DependencyConfiguration(id="3.12-newest", python="3.12", packages={"pyspark[sql]": "==3.5.6", "sympy": "==1.9", "pandas": "==2.2.3", "tmlt.core": ">=0.18.0"}), + #fmt: on +] AUDIT_VERSIONS = ["3.9", "3.10", "3.11", "3.12"] AUDIT_SUPPRESSIONS = [ @@ -120,3 +106,5 @@ sm.audit() sm.make_release() + +sm.test_dependency_matrix(DEPENDENCY_MATRIX) diff --git a/pyproject.toml b/pyproject.toml index 985fc212..f23a756f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,9 +35,7 @@ dependencies = [ "pandas >=1.4.0,<2 ; python_version < '3.11'", "pandas >=1.5.0,<2 ; python_version == '3.11'", "pandas >=2.2.0,<3 ; python_version >= '3.12'", - "pyspark[sql] >=3.3.1,<3.6 ; python_version < '3.11'", - "pyspark[sql] >=3.4.0,<3.6 ; python_version == '3.11'", - "pyspark[sql] >=3.5.0,<3.6 ; python_version >= '3.12'", + "pyspark[sql] >=3.5.0,<3.6", "sympy >=1.8,<1.13", "typeguard >=4.0.0,<5", "typing-extensions >=4.1.0,<5", diff --git a/uv.lock b/uv.lock index 40ba552f..63e47920 100644 --- a/uv.lock +++ b/uv.lock @@ -2453,9 +2453,7 @@ requires-dist = [ { name = "pandas", marker = "python_full_version < '3.11'", specifier = ">=1.4.0,<2" }, { name = "pandas", marker = "python_full_version == '3.11.*'", specifier = ">=1.5.0,<2" }, { name = "pandas", marker = "python_full_version >= '3.12'", specifier = ">=2.2.0,<3" }, - { name = "pyspark", extras = ["sql"], marker = "python_full_version < '3.11'", specifier = ">=3.3.1,<3.6" }, - { name = "pyspark", extras = ["sql"], marker = "python_full_version == '3.11.*'", specifier = ">=3.4.0,<3.6" }, - { name = "pyspark", extras = ["sql"], marker = "python_full_version >= '3.12'", specifier = ">=3.5.0,<3.6" }, + { name = "pyspark", extras = ["sql"], specifier = ">=3.5.0,<3.6" }, { name = "sympy", specifier = ">=1.8,<1.13" }, { name = "tabulate", specifier = ">=0.8.9,<0.9" }, { name = "tmlt-core", specifier = ">=0.18.0,<0.19" }, @@ -2538,8 +2536,8 @@ wheels = [ [[package]] name = "tmlt-nox-utils" -version = "0.0.0.post15+60b02cab" -source = { git = "https://github.com/opendp/tumult-tools.git?subdirectory=nox-utils#60b02cabf3b5bd0d20fa51c4598c92622f2c65e9" } +version = "0.0.0.post23+df6aa1c0" +source = { git = "https://github.com/opendp/tumult-tools.git?subdirectory=nox-utils#df6aa1c070f189fc6aad44fdde812781951300a9" } dependencies = [ { name = "gitpython" }, { name = "nox" }, From bf6b72f2ebe9aec761693c77dca0a656eae7a92a Mon Sep 17 00:00:00 2001 From: dasm Date: Fri, 19 Sep 2025 12:55:03 -0700 Subject: [PATCH 02/25] Switch to docs.tmlt.dev for versions json (#39) --- doc/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/conf.py b/doc/conf.py index 13f9d00e..91fa7f5f 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -122,7 +122,7 @@ # Remove this after intersphinx can use core nitpick_ignore_regex = [(r"py:.*", r"tmlt.core.*")] -json_url = "https://tmlt.dev/analytics/versions.json" +json_url = "https://docs.tmlt.dev/analytics/versions.json" # Theme settings templates_path = ["_templates", "_templates/autosummary"] From 021b293cef973f4236e75cd1662f5f178951b197 Mon Sep 17 00:00:00 2001 From: dasm Date: Fri, 26 Sep 2025 17:07:51 -0700 Subject: [PATCH 03/25] Re-add older pyspark versions on linux (#56) * Re-enable older pyspark on linux * Add older pyspark versions to noxfile. --- CHANGELOG.rst | 3 +- noxfile.py | 99 ++++++++++++++++++++++++++++++++++++++++++------ pyproject.toml | 5 ++- uv.lock | 101 +++++++++++++++++++++++++++++++++---------------- 4 files changed, 161 insertions(+), 47 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b65bf933..33bd5aa5 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -12,8 +12,7 @@ Unreleased Changed ~~~~~~~ -- Dropped support for pyspark <3.5.0 after discovering that it does not work on Macs (this may not work for older versions as well). - +- Dropped support for pyspark <3.5.0 on Macs after discovering that these configurations frequently crash. Older versions of the library may also be affected. .. _v0.20.2: diff --git a/noxfile.py b/noxfile.py index adbb6427..990a6e00 100644 --- a/noxfile.py +++ b/noxfile.py @@ -5,6 +5,7 @@ API reference. """ +import sys from pathlib import Path import nox @@ -28,17 +29,93 @@ """For test suites where we track coverage (i.e. the fast tests and the full test suite), fail if test coverage falls below this percentage.""" + +def is_mac(): + """Returns true if the current system is a mac.""" + return sys.platform == "darwin" + + DEPENDENCY_MATRIX = [ - #fmt: off - DependencyConfiguration(id="3.9-oldest", python="3.9", packages={"pyspark[sql]": "==3.5.0", "sympy": "==1.8", "pandas": "==1.4.0", "tmlt.core": "==0.18.0"}), - DependencyConfiguration(id="3.9-newest", python="3.9", packages={"pyspark[sql]": "==3.5.6", "sympy": "==1.9", "pandas": "==1.5.3", "tmlt.core": ">=0.18.0"}), - DependencyConfiguration(id="3.10-oldest", python="3.10", packages={"pyspark[sql]": "==3.5.0", "sympy": "==1.8", "pandas": "==1.4.0", "tmlt.core": "==0.18.0"}), - DependencyConfiguration(id="3.10-newest", python="3.10", packages={"pyspark[sql]": "==3.5.6", "sympy": "==1.9", "pandas": "==1.5.3", "tmlt.core": ">=0.18.0"}), - DependencyConfiguration(id="3.11-oldest", python="3.11", packages={"pyspark[sql]": "==3.5.0", "sympy": "==1.8", "pandas": "==1.5.0", "tmlt.core": "==0.18.0"}), - DependencyConfiguration(id="3.11-newest", python="3.11", packages={"pyspark[sql]": "==3.5.6", "sympy": "==1.9", "pandas": "==1.5.3", "tmlt.core": ">=0.18.0"}), - DependencyConfiguration(id="3.12-oldest", python="3.12", packages={"pyspark[sql]": "==3.5.0", "sympy": "==1.8", "pandas": "==2.2.0", "tmlt.core": "==0.18.0"}), - DependencyConfiguration(id="3.12-newest", python="3.12", packages={"pyspark[sql]": "==3.5.6", "sympy": "==1.9", "pandas": "==2.2.3", "tmlt.core": ">=0.18.0"}), - #fmt: on + DependencyConfiguration( + id="3.9-oldest", + python="3.9", + packages={ + "pyspark[sql]": "==3.3.1" if not is_mac() else "==3.5.0", + "sympy": "==1.8", + "pandas": "==1.4.0", + "tmlt.core": "==0.18.0", + }, + ), + DependencyConfiguration( + id="3.9-newest", + python="3.9", + packages={ + "pyspark[sql]": "==3.5.6", + "sympy": "==1.9", + "pandas": "==1.5.3", + "tmlt.core": ">=0.18.0", + }, + ), + DependencyConfiguration( + id="3.10-oldest", + python="3.10", + packages={ + "pyspark[sql]": "==3.3.1" if not is_mac() else "==3.5.0", + "sympy": "==1.8", + "pandas": "==1.4.0", + "tmlt.core": "==0.18.0", + }, + ), + DependencyConfiguration( + id="3.10-newest", + python="3.10", + packages={ + "pyspark[sql]": "==3.5.6", + "sympy": "==1.9", + "pandas": "==1.5.3", + "tmlt.core": ">=0.18.0", + }, + ), + DependencyConfiguration( + id="3.11-oldest", + python="3.11", + packages={ + "pyspark[sql]": "==3.4.0" if not is_mac() else "==3.5.0", + "sympy": "==1.8", + "pandas": "==1.5.0", + "tmlt.core": "==0.18.0", + }, + ), + DependencyConfiguration( + id="3.11-newest", + python="3.11", + packages={ + "pyspark[sql]": "==3.5.6", + "sympy": "==1.9", + "pandas": "==1.5.3", + "tmlt.core": ">=0.18.0", + }, + ), + DependencyConfiguration( + id="3.12-oldest", + python="3.12", + packages={ + "pyspark[sql]": "==3.5.0", + "sympy": "==1.8", + "pandas": "==2.2.0", + "tmlt.core": "==0.18.0", + }, + ), + DependencyConfiguration( + id="3.12-newest", + python="3.12", + packages={ + "pyspark[sql]": "==3.5.6", + "sympy": "==1.9", + "pandas": "==2.2.3", + "tmlt.core": ">=0.18.0", + }, + ), ] AUDIT_VERSIONS = ["3.9", "3.10", "3.11", "3.12"] @@ -101,7 +178,7 @@ sm.docs() for benchmark_name, timeout in BENCHMARK_TO_TIMEOUT.items(): - sm.benchmark(CWD / benchmark_name, timeout*60) + sm.benchmark(CWD / benchmark_name, timeout * 60) sm.audit() diff --git a/pyproject.toml b/pyproject.toml index f23a756f..296df0f1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,10 @@ dependencies = [ "pandas >=1.4.0,<2 ; python_version < '3.11'", "pandas >=1.5.0,<2 ; python_version == '3.11'", "pandas >=2.2.0,<3 ; python_version >= '3.12'", - "pyspark[sql] >=3.5.0,<3.6", + "pyspark[sql] >=3.3.1,<3.6 ; python_version < '3.11' and sys_platform != 'darwin'", + "pyspark[sql] >=3.4.0,<3.6 ; python_version == '3.11' and sys_platform != 'darwin'", + "pyspark[sql] >=3.5.0,<3.6 ; python_version >= '3.12' and sys_platform != 'darwin'", + "pyspark[sql] >=3.5.0,<3.6 ; sys_platform == 'darwin'", "sympy >=1.8,<1.13", "typeguard >=4.0.0,<5", "typing-extensions >=4.1.0,<5", diff --git a/uv.lock b/uv.lock index 63e47920..cb74f831 100644 --- a/uv.lock +++ b/uv.lock @@ -2,10 +2,14 @@ version = 1 revision = 2 requires-python = ">=3.9, <3.13" resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version < '3.10'", + "python_full_version >= '3.12' and sys_platform != 'darwin'", + "python_full_version >= '3.12' and sys_platform == 'darwin'", + "python_full_version == '3.11.*' and sys_platform != 'darwin'", + "python_full_version == '3.11.*' and sys_platform == 'darwin'", + "python_full_version == '3.10.*' and sys_platform != 'darwin'", + "python_full_version == '3.10.*' and sys_platform == 'darwin'", + "python_full_version < '3.10' and sys_platform != 'darwin'", + "python_full_version < '3.10' and sys_platform == 'darwin'", ] [[package]] @@ -227,7 +231,8 @@ name = "click" version = "8.1.8" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version < '3.10'", + "python_full_version < '3.10' and sys_platform != 'darwin'", + "python_full_version < '3.10' and sys_platform == 'darwin'", ] dependencies = [ { name = "colorama", marker = "python_full_version < '3.10' and sys_platform == 'win32'" }, @@ -242,9 +247,12 @@ name = "click" version = "8.2.1" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", + "python_full_version >= '3.12' and sys_platform != 'darwin'", + "python_full_version >= '3.12' and sys_platform == 'darwin'", + "python_full_version == '3.11.*' and sys_platform != 'darwin'", + "python_full_version == '3.11.*' and sys_platform == 'darwin'", + "python_full_version == '3.10.*' and sys_platform != 'darwin'", + "python_full_version == '3.10.*' and sys_platform == 'darwin'", ] dependencies = [ { name = "colorama", marker = "python_full_version >= '3.10' and sys_platform == 'win32'" }, @@ -280,7 +288,8 @@ name = "contourpy" version = "1.3.0" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version < '3.10'", + "python_full_version < '3.10' and sys_platform != 'darwin'", + "python_full_version < '3.10' and sys_platform == 'darwin'", ] dependencies = [ { name = "numpy", marker = "python_full_version < '3.10'" }, @@ -340,9 +349,12 @@ name = "contourpy" version = "1.3.2" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", + "python_full_version >= '3.12' and sys_platform != 'darwin'", + "python_full_version >= '3.12' and sys_platform == 'darwin'", + "python_full_version == '3.11.*' and sys_platform != 'darwin'", + "python_full_version == '3.11.*' and sys_platform == 'darwin'", + "python_full_version == '3.10.*' and sys_platform != 'darwin'", + "python_full_version == '3.10.*' and sys_platform == 'darwin'", ] dependencies = [ { name = "numpy", marker = "python_full_version >= '3.10'" }, @@ -726,7 +738,8 @@ name = "kiwisolver" version = "1.4.7" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version < '3.10'", + "python_full_version < '3.10' and sys_platform != 'darwin'", + "python_full_version < '3.10' and sys_platform == 'darwin'", ] sdist = { url = "https://files.pythonhosted.org/packages/85/4d/2255e1c76304cbd60b48cee302b66d1dde4468dc5b1160e4b7cb43778f2a/kiwisolver-1.4.7.tar.gz", hash = "sha256:9893ff81bd7107f7b685d3017cc6583daadb4fc26e4a888350df530e41980a60", size = 97286, upload-time = "2024-09-04T09:39:44.302Z" } wheels = [ @@ -813,9 +826,12 @@ name = "kiwisolver" version = "1.4.8" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", + "python_full_version >= '3.12' and sys_platform != 'darwin'", + "python_full_version >= '3.12' and sys_platform == 'darwin'", + "python_full_version == '3.11.*' and sys_platform != 'darwin'", + "python_full_version == '3.11.*' and sys_platform == 'darwin'", + "python_full_version == '3.10.*' and sys_platform != 'darwin'", + "python_full_version == '3.10.*' and sys_platform == 'darwin'", ] sdist = { url = "https://files.pythonhosted.org/packages/82/59/7c91426a8ac292e1cdd53a63b6d9439abd573c875c3f92c146767dd33faf/kiwisolver-1.4.8.tar.gz", hash = "sha256:23d5f023bdc8c7e54eb65f03ca5d5bb25b601eac4d7f1a042888a1f45237987e", size = 97538, upload-time = "2024-12-24T18:30:51.519Z" } wheels = [ @@ -958,7 +974,8 @@ name = "matplotlib" version = "3.9.4" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version < '3.10'", + "python_full_version < '3.10' and sys_platform != 'darwin'", + "python_full_version < '3.10' and sys_platform == 'darwin'", ] dependencies = [ { name = "contourpy", version = "1.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, @@ -1009,9 +1026,12 @@ name = "matplotlib" version = "3.10.3" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", + "python_full_version >= '3.12' and sys_platform != 'darwin'", + "python_full_version >= '3.12' and sys_platform == 'darwin'", + "python_full_version == '3.11.*' and sys_platform != 'darwin'", + "python_full_version == '3.11.*' and sys_platform == 'darwin'", + "python_full_version == '3.10.*' and sys_platform != 'darwin'", + "python_full_version == '3.10.*' and sys_platform == 'darwin'", ] dependencies = [ { name = "contourpy", version = "1.3.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, @@ -1256,9 +1276,12 @@ name = "pandas" version = "1.5.3" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version < '3.10'", + "python_full_version == '3.11.*' and sys_platform != 'darwin'", + "python_full_version == '3.11.*' and sys_platform == 'darwin'", + "python_full_version == '3.10.*' and sys_platform != 'darwin'", + "python_full_version == '3.10.*' and sys_platform == 'darwin'", + "python_full_version < '3.10' and sys_platform != 'darwin'", + "python_full_version < '3.10' and sys_platform == 'darwin'", ] dependencies = [ { name = "numpy", marker = "python_full_version < '3.12'" }, @@ -1293,7 +1316,8 @@ name = "pandas" version = "2.3.0" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version >= '3.12'", + "python_full_version >= '3.12' and sys_platform != 'darwin'", + "python_full_version >= '3.12' and sys_platform == 'darwin'", ] dependencies = [ { name = "numpy", marker = "python_full_version >= '3.12'" }, @@ -1883,8 +1907,10 @@ name = "randomgen" version = "1.26.0" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.10.*'", - "python_full_version < '3.10'", + "python_full_version == '3.10.*' and sys_platform != 'darwin'", + "python_full_version == '3.10.*' and sys_platform == 'darwin'", + "python_full_version < '3.10' and sys_platform != 'darwin'", + "python_full_version < '3.10' and sys_platform == 'darwin'", ] dependencies = [ { name = "numpy", marker = "python_full_version < '3.11'" }, @@ -1921,8 +1947,10 @@ name = "randomgen" version = "1.26.1" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", + "python_full_version >= '3.12' and sys_platform != 'darwin'", + "python_full_version >= '3.12' and sys_platform == 'darwin'", + "python_full_version == '3.11.*' and sys_platform != 'darwin'", + "python_full_version == '3.11.*' and sys_platform == 'darwin'", ] dependencies = [ { name = "numpy", marker = "python_full_version >= '3.11'" }, @@ -1988,7 +2016,8 @@ name = "scipy" version = "1.13.1" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version < '3.10'", + "python_full_version < '3.10' and sys_platform != 'darwin'", + "python_full_version < '3.10' and sys_platform == 'darwin'", ] dependencies = [ { name = "numpy", marker = "python_full_version < '3.10'" }, @@ -2026,7 +2055,8 @@ name = "scipy" version = "1.15.3" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.10.*'", + "python_full_version == '3.10.*' and sys_platform != 'darwin'", + "python_full_version == '3.10.*' and sys_platform == 'darwin'", ] dependencies = [ { name = "numpy", marker = "python_full_version == '3.10.*'" }, @@ -2067,8 +2097,10 @@ name = "scipy" version = "1.16.0" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", + "python_full_version >= '3.12' and sys_platform != 'darwin'", + "python_full_version >= '3.12' and sys_platform == 'darwin'", + "python_full_version == '3.11.*' and sys_platform != 'darwin'", + "python_full_version == '3.11.*' and sys_platform == 'darwin'", ] dependencies = [ { name = "numpy", marker = "python_full_version >= '3.11'" }, @@ -2453,7 +2485,10 @@ requires-dist = [ { name = "pandas", marker = "python_full_version < '3.11'", specifier = ">=1.4.0,<2" }, { name = "pandas", marker = "python_full_version == '3.11.*'", specifier = ">=1.5.0,<2" }, { name = "pandas", marker = "python_full_version >= '3.12'", specifier = ">=2.2.0,<3" }, - { name = "pyspark", extras = ["sql"], specifier = ">=3.5.0,<3.6" }, + { name = "pyspark", extras = ["sql"], marker = "sys_platform == 'darwin'", specifier = ">=3.5.0,<3.6" }, + { name = "pyspark", extras = ["sql"], marker = "python_full_version < '3.11' and sys_platform != 'darwin'", specifier = ">=3.3.1,<3.6" }, + { name = "pyspark", extras = ["sql"], marker = "python_full_version == '3.11.*' and sys_platform != 'darwin'", specifier = ">=3.4.0,<3.6" }, + { name = "pyspark", extras = ["sql"], marker = "python_full_version >= '3.12' and sys_platform != 'darwin'", specifier = ">=3.5.0,<3.6" }, { name = "sympy", specifier = ">=1.8,<1.13" }, { name = "tabulate", specifier = ">=0.8.9,<0.9" }, { name = "tmlt-core", specifier = ">=0.18.0,<0.19" }, From e09e4c0b71269fd17c8ee41f3e6faae1af1ea95b Mon Sep 17 00:00:00 2001 From: Bayard Carlson <65914476+cbaycity@users.noreply.github.com> Date: Sun, 5 Oct 2025 20:51:27 -0400 Subject: [PATCH 04/25] Removes python 3.9 (#57) * Removes python 3.9 * Update CHANGELOG.rst Co-authored-by: Tom Magerlein --------- Co-authored-by: Tom Magerlein --- .ci/common.yml | 2 +- .github/workflows/release.yml | 2 +- .gitlab-ci.yml | 12 +- .python-version | 2 +- CHANGELOG.rst | 1 + CONTRIBUTING.md | 2 +- doc/installation.rst | 2 +- noxfile.py | 23 +- pyproject.toml | 3 +- uv.lock | 556 ++-------------------------------- 10 files changed, 41 insertions(+), 564 deletions(-) diff --git a/.ci/common.yml b/.ci/common.yml index e43e5056..3986035b 100644 --- a/.ci/common.yml +++ b/.ci/common.yml @@ -10,7 +10,7 @@ variables: FF_USE_FASTZIP: "true" .base: - image: registry.gitlab.com/tumult-labs/ops/ci/linux:python3.9 + image: registry.gitlab.com/tumult-labs/ops/ci/linux:python3.10 before_script: - java -version - python --version diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 12db548e..72c51f06 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -82,7 +82,7 @@ jobs: # Note: mac runners are rather expensive (10x multiplier) so we don't use them here. os: [ubuntu-latest] dependencies: [oldest, newest] - python: ["3.9", "3.12"] + python: ["3.10", "3.12"] runs-on: ${{ matrix.os }} needs: Package steps: diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 033ecaa8..121efb16 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -37,7 +37,7 @@ package: extends: .base stage: setup needs: [] - image: registry.gitlab.com/tumult-labs/ops/ci/linux:python3.9 + image: registry.gitlab.com/tumult-labs/ops/ci/linux:python3.10 script: - nox -s build artifacts: @@ -134,9 +134,9 @@ test_dep_matrix: needs: ["package"] parallel: matrix: - - PYTHON_VERSION: ["3.9", "3.10", "3.11", "3.12"] + - PYTHON_VERSION: ["3.10", "3.11", "3.12"] TEST_TYPE: ["oldest", "newest"] - - PYTHON_VERSION: ["3.9"] + - PYTHON_VERSION: ["3.10"] TEST_TYPE: ["pyspark3.4"] image: registry.gitlab.com/tumult-labs/ops/ci/linux:python${PYTHON_VERSION} rules: @@ -182,7 +182,7 @@ benchmark_dependency_matrix: needs: ["package"] parallel: matrix: - - PYTHON_VERSION: ["3.9", "3.10", "3.11", "3.12"] + - PYTHON_VERSION: ["3.10", "3.11", "3.12"] TEST_TYPE: ["oldest", "newest"] image: registry.gitlab.com/tumult-labs/ops/ci/linux:python${PYTHON_VERSION} script: @@ -205,7 +205,7 @@ test_release_linux: needs: ["package"] parallel: matrix: # 3.10, 3.11, and 3.12 run out of memory on a c6a.xlarge. - - PYTHON_VERSION: ["3.9"] + - PYTHON_VERSION: ["3.10"] INSTANCE: aws-c6a.xlarge - PYTHON_VERSION: ["3.10", "3.11", "3.12"] INSTANCE: aws-m6a.2xlarge @@ -236,7 +236,7 @@ audit: needs: ["package"] parallel: matrix: - - PYTHON_VERSION: ["3.9", "3.10", "3.11", "3.12"] + - PYTHON_VERSION: ["3.10", "3.11", "3.12"] image: registry.gitlab.com/tumult-labs/ops/ci/linux:python${PYTHON_VERSION} script: - nox -s "audit(python=\"${PYTHON_VERSION}\")" diff --git a/.python-version b/.python-version index a02597f4..7c7a975f 100644 --- a/.python-version +++ b/.python-version @@ -1 +1 @@ -3.9 \ No newline at end of file +3.10 \ No newline at end of file diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 33bd5aa5..8be73d2b 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -12,6 +12,7 @@ Unreleased Changed ~~~~~~~ +- Dropped support for Python 3.9, as it has reached end-of-life. - Dropped support for pyspark <3.5.0 on Macs after discovering that these configurations frequently crash. Older versions of the library may also be affected. .. _v0.20.2: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a6d2b909..0d1bde04 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -19,7 +19,7 @@ Once you have agreement on the feature or bug, anyone can send us a Pull Request We use [`uv`](https://docs.astral.sh/uv/) for dependency management during development. To set up your environment, install `uv` by following its [installation instructions](https://docs.astral.sh/uv/getting-started/installation/), then install the prerequisites listed in the [Tumult Analytics installation instructions](https://opendp.github.io/tumult-docs/analytics/latest/installation.html#prerequisites), and finally install our dev dependencies by running `uv sync` from the root of this repository. -To minimize compatibility issues, doing development on the oldest supported Python minor version (currently 3.9) is strongly recommended. +To minimize compatibility issues, doing development on the oldest supported Python minor version (currently 3.10) is strongly recommended. If you are using `uv` to manage your Python installations, running `uv sync` without an existing virtual environment should automatically install and use an appropriate Python version. ### Basic usage diff --git a/doc/installation.rst b/doc/installation.rst index 57c7a322..695bb333 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -12,7 +12,7 @@ Prerequisites ^^^^^^^^^^^^^ |project| is built in `Python `__, so a Python installation is required to use it. -It is compatible with Python 3.9 through 3.11, and has experimental support for Python 3.12. +It is compatible with Python 3.10 through 3.11, and has experimental support for Python 3.12. Because Tumult Analytics uses PySpark for computation, it also `requires Java 8 or 11 `__, or Java 17 if PySpark 3.4 or later is used. Java 21 is not currently supported. diff --git a/noxfile.py b/noxfile.py index 990a6e00..7da191f2 100644 --- a/noxfile.py +++ b/noxfile.py @@ -36,26 +36,6 @@ def is_mac(): DEPENDENCY_MATRIX = [ - DependencyConfiguration( - id="3.9-oldest", - python="3.9", - packages={ - "pyspark[sql]": "==3.3.1" if not is_mac() else "==3.5.0", - "sympy": "==1.8", - "pandas": "==1.4.0", - "tmlt.core": "==0.18.0", - }, - ), - DependencyConfiguration( - id="3.9-newest", - python="3.9", - packages={ - "pyspark[sql]": "==3.5.6", - "sympy": "==1.9", - "pandas": "==1.5.3", - "tmlt.core": ">=0.18.0", - }, - ), DependencyConfiguration( id="3.10-oldest", python="3.10", @@ -118,7 +98,7 @@ def is_mac(): ), ] -AUDIT_VERSIONS = ["3.9", "3.10", "3.11", "3.12"] +AUDIT_VERSIONS = ["3.10", "3.11", "3.12"] AUDIT_SUPPRESSIONS = [ "PYSEC-2023-228", # Affects: pip<23.3 @@ -151,6 +131,7 @@ def is_mac(): package=PACKAGE_NAME, package_github=PACKAGE_GITHUB, directory=CWD, + default_python_version="3.10", smoketest_script=SMOKETEST_SCRIPT, parallel_tests=False, min_coverage=MIN_COVERAGE, diff --git a/pyproject.toml b/pyproject.toml index 296df0f1..7322cfed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,6 @@ classifiers = [ "Topic :: Security", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", @@ -27,7 +26,7 @@ keywords = [ "differential privacy", ] -requires-python = ">=3.9,<3.13" +requires-python = ">=3.10,<3.13" dependencies = [ # When updating Core, PySpark, Pandas, or SymPy, remember to update the # dependency matrix in the noxfile. diff --git a/uv.lock b/uv.lock index cb74f831..2180c3e0 100644 --- a/uv.lock +++ b/uv.lock @@ -1,15 +1,13 @@ version = 1 -revision = 2 -requires-python = ">=3.9, <3.13" +revision = 3 +requires-python = ">=3.10, <3.13" resolution-markers = [ "python_full_version >= '3.12' and sys_platform != 'darwin'", "python_full_version >= '3.12' and sys_platform == 'darwin'", "python_full_version == '3.11.*' and sys_platform != 'darwin'", "python_full_version == '3.11.*' and sys_platform == 'darwin'", - "python_full_version == '3.10.*' and sys_platform != 'darwin'", - "python_full_version == '3.10.*' and sys_platform == 'darwin'", - "python_full_version < '3.10' and sys_platform != 'darwin'", - "python_full_version < '3.10' and sys_platform == 'darwin'", + "python_full_version < '3.11' and sys_platform != 'darwin'", + "python_full_version < '3.11' and sys_platform == 'darwin'", ] [[package]] @@ -99,8 +97,7 @@ name = "black" version = "23.12.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "click", version = "8.1.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "click", version = "8.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "click" }, { name = "mypy-extensions" }, { name = "packaging" }, { name = "pathspec" }, @@ -122,10 +119,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d2/1e/30f5eafcc41b8378890ba39b693fa111f7dca8a2620ba5162075d95ffe46/black-23.12.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2d9e13db441c509a3763a7a3d9a49ccc1b4e974a47be4e08ade2a228876500ec", size = 1398647, upload-time = "2023-12-22T23:19:57.225Z" }, { url = "https://files.pythonhosted.org/packages/99/de/ddb45cc044256431d96d846ce03164d149d81ca606b5172224d1872e0b58/black-23.12.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d1bd9c210f8b109b1762ec9fd36592fdd528485aadb3f5849b2740ef17e674e", size = 1720450, upload-time = "2023-12-22T23:08:52.675Z" }, { url = "https://files.pythonhosted.org/packages/98/2b/54e5dbe9be5a10cbea2259517206ff7b6a452bb34e07508c7e1395950833/black-23.12.1-cp312-cp312-win_amd64.whl", hash = "sha256:ae76c22bde5cbb6bfd211ec343ded2163bba7883c7bc77f6b756a1049436fbb9", size = 1351070, upload-time = "2023-12-22T23:09:32.762Z" }, - { url = "https://files.pythonhosted.org/packages/85/97/f5c6b46fa6f47263e6e27d6feef967e3e99f4e1aedaaf93fd98f904580e2/black-23.12.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3e1b38b3135fd4c025c28c55ddfc236b05af657828a8a6abe5deec419a0b7055", size = 1560093, upload-time = "2023-12-22T23:23:40.554Z" }, - { url = "https://files.pythonhosted.org/packages/ef/54/41aec3623ac8c610ea9eabc2092c7c73aab293ef2858fb3b66904debe78c/black-23.12.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4f0031eaa7b921db76decd73636ef3a12c942ed367d8c3841a0739412b260a54", size = 1403728, upload-time = "2023-12-22T23:22:52.826Z" }, - { url = "https://files.pythonhosted.org/packages/e4/24/afa2005a508768228b88ee04e647022be9852e675c8d7237fb1e73e4607d/black-23.12.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:97e56155c6b737854e60a9ab1c598ff2533d57e7506d97af5481141671abf3ea", size = 1710054, upload-time = "2023-12-22T23:08:47.238Z" }, - { url = "https://files.pythonhosted.org/packages/cb/61/111749529f766170a6cbe4cce5209a94ddba4bad0dda3793a6af641515b3/black-23.12.1-cp39-cp39-win_amd64.whl", hash = "sha256:dd15245c8b68fe2b6bd0f32c1556509d11bb33aec9b5d0866dd8e2ed3dba09c2", size = 1332558, upload-time = "2023-12-22T23:09:08.454Z" }, { url = "https://files.pythonhosted.org/packages/7b/14/4da7b12a9abc43a601c215cb5a3d176734578da109f0dbf0a832ed78be09/black-23.12.1-py3-none-any.whl", hash = "sha256:78baad24af0f033958cad29731e27363183e140962595def56423e626f4bee3e", size = 194363, upload-time = "2023-12-22T23:06:14.278Z" }, ] @@ -210,52 +203,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/77/1a/5eefc0ce04affb98af07bc05f3bac9094513c0e23b0562d64af46a06aae4/charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:dedb8adb91d11846ee08bec4c8236c8549ac721c245678282dcb06b221aab59f", size = 149565, upload-time = "2025-05-02T08:32:51.404Z" }, { url = "https://files.pythonhosted.org/packages/37/a0/2410e5e6032a174c95e0806b1a6585eb21e12f445ebe239fac441995226a/charset_normalizer-3.4.2-cp312-cp312-win32.whl", hash = "sha256:db4c7bf0e07fc3b7d89ac2a5880a6a8062056801b83ff56d8464b70f65482b6c", size = 98357, upload-time = "2025-05-02T08:32:53.079Z" }, { url = "https://files.pythonhosted.org/packages/6c/4f/c02d5c493967af3eda9c771ad4d2bbc8df6f99ddbeb37ceea6e8716a32bc/charset_normalizer-3.4.2-cp312-cp312-win_amd64.whl", hash = "sha256:5a9979887252a82fefd3d3ed2a8e3b937a7a809f65dcb1e068b090e165bbe99e", size = 105776, upload-time = "2025-05-02T08:32:54.573Z" }, - { url = "https://files.pythonhosted.org/packages/28/f8/dfb01ff6cc9af38552c69c9027501ff5a5117c4cc18dcd27cb5259fa1888/charset_normalizer-3.4.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:005fa3432484527f9732ebd315da8da8001593e2cf46a3d817669f062c3d9ed4", size = 201671, upload-time = "2025-05-02T08:34:12.696Z" }, - { url = "https://files.pythonhosted.org/packages/32/fb/74e26ee556a9dbfe3bd264289b67be1e6d616329403036f6507bb9f3f29c/charset_normalizer-3.4.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e92fca20c46e9f5e1bb485887d074918b13543b1c2a1185e69bb8d17ab6236a7", size = 144744, upload-time = "2025-05-02T08:34:14.665Z" }, - { url = "https://files.pythonhosted.org/packages/ad/06/8499ee5aa7addc6f6d72e068691826ff093329fe59891e83b092ae4c851c/charset_normalizer-3.4.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:50bf98d5e563b83cc29471fa114366e6806bc06bc7a25fd59641e41445327836", size = 154993, upload-time = "2025-05-02T08:34:17.134Z" }, - { url = "https://files.pythonhosted.org/packages/f1/a2/5e4c187680728219254ef107a6949c60ee0e9a916a5dadb148c7ae82459c/charset_normalizer-3.4.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:721c76e84fe669be19c5791da68232ca2e05ba5185575086e384352e2c309597", size = 147382, upload-time = "2025-05-02T08:34:19.081Z" }, - { url = "https://files.pythonhosted.org/packages/4c/fe/56aca740dda674f0cc1ba1418c4d84534be51f639b5f98f538b332dc9a95/charset_normalizer-3.4.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:82d8fd25b7f4675d0c47cf95b594d4e7b158aca33b76aa63d07186e13c0e0ab7", size = 149536, upload-time = "2025-05-02T08:34:21.073Z" }, - { url = "https://files.pythonhosted.org/packages/53/13/db2e7779f892386b589173dd689c1b1e304621c5792046edd8a978cbf9e0/charset_normalizer-3.4.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3daeac64d5b371dea99714f08ffc2c208522ec6b06fbc7866a450dd446f5c0f", size = 151349, upload-time = "2025-05-02T08:34:23.193Z" }, - { url = "https://files.pythonhosted.org/packages/69/35/e52ab9a276186f729bce7a0638585d2982f50402046e4b0faa5d2c3ef2da/charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:dccab8d5fa1ef9bfba0590ecf4d46df048d18ffe3eec01eeb73a42e0d9e7a8ba", size = 146365, upload-time = "2025-05-02T08:34:25.187Z" }, - { url = "https://files.pythonhosted.org/packages/a6/d8/af7333f732fc2e7635867d56cb7c349c28c7094910c72267586947561b4b/charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:aaf27faa992bfee0264dc1f03f4c75e9fcdda66a519db6b957a3f826e285cf12", size = 154499, upload-time = "2025-05-02T08:34:27.359Z" }, - { url = "https://files.pythonhosted.org/packages/7a/3d/a5b2e48acef264d71e036ff30bcc49e51bde80219bb628ba3e00cf59baac/charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:eb30abc20df9ab0814b5a2524f23d75dcf83cde762c161917a2b4b7b55b1e518", size = 157735, upload-time = "2025-05-02T08:34:29.798Z" }, - { url = "https://files.pythonhosted.org/packages/85/d8/23e2c112532a29f3eef374375a8684a4f3b8e784f62b01da931186f43494/charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:c72fbbe68c6f32f251bdc08b8611c7b3060612236e960ef848e0a517ddbe76c5", size = 154786, upload-time = "2025-05-02T08:34:31.858Z" }, - { url = "https://files.pythonhosted.org/packages/c7/57/93e0169f08ecc20fe82d12254a200dfaceddc1c12a4077bf454ecc597e33/charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:982bb1e8b4ffda883b3d0a521e23abcd6fd17418f6d2c4118d257a10199c0ce3", size = 150203, upload-time = "2025-05-02T08:34:33.88Z" }, - { url = "https://files.pythonhosted.org/packages/2c/9d/9bf2b005138e7e060d7ebdec7503d0ef3240141587651f4b445bdf7286c2/charset_normalizer-3.4.2-cp39-cp39-win32.whl", hash = "sha256:43e0933a0eff183ee85833f341ec567c0980dae57c464d8a508e1b2ceb336471", size = 98436, upload-time = "2025-05-02T08:34:35.907Z" }, - { url = "https://files.pythonhosted.org/packages/6d/24/5849d46cf4311bbf21b424c443b09b459f5b436b1558c04e45dbb7cc478b/charset_normalizer-3.4.2-cp39-cp39-win_amd64.whl", hash = "sha256:d11b54acf878eef558599658b0ffca78138c8c3655cf4f3a4a673c437e67732e", size = 105772, upload-time = "2025-05-02T08:34:37.935Z" }, { url = "https://files.pythonhosted.org/packages/20/94/c5790835a017658cbfabd07f3bfb549140c3ac458cfc196323996b10095a/charset_normalizer-3.4.2-py3-none-any.whl", hash = "sha256:7f56930ab0abd1c45cd15be65cc741c28b1c9a34876ce8c17a2fa107810c0af0", size = 52626, upload-time = "2025-05-02T08:34:40.053Z" }, ] -[[package]] -name = "click" -version = "8.1.8" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10' and sys_platform != 'darwin'", - "python_full_version < '3.10' and sys_platform == 'darwin'", -] -dependencies = [ - { name = "colorama", marker = "python_full_version < '3.10' and sys_platform == 'win32'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593, upload-time = "2024-12-21T18:38:44.339Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/7e/d4/7ebdbd03970677812aac39c869717059dbb71a4cfc033ca6e5221787892c/click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2", size = 98188, upload-time = "2024-12-21T18:38:41.666Z" }, -] - [[package]] name = "click" version = "8.2.1" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12' and sys_platform != 'darwin'", - "python_full_version >= '3.12' and sys_platform == 'darwin'", - "python_full_version == '3.11.*' and sys_platform != 'darwin'", - "python_full_version == '3.11.*' and sys_platform == 'darwin'", - "python_full_version == '3.10.*' and sys_platform != 'darwin'", - "python_full_version == '3.10.*' and sys_platform == 'darwin'", -] dependencies = [ - { name = "colorama", marker = "python_full_version >= '3.10' and sys_platform == 'win32'" }, + { name = "colorama", marker = "sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/60/6c/8ca2efa64cf75a977a0d7fac081354553ebe483345c734fb6b6515d96bbc/click-8.2.1.tar.gz", hash = "sha256:27c491cc05d968d271d5a1db13e3b5a184636d9d930f148c50b038f0d0646202", size = 286342, upload-time = "2025-05-20T23:19:49.832Z" } wheels = [ @@ -283,81 +239,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e3/51/9b208e85196941db2f0654ad0357ca6388ab3ed67efdbfc799f35d1f83aa/colorlog-6.9.0-py3-none-any.whl", hash = "sha256:5906e71acd67cb07a71e779c47c4bcb45fb8c2993eebe9e5adcd6a6f1b283eff", size = 11424, upload-time = "2024-10-29T18:34:49.815Z" }, ] -[[package]] -name = "contourpy" -version = "1.3.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10' and sys_platform != 'darwin'", - "python_full_version < '3.10' and sys_platform == 'darwin'", -] -dependencies = [ - { name = "numpy", marker = "python_full_version < '3.10'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/f5/f6/31a8f28b4a2a4fa0e01085e542f3081ab0588eff8e589d39d775172c9792/contourpy-1.3.0.tar.gz", hash = "sha256:7ffa0db17717a8ffb127efd0c95a4362d996b892c2904db72428d5b52e1938a4", size = 13464370, upload-time = "2024-08-27T21:00:03.328Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6c/e0/be8dcc796cfdd96708933e0e2da99ba4bb8f9b2caa9d560a50f3f09a65f3/contourpy-1.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:880ea32e5c774634f9fcd46504bf9f080a41ad855f4fef54f5380f5133d343c7", size = 265366, upload-time = "2024-08-27T20:50:09.947Z" }, - { url = "https://files.pythonhosted.org/packages/50/d6/c953b400219443535d412fcbbc42e7a5e823291236bc0bb88936e3cc9317/contourpy-1.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:76c905ef940a4474a6289c71d53122a4f77766eef23c03cd57016ce19d0f7b42", size = 249226, upload-time = "2024-08-27T20:50:16.1Z" }, - { url = "https://files.pythonhosted.org/packages/6f/b4/6fffdf213ffccc28483c524b9dad46bb78332851133b36ad354b856ddc7c/contourpy-1.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:92f8557cbb07415a4d6fa191f20fd9d2d9eb9c0b61d1b2f52a8926e43c6e9af7", size = 308460, upload-time = "2024-08-27T20:50:22.536Z" }, - { url = "https://files.pythonhosted.org/packages/cf/6c/118fc917b4050f0afe07179a6dcbe4f3f4ec69b94f36c9e128c4af480fb8/contourpy-1.3.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:36f965570cff02b874773c49bfe85562b47030805d7d8360748f3eca570f4cab", size = 347623, upload-time = "2024-08-27T20:50:28.806Z" }, - { url = "https://files.pythonhosted.org/packages/f9/a4/30ff110a81bfe3abf7b9673284d21ddce8cc1278f6f77393c91199da4c90/contourpy-1.3.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cacd81e2d4b6f89c9f8a5b69b86490152ff39afc58a95af002a398273e5ce589", size = 317761, upload-time = "2024-08-27T20:50:35.126Z" }, - { url = "https://files.pythonhosted.org/packages/99/e6/d11966962b1aa515f5586d3907ad019f4b812c04e4546cc19ebf62b5178e/contourpy-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69375194457ad0fad3a839b9e29aa0b0ed53bb54db1bfb6c3ae43d111c31ce41", size = 322015, upload-time = "2024-08-27T20:50:40.318Z" }, - { url = "https://files.pythonhosted.org/packages/4d/e3/182383743751d22b7b59c3c753277b6aee3637049197624f333dac5b4c80/contourpy-1.3.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:7a52040312b1a858b5e31ef28c2e865376a386c60c0e248370bbea2d3f3b760d", size = 1262672, upload-time = "2024-08-27T20:50:55.643Z" }, - { url = "https://files.pythonhosted.org/packages/78/53/974400c815b2e605f252c8fb9297e2204347d1755a5374354ee77b1ea259/contourpy-1.3.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3faeb2998e4fcb256542e8a926d08da08977f7f5e62cf733f3c211c2a5586223", size = 1321688, upload-time = "2024-08-27T20:51:11.293Z" }, - { url = "https://files.pythonhosted.org/packages/52/29/99f849faed5593b2926a68a31882af98afbeac39c7fdf7de491d9c85ec6a/contourpy-1.3.0-cp310-cp310-win32.whl", hash = "sha256:36e0cff201bcb17a0a8ecc7f454fe078437fa6bda730e695a92f2d9932bd507f", size = 171145, upload-time = "2024-08-27T20:51:15.2Z" }, - { url = "https://files.pythonhosted.org/packages/a9/97/3f89bba79ff6ff2b07a3cbc40aa693c360d5efa90d66e914f0ff03b95ec7/contourpy-1.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:87ddffef1dbe5e669b5c2440b643d3fdd8622a348fe1983fad7a0f0ccb1cd67b", size = 216019, upload-time = "2024-08-27T20:51:19.365Z" }, - { url = "https://files.pythonhosted.org/packages/b3/1f/9375917786cb39270b0ee6634536c0e22abf225825602688990d8f5c6c19/contourpy-1.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0fa4c02abe6c446ba70d96ece336e621efa4aecae43eaa9b030ae5fb92b309ad", size = 266356, upload-time = "2024-08-27T20:51:24.146Z" }, - { url = "https://files.pythonhosted.org/packages/05/46/9256dd162ea52790c127cb58cfc3b9e3413a6e3478917d1f811d420772ec/contourpy-1.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:834e0cfe17ba12f79963861e0f908556b2cedd52e1f75e6578801febcc6a9f49", size = 250915, upload-time = "2024-08-27T20:51:28.683Z" }, - { url = "https://files.pythonhosted.org/packages/e1/5d/3056c167fa4486900dfbd7e26a2fdc2338dc58eee36d490a0ed3ddda5ded/contourpy-1.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dbc4c3217eee163fa3984fd1567632b48d6dfd29216da3ded3d7b844a8014a66", size = 310443, upload-time = "2024-08-27T20:51:33.675Z" }, - { url = "https://files.pythonhosted.org/packages/ca/c2/1a612e475492e07f11c8e267ea5ec1ce0d89971be496c195e27afa97e14a/contourpy-1.3.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4865cd1d419e0c7a7bf6de1777b185eebdc51470800a9f42b9e9decf17762081", size = 348548, upload-time = "2024-08-27T20:51:39.322Z" }, - { url = "https://files.pythonhosted.org/packages/45/cf/2c2fc6bb5874158277b4faf136847f0689e1b1a1f640a36d76d52e78907c/contourpy-1.3.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:303c252947ab4b14c08afeb52375b26781ccd6a5ccd81abcdfc1fafd14cf93c1", size = 319118, upload-time = "2024-08-27T20:51:44.717Z" }, - { url = "https://files.pythonhosted.org/packages/03/33/003065374f38894cdf1040cef474ad0546368eea7e3a51d48b8a423961f8/contourpy-1.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:637f674226be46f6ba372fd29d9523dd977a291f66ab2a74fbeb5530bb3f445d", size = 323162, upload-time = "2024-08-27T20:51:49.683Z" }, - { url = "https://files.pythonhosted.org/packages/42/80/e637326e85e4105a802e42959f56cff2cd39a6b5ef68d5d9aee3ea5f0e4c/contourpy-1.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:76a896b2f195b57db25d6b44e7e03f221d32fe318d03ede41f8b4d9ba1bff53c", size = 1265396, upload-time = "2024-08-27T20:52:04.926Z" }, - { url = "https://files.pythonhosted.org/packages/7c/3b/8cbd6416ca1bbc0202b50f9c13b2e0b922b64be888f9d9ee88e6cfabfb51/contourpy-1.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e1fd23e9d01591bab45546c089ae89d926917a66dceb3abcf01f6105d927e2cb", size = 1324297, upload-time = "2024-08-27T20:52:21.843Z" }, - { url = "https://files.pythonhosted.org/packages/4d/2c/021a7afaa52fe891f25535506cc861c30c3c4e5a1c1ce94215e04b293e72/contourpy-1.3.0-cp311-cp311-win32.whl", hash = "sha256:d402880b84df3bec6eab53cd0cf802cae6a2ef9537e70cf75e91618a3801c20c", size = 171808, upload-time = "2024-08-27T20:52:25.163Z" }, - { url = "https://files.pythonhosted.org/packages/8d/2f/804f02ff30a7fae21f98198828d0857439ec4c91a96e20cf2d6c49372966/contourpy-1.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:6cb6cc968059db9c62cb35fbf70248f40994dfcd7aa10444bbf8b3faeb7c2d67", size = 217181, upload-time = "2024-08-27T20:52:29.13Z" }, - { url = "https://files.pythonhosted.org/packages/c9/92/8e0bbfe6b70c0e2d3d81272b58c98ac69ff1a4329f18c73bd64824d8b12e/contourpy-1.3.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:570ef7cf892f0afbe5b2ee410c507ce12e15a5fa91017a0009f79f7d93a1268f", size = 267838, upload-time = "2024-08-27T20:52:33.911Z" }, - { url = "https://files.pythonhosted.org/packages/e3/04/33351c5d5108460a8ce6d512307690b023f0cfcad5899499f5c83b9d63b1/contourpy-1.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:da84c537cb8b97d153e9fb208c221c45605f73147bd4cadd23bdae915042aad6", size = 251549, upload-time = "2024-08-27T20:52:39.179Z" }, - { url = "https://files.pythonhosted.org/packages/51/3d/aa0fe6ae67e3ef9f178389e4caaaa68daf2f9024092aa3c6032e3d174670/contourpy-1.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0be4d8425bfa755e0fd76ee1e019636ccc7c29f77a7c86b4328a9eb6a26d0639", size = 303177, upload-time = "2024-08-27T20:52:44.789Z" }, - { url = "https://files.pythonhosted.org/packages/56/c3/c85a7e3e0cab635575d3b657f9535443a6f5d20fac1a1911eaa4bbe1aceb/contourpy-1.3.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9c0da700bf58f6e0b65312d0a5e695179a71d0163957fa381bb3c1f72972537c", size = 341735, upload-time = "2024-08-27T20:52:51.05Z" }, - { url = "https://files.pythonhosted.org/packages/dd/8d/20f7a211a7be966a53f474bc90b1a8202e9844b3f1ef85f3ae45a77151ee/contourpy-1.3.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eb8b141bb00fa977d9122636b16aa67d37fd40a3d8b52dd837e536d64b9a4d06", size = 314679, upload-time = "2024-08-27T20:52:58.473Z" }, - { url = "https://files.pythonhosted.org/packages/6e/be/524e377567defac0e21a46e2a529652d165fed130a0d8a863219303cee18/contourpy-1.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3634b5385c6716c258d0419c46d05c8aa7dc8cb70326c9a4fb66b69ad2b52e09", size = 320549, upload-time = "2024-08-27T20:53:06.593Z" }, - { url = "https://files.pythonhosted.org/packages/0f/96/fdb2552a172942d888915f3a6663812e9bc3d359d53dafd4289a0fb462f0/contourpy-1.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0dce35502151b6bd35027ac39ba6e5a44be13a68f55735c3612c568cac3805fd", size = 1263068, upload-time = "2024-08-27T20:53:23.442Z" }, - { url = "https://files.pythonhosted.org/packages/2a/25/632eab595e3140adfa92f1322bf8915f68c932bac468e89eae9974cf1c00/contourpy-1.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:aea348f053c645100612b333adc5983d87be69acdc6d77d3169c090d3b01dc35", size = 1322833, upload-time = "2024-08-27T20:53:39.243Z" }, - { url = "https://files.pythonhosted.org/packages/73/e3/69738782e315a1d26d29d71a550dbbe3eb6c653b028b150f70c1a5f4f229/contourpy-1.3.0-cp312-cp312-win32.whl", hash = "sha256:90f73a5116ad1ba7174341ef3ea5c3150ddf20b024b98fb0c3b29034752c8aeb", size = 172681, upload-time = "2024-08-27T20:53:43.05Z" }, - { url = "https://files.pythonhosted.org/packages/0c/89/9830ba00d88e43d15e53d64931e66b8792b46eb25e2050a88fec4a0df3d5/contourpy-1.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:b11b39aea6be6764f84360fce6c82211a9db32a7c7de8fa6dd5397cf1d079c3b", size = 218283, upload-time = "2024-08-27T20:53:47.232Z" }, - { url = "https://files.pythonhosted.org/packages/b3/e3/b9f72758adb6ef7397327ceb8b9c39c75711affb220e4f53c745ea1d5a9a/contourpy-1.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a11077e395f67ffc2c44ec2418cfebed032cd6da3022a94fc227b6faf8e2acb8", size = 265518, upload-time = "2024-08-27T20:56:01.333Z" }, - { url = "https://files.pythonhosted.org/packages/ec/22/19f5b948367ab5260fb41d842c7a78dae645603881ea6bc39738bcfcabf6/contourpy-1.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e8134301d7e204c88ed7ab50028ba06c683000040ede1d617298611f9dc6240c", size = 249350, upload-time = "2024-08-27T20:56:05.432Z" }, - { url = "https://files.pythonhosted.org/packages/26/76/0c7d43263dd00ae21a91a24381b7e813d286a3294d95d179ef3a7b9fb1d7/contourpy-1.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e12968fdfd5bb45ffdf6192a590bd8ddd3ba9e58360b29683c6bb71a7b41edca", size = 309167, upload-time = "2024-08-27T20:56:10.034Z" }, - { url = "https://files.pythonhosted.org/packages/96/3b/cadff6773e89f2a5a492c1a8068e21d3fccaf1a1c1df7d65e7c8e3ef60ba/contourpy-1.3.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fd2a0fc506eccaaa7595b7e1418951f213cf8255be2600f1ea1b61e46a60c55f", size = 348279, upload-time = "2024-08-27T20:56:15.41Z" }, - { url = "https://files.pythonhosted.org/packages/e1/86/158cc43aa549d2081a955ab11c6bdccc7a22caacc2af93186d26f5f48746/contourpy-1.3.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4cfb5c62ce023dfc410d6059c936dcf96442ba40814aefbfa575425a3a7f19dc", size = 318519, upload-time = "2024-08-27T20:56:21.813Z" }, - { url = "https://files.pythonhosted.org/packages/05/11/57335544a3027e9b96a05948c32e566328e3a2f84b7b99a325b7a06d2b06/contourpy-1.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:68a32389b06b82c2fdd68276148d7b9275b5f5cf13e5417e4252f6d1a34f72a2", size = 321922, upload-time = "2024-08-27T20:56:26.983Z" }, - { url = "https://files.pythonhosted.org/packages/0b/e3/02114f96543f4a1b694333b92a6dcd4f8eebbefcc3a5f3bbb1316634178f/contourpy-1.3.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:94e848a6b83da10898cbf1311a815f770acc9b6a3f2d646f330d57eb4e87592e", size = 1258017, upload-time = "2024-08-27T20:56:42.246Z" }, - { url = "https://files.pythonhosted.org/packages/f3/3b/bfe4c81c6d5881c1c643dde6620be0b42bf8aab155976dd644595cfab95c/contourpy-1.3.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:d78ab28a03c854a873787a0a42254a0ccb3cb133c672f645c9f9c8f3ae9d0800", size = 1316773, upload-time = "2024-08-27T20:56:58.58Z" }, - { url = "https://files.pythonhosted.org/packages/f1/17/c52d2970784383cafb0bd918b6fb036d98d96bbf0bc1befb5d1e31a07a70/contourpy-1.3.0-cp39-cp39-win32.whl", hash = "sha256:81cb5ed4952aae6014bc9d0421dec7c5835c9c8c31cdf51910b708f548cf58e5", size = 171353, upload-time = "2024-08-27T20:57:02.718Z" }, - { url = "https://files.pythonhosted.org/packages/53/23/db9f69676308e094d3c45f20cc52e12d10d64f027541c995d89c11ad5c75/contourpy-1.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:14e262f67bd7e6eb6880bc564dcda30b15e351a594657e55b7eec94b6ef72843", size = 211817, upload-time = "2024-08-27T20:57:06.328Z" }, - { url = "https://files.pythonhosted.org/packages/d1/09/60e486dc2b64c94ed33e58dcfb6f808192c03dfc5574c016218b9b7680dc/contourpy-1.3.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:fe41b41505a5a33aeaed2a613dccaeaa74e0e3ead6dd6fd3a118fb471644fd6c", size = 261886, upload-time = "2024-08-27T20:57:10.863Z" }, - { url = "https://files.pythonhosted.org/packages/19/20/b57f9f7174fcd439a7789fb47d764974ab646fa34d1790551de386457a8e/contourpy-1.3.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eca7e17a65f72a5133bdbec9ecf22401c62bcf4821361ef7811faee695799779", size = 311008, upload-time = "2024-08-27T20:57:15.588Z" }, - { url = "https://files.pythonhosted.org/packages/74/fc/5040d42623a1845d4f17a418e590fd7a79ae8cb2bad2b2f83de63c3bdca4/contourpy-1.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:1ec4dc6bf570f5b22ed0d7efba0dfa9c5b9e0431aeea7581aa217542d9e809a4", size = 215690, upload-time = "2024-08-27T20:57:19.321Z" }, - { url = "https://files.pythonhosted.org/packages/2b/24/dc3dcd77ac7460ab7e9d2b01a618cb31406902e50e605a8d6091f0a8f7cc/contourpy-1.3.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:00ccd0dbaad6d804ab259820fa7cb0b8036bda0686ef844d24125d8287178ce0", size = 261894, upload-time = "2024-08-27T20:57:23.873Z" }, - { url = "https://files.pythonhosted.org/packages/b1/db/531642a01cfec39d1682e46b5457b07cf805e3c3c584ec27e2a6223f8f6c/contourpy-1.3.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ca947601224119117f7c19c9cdf6b3ab54c5726ef1d906aa4a69dfb6dd58102", size = 311099, upload-time = "2024-08-27T20:57:28.58Z" }, - { url = "https://files.pythonhosted.org/packages/38/1e/94bda024d629f254143a134eead69e21c836429a2a6ce82209a00ddcb79a/contourpy-1.3.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:c6ec93afeb848a0845a18989da3beca3eec2c0f852322efe21af1931147d12cb", size = 215838, upload-time = "2024-08-27T20:57:32.913Z" }, -] - [[package]] name = "contourpy" version = "1.3.2" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12' and sys_platform != 'darwin'", - "python_full_version >= '3.12' and sys_platform == 'darwin'", - "python_full_version == '3.11.*' and sys_platform != 'darwin'", - "python_full_version == '3.11.*' and sys_platform == 'darwin'", - "python_full_version == '3.10.*' and sys_platform != 'darwin'", - "python_full_version == '3.10.*' and sys_platform == 'darwin'", -] dependencies = [ - { name = "numpy", marker = "python_full_version >= '3.10'" }, + { name = "numpy" }, ] sdist = { url = "https://files.pythonhosted.org/packages/66/54/eb9bfc647b19f2009dd5c7f5ec51c4e6ca831725f1aea7a993034f483147/contourpy-1.3.2.tar.gz", hash = "sha256:b6945942715a034c671b7fc54f9588126b0b8bf23db2696e3ca8328f3ff0ab54", size = 13466130, upload-time = "2025-04-15T17:47:53.79Z" } wheels = [ @@ -437,16 +324,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8a/65/27e0a1fa5e2e5079bdca4521be2f5dabf516f94e29a0defed35ac2382eb2/coverage-7.9.1-cp312-cp312-win32.whl", hash = "sha256:5f646a99a8c2b3ff4c6a6e081f78fad0dde275cd59f8f49dc4eab2e394332e74", size = 214724, upload-time = "2025-06-13T13:01:25.435Z" }, { url = "https://files.pythonhosted.org/packages/9b/a8/d5b128633fd1a5e0401a4160d02fa15986209a9e47717174f99dc2f7166d/coverage-7.9.1-cp312-cp312-win_amd64.whl", hash = "sha256:30f445f85c353090b83e552dcbbdad3ec84c7967e108c3ae54556ca69955563e", size = 215535, upload-time = "2025-06-13T13:01:27.861Z" }, { url = "https://files.pythonhosted.org/packages/a3/37/84bba9d2afabc3611f3e4325ee2c6a47cd449b580d4a606b240ce5a6f9bf/coverage-7.9.1-cp312-cp312-win_arm64.whl", hash = "sha256:af41da5dca398d3474129c58cb2b106a5d93bbb196be0d307ac82311ca234342", size = 213904, upload-time = "2025-06-13T13:01:29.202Z" }, - { url = "https://files.pythonhosted.org/packages/a5/d6/c41dd9b02bf16ec001aaf1cbef665537606899a3db1094e78f5ae17540ca/coverage-7.9.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6f424507f57878e424d9a95dc4ead3fbdd72fd201e404e861e465f28ea469951", size = 212029, upload-time = "2025-06-13T13:02:09.058Z" }, - { url = "https://files.pythonhosted.org/packages/f8/c0/40420d81d731f84c3916dcdf0506b3e6c6570817bff2576b83f780914ae6/coverage-7.9.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:535fde4001b2783ac80865d90e7cc7798b6b126f4cd8a8c54acfe76804e54e58", size = 212407, upload-time = "2025-06-13T13:02:11.151Z" }, - { url = "https://files.pythonhosted.org/packages/9b/87/f0db7d62d0e09f14d6d2f6ae8c7274a2f09edf74895a34b412a0601e375a/coverage-7.9.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02532fd3290bb8fa6bec876520842428e2a6ed6c27014eca81b031c2d30e3f71", size = 241160, upload-time = "2025-06-13T13:02:12.864Z" }, - { url = "https://files.pythonhosted.org/packages/a9/b7/3337c064f058a5d7696c4867159651a5b5fb01a5202bcf37362f0c51400e/coverage-7.9.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:56f5eb308b17bca3bbff810f55ee26d51926d9f89ba92707ee41d3c061257e55", size = 239027, upload-time = "2025-06-13T13:02:14.294Z" }, - { url = "https://files.pythonhosted.org/packages/7e/a9/5898a283f66d1bd413c32c2e0e05408196fd4f37e206e2b06c6e0c626e0e/coverage-7.9.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bfa447506c1a52271f1b0de3f42ea0fa14676052549095e378d5bff1c505ff7b", size = 240145, upload-time = "2025-06-13T13:02:15.745Z" }, - { url = "https://files.pythonhosted.org/packages/e0/33/d96e3350078a3c423c549cb5b2ba970de24c5257954d3e4066e2b2152d30/coverage-7.9.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:9ca8e220006966b4a7b68e8984a6aee645a0384b0769e829ba60281fe61ec4f7", size = 239871, upload-time = "2025-06-13T13:02:17.344Z" }, - { url = "https://files.pythonhosted.org/packages/1d/6e/6fb946072455f71a820cac144d49d11747a0f1a21038060a68d2d0200499/coverage-7.9.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:49f1d0788ba5b7ba65933f3a18864117c6506619f5ca80326b478f72acf3f385", size = 238122, upload-time = "2025-06-13T13:02:18.849Z" }, - { url = "https://files.pythonhosted.org/packages/e4/5c/bc43f25c8586840ce25a796a8111acf6a2b5f0909ba89a10d41ccff3920d/coverage-7.9.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:68cd53aec6f45b8e4724c0950ce86eacb775c6be01ce6e3669fe4f3a21e768ed", size = 239058, upload-time = "2025-06-13T13:02:21.423Z" }, - { url = "https://files.pythonhosted.org/packages/11/d8/ce2007418dd7fd00ff8c8b898bb150bb4bac2d6a86df05d7b88a07ff595f/coverage-7.9.1-cp39-cp39-win32.whl", hash = "sha256:95335095b6c7b1cc14c3f3f17d5452ce677e8490d101698562b2ffcacc304c8d", size = 214532, upload-time = "2025-06-13T13:02:22.857Z" }, - { url = "https://files.pythonhosted.org/packages/20/21/334e76fa246e92e6d69cab217f7c8a70ae0cc8f01438bd0544103f29528e/coverage-7.9.1-cp39-cp39-win_amd64.whl", hash = "sha256:e1b5191d1648acc439b24721caab2fd0c86679d8549ed2c84d5a7ec1bedcc244", size = 215439, upload-time = "2025-06-13T13:02:24.268Z" }, { url = "https://files.pythonhosted.org/packages/3e/e5/c723545c3fd3204ebde3b4cc4b927dce709d3b6dc577754bb57f63ca4a4a/coverage-7.9.1-pp39.pp310.pp311-none-any.whl", hash = "sha256:db0f04118d1db74db6c9e1cb1898532c7dcc220f1d2718f058601f7c3f499514", size = 204009, upload-time = "2025-06-13T13:02:25.787Z" }, { url = "https://files.pythonhosted.org/packages/08/b8/7ddd1e8ba9701dea08ce22029917140e6f66a859427406579fd8d0ca7274/coverage-7.9.1-py3-none-any.whl", hash = "sha256:66b974b145aa189516b6bf2d8423e888b742517d37872f6ee4c5be0073bd9a3c", size = 204000, upload-time = "2025-06-13T13:02:27.173Z" }, ] @@ -541,15 +418,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1d/d6/6ed8b439906ca2e88d65bddf002e21239678aca6001d8fb82e8e2b196245/dunamai-1.24.1-py3-none-any.whl", hash = "sha256:4370e406d8ce195fc4b066b5c326bfa9adb269c4b8719b4e4fd90b63a2144bf7", size = 26654, upload-time = "2025-05-09T13:48:45.442Z" }, ] -[[package]] -name = "eval-type-backport" -version = "0.2.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/30/ea/8b0ac4469d4c347c6a385ff09dc3c048c2d021696664e26c7ee6791631b5/eval_type_backport-0.2.2.tar.gz", hash = "sha256:f0576b4cf01ebb5bd358d02314d31846af5e07678387486e2c798af0e7d849c1", size = 9079, upload-time = "2024-12-21T20:09:46.005Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ce/31/55cd413eaccd39125368be33c46de24a1f639f2e12349b0361b4678f3915/eval_type_backport-0.2.2-py3-none-any.whl", hash = "sha256:cb6ad7c393517f476f96d456d0412ea80f0a8cf96f6892834cd9340149111b0a", size = 5830, upload-time = "2024-12-21T20:09:44.175Z" }, -] - [[package]] name = "exceptiongroup" version = "1.3.0" @@ -610,14 +478,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ae/44/a3a3b70d5709405f7525bb7cb497b4e46151e0c02e3c8a0e40e5e9fe030b/fonttools-4.58.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e38f687d5de97c7fb7da3e58169fb5ba349e464e141f83c3c2e2beb91d317816", size = 5037851, upload-time = "2025-06-13T17:24:35.034Z" }, { url = "https://files.pythonhosted.org/packages/21/cb/e8923d197c78969454eb876a4a55a07b59c9c4c46598f02b02411dc3b45c/fonttools-4.58.4-cp312-cp312-win32.whl", hash = "sha256:636c073b4da9db053aa683db99580cac0f7c213a953b678f69acbca3443c12cc", size = 2187428, upload-time = "2025-06-13T17:24:36.996Z" }, { url = "https://files.pythonhosted.org/packages/46/e6/fe50183b1a0e1018e7487ee740fa8bb127b9f5075a41e20d017201e8ab14/fonttools-4.58.4-cp312-cp312-win_amd64.whl", hash = "sha256:82e8470535743409b30913ba2822e20077acf9ea70acec40b10fcf5671dceb58", size = 2236649, upload-time = "2025-06-13T17:24:38.985Z" }, - { url = "https://files.pythonhosted.org/packages/45/20/787d70ba4cb831706fa587c56ee472a88ebc28752be660f4b58e598af6fc/fonttools-4.58.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:ca773fe7812e4e1197ee4e63b9691e89650ab55f679e12ac86052d2fe0d152cd", size = 2754537, upload-time = "2025-06-13T17:24:57.851Z" }, - { url = "https://files.pythonhosted.org/packages/4d/a5/ccb7ef1b8ab4bbf48f7753b6df512b61e73af82cd27aa486a03d6afb8635/fonttools-4.58.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e31289101221910f44245472e02b1a2f7d671c6d06a45c07b354ecb25829ad92", size = 2321715, upload-time = "2025-06-13T17:24:59.863Z" }, - { url = "https://files.pythonhosted.org/packages/20/5c/b361a7eae95950afaadb7049f55b214b619cb5368086cb3253726fe0c478/fonttools-4.58.4-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:90c9e3c01475bb9602cb617f69f02c4ba7ab7784d93f0b0d685e84286f4c1a10", size = 4819004, upload-time = "2025-06-13T17:25:01.591Z" }, - { url = "https://files.pythonhosted.org/packages/d5/2f/3006fbb1f57704cd60af82fb8127788cfb102f12d39c39fb5996af595cf3/fonttools-4.58.4-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e00a826f2bc745a010341ac102082fe5e3fb9f0861b90ed9ff32277598813711", size = 4749072, upload-time = "2025-06-13T17:25:03.334Z" }, - { url = "https://files.pythonhosted.org/packages/c2/42/ea79e2c3d5e4441e4508d6456b268a7de275452f3dba3a13fc9d73f3e03d/fonttools-4.58.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:bc75e72e9d2a4ad0935c59713bd38679d51c6fefab1eadde80e3ed4c2a11ea84", size = 4802023, upload-time = "2025-06-13T17:25:05.486Z" }, - { url = "https://files.pythonhosted.org/packages/d4/70/90a196f57faa2bcd1485710c6d08eedceca500cdf2166640b3478e72072c/fonttools-4.58.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:f57a795e540059ce3de68508acfaaf177899b39c36ef0a2833b2308db98c71f1", size = 4911103, upload-time = "2025-06-13T17:25:07.505Z" }, - { url = "https://files.pythonhosted.org/packages/cb/3f/a7d38e606e98701dbcb6198406c8b554a77ed06c5b21e425251813fd3775/fonttools-4.58.4-cp39-cp39-win32.whl", hash = "sha256:a7d04f64c88b48ede655abcf76f2b2952f04933567884d99be7c89e0a4495131", size = 1471393, upload-time = "2025-06-13T17:25:09.587Z" }, - { url = "https://files.pythonhosted.org/packages/37/6e/08158deaebeb5b0c7a0fb251ca6827defb5f5159958a23ba427e0b677e95/fonttools-4.58.4-cp39-cp39-win_amd64.whl", hash = "sha256:5a8bc5dfd425c89b1c38380bc138787b0a830f761b82b37139aa080915503b69", size = 1515901, upload-time = "2025-06-13T17:25:11.336Z" }, { url = "https://files.pythonhosted.org/packages/0b/2f/c536b5b9bb3c071e91d536a4d11f969e911dbb6b227939f4c5b0bca090df/fonttools-4.58.4-py3-none-any.whl", hash = "sha256:a10ce13a13f26cbb9f37512a4346bb437ad7e002ff6fa966a7ce7ff5ac3528bd", size = 1114660, upload-time = "2025-06-13T17:25:13.321Z" }, ] @@ -679,30 +539,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ff/62/85c4c919272577931d407be5ba5d71c20f0b616d31a0befe0ae45bb79abd/imagesize-1.4.1-py2.py3-none-any.whl", hash = "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b", size = 8769, upload-time = "2022-07-01T12:21:02.467Z" }, ] -[[package]] -name = "importlib-metadata" -version = "8.7.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "zipp", marker = "python_full_version < '3.10'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/76/66/650a33bd90f786193e4de4b3ad86ea60b53c89b669a5c7be931fac31cdb0/importlib_metadata-8.7.0.tar.gz", hash = "sha256:d13b81ad223b890aa16c5471f2ac3056cf76c5f10f82d6f9292f0b415f389000", size = 56641, upload-time = "2025-04-27T15:29:01.736Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/20/b0/36bd937216ec521246249be3bf9855081de4c5e06a0c9b4219dbeda50373/importlib_metadata-8.7.0-py3-none-any.whl", hash = "sha256:e5dd1551894c77868a30651cef00984d50e1002d06942a7101d34870c5f02afd", size = 27656, upload-time = "2025-04-27T15:29:00.214Z" }, -] - -[[package]] -name = "importlib-resources" -version = "6.5.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "zipp", marker = "python_full_version < '3.10'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/cf/8c/f834fbf984f691b4f7ff60f50b514cc3de5cc08abfc3295564dd89c5e2e7/importlib_resources-6.5.2.tar.gz", hash = "sha256:185f87adef5bcc288449d98fb4fba07cea78bc036455dd44c5fc4a2fe78fed2c", size = 44693, upload-time = "2025-01-03T18:51:56.698Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a4/ed/1f1afb2e9e7f38a545d628f864d562a5ae64fe6f7a10e28ffb9b185b4e89/importlib_resources-6.5.2-py3-none-any.whl", hash = "sha256:789cfdc3ed28c78b67a06acb8126751ced69a3d5f79c095a98298cd8a760ccec", size = 37461, upload-time = "2025-01-03T18:51:54.306Z" }, -] - [[package]] name = "iniconfig" version = "2.1.0" @@ -733,106 +569,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, ] -[[package]] -name = "kiwisolver" -version = "1.4.7" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10' and sys_platform != 'darwin'", - "python_full_version < '3.10' and sys_platform == 'darwin'", -] -sdist = { url = "https://files.pythonhosted.org/packages/85/4d/2255e1c76304cbd60b48cee302b66d1dde4468dc5b1160e4b7cb43778f2a/kiwisolver-1.4.7.tar.gz", hash = "sha256:9893ff81bd7107f7b685d3017cc6583daadb4fc26e4a888350df530e41980a60", size = 97286, upload-time = "2024-09-04T09:39:44.302Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/97/14/fc943dd65268a96347472b4fbe5dcc2f6f55034516f80576cd0dd3a8930f/kiwisolver-1.4.7-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8a9c83f75223d5e48b0bc9cb1bf2776cf01563e00ade8775ffe13b0b6e1af3a6", size = 122440, upload-time = "2024-09-04T09:03:44.9Z" }, - { url = "https://files.pythonhosted.org/packages/1e/46/e68fed66236b69dd02fcdb506218c05ac0e39745d696d22709498896875d/kiwisolver-1.4.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:58370b1ffbd35407444d57057b57da5d6549d2d854fa30249771775c63b5fe17", size = 65758, upload-time = "2024-09-04T09:03:46.582Z" }, - { url = "https://files.pythonhosted.org/packages/ef/fa/65de49c85838681fc9cb05de2a68067a683717321e01ddafb5b8024286f0/kiwisolver-1.4.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:aa0abdf853e09aff551db11fce173e2177d00786c688203f52c87ad7fcd91ef9", size = 64311, upload-time = "2024-09-04T09:03:47.973Z" }, - { url = "https://files.pythonhosted.org/packages/42/9c/cc8d90f6ef550f65443bad5872ffa68f3dee36de4974768628bea7c14979/kiwisolver-1.4.7-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:8d53103597a252fb3ab8b5845af04c7a26d5e7ea8122303dd7a021176a87e8b9", size = 1637109, upload-time = "2024-09-04T09:03:49.281Z" }, - { url = "https://files.pythonhosted.org/packages/55/91/0a57ce324caf2ff5403edab71c508dd8f648094b18cfbb4c8cc0fde4a6ac/kiwisolver-1.4.7-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:88f17c5ffa8e9462fb79f62746428dd57b46eb931698e42e990ad63103f35e6c", size = 1617814, upload-time = "2024-09-04T09:03:51.444Z" }, - { url = "https://files.pythonhosted.org/packages/12/5d/c36140313f2510e20207708adf36ae4919416d697ee0236b0ddfb6fd1050/kiwisolver-1.4.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88a9ca9c710d598fd75ee5de59d5bda2684d9db36a9f50b6125eaea3969c2599", size = 1400881, upload-time = "2024-09-04T09:03:53.357Z" }, - { url = "https://files.pythonhosted.org/packages/56/d0/786e524f9ed648324a466ca8df86298780ef2b29c25313d9a4f16992d3cf/kiwisolver-1.4.7-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f4d742cb7af1c28303a51b7a27aaee540e71bb8e24f68c736f6f2ffc82f2bf05", size = 1512972, upload-time = "2024-09-04T09:03:55.082Z" }, - { url = "https://files.pythonhosted.org/packages/67/5a/77851f2f201e6141d63c10a0708e996a1363efaf9e1609ad0441b343763b/kiwisolver-1.4.7-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e28c7fea2196bf4c2f8d46a0415c77a1c480cc0724722f23d7410ffe9842c407", size = 1444787, upload-time = "2024-09-04T09:03:56.588Z" }, - { url = "https://files.pythonhosted.org/packages/06/5f/1f5eaab84355885e224a6fc8d73089e8713dc7e91c121f00b9a1c58a2195/kiwisolver-1.4.7-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:e968b84db54f9d42046cf154e02911e39c0435c9801681e3fc9ce8a3c4130278", size = 2199212, upload-time = "2024-09-04T09:03:58.557Z" }, - { url = "https://files.pythonhosted.org/packages/b5/28/9152a3bfe976a0ae21d445415defc9d1cd8614b2910b7614b30b27a47270/kiwisolver-1.4.7-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0c18ec74c0472de033e1bebb2911c3c310eef5649133dd0bedf2a169a1b269e5", size = 2346399, upload-time = "2024-09-04T09:04:00.178Z" }, - { url = "https://files.pythonhosted.org/packages/26/f6/453d1904c52ac3b400f4d5e240ac5fec25263716723e44be65f4d7149d13/kiwisolver-1.4.7-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8f0ea6da6d393d8b2e187e6a5e3fb81f5862010a40c3945e2c6d12ae45cfb2ad", size = 2308688, upload-time = "2024-09-04T09:04:02.216Z" }, - { url = "https://files.pythonhosted.org/packages/5a/9a/d4968499441b9ae187e81745e3277a8b4d7c60840a52dc9d535a7909fac3/kiwisolver-1.4.7-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:f106407dda69ae456dd1227966bf445b157ccc80ba0dff3802bb63f30b74e895", size = 2445493, upload-time = "2024-09-04T09:04:04.571Z" }, - { url = "https://files.pythonhosted.org/packages/07/c9/032267192e7828520dacb64dfdb1d74f292765f179e467c1cba97687f17d/kiwisolver-1.4.7-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:84ec80df401cfee1457063732d90022f93951944b5b58975d34ab56bb150dfb3", size = 2262191, upload-time = "2024-09-04T09:04:05.969Z" }, - { url = "https://files.pythonhosted.org/packages/6c/ad/db0aedb638a58b2951da46ddaeecf204be8b4f5454df020d850c7fa8dca8/kiwisolver-1.4.7-cp310-cp310-win32.whl", hash = "sha256:71bb308552200fb2c195e35ef05de12f0c878c07fc91c270eb3d6e41698c3bcc", size = 46644, upload-time = "2024-09-04T09:04:07.408Z" }, - { url = "https://files.pythonhosted.org/packages/12/ca/d0f7b7ffbb0be1e7c2258b53554efec1fd652921f10d7d85045aff93ab61/kiwisolver-1.4.7-cp310-cp310-win_amd64.whl", hash = "sha256:44756f9fd339de0fb6ee4f8c1696cfd19b2422e0d70b4cefc1cc7f1f64045a8c", size = 55877, upload-time = "2024-09-04T09:04:08.869Z" }, - { url = "https://files.pythonhosted.org/packages/97/6c/cfcc128672f47a3e3c0d918ecb67830600078b025bfc32d858f2e2d5c6a4/kiwisolver-1.4.7-cp310-cp310-win_arm64.whl", hash = "sha256:78a42513018c41c2ffd262eb676442315cbfe3c44eed82385c2ed043bc63210a", size = 48347, upload-time = "2024-09-04T09:04:10.106Z" }, - { url = "https://files.pythonhosted.org/packages/e9/44/77429fa0a58f941d6e1c58da9efe08597d2e86bf2b2cce6626834f49d07b/kiwisolver-1.4.7-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d2b0e12a42fb4e72d509fc994713d099cbb15ebf1103545e8a45f14da2dfca54", size = 122442, upload-time = "2024-09-04T09:04:11.432Z" }, - { url = "https://files.pythonhosted.org/packages/e5/20/8c75caed8f2462d63c7fd65e16c832b8f76cda331ac9e615e914ee80bac9/kiwisolver-1.4.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2a8781ac3edc42ea4b90bc23e7d37b665d89423818e26eb6df90698aa2287c95", size = 65762, upload-time = "2024-09-04T09:04:12.468Z" }, - { url = "https://files.pythonhosted.org/packages/f4/98/fe010f15dc7230f45bc4cf367b012d651367fd203caaa992fd1f5963560e/kiwisolver-1.4.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:46707a10836894b559e04b0fd143e343945c97fd170d69a2d26d640b4e297935", size = 64319, upload-time = "2024-09-04T09:04:13.635Z" }, - { url = "https://files.pythonhosted.org/packages/8b/1b/b5d618f4e58c0675654c1e5051bcf42c776703edb21c02b8c74135541f60/kiwisolver-1.4.7-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ef97b8df011141c9b0f6caf23b29379f87dd13183c978a30a3c546d2c47314cb", size = 1334260, upload-time = "2024-09-04T09:04:14.878Z" }, - { url = "https://files.pythonhosted.org/packages/b8/01/946852b13057a162a8c32c4c8d2e9ed79f0bb5d86569a40c0b5fb103e373/kiwisolver-1.4.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3ab58c12a2cd0fc769089e6d38466c46d7f76aced0a1f54c77652446733d2d02", size = 1426589, upload-time = "2024-09-04T09:04:16.514Z" }, - { url = "https://files.pythonhosted.org/packages/70/d1/c9f96df26b459e15cf8a965304e6e6f4eb291e0f7a9460b4ad97b047561e/kiwisolver-1.4.7-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:803b8e1459341c1bb56d1c5c010406d5edec8a0713a0945851290a7930679b51", size = 1541080, upload-time = "2024-09-04T09:04:18.322Z" }, - { url = "https://files.pythonhosted.org/packages/d3/73/2686990eb8b02d05f3de759d6a23a4ee7d491e659007dd4c075fede4b5d0/kiwisolver-1.4.7-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f9a9e8a507420fe35992ee9ecb302dab68550dedc0da9e2880dd88071c5fb052", size = 1470049, upload-time = "2024-09-04T09:04:20.266Z" }, - { url = "https://files.pythonhosted.org/packages/a7/4b/2db7af3ed3af7c35f388d5f53c28e155cd402a55432d800c543dc6deb731/kiwisolver-1.4.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18077b53dc3bb490e330669a99920c5e6a496889ae8c63b58fbc57c3d7f33a18", size = 1426376, upload-time = "2024-09-04T09:04:22.419Z" }, - { url = "https://files.pythonhosted.org/packages/05/83/2857317d04ea46dc5d115f0df7e676997bbd968ced8e2bd6f7f19cfc8d7f/kiwisolver-1.4.7-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6af936f79086a89b3680a280c47ea90b4df7047b5bdf3aa5c524bbedddb9e545", size = 2222231, upload-time = "2024-09-04T09:04:24.526Z" }, - { url = "https://files.pythonhosted.org/packages/0d/b5/866f86f5897cd4ab6d25d22e403404766a123f138bd6a02ecb2cdde52c18/kiwisolver-1.4.7-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:3abc5b19d24af4b77d1598a585b8a719beb8569a71568b66f4ebe1fb0449460b", size = 2368634, upload-time = "2024-09-04T09:04:25.899Z" }, - { url = "https://files.pythonhosted.org/packages/c1/ee/73de8385403faba55f782a41260210528fe3273d0cddcf6d51648202d6d0/kiwisolver-1.4.7-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:933d4de052939d90afbe6e9d5273ae05fb836cc86c15b686edd4b3560cc0ee36", size = 2329024, upload-time = "2024-09-04T09:04:28.523Z" }, - { url = "https://files.pythonhosted.org/packages/a1/e7/cd101d8cd2cdfaa42dc06c433df17c8303d31129c9fdd16c0ea37672af91/kiwisolver-1.4.7-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:65e720d2ab2b53f1f72fb5da5fb477455905ce2c88aaa671ff0a447c2c80e8e3", size = 2468484, upload-time = "2024-09-04T09:04:30.547Z" }, - { url = "https://files.pythonhosted.org/packages/e1/72/84f09d45a10bc57a40bb58b81b99d8f22b58b2040c912b7eb97ebf625bf2/kiwisolver-1.4.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3bf1ed55088f214ba6427484c59553123fdd9b218a42bbc8c6496d6754b1e523", size = 2284078, upload-time = "2024-09-04T09:04:33.218Z" }, - { url = "https://files.pythonhosted.org/packages/d2/d4/71828f32b956612dc36efd7be1788980cb1e66bfb3706e6dec9acad9b4f9/kiwisolver-1.4.7-cp311-cp311-win32.whl", hash = "sha256:4c00336b9dd5ad96d0a558fd18a8b6f711b7449acce4c157e7343ba92dd0cf3d", size = 46645, upload-time = "2024-09-04T09:04:34.371Z" }, - { url = "https://files.pythonhosted.org/packages/a1/65/d43e9a20aabcf2e798ad1aff6c143ae3a42cf506754bcb6a7ed8259c8425/kiwisolver-1.4.7-cp311-cp311-win_amd64.whl", hash = "sha256:929e294c1ac1e9f615c62a4e4313ca1823ba37326c164ec720a803287c4c499b", size = 56022, upload-time = "2024-09-04T09:04:35.786Z" }, - { url = "https://files.pythonhosted.org/packages/35/b3/9f75a2e06f1b4ca00b2b192bc2b739334127d27f1d0625627ff8479302ba/kiwisolver-1.4.7-cp311-cp311-win_arm64.whl", hash = "sha256:e33e8fbd440c917106b237ef1a2f1449dfbb9b6f6e1ce17c94cd6a1e0d438376", size = 48536, upload-time = "2024-09-04T09:04:37.525Z" }, - { url = "https://files.pythonhosted.org/packages/97/9c/0a11c714cf8b6ef91001c8212c4ef207f772dd84540104952c45c1f0a249/kiwisolver-1.4.7-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:5360cc32706dab3931f738d3079652d20982511f7c0ac5711483e6eab08efff2", size = 121808, upload-time = "2024-09-04T09:04:38.637Z" }, - { url = "https://files.pythonhosted.org/packages/f2/d8/0fe8c5f5d35878ddd135f44f2af0e4e1d379e1c7b0716f97cdcb88d4fd27/kiwisolver-1.4.7-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:942216596dc64ddb25adb215c3c783215b23626f8d84e8eff8d6d45c3f29f75a", size = 65531, upload-time = "2024-09-04T09:04:39.694Z" }, - { url = "https://files.pythonhosted.org/packages/80/c5/57fa58276dfdfa612241d640a64ca2f76adc6ffcebdbd135b4ef60095098/kiwisolver-1.4.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:48b571ecd8bae15702e4f22d3ff6a0f13e54d3d00cd25216d5e7f658242065ee", size = 63894, upload-time = "2024-09-04T09:04:41.6Z" }, - { url = "https://files.pythonhosted.org/packages/8b/e9/26d3edd4c4ad1c5b891d8747a4f81b1b0aba9fb9721de6600a4adc09773b/kiwisolver-1.4.7-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ad42ba922c67c5f219097b28fae965e10045ddf145d2928bfac2eb2e17673640", size = 1369296, upload-time = "2024-09-04T09:04:42.886Z" }, - { url = "https://files.pythonhosted.org/packages/b6/67/3f4850b5e6cffb75ec40577ddf54f7b82b15269cc5097ff2e968ee32ea7d/kiwisolver-1.4.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:612a10bdae23404a72941a0fc8fa2660c6ea1217c4ce0dbcab8a8f6543ea9e7f", size = 1461450, upload-time = "2024-09-04T09:04:46.284Z" }, - { url = "https://files.pythonhosted.org/packages/52/be/86cbb9c9a315e98a8dc6b1d23c43cffd91d97d49318854f9c37b0e41cd68/kiwisolver-1.4.7-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9e838bba3a3bac0fe06d849d29772eb1afb9745a59710762e4ba3f4cb8424483", size = 1579168, upload-time = "2024-09-04T09:04:47.91Z" }, - { url = "https://files.pythonhosted.org/packages/0f/00/65061acf64bd5fd34c1f4ae53f20b43b0a017a541f242a60b135b9d1e301/kiwisolver-1.4.7-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:22f499f6157236c19f4bbbd472fa55b063db77a16cd74d49afe28992dff8c258", size = 1507308, upload-time = "2024-09-04T09:04:49.465Z" }, - { url = "https://files.pythonhosted.org/packages/21/e4/c0b6746fd2eb62fe702118b3ca0cb384ce95e1261cfada58ff693aeec08a/kiwisolver-1.4.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:693902d433cf585133699972b6d7c42a8b9f8f826ebcaf0132ff55200afc599e", size = 1464186, upload-time = "2024-09-04T09:04:50.949Z" }, - { url = "https://files.pythonhosted.org/packages/0a/0f/529d0a9fffb4d514f2782c829b0b4b371f7f441d61aa55f1de1c614c4ef3/kiwisolver-1.4.7-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4e77f2126c3e0b0d055f44513ed349038ac180371ed9b52fe96a32aa071a5107", size = 2247877, upload-time = "2024-09-04T09:04:52.388Z" }, - { url = "https://files.pythonhosted.org/packages/d1/e1/66603ad779258843036d45adcbe1af0d1a889a07af4635f8b4ec7dccda35/kiwisolver-1.4.7-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:657a05857bda581c3656bfc3b20e353c232e9193eb167766ad2dc58b56504948", size = 2404204, upload-time = "2024-09-04T09:04:54.385Z" }, - { url = "https://files.pythonhosted.org/packages/8d/61/de5fb1ca7ad1f9ab7970e340a5b833d735df24689047de6ae71ab9d8d0e7/kiwisolver-1.4.7-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:4bfa75a048c056a411f9705856abfc872558e33c055d80af6a380e3658766038", size = 2352461, upload-time = "2024-09-04T09:04:56.307Z" }, - { url = "https://files.pythonhosted.org/packages/ba/d2/0edc00a852e369827f7e05fd008275f550353f1f9bcd55db9363d779fc63/kiwisolver-1.4.7-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:34ea1de54beef1c104422d210c47c7d2a4999bdecf42c7b5718fbe59a4cac383", size = 2501358, upload-time = "2024-09-04T09:04:57.922Z" }, - { url = "https://files.pythonhosted.org/packages/84/15/adc15a483506aec6986c01fb7f237c3aec4d9ed4ac10b756e98a76835933/kiwisolver-1.4.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:90da3b5f694b85231cf93586dad5e90e2d71b9428f9aad96952c99055582f520", size = 2314119, upload-time = "2024-09-04T09:04:59.332Z" }, - { url = "https://files.pythonhosted.org/packages/36/08/3a5bb2c53c89660863a5aa1ee236912269f2af8762af04a2e11df851d7b2/kiwisolver-1.4.7-cp312-cp312-win32.whl", hash = "sha256:18e0cca3e008e17fe9b164b55735a325140a5a35faad8de92dd80265cd5eb80b", size = 46367, upload-time = "2024-09-04T09:05:00.804Z" }, - { url = "https://files.pythonhosted.org/packages/19/93/c05f0a6d825c643779fc3c70876bff1ac221f0e31e6f701f0e9578690d70/kiwisolver-1.4.7-cp312-cp312-win_amd64.whl", hash = "sha256:58cb20602b18f86f83a5c87d3ee1c766a79c0d452f8def86d925e6c60fbf7bfb", size = 55884, upload-time = "2024-09-04T09:05:01.924Z" }, - { url = "https://files.pythonhosted.org/packages/d2/f9/3828d8f21b6de4279f0667fb50a9f5215e6fe57d5ec0d61905914f5b6099/kiwisolver-1.4.7-cp312-cp312-win_arm64.whl", hash = "sha256:f5a8b53bdc0b3961f8b6125e198617c40aeed638b387913bf1ce78afb1b0be2a", size = 48528, upload-time = "2024-09-04T09:05:02.983Z" }, - { url = "https://files.pythonhosted.org/packages/11/88/37ea0ea64512997b13d69772db8dcdc3bfca5442cda3a5e4bb943652ee3e/kiwisolver-1.4.7-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:3f9362ecfca44c863569d3d3c033dbe8ba452ff8eed6f6b5806382741a1334bd", size = 122449, upload-time = "2024-09-04T09:05:55.311Z" }, - { url = "https://files.pythonhosted.org/packages/4e/45/5a5c46078362cb3882dcacad687c503089263c017ca1241e0483857791eb/kiwisolver-1.4.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e8df2eb9b2bac43ef8b082e06f750350fbbaf2887534a5be97f6cf07b19d9583", size = 65757, upload-time = "2024-09-04T09:05:56.906Z" }, - { url = "https://files.pythonhosted.org/packages/8a/be/a6ae58978772f685d48dd2e84460937761c53c4bbd84e42b0336473d9775/kiwisolver-1.4.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f32d6edbc638cde7652bd690c3e728b25332acbadd7cad670cc4a02558d9c417", size = 64312, upload-time = "2024-09-04T09:05:58.384Z" }, - { url = "https://files.pythonhosted.org/packages/f4/04/18ef6f452d311e1e1eb180c9bf5589187fa1f042db877e6fe443ef10099c/kiwisolver-1.4.7-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:e2e6c39bd7b9372b0be21456caab138e8e69cc0fc1190a9dfa92bd45a1e6e904", size = 1626966, upload-time = "2024-09-04T09:05:59.855Z" }, - { url = "https://files.pythonhosted.org/packages/21/b1/40655f6c3fa11ce740e8a964fa8e4c0479c87d6a7944b95af799c7a55dfe/kiwisolver-1.4.7-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:dda56c24d869b1193fcc763f1284b9126550eaf84b88bbc7256e15028f19188a", size = 1607044, upload-time = "2024-09-04T09:06:02.16Z" }, - { url = "https://files.pythonhosted.org/packages/fd/93/af67dbcfb9b3323bbd2c2db1385a7139d8f77630e4a37bb945b57188eb2d/kiwisolver-1.4.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79849239c39b5e1fd906556c474d9b0439ea6792b637511f3fe3a41158d89ca8", size = 1391879, upload-time = "2024-09-04T09:06:03.908Z" }, - { url = "https://files.pythonhosted.org/packages/40/6f/d60770ef98e77b365d96061d090c0cd9e23418121c55fff188fa4bdf0b54/kiwisolver-1.4.7-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5e3bc157fed2a4c02ec468de4ecd12a6e22818d4f09cde2c31ee3226ffbefab2", size = 1504751, upload-time = "2024-09-04T09:06:05.58Z" }, - { url = "https://files.pythonhosted.org/packages/fa/3a/5f38667d313e983c432f3fcd86932177519ed8790c724e07d77d1de0188a/kiwisolver-1.4.7-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3da53da805b71e41053dc670f9a820d1157aae77b6b944e08024d17bcd51ef88", size = 1436990, upload-time = "2024-09-04T09:06:08.126Z" }, - { url = "https://files.pythonhosted.org/packages/cb/3b/1520301a47326e6a6043b502647e42892be33b3f051e9791cc8bb43f1a32/kiwisolver-1.4.7-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:8705f17dfeb43139a692298cb6637ee2e59c0194538153e83e9ee0c75c2eddde", size = 2191122, upload-time = "2024-09-04T09:06:10.345Z" }, - { url = "https://files.pythonhosted.org/packages/cf/c4/eb52da300c166239a2233f1f9c4a1b767dfab98fae27681bfb7ea4873cb6/kiwisolver-1.4.7-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:82a5c2f4b87c26bb1a0ef3d16b5c4753434633b83d365cc0ddf2770c93829e3c", size = 2338126, upload-time = "2024-09-04T09:06:12.321Z" }, - { url = "https://files.pythonhosted.org/packages/1a/cb/42b92fd5eadd708dd9107c089e817945500685f3437ce1fd387efebc6d6e/kiwisolver-1.4.7-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:ce8be0466f4c0d585cdb6c1e2ed07232221df101a4c6f28821d2aa754ca2d9e2", size = 2298313, upload-time = "2024-09-04T09:06:14.562Z" }, - { url = "https://files.pythonhosted.org/packages/4f/eb/be25aa791fe5fc75a8b1e0c965e00f942496bc04635c9aae8035f6b76dcd/kiwisolver-1.4.7-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:409afdfe1e2e90e6ee7fc896f3df9a7fec8e793e58bfa0d052c8a82f99c37abb", size = 2437784, upload-time = "2024-09-04T09:06:16.767Z" }, - { url = "https://files.pythonhosted.org/packages/c5/22/30a66be7f3368d76ff95689e1c2e28d382383952964ab15330a15d8bfd03/kiwisolver-1.4.7-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:5b9c3f4ee0b9a439d2415012bd1b1cc2df59e4d6a9939f4d669241d30b414327", size = 2253988, upload-time = "2024-09-04T09:06:18.705Z" }, - { url = "https://files.pythonhosted.org/packages/35/d3/5f2ecb94b5211c8a04f218a76133cc8d6d153b0f9cd0b45fad79907f0689/kiwisolver-1.4.7-cp39-cp39-win32.whl", hash = "sha256:a79ae34384df2b615eefca647a2873842ac3b596418032bef9a7283675962644", size = 46980, upload-time = "2024-09-04T09:06:20.106Z" }, - { url = "https://files.pythonhosted.org/packages/ef/17/cd10d020578764ea91740204edc6b3236ed8106228a46f568d716b11feb2/kiwisolver-1.4.7-cp39-cp39-win_amd64.whl", hash = "sha256:cf0438b42121a66a3a667de17e779330fc0f20b0d97d59d2f2121e182b0505e4", size = 55847, upload-time = "2024-09-04T09:06:21.407Z" }, - { url = "https://files.pythonhosted.org/packages/91/84/32232502020bd78d1d12be7afde15811c64a95ed1f606c10456db4e4c3ac/kiwisolver-1.4.7-cp39-cp39-win_arm64.whl", hash = "sha256:764202cc7e70f767dab49e8df52c7455e8de0df5d858fa801a11aa0d882ccf3f", size = 48494, upload-time = "2024-09-04T09:06:22.648Z" }, - { url = "https://files.pythonhosted.org/packages/ac/59/741b79775d67ab67ced9bb38552da688c0305c16e7ee24bba7a2be253fb7/kiwisolver-1.4.7-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:94252291e3fe68001b1dd747b4c0b3be12582839b95ad4d1b641924d68fd4643", size = 59491, upload-time = "2024-09-04T09:06:24.188Z" }, - { url = "https://files.pythonhosted.org/packages/58/cc/fb239294c29a5656e99e3527f7369b174dd9cc7c3ef2dea7cb3c54a8737b/kiwisolver-1.4.7-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:5b7dfa3b546da08a9f622bb6becdb14b3e24aaa30adba66749d38f3cc7ea9706", size = 57648, upload-time = "2024-09-04T09:06:25.559Z" }, - { url = "https://files.pythonhosted.org/packages/3b/ef/2f009ac1f7aab9f81efb2d837301d255279d618d27b6015780115ac64bdd/kiwisolver-1.4.7-pp310-pypy310_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bd3de6481f4ed8b734da5df134cd5a6a64fe32124fe83dde1e5b5f29fe30b1e6", size = 84257, upload-time = "2024-09-04T09:06:27.038Z" }, - { url = "https://files.pythonhosted.org/packages/81/e1/c64f50987f85b68b1c52b464bb5bf73e71570c0f7782d626d1eb283ad620/kiwisolver-1.4.7-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a91b5f9f1205845d488c928e8570dcb62b893372f63b8b6e98b863ebd2368ff2", size = 80906, upload-time = "2024-09-04T09:06:28.48Z" }, - { url = "https://files.pythonhosted.org/packages/fd/71/1687c5c0a0be2cee39a5c9c389e546f9c6e215e46b691d00d9f646892083/kiwisolver-1.4.7-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40fa14dbd66b8b8f470d5fc79c089a66185619d31645f9b0773b88b19f7223c4", size = 79951, upload-time = "2024-09-04T09:06:29.966Z" }, - { url = "https://files.pythonhosted.org/packages/ea/8b/d7497df4a1cae9367adf21665dd1f896c2a7aeb8769ad77b662c5e2bcce7/kiwisolver-1.4.7-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:eb542fe7933aa09d8d8f9d9097ef37532a7df6497819d16efe4359890a2f417a", size = 55715, upload-time = "2024-09-04T09:06:31.489Z" }, - { url = "https://files.pythonhosted.org/packages/d5/df/ce37d9b26f07ab90880923c94d12a6ff4d27447096b4c849bfc4339ccfdf/kiwisolver-1.4.7-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:8b01aac285f91ca889c800042c35ad3b239e704b150cfd3382adfc9dcc780e39", size = 58666, upload-time = "2024-09-04T09:06:43.756Z" }, - { url = "https://files.pythonhosted.org/packages/b0/d3/e4b04f43bc629ac8e186b77b2b1a251cdfa5b7610fa189dc0db622672ce6/kiwisolver-1.4.7-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:48be928f59a1f5c8207154f935334d374e79f2b5d212826307d072595ad76a2e", size = 57088, upload-time = "2024-09-04T09:06:45.406Z" }, - { url = "https://files.pythonhosted.org/packages/30/1c/752df58e2d339e670a535514d2db4fe8c842ce459776b8080fbe08ebb98e/kiwisolver-1.4.7-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f37cfe618a117e50d8c240555331160d73d0411422b59b5ee217843d7b693608", size = 84321, upload-time = "2024-09-04T09:06:47.557Z" }, - { url = "https://files.pythonhosted.org/packages/f0/f8/fe6484e847bc6e238ec9f9828089fb2c0bb53f2f5f3a79351fde5b565e4f/kiwisolver-1.4.7-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:599b5c873c63a1f6ed7eead644a8a380cfbdf5db91dcb6f85707aaab213b1674", size = 80776, upload-time = "2024-09-04T09:06:49.235Z" }, - { url = "https://files.pythonhosted.org/packages/9b/57/d7163c0379f250ef763aba85330a19feefb5ce6cb541ade853aaba881524/kiwisolver-1.4.7-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:801fa7802e5cfabe3ab0c81a34c323a319b097dfb5004be950482d882f3d7225", size = 79984, upload-time = "2024-09-04T09:06:51.336Z" }, - { url = "https://files.pythonhosted.org/packages/8c/95/4a103776c265d13b3d2cd24fb0494d4e04ea435a8ef97e1b2c026d43250b/kiwisolver-1.4.7-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:0c6c43471bc764fad4bc99c5c2d6d16a676b1abf844ca7c8702bdae92df01ee0", size = 55811, upload-time = "2024-09-04T09:06:53.078Z" }, -] - [[package]] name = "kiwisolver" version = "1.4.8" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12' and sys_platform != 'darwin'", - "python_full_version >= '3.12' and sys_platform == 'darwin'", - "python_full_version == '3.11.*' and sys_platform != 'darwin'", - "python_full_version == '3.11.*' and sys_platform == 'darwin'", - "python_full_version == '3.10.*' and sys_platform != 'darwin'", - "python_full_version == '3.10.*' and sys_platform == 'darwin'", -] sdist = { url = "https://files.pythonhosted.org/packages/82/59/7c91426a8ac292e1cdd53a63b6d9439abd573c875c3f92c146767dd33faf/kiwisolver-1.4.8.tar.gz", hash = "sha256:23d5f023bdc8c7e54eb65f03ca5d5bb25b601eac4d7f1a042888a1f45237987e", size = 97538, upload-time = "2024-12-24T18:30:51.519Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/47/5f/4d8e9e852d98ecd26cdf8eaf7ed8bc33174033bba5e07001b289f07308fd/kiwisolver-1.4.8-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:88c6f252f6816a73b1f8c904f7bbe02fd67c09a69f7cb8a0eecdbf5ce78e63db", size = 124623, upload-time = "2024-12-24T18:28:17.687Z" }, @@ -957,92 +697,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a2/82/8be4c96ffee03c5b4a034e60a31294daf481e12c7c43ab8e34a1453ee48b/MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48", size = 23352, upload-time = "2024-10-18T15:21:20.971Z" }, { url = "https://files.pythonhosted.org/packages/51/ae/97827349d3fcffee7e184bdf7f41cd6b88d9919c80f0263ba7acd1bbcb18/MarkupSafe-3.0.2-cp312-cp312-win32.whl", hash = "sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30", size = 15097, upload-time = "2024-10-18T15:21:22.646Z" }, { url = "https://files.pythonhosted.org/packages/c1/80/a61f99dc3a936413c3ee4e1eecac96c0da5ed07ad56fd975f1a9da5bc630/MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87", size = 15601, upload-time = "2024-10-18T15:21:23.499Z" }, - { url = "https://files.pythonhosted.org/packages/a7/ea/9b1530c3fdeeca613faeb0fb5cbcf2389d816072fab72a71b45749ef6062/MarkupSafe-3.0.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:eaa0a10b7f72326f1372a713e73c3f739b524b3af41feb43e4921cb529f5929a", size = 14344, upload-time = "2024-10-18T15:21:43.721Z" }, - { url = "https://files.pythonhosted.org/packages/4b/c2/fbdbfe48848e7112ab05e627e718e854d20192b674952d9042ebd8c9e5de/MarkupSafe-3.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:48032821bbdf20f5799ff537c7ac3d1fba0ba032cfc06194faffa8cda8b560ff", size = 12389, upload-time = "2024-10-18T15:21:44.666Z" }, - { url = "https://files.pythonhosted.org/packages/f0/25/7a7c6e4dbd4f867d95d94ca15449e91e52856f6ed1905d58ef1de5e211d0/MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a9d3f5f0901fdec14d8d2f66ef7d035f2157240a433441719ac9a3fba440b13", size = 21607, upload-time = "2024-10-18T15:21:45.452Z" }, - { url = "https://files.pythonhosted.org/packages/53/8f/f339c98a178f3c1e545622206b40986a4c3307fe39f70ccd3d9df9a9e425/MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88b49a3b9ff31e19998750c38e030fc7bb937398b1f78cfa599aaef92d693144", size = 20728, upload-time = "2024-10-18T15:21:46.295Z" }, - { url = "https://files.pythonhosted.org/packages/1a/03/8496a1a78308456dbd50b23a385c69b41f2e9661c67ea1329849a598a8f9/MarkupSafe-3.0.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cfad01eed2c2e0c01fd0ecd2ef42c492f7f93902e39a42fc9ee1692961443a29", size = 20826, upload-time = "2024-10-18T15:21:47.134Z" }, - { url = "https://files.pythonhosted.org/packages/e6/cf/0a490a4bd363048c3022f2f475c8c05582179bb179defcee4766fb3dcc18/MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1225beacc926f536dc82e45f8a4d68502949dc67eea90eab715dea3a21c1b5f0", size = 21843, upload-time = "2024-10-18T15:21:48.334Z" }, - { url = "https://files.pythonhosted.org/packages/19/a3/34187a78613920dfd3cdf68ef6ce5e99c4f3417f035694074beb8848cd77/MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:3169b1eefae027567d1ce6ee7cae382c57fe26e82775f460f0b2778beaad66c0", size = 21219, upload-time = "2024-10-18T15:21:49.587Z" }, - { url = "https://files.pythonhosted.org/packages/17/d8/5811082f85bb88410ad7e452263af048d685669bbbfb7b595e8689152498/MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:eb7972a85c54febfb25b5c4b4f3af4dcc731994c7da0d8a0b4a6eb0640e1d178", size = 20946, upload-time = "2024-10-18T15:21:50.441Z" }, - { url = "https://files.pythonhosted.org/packages/7c/31/bd635fb5989440d9365c5e3c47556cfea121c7803f5034ac843e8f37c2f2/MarkupSafe-3.0.2-cp39-cp39-win32.whl", hash = "sha256:8c4e8c3ce11e1f92f6536ff07154f9d49677ebaaafc32db9db4620bc11ed480f", size = 15063, upload-time = "2024-10-18T15:21:51.385Z" }, - { url = "https://files.pythonhosted.org/packages/b3/73/085399401383ce949f727afec55ec3abd76648d04b9f22e1c0e99cb4bec3/MarkupSafe-3.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:6e296a513ca3d94054c2c881cc913116e90fd030ad1c656b3869762b754f5f8a", size = 15506, upload-time = "2024-10-18T15:21:52.974Z" }, -] - -[[package]] -name = "matplotlib" -version = "3.9.4" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10' and sys_platform != 'darwin'", - "python_full_version < '3.10' and sys_platform == 'darwin'", -] -dependencies = [ - { name = "contourpy", version = "1.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "cycler", marker = "python_full_version < '3.10'" }, - { name = "fonttools", marker = "python_full_version < '3.10'" }, - { name = "importlib-resources", marker = "python_full_version < '3.10'" }, - { name = "kiwisolver", version = "1.4.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "numpy", marker = "python_full_version < '3.10'" }, - { name = "packaging", marker = "python_full_version < '3.10'" }, - { name = "pillow", marker = "python_full_version < '3.10'" }, - { name = "pyparsing", marker = "python_full_version < '3.10'" }, - { name = "python-dateutil", marker = "python_full_version < '3.10'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/df/17/1747b4154034befd0ed33b52538f5eb7752d05bb51c5e2a31470c3bc7d52/matplotlib-3.9.4.tar.gz", hash = "sha256:1e00e8be7393cbdc6fedfa8a6fba02cf3e83814b285db1c60b906a023ba41bc3", size = 36106529, upload-time = "2024-12-13T05:56:34.184Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/7e/94/27d2e2c30d54b56c7b764acc1874a909e34d1965a427fc7092bb6a588b63/matplotlib-3.9.4-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:c5fdd7abfb706dfa8d307af64a87f1a862879ec3cd8d0ec8637458f0885b9c50", size = 7885089, upload-time = "2024-12-13T05:54:24.224Z" }, - { url = "https://files.pythonhosted.org/packages/c6/25/828273307e40a68eb8e9df832b6b2aaad075864fdc1de4b1b81e40b09e48/matplotlib-3.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d89bc4e85e40a71d1477780366c27fb7c6494d293e1617788986f74e2a03d7ff", size = 7770600, upload-time = "2024-12-13T05:54:27.214Z" }, - { url = "https://files.pythonhosted.org/packages/f2/65/f841a422ec994da5123368d76b126acf4fc02ea7459b6e37c4891b555b83/matplotlib-3.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ddf9f3c26aae695c5daafbf6b94e4c1a30d6cd617ba594bbbded3b33a1fcfa26", size = 8200138, upload-time = "2024-12-13T05:54:29.497Z" }, - { url = "https://files.pythonhosted.org/packages/07/06/272aca07a38804d93b6050813de41ca7ab0e29ba7a9dd098e12037c919a9/matplotlib-3.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18ebcf248030173b59a868fda1fe42397253f6698995b55e81e1f57431d85e50", size = 8312711, upload-time = "2024-12-13T05:54:34.396Z" }, - { url = "https://files.pythonhosted.org/packages/98/37/f13e23b233c526b7e27ad61be0a771894a079e0f7494a10d8d81557e0e9a/matplotlib-3.9.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:974896ec43c672ec23f3f8c648981e8bc880ee163146e0312a9b8def2fac66f5", size = 9090622, upload-time = "2024-12-13T05:54:36.808Z" }, - { url = "https://files.pythonhosted.org/packages/4f/8c/b1f5bd2bd70e60f93b1b54c4d5ba7a992312021d0ddddf572f9a1a6d9348/matplotlib-3.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:4598c394ae9711cec135639374e70871fa36b56afae17bdf032a345be552a88d", size = 7828211, upload-time = "2024-12-13T05:54:40.596Z" }, - { url = "https://files.pythonhosted.org/packages/74/4b/65be7959a8fa118a3929b49a842de5b78bb55475236fcf64f3e308ff74a0/matplotlib-3.9.4-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:d4dd29641d9fb8bc4492420c5480398dd40a09afd73aebe4eb9d0071a05fbe0c", size = 7894430, upload-time = "2024-12-13T05:54:44.049Z" }, - { url = "https://files.pythonhosted.org/packages/e9/18/80f70d91896e0a517b4a051c3fd540daa131630fd75e02e250365353b253/matplotlib-3.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:30e5b22e8bcfb95442bf7d48b0d7f3bdf4a450cbf68986ea45fca3d11ae9d099", size = 7780045, upload-time = "2024-12-13T05:54:46.414Z" }, - { url = "https://files.pythonhosted.org/packages/a2/73/ccb381026e3238c5c25c3609ba4157b2d1a617ec98d65a8b4ee4e1e74d02/matplotlib-3.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bb0030d1d447fd56dcc23b4c64a26e44e898f0416276cac1ebc25522e0ac249", size = 8209906, upload-time = "2024-12-13T05:54:49.459Z" }, - { url = "https://files.pythonhosted.org/packages/ab/33/1648da77b74741c89f5ea95cbf42a291b4b364f2660b316318811404ed97/matplotlib-3.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aca90ed222ac3565d2752b83dbb27627480d27662671e4d39da72e97f657a423", size = 8322873, upload-time = "2024-12-13T05:54:53.066Z" }, - { url = "https://files.pythonhosted.org/packages/57/d3/8447ba78bc6593c9044c372d1609f8ea10fb1e071e7a9e0747bea74fc16c/matplotlib-3.9.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a181b2aa2906c608fcae72f977a4a2d76e385578939891b91c2550c39ecf361e", size = 9099566, upload-time = "2024-12-13T05:54:55.522Z" }, - { url = "https://files.pythonhosted.org/packages/23/e1/4f0e237bf349c02ff9d1b6e7109f1a17f745263809b9714a8576dc17752b/matplotlib-3.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:1f6882828231eca17f501c4dcd98a05abb3f03d157fbc0769c6911fe08b6cfd3", size = 7838065, upload-time = "2024-12-13T05:54:58.337Z" }, - { url = "https://files.pythonhosted.org/packages/1a/2b/c918bf6c19d6445d1cefe3d2e42cb740fb997e14ab19d4daeb6a7ab8a157/matplotlib-3.9.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:dfc48d67e6661378a21c2983200a654b72b5c5cdbd5d2cf6e5e1ece860f0cc70", size = 7891131, upload-time = "2024-12-13T05:55:02.837Z" }, - { url = "https://files.pythonhosted.org/packages/c1/e5/b4e8fc601ca302afeeabf45f30e706a445c7979a180e3a978b78b2b681a4/matplotlib-3.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:47aef0fab8332d02d68e786eba8113ffd6f862182ea2999379dec9e237b7e483", size = 7776365, upload-time = "2024-12-13T05:55:05.158Z" }, - { url = "https://files.pythonhosted.org/packages/99/06/b991886c506506476e5d83625c5970c656a491b9f80161458fed94597808/matplotlib-3.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fba1f52c6b7dc764097f52fd9ab627b90db452c9feb653a59945de16752e965f", size = 8200707, upload-time = "2024-12-13T05:55:09.48Z" }, - { url = "https://files.pythonhosted.org/packages/c3/e2/556b627498cb27e61026f2d1ba86a78ad1b836fef0996bef5440e8bc9559/matplotlib-3.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:173ac3748acaac21afcc3fa1633924609ba1b87749006bc25051c52c422a5d00", size = 8313761, upload-time = "2024-12-13T05:55:12.95Z" }, - { url = "https://files.pythonhosted.org/packages/58/ff/165af33ec766ff818306ea88e91f9f60d2a6ed543be1eb122a98acbf3b0d/matplotlib-3.9.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:320edea0cadc07007765e33f878b13b3738ffa9745c5f707705692df70ffe0e0", size = 9095284, upload-time = "2024-12-13T05:55:16.199Z" }, - { url = "https://files.pythonhosted.org/packages/9f/8b/3d0c7a002db3b1ed702731c2a9a06d78d035f1f2fb0fb936a8e43cc1e9f4/matplotlib-3.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:a4a4cfc82330b27042a7169533da7991e8789d180dd5b3daeaee57d75cd5a03b", size = 7841160, upload-time = "2024-12-13T05:55:19.991Z" }, - { url = "https://files.pythonhosted.org/packages/56/eb/501b465c9fef28f158e414ea3a417913dc2ac748564c7ed41535f23445b4/matplotlib-3.9.4-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:3c3724d89a387ddf78ff88d2a30ca78ac2b4c89cf37f2db4bd453c34799e933c", size = 7885919, upload-time = "2024-12-13T05:55:59.66Z" }, - { url = "https://files.pythonhosted.org/packages/da/36/236fbd868b6c91309a5206bd90c3f881f4f44b2d997cd1d6239ef652f878/matplotlib-3.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d5f0a8430ffe23d7e32cfd86445864ccad141797f7d25b7c41759a5b5d17cfd7", size = 7771486, upload-time = "2024-12-13T05:56:04.264Z" }, - { url = "https://files.pythonhosted.org/packages/e0/4b/105caf2d54d5ed11d9f4335398f5103001a03515f2126c936a752ccf1461/matplotlib-3.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6bb0141a21aef3b64b633dc4d16cbd5fc538b727e4958be82a0e1c92a234160e", size = 8201838, upload-time = "2024-12-13T05:56:06.792Z" }, - { url = "https://files.pythonhosted.org/packages/5d/a7/bb01188fb4013d34d274caf44a2f8091255b0497438e8b6c0a7c1710c692/matplotlib-3.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57aa235109e9eed52e2c2949db17da185383fa71083c00c6c143a60e07e0888c", size = 8314492, upload-time = "2024-12-13T05:56:09.964Z" }, - { url = "https://files.pythonhosted.org/packages/33/19/02e1a37f7141fc605b193e927d0a9cdf9dc124a20b9e68793f4ffea19695/matplotlib-3.9.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:b18c600061477ccfdd1e6fd050c33d8be82431700f3452b297a56d9ed7037abb", size = 9092500, upload-time = "2024-12-13T05:56:13.55Z" }, - { url = "https://files.pythonhosted.org/packages/57/68/c2feb4667adbf882ffa4b3e0ac9967f848980d9f8b5bebd86644aa67ce6a/matplotlib-3.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:ef5f2d1b67d2d2145ff75e10f8c008bfbf71d45137c4b648c87193e7dd053eac", size = 7822962, upload-time = "2024-12-13T05:56:16.358Z" }, - { url = "https://files.pythonhosted.org/packages/0c/22/2ef6a364cd3f565442b0b055e0599744f1e4314ec7326cdaaa48a4d864d7/matplotlib-3.9.4-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:44e0ed786d769d85bc787b0606a53f2d8d2d1d3c8a2608237365e9121c1a338c", size = 7877995, upload-time = "2024-12-13T05:56:18.805Z" }, - { url = "https://files.pythonhosted.org/packages/87/b8/2737456e566e9f4d94ae76b8aa0d953d9acb847714f9a7ad80184474f5be/matplotlib-3.9.4-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:09debb9ce941eb23ecdbe7eab972b1c3e0276dcf01688073faff7b0f61d6c6ca", size = 7769300, upload-time = "2024-12-13T05:56:21.315Z" }, - { url = "https://files.pythonhosted.org/packages/b2/1f/e709c6ec7b5321e6568769baa288c7178e60a93a9da9e682b39450da0e29/matplotlib-3.9.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bcc53cf157a657bfd03afab14774d54ba73aa84d42cfe2480c91bd94873952db", size = 8313423, upload-time = "2024-12-13T05:56:26.719Z" }, - { url = "https://files.pythonhosted.org/packages/5e/b6/5a1f868782cd13f053a679984e222007ecff654a9bfbac6b27a65f4eeb05/matplotlib-3.9.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:ad45da51be7ad02387801fd154ef74d942f49fe3fcd26a64c94842ba7ec0d865", size = 7854624, upload-time = "2024-12-13T05:56:29.359Z" }, ] [[package]] name = "matplotlib" version = "3.10.3" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12' and sys_platform != 'darwin'", - "python_full_version >= '3.12' and sys_platform == 'darwin'", - "python_full_version == '3.11.*' and sys_platform != 'darwin'", - "python_full_version == '3.11.*' and sys_platform == 'darwin'", - "python_full_version == '3.10.*' and sys_platform != 'darwin'", - "python_full_version == '3.10.*' and sys_platform == 'darwin'", -] dependencies = [ - { name = "contourpy", version = "1.3.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "cycler", marker = "python_full_version >= '3.10'" }, - { name = "fonttools", marker = "python_full_version >= '3.10'" }, - { name = "kiwisolver", version = "1.4.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "numpy", marker = "python_full_version >= '3.10'" }, - { name = "packaging", marker = "python_full_version >= '3.10'" }, - { name = "pillow", marker = "python_full_version >= '3.10'" }, - { name = "pyparsing", marker = "python_full_version >= '3.10'" }, - { name = "python-dateutil", marker = "python_full_version >= '3.10'" }, + { name = "contourpy" }, + { name = "cycler" }, + { name = "fonttools" }, + { name = "kiwisolver" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pillow" }, + { name = "pyparsing" }, + { name = "python-dateutil" }, ] sdist = { url = "https://files.pythonhosted.org/packages/26/91/d49359a21893183ed2a5b6c76bec40e0b1dcbf8ca148f864d134897cfc75/matplotlib-3.10.3.tar.gz", hash = "sha256:2f82d2c5bb7ae93aaaa4cd42aca65d76ce6376f83304fa3a630b569aca274df0", size = 34799811, upload-time = "2025-05-08T19:10:54.39Z" } wheels = [ @@ -1132,16 +802,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b8/d0/0cf4a6ecb9bc960d624c93effaeaae75cbf00b3bc4a54f35c8507273cda1/msgpack-1.1.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:bb29aaa613c0a1c40d1af111abf025f1732cab333f96f285d6a93b934738a68a", size = 419883, upload-time = "2025-06-13T06:52:12.806Z" }, { url = "https://files.pythonhosted.org/packages/62/83/9697c211720fa71a2dfb632cad6196a8af3abea56eece220fde4674dc44b/msgpack-1.1.1-cp312-cp312-win32.whl", hash = "sha256:870b9a626280c86cff9c576ec0d9cbcc54a1e5ebda9cd26dab12baf41fee218c", size = 65406, upload-time = "2025-06-13T06:52:14.271Z" }, { url = "https://files.pythonhosted.org/packages/c0/23/0abb886e80eab08f5e8c485d6f13924028602829f63b8f5fa25a06636628/msgpack-1.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:5692095123007180dca3e788bb4c399cc26626da51629a31d40207cb262e67f4", size = 72558, upload-time = "2025-06-13T06:52:15.252Z" }, - { url = "https://files.pythonhosted.org/packages/1f/bd/0792be119d7fe7dc2148689ef65c90507d82d20a204aab3b98c74a1f8684/msgpack-1.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f5be6b6bc52fad84d010cb45433720327ce886009d862f46b26d4d154001994b", size = 81882, upload-time = "2025-06-13T06:52:39.316Z" }, - { url = "https://files.pythonhosted.org/packages/75/77/ce06c8e26a816ae8730a8e030d263c5289adcaff9f0476f9b270bdd7c5c2/msgpack-1.1.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3a89cd8c087ea67e64844287ea52888239cbd2940884eafd2dcd25754fb72232", size = 78414, upload-time = "2025-06-13T06:52:40.341Z" }, - { url = "https://files.pythonhosted.org/packages/73/27/190576c497677fb4a0d05d896b24aea6cdccd910f206aaa7b511901befed/msgpack-1.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d75f3807a9900a7d575d8d6674a3a47e9f227e8716256f35bc6f03fc597ffbf", size = 400927, upload-time = "2025-06-13T06:52:41.399Z" }, - { url = "https://files.pythonhosted.org/packages/ed/af/6a0aa5a06762e70726ec3c10fb966600d84a7220b52635cb0ab2dc64d32f/msgpack-1.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d182dac0221eb8faef2e6f44701812b467c02674a322c739355c39e94730cdbf", size = 405903, upload-time = "2025-06-13T06:52:42.699Z" }, - { url = "https://files.pythonhosted.org/packages/1e/80/3f3da358cecbbe8eb12360814bd1277d59d2608485934742a074d99894a9/msgpack-1.1.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1b13fe0fb4aac1aa5320cd693b297fe6fdef0e7bea5518cbc2dd5299f873ae90", size = 393192, upload-time = "2025-06-13T06:52:43.986Z" }, - { url = "https://files.pythonhosted.org/packages/98/c6/3a0ec7fdebbb4f3f8f254696cd91d491c29c501dbebd86286c17e8f68cd7/msgpack-1.1.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:435807eeb1bc791ceb3247d13c79868deb22184e1fc4224808750f0d7d1affc1", size = 393851, upload-time = "2025-06-13T06:52:45.177Z" }, - { url = "https://files.pythonhosted.org/packages/39/37/df50d5f8e68514b60fbe70f6e8337ea2b32ae2be030871bcd9d1cf7d4b62/msgpack-1.1.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:4835d17af722609a45e16037bb1d4d78b7bdf19d6c0128116d178956618c4e88", size = 400292, upload-time = "2025-06-13T06:52:46.381Z" }, - { url = "https://files.pythonhosted.org/packages/fc/ec/1e067292e02d2ceb4c8cb5ba222c4f7bb28730eef5676740609dc2627e0f/msgpack-1.1.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a8ef6e342c137888ebbfb233e02b8fbd689bb5b5fcc59b34711ac47ebd504478", size = 401873, upload-time = "2025-06-13T06:52:47.957Z" }, - { url = "https://files.pythonhosted.org/packages/d3/31/e8c9c6b5b58d64c9efa99c8d181fcc25f38ead357b0360379fbc8a4234ad/msgpack-1.1.1-cp39-cp39-win32.whl", hash = "sha256:61abccf9de335d9efd149e2fff97ed5974f2481b3353772e8e2dd3402ba2bd57", size = 65028, upload-time = "2025-06-13T06:52:49.166Z" }, - { url = "https://files.pythonhosted.org/packages/20/d6/cd62cded572e5e25892747a5d27850170bcd03c855e9c69c538e024de6f9/msgpack-1.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:40eae974c873b2992fd36424a5d9407f93e97656d999f43fca9d29f820899084", size = 71700, upload-time = "2025-06-13T06:52:50.244Z" }, ] [[package]] @@ -1174,12 +834,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/15/f8/491997a9b8a554204f834ed4816bda813aefda31cf873bb099deee3c9a99/mypy-1.16.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0ab5eca37b50188163fa7c1b73c685ac66c4e9bdee4a85c9adac0e91d8895e15", size = 12722980, upload-time = "2025-06-16T16:37:40.929Z" }, { url = "https://files.pythonhosted.org/packages/df/f0/2bd41e174b5fd93bc9de9a28e4fb673113633b8a7f3a607fa4a73595e468/mypy-1.16.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:dedb6229b2c9086247e21a83c309754b9058b438704ad2f6807f0d8227f6ebdd", size = 12903328, upload-time = "2025-06-16T16:34:35.099Z" }, { url = "https://files.pythonhosted.org/packages/61/81/5572108a7bec2c46b8aff7e9b524f371fe6ab5efb534d38d6b37b5490da8/mypy-1.16.1-cp312-cp312-win_amd64.whl", hash = "sha256:1f0435cf920e287ff68af3d10a118a73f212deb2ce087619eb4e648116d1fe9b", size = 9562321, upload-time = "2025-06-16T16:48:58.823Z" }, - { url = "https://files.pythonhosted.org/packages/49/5e/ed1e6a7344005df11dfd58b0fdd59ce939a0ba9f7ed37754bf20670b74db/mypy-1.16.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7fc688329af6a287567f45cc1cefb9db662defeb14625213a5b7da6e692e2069", size = 10959511, upload-time = "2025-06-16T16:47:21.945Z" }, - { url = "https://files.pythonhosted.org/packages/30/88/a7cbc2541e91fe04f43d9e4577264b260fecedb9bccb64ffb1a34b7e6c22/mypy-1.16.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5e198ab3f55924c03ead626ff424cad1732d0d391478dfbf7bb97b34602395da", size = 10075555, upload-time = "2025-06-16T16:50:14.084Z" }, - { url = "https://files.pythonhosted.org/packages/93/f7/c62b1e31a32fbd1546cca5e0a2e5f181be5761265ad1f2e94f2a306fa906/mypy-1.16.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:09aa4f91ada245f0a45dbc47e548fd94e0dd5a8433e0114917dc3b526912a30c", size = 11874169, upload-time = "2025-06-16T16:49:42.276Z" }, - { url = "https://files.pythonhosted.org/packages/c8/15/db580a28034657fb6cb87af2f8996435a5b19d429ea4dcd6e1c73d418e60/mypy-1.16.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:13c7cd5b1cb2909aa318a90fd1b7e31f17c50b242953e7dd58345b2a814f6383", size = 12610060, upload-time = "2025-06-16T16:34:15.215Z" }, - { url = "https://files.pythonhosted.org/packages/ec/78/c17f48f6843048fa92d1489d3095e99324f2a8c420f831a04ccc454e2e51/mypy-1.16.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:58e07fb958bc5d752a280da0e890c538f1515b79a65757bbdc54252ba82e0b40", size = 12875199, upload-time = "2025-06-16T16:35:14.448Z" }, - { url = "https://files.pythonhosted.org/packages/bc/d6/ed42167d0a42680381653fd251d877382351e1bd2c6dd8a818764be3beb1/mypy-1.16.1-cp39-cp39-win_amd64.whl", hash = "sha256:f895078594d918f93337a505f8add9bd654d1a24962b4c6ed9390e12531eb31b", size = 9487033, upload-time = "2025-06-16T16:49:57.907Z" }, { url = "https://files.pythonhosted.org/packages/cf/d3/53e684e78e07c1a2bf7105715e5edd09ce951fc3f47cf9ed095ec1b7a037/mypy-1.16.1-py3-none-any.whl", hash = "sha256:5fc2ac4027d0ef28d6ba69a0343737a23c4d1b83672bf38d1fe237bdc0643b37", size = 2265923, upload-time = "2025-06-16T16:48:02.366Z" }, ] @@ -1240,17 +894,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/76/8c/2ba3902e1a0fc1c74962ea9bb33a534bb05984ad7ff9515bf8d07527cadd/numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0", size = 17786643, upload-time = "2024-02-05T23:57:56.585Z" }, { url = "https://files.pythonhosted.org/packages/28/4a/46d9e65106879492374999e76eb85f87b15328e06bd1550668f79f7b18c6/numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110", size = 5677803, upload-time = "2024-02-05T23:58:08.963Z" }, { url = "https://files.pythonhosted.org/packages/16/2e/86f24451c2d530c88daf997cb8d6ac622c1d40d19f5a031ed68a4b73a374/numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818", size = 15517754, upload-time = "2024-02-05T23:58:36.364Z" }, - { url = "https://files.pythonhosted.org/packages/7d/24/ce71dc08f06534269f66e73c04f5709ee024a1afe92a7b6e1d73f158e1f8/numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c", size = 20636301, upload-time = "2024-02-05T23:59:10.976Z" }, - { url = "https://files.pythonhosted.org/packages/ae/8c/ab03a7c25741f9ebc92684a20125fbc9fc1b8e1e700beb9197d750fdff88/numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be", size = 13971216, upload-time = "2024-02-05T23:59:35.472Z" }, - { url = "https://files.pythonhosted.org/packages/6d/64/c3bcdf822269421d85fe0d64ba972003f9bb4aa9a419da64b86856c9961f/numpy-1.26.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764", size = 14226281, upload-time = "2024-02-05T23:59:59.372Z" }, - { url = "https://files.pythonhosted.org/packages/54/30/c2a907b9443cf42b90c17ad10c1e8fa801975f01cb9764f3f8eb8aea638b/numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3", size = 18249516, upload-time = "2024-02-06T00:00:32.79Z" }, - { url = "https://files.pythonhosted.org/packages/43/12/01a563fc44c07095996d0129b8899daf89e4742146f7044cdbdb3101c57f/numpy-1.26.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd", size = 13882132, upload-time = "2024-02-06T00:00:58.197Z" }, - { url = "https://files.pythonhosted.org/packages/16/ee/9df80b06680aaa23fc6c31211387e0db349e0e36d6a63ba3bd78c5acdf11/numpy-1.26.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c", size = 18084181, upload-time = "2024-02-06T00:01:31.21Z" }, - { url = "https://files.pythonhosted.org/packages/28/7d/4b92e2fe20b214ffca36107f1a3e75ef4c488430e64de2d9af5db3a4637d/numpy-1.26.4-cp39-cp39-win32.whl", hash = "sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6", size = 5976360, upload-time = "2024-02-06T00:01:43.013Z" }, - { url = "https://files.pythonhosted.org/packages/b5/42/054082bd8220bbf6f297f982f0a8f5479fcbc55c8b511d928df07b965869/numpy-1.26.4-cp39-cp39-win_amd64.whl", hash = "sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea", size = 15814633, upload-time = "2024-02-06T00:02:16.694Z" }, - { url = "https://files.pythonhosted.org/packages/3f/72/3df6c1c06fc83d9cfe381cccb4be2532bbd38bf93fbc9fad087b6687f1c0/numpy-1.26.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30", size = 20455961, upload-time = "2024-02-06T00:03:05.993Z" }, - { url = "https://files.pythonhosted.org/packages/8e/02/570545bac308b58ffb21adda0f4e220ba716fb658a63c151daecc3293350/numpy-1.26.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c", size = 18061071, upload-time = "2024-02-06T00:03:41.5Z" }, - { url = "https://files.pythonhosted.org/packages/f4/5f/fafd8c51235f60d49f7a88e2275e13971e90555b67da52dd6416caec32fe/numpy-1.26.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0", size = 15709730, upload-time = "2024-02-06T00:04:11.719Z" }, ] [[package]] @@ -1278,10 +921,8 @@ source = { registry = "https://pypi.org/simple" } resolution-markers = [ "python_full_version == '3.11.*' and sys_platform != 'darwin'", "python_full_version == '3.11.*' and sys_platform == 'darwin'", - "python_full_version == '3.10.*' and sys_platform != 'darwin'", - "python_full_version == '3.10.*' and sys_platform == 'darwin'", - "python_full_version < '3.10' and sys_platform != 'darwin'", - "python_full_version < '3.10' and sys_platform == 'darwin'", + "python_full_version < '3.11' and sys_platform != 'darwin'", + "python_full_version < '3.11' and sys_platform == 'darwin'", ] dependencies = [ { name = "numpy", marker = "python_full_version < '3.12'" }, @@ -1302,13 +943,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/63/8d/c2bd356b9d4baf1c5cf8d7e251fb4540e87083072c905430da48c2bb31eb/pandas-1.5.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e474390e60ed609cec869b0da796ad94f420bb057d86784191eefc62b65819ae", size = 11374218, upload-time = "2023-01-19T08:30:00.5Z" }, { url = "https://files.pythonhosted.org/packages/56/73/3351beeb807dca69fcc3c4966bcccc51552bd01549a9b13c04ab00a43f21/pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f2b952406a1588ad4cad5b3f55f520e82e902388a6d5a4a91baa8d38d23c7f6", size = 12017319, upload-time = "2023-01-19T08:30:06.097Z" }, { url = "https://files.pythonhosted.org/packages/da/6d/1235da14daddaa6e47f74ba0c255358f0ce7a6ee05da8bf8eb49161aa6b5/pandas-1.5.3-cp311-cp311-win_amd64.whl", hash = "sha256:bc4c368f42b551bf72fac35c5128963a171b40dce866fb066540eeaf46faa003", size = 10303385, upload-time = "2023-01-19T08:30:11.148Z" }, - { url = "https://files.pythonhosted.org/packages/90/19/1a92d73cda1233326e787a4c14362a1fcce4c7d9f28316fd769308aefb99/pandas-1.5.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c74a62747864ed568f5a82a49a23a8d7fe171d0c69038b38cedf0976831296fa", size = 18722090, upload-time = "2023-01-19T08:31:03.457Z" }, - { url = "https://files.pythonhosted.org/packages/02/4a/8e2513db9d15929b833147f975d8424dc6a3e18100ead10aab78756a1aad/pandas-1.5.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c4c00e0b0597c8e4f59e8d461f797e5d70b4d025880516a8261b2817c47759ee", size = 12049642, upload-time = "2023-01-19T08:31:09.324Z" }, - { url = "https://files.pythonhosted.org/packages/a7/2b/c71df8794e8e75ba1ec9da1c1a2efc946590aa79a05148a4138405ef5f72/pandas-1.5.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a50d9a4336a9621cab7b8eb3fb11adb82de58f9b91d84c2cd526576b881a0c5a", size = 10962439, upload-time = "2023-01-19T08:31:14.872Z" }, - { url = "https://files.pythonhosted.org/packages/7d/d6/92be61dca3880c7cec99a9b4acf6260b3dc00519673fdb3e6666ac6096ce/pandas-1.5.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd05f7783b3274aa206a1af06f0ceed3f9b412cf665b7247eacd83be41cf7bf0", size = 11471277, upload-time = "2023-01-19T08:31:19.706Z" }, - { url = "https://files.pythonhosted.org/packages/e1/4d/3eb96e53a9208350ee21615f850c4be9a246d32bf1d34cd36682cb58c3b7/pandas-1.5.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f69c4029613de47816b1bb30ff5ac778686688751a5e9c99ad8c7031f6508e5", size = 12169732, upload-time = "2023-01-19T08:31:24.806Z" }, - { url = "https://files.pythonhosted.org/packages/94/85/89f6547642b28fbd874504a6f548d6be4d88981837a23ab18d76cb773bea/pandas-1.5.3-cp39-cp39-win32.whl", hash = "sha256:7cec0bee9f294e5de5bbfc14d0573f65526071029d036b753ee6507d2a21480a", size = 9730624, upload-time = "2023-01-19T08:31:30.409Z" }, - { url = "https://files.pythonhosted.org/packages/c2/45/801ecd8434eef0b39cc02795ffae273fe3df3cfcb3f6fff215efbe92d93c/pandas-1.5.3-cp39-cp39-win_amd64.whl", hash = "sha256:dfd681c5dc216037e0b0a2c821f5ed99ba9f03ebcf119c7dac0e9a7b960b9ec9", size = 10932203, upload-time = "2023-01-19T08:31:35.717Z" }, ] [[package]] @@ -1348,13 +982,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d7/bf/0213986830a92d44d55153c1d69b509431a972eb73f204242988c4e66e86/pandas-2.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:404d681c698e3c8a40a61d0cd9412cc7364ab9a9cc6e144ae2992e11a2e77a20", size = 12470733, upload-time = "2025-06-06T00:00:18.651Z" }, { url = "https://files.pythonhosted.org/packages/a4/0e/21eb48a3a34a7d4bac982afc2c4eb5ab09f2d988bdf29d92ba9ae8e90a79/pandas-2.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6021910b086b3ca756755e86ddc64e0ddafd5e58e076c72cb1585162e5ad259b", size = 13212406, upload-time = "2025-06-05T03:26:55.992Z" }, { url = "https://files.pythonhosted.org/packages/1f/d9/74017c4eec7a28892d8d6e31ae9de3baef71f5a5286e74e6b7aad7f8c837/pandas-2.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:094e271a15b579650ebf4c5155c05dcd2a14fd4fdd72cf4854b2f7ad31ea30be", size = 10976199, upload-time = "2025-06-05T03:26:59.594Z" }, - { url = "https://files.pythonhosted.org/packages/38/86/d786690bd1d666d3369355a173b32a4ab7a83053cbb2d6a24ceeedb31262/pandas-2.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9efc0acbbffb5236fbdf0409c04edce96bec4bdaa649d49985427bd1ec73e085", size = 11552206, upload-time = "2025-06-06T00:00:29.501Z" }, - { url = "https://files.pythonhosted.org/packages/9c/2f/99f581c1c5b013fcfcbf00a48f5464fb0105da99ea5839af955e045ae3ab/pandas-2.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:75651c14fde635e680496148a8526b328e09fe0572d9ae9b638648c46a544ba3", size = 10796831, upload-time = "2025-06-06T00:00:49.502Z" }, - { url = "https://files.pythonhosted.org/packages/5c/be/3ee7f424367e0f9e2daee93a3145a18b703fbf733ba56e1cf914af4b40d1/pandas-2.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf5be867a0541a9fb47a4be0c5790a4bccd5b77b92f0a59eeec9375fafc2aa14", size = 11736943, upload-time = "2025-06-06T00:01:15.992Z" }, - { url = "https://files.pythonhosted.org/packages/83/95/81c7bb8f1aefecd948f80464177a7d9a1c5e205c5a1e279984fdacbac9de/pandas-2.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84141f722d45d0c2a89544dd29d35b3abfc13d2250ed7e68394eda7564bd6324", size = 12366679, upload-time = "2025-06-06T00:01:36.162Z" }, - { url = "https://files.pythonhosted.org/packages/d5/7a/54cf52fb454408317136d683a736bb597864db74977efee05e63af0a7d38/pandas-2.3.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:f95a2aef32614ed86216d3c450ab12a4e82084e8102e355707a1d96e33d51c34", size = 12924072, upload-time = "2025-06-06T00:01:44.243Z" }, - { url = "https://files.pythonhosted.org/packages/0a/bf/25018e431257f8a42c173080f9da7c592508269def54af4a76ccd1c14420/pandas-2.3.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:e0f51973ba93a9f97185049326d75b942b9aeb472bec616a129806facb129ebb", size = 13696374, upload-time = "2025-06-06T00:02:14.346Z" }, - { url = "https://files.pythonhosted.org/packages/db/84/5ffd2c447c02db56326f5c19a235a747fae727e4842cc20e1ddd28f990f6/pandas-2.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:b198687ca9c8529662213538a9bb1e60fa0bf0f6af89292eb68fea28743fcd5a", size = 11104735, upload-time = "2025-06-06T00:02:21.088Z" }, ] [[package]] @@ -1414,17 +1041,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/26/7d/73699ad77895f69edff76b0f332acc3d497f22f5d75e5360f78cbcaff248/pillow-11.3.0-cp312-cp312-win32.whl", hash = "sha256:7b161756381f0918e05e7cb8a371fff367e807770f8fe92ecb20d905d0e1c149", size = 6275079, upload-time = "2025-07-01T09:14:30.104Z" }, { url = "https://files.pythonhosted.org/packages/8c/ce/e7dfc873bdd9828f3b6e5c2bbb74e47a98ec23cc5c74fc4e54462f0d9204/pillow-11.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:a6444696fce635783440b7f7a9fc24b3ad10a9ea3f0ab66c5905be1c19ccf17d", size = 6986324, upload-time = "2025-07-01T09:14:31.899Z" }, { url = "https://files.pythonhosted.org/packages/16/8f/b13447d1bf0b1f7467ce7d86f6e6edf66c0ad7cf44cf5c87a37f9bed9936/pillow-11.3.0-cp312-cp312-win_arm64.whl", hash = "sha256:2aceea54f957dd4448264f9bf40875da0415c83eb85f55069d89c0ed436e3542", size = 2423067, upload-time = "2025-07-01T09:14:33.709Z" }, - { url = "https://files.pythonhosted.org/packages/9e/8e/9c089f01677d1264ab8648352dcb7773f37da6ad002542760c80107da816/pillow-11.3.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:48d254f8a4c776de343051023eb61ffe818299eeac478da55227d96e241de53f", size = 5316478, upload-time = "2025-07-01T09:15:52.209Z" }, - { url = "https://files.pythonhosted.org/packages/b5/a9/5749930caf674695867eb56a581e78eb5f524b7583ff10b01b6e5048acb3/pillow-11.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7aee118e30a4cf54fdd873bd3a29de51e29105ab11f9aad8c32123f58c8f8081", size = 4686522, upload-time = "2025-07-01T09:15:54.162Z" }, - { url = "https://files.pythonhosted.org/packages/43/46/0b85b763eb292b691030795f9f6bb6fcaf8948c39413c81696a01c3577f7/pillow-11.3.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:23cff760a9049c502721bdb743a7cb3e03365fafcdfc2ef9784610714166e5a4", size = 5853376, upload-time = "2025-07-03T13:11:01.066Z" }, - { url = "https://files.pythonhosted.org/packages/5e/c6/1a230ec0067243cbd60bc2dad5dc3ab46a8a41e21c15f5c9b52b26873069/pillow-11.3.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6359a3bc43f57d5b375d1ad54a0074318a0844d11b76abccf478c37c986d3cfc", size = 7626020, upload-time = "2025-07-03T13:11:06.479Z" }, - { url = "https://files.pythonhosted.org/packages/63/dd/f296c27ffba447bfad76c6a0c44c1ea97a90cb9472b9304c94a732e8dbfb/pillow-11.3.0-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:092c80c76635f5ecb10f3f83d76716165c96f5229addbd1ec2bdbbda7d496e06", size = 5956732, upload-time = "2025-07-01T09:15:56.111Z" }, - { url = "https://files.pythonhosted.org/packages/a5/a0/98a3630f0b57f77bae67716562513d3032ae70414fcaf02750279c389a9e/pillow-11.3.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cadc9e0ea0a2431124cde7e1697106471fc4c1da01530e679b2391c37d3fbb3a", size = 6624404, upload-time = "2025-07-01T09:15:58.245Z" }, - { url = "https://files.pythonhosted.org/packages/de/e6/83dfba5646a290edd9a21964da07674409e410579c341fc5b8f7abd81620/pillow-11.3.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:6a418691000f2a418c9135a7cf0d797c1bb7d9a485e61fe8e7722845b95ef978", size = 6067760, upload-time = "2025-07-01T09:16:00.003Z" }, - { url = "https://files.pythonhosted.org/packages/bc/41/15ab268fe6ee9a2bc7391e2bbb20a98d3974304ab1a406a992dcb297a370/pillow-11.3.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:97afb3a00b65cc0804d1c7abddbf090a81eaac02768af58cbdcaaa0a931e0b6d", size = 6700534, upload-time = "2025-07-01T09:16:02.29Z" }, - { url = "https://files.pythonhosted.org/packages/64/79/6d4f638b288300bed727ff29f2a3cb63db054b33518a95f27724915e3fbc/pillow-11.3.0-cp39-cp39-win32.whl", hash = "sha256:ea944117a7974ae78059fcc1800e5d3295172bb97035c0c1d9345fca1419da71", size = 6277091, upload-time = "2025-07-01T09:16:04.4Z" }, - { url = "https://files.pythonhosted.org/packages/46/05/4106422f45a05716fd34ed21763f8ec182e8ea00af6e9cb05b93a247361a/pillow-11.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:e5c5858ad8ec655450a7c7df532e9842cf8df7cc349df7225c60d5d348c8aada", size = 6986091, upload-time = "2025-07-01T09:16:06.342Z" }, - { url = "https://files.pythonhosted.org/packages/63/c6/287fd55c2c12761d0591549d48885187579b7c257bef0c6660755b0b59ae/pillow-11.3.0-cp39-cp39-win_arm64.whl", hash = "sha256:6abdbfd3aea42be05702a8dd98832329c167ee84400a1d1f61ab11437f1717eb", size = 2422632, upload-time = "2025-07-01T09:16:08.142Z" }, { url = "https://files.pythonhosted.org/packages/6f/8b/209bd6b62ce8367f47e68a218bffac88888fdf2c9fcf1ecadc6c3ec1ebc7/pillow-11.3.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:3cee80663f29e3843b68199b9d6f4f54bd1d4a6b59bdd91bceefc51238bcb967", size = 5270556, upload-time = "2025-07-01T09:16:09.961Z" }, { url = "https://files.pythonhosted.org/packages/2e/e6/231a0b76070c2cfd9e260a7a5b504fb72da0a95279410fa7afd99d9751d6/pillow-11.3.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:b5f56c3f344f2ccaf0dd875d3e180f631dc60a51b314295a3e681fe8cf851fbe", size = 4654625, upload-time = "2025-07-01T09:16:11.913Z" }, { url = "https://files.pythonhosted.org/packages/13/f4/10cf94fda33cb12765f2397fc285fa6d8eb9c29de7f3185165b702fc7386/pillow-11.3.0-pp310-pypy310_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e67d793d180c9df62f1f40aee3accca4829d3794c95098887edc18af4b8b780c", size = 4874207, upload-time = "2025-07-03T13:11:10.201Z" }, @@ -1561,13 +1177,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/bb/e6/9b3afbbcf10cc724312e824af94a2e993d8ace22994d823f5c35324cebf5/pyarrow-18.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:d4b3d2a34780645bed6414e22dda55a92e0fcd1b8a637fba86800ad737057e33", size = 38618629, upload-time = "2024-11-26T01:59:59.966Z" }, { url = "https://files.pythonhosted.org/packages/3a/2e/3b99f8a3d9e0ccae0e961978a0d0089b25fb46ebbcfb5ebae3cca179a5b3/pyarrow-18.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:c52f81aa6f6575058d8e2c782bf79d4f9fdc89887f16825ec3a66607a5dd8e30", size = 40078661, upload-time = "2024-11-26T02:00:04.55Z" }, { url = "https://files.pythonhosted.org/packages/76/52/f8da04195000099d394012b8d42c503d7041b79f778d854f410e5f05049a/pyarrow-18.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:0ad4892617e1a6c7a551cfc827e072a633eaff758fa09f21c4ee548c30bcaf99", size = 25092330, upload-time = "2024-11-26T02:00:09.576Z" }, - { url = "https://files.pythonhosted.org/packages/fd/9b/60516e3876ec6f25b0909afa70f90a15de83b48c7c0d8042fac4e64c4411/pyarrow-18.1.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:0b331e477e40f07238adc7ba7469c36b908f07c89b95dd4bd3a0ec84a3d1e21e", size = 29543752, upload-time = "2024-11-26T02:01:17.179Z" }, - { url = "https://files.pythonhosted.org/packages/14/a7/bd08b6f1a2bd2e71dc6bb0451fc1872607e44c83daf1ee63c82764a2d233/pyarrow-18.1.0-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:2c4dd0c9010a25ba03e198fe743b1cc03cd33c08190afff371749c52ccbbaf76", size = 30850753, upload-time = "2024-11-26T02:01:21.374Z" }, - { url = "https://files.pythonhosted.org/packages/84/c9/62ef9c6281c0e5b4ee1afa9d7bd556e72e06da6706b7906c32c15e69b3d6/pyarrow-18.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f97b31b4c4e21ff58c6f330235ff893cc81e23da081b1a4b1c982075e0ed4e9", size = 39226870, upload-time = "2024-11-26T02:01:25.533Z" }, - { url = "https://files.pythonhosted.org/packages/b2/99/a6e89e71655a38475e76b060777c8bf69c078b772bec3b7daf7361440f05/pyarrow-18.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a4813cb8ecf1809871fd2d64a8eff740a1bd3691bbe55f01a3cf6c5ec869754", size = 40139114, upload-time = "2024-11-26T02:01:31.317Z" }, - { url = "https://files.pythonhosted.org/packages/64/a9/06d79923890682e4fe7a16524abee307407008a413115354aaf3226b8410/pyarrow-18.1.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:05a5636ec3eb5cc2a36c6edb534a38ef57b2ab127292a716d00eabb887835f1e", size = 38639231, upload-time = "2024-11-26T02:01:36.734Z" }, - { url = "https://files.pythonhosted.org/packages/3b/8c/4c3ed19026a00740b81fe1c87f3ff235b2763a0a1ddf5711a9d026b775ce/pyarrow-18.1.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:73eeed32e724ea3568bb06161cad5fa7751e45bc2228e33dcb10c614044165c7", size = 40070949, upload-time = "2024-11-26T02:01:41.703Z" }, - { url = "https://files.pythonhosted.org/packages/87/d8/94161a7ca5c55199484e926165e9e33f318ea1d1b0d7cdbcbc3652b933ec/pyarrow-18.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:a1880dd6772b685e803011a6b43a230c23b566859a6e0c9a276c1e0faf4f4052", size = 25301373, upload-time = "2024-11-26T02:01:46.244Z" }, ] [[package]] @@ -1575,7 +1184,6 @@ name = "pybtex" version = "0.25.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "importlib-metadata", marker = "python_full_version < '3.10'" }, { name = "latexcodec" }, { name = "pyyaml" }, ] @@ -1662,19 +1270,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b0/6a/adf5734ffd52bf86d865093ad70b2ce543415e0e356f6cacabbc0d9ad910/pydantic_core-2.33.2-cp312-cp312-win32.whl", hash = "sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290", size = 1892628, upload-time = "2025-04-23T18:31:47.819Z" }, { url = "https://files.pythonhosted.org/packages/43/e4/5479fecb3606c1368d496a825d8411e126133c41224c1e7238be58b87d7e/pydantic_core-2.33.2-cp312-cp312-win_amd64.whl", hash = "sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2", size = 1955866, upload-time = "2025-04-23T18:31:49.635Z" }, { url = "https://files.pythonhosted.org/packages/0d/24/8b11e8b3e2be9dd82df4b11408a67c61bb4dc4f8e11b5b0fc888b38118b5/pydantic_core-2.33.2-cp312-cp312-win_arm64.whl", hash = "sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab", size = 1888894, upload-time = "2025-04-23T18:31:51.609Z" }, - { url = "https://files.pythonhosted.org/packages/53/ea/bbe9095cdd771987d13c82d104a9c8559ae9aec1e29f139e286fd2e9256e/pydantic_core-2.33.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:a2b911a5b90e0374d03813674bf0a5fbbb7741570dcd4b4e85a2e48d17def29d", size = 2028677, upload-time = "2025-04-23T18:32:27.227Z" }, - { url = "https://files.pythonhosted.org/packages/49/1d/4ac5ed228078737d457a609013e8f7edc64adc37b91d619ea965758369e5/pydantic_core-2.33.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6fa6dfc3e4d1f734a34710f391ae822e0a8eb8559a85c6979e14e65ee6ba2954", size = 1864735, upload-time = "2025-04-23T18:32:29.019Z" }, - { url = "https://files.pythonhosted.org/packages/23/9a/2e70d6388d7cda488ae38f57bc2f7b03ee442fbcf0d75d848304ac7e405b/pydantic_core-2.33.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c54c939ee22dc8e2d545da79fc5381f1c020d6d3141d3bd747eab59164dc89fb", size = 1898467, upload-time = "2025-04-23T18:32:31.119Z" }, - { url = "https://files.pythonhosted.org/packages/ff/2e/1568934feb43370c1ffb78a77f0baaa5a8b6897513e7a91051af707ffdc4/pydantic_core-2.33.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:53a57d2ed685940a504248187d5685e49eb5eef0f696853647bf37c418c538f7", size = 1983041, upload-time = "2025-04-23T18:32:33.655Z" }, - { url = "https://files.pythonhosted.org/packages/01/1a/1a1118f38ab64eac2f6269eb8c120ab915be30e387bb561e3af904b12499/pydantic_core-2.33.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:09fb9dd6571aacd023fe6aaca316bd01cf60ab27240d7eb39ebd66a3a15293b4", size = 2136503, upload-time = "2025-04-23T18:32:35.519Z" }, - { url = "https://files.pythonhosted.org/packages/5c/da/44754d1d7ae0f22d6d3ce6c6b1486fc07ac2c524ed8f6eca636e2e1ee49b/pydantic_core-2.33.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0e6116757f7959a712db11f3e9c0a99ade00a5bbedae83cb801985aa154f071b", size = 2736079, upload-time = "2025-04-23T18:32:37.659Z" }, - { url = "https://files.pythonhosted.org/packages/4d/98/f43cd89172220ec5aa86654967b22d862146bc4d736b1350b4c41e7c9c03/pydantic_core-2.33.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d55ab81c57b8ff8548c3e4947f119551253f4e3787a7bbc0b6b3ca47498a9d3", size = 2006508, upload-time = "2025-04-23T18:32:39.637Z" }, - { url = "https://files.pythonhosted.org/packages/2b/cc/f77e8e242171d2158309f830f7d5d07e0531b756106f36bc18712dc439df/pydantic_core-2.33.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c20c462aa4434b33a2661701b861604913f912254e441ab8d78d30485736115a", size = 2113693, upload-time = "2025-04-23T18:32:41.818Z" }, - { url = "https://files.pythonhosted.org/packages/54/7a/7be6a7bd43e0a47c147ba7fbf124fe8aaf1200bc587da925509641113b2d/pydantic_core-2.33.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:44857c3227d3fb5e753d5fe4a3420d6376fa594b07b621e220cd93703fe21782", size = 2074224, upload-time = "2025-04-23T18:32:44.033Z" }, - { url = "https://files.pythonhosted.org/packages/2a/07/31cf8fadffbb03be1cb520850e00a8490c0927ec456e8293cafda0726184/pydantic_core-2.33.2-cp39-cp39-musllinux_1_1_armv7l.whl", hash = "sha256:eb9b459ca4df0e5c87deb59d37377461a538852765293f9e6ee834f0435a93b9", size = 2245403, upload-time = "2025-04-23T18:32:45.836Z" }, - { url = "https://files.pythonhosted.org/packages/b6/8d/bbaf4c6721b668d44f01861f297eb01c9b35f612f6b8e14173cb204e6240/pydantic_core-2.33.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9fcd347d2cc5c23b06de6d3b7b8275be558a0c90549495c699e379a80bf8379e", size = 2242331, upload-time = "2025-04-23T18:32:47.618Z" }, - { url = "https://files.pythonhosted.org/packages/bb/93/3cc157026bca8f5006250e74515119fcaa6d6858aceee8f67ab6dc548c16/pydantic_core-2.33.2-cp39-cp39-win32.whl", hash = "sha256:83aa99b1285bc8f038941ddf598501a86f1536789740991d7d8756e34f1e74d9", size = 1910571, upload-time = "2025-04-23T18:32:49.401Z" }, - { url = "https://files.pythonhosted.org/packages/5b/90/7edc3b2a0d9f0dda8806c04e511a67b0b7a41d2187e2003673a996fb4310/pydantic_core-2.33.2-cp39-cp39-win_amd64.whl", hash = "sha256:f481959862f57f29601ccced557cc2e817bce7533ab8e01a797a48b49c9692b3", size = 1956504, upload-time = "2025-04-23T18:32:51.287Z" }, { url = "https://files.pythonhosted.org/packages/30/68/373d55e58b7e83ce371691f6eaa7175e3a24b956c44628eb25d7da007917/pydantic_core-2.33.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5c4aa4e82353f65e548c476b37e64189783aa5384903bfea4f41580f255fddfa", size = 2023982, upload-time = "2025-04-23T18:32:53.14Z" }, { url = "https://files.pythonhosted.org/packages/a4/16/145f54ac08c96a63d8ed6442f9dec17b2773d19920b627b18d4f10a061ea/pydantic_core-2.33.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d946c8bf0d5c24bf4fe333af284c59a19358aa3ec18cb3dc4370080da1e8ad29", size = 1858412, upload-time = "2025-04-23T18:32:55.52Z" }, { url = "https://files.pythonhosted.org/packages/41/b1/c6dc6c3e2de4516c0bb2c46f6a373b91b5660312342a0cf5826e38ad82fa/pydantic_core-2.33.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87b31b6846e361ef83fedb187bb5b4372d0da3f7e28d85415efa92d6125d6e6d", size = 1892749, upload-time = "2025-04-23T18:32:57.546Z" }, @@ -1693,15 +1288,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b8/e9/1f7efbe20d0b2b10f6718944b5d8ece9152390904f29a78e68d4e7961159/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf", size = 2239013, upload-time = "2025-04-23T18:33:26.621Z" }, { url = "https://files.pythonhosted.org/packages/3c/b2/5309c905a93811524a49b4e031e9851a6b00ff0fb668794472ea7746b448/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb", size = 2238715, upload-time = "2025-04-23T18:33:28.656Z" }, { url = "https://files.pythonhosted.org/packages/32/56/8a7ca5d2cd2cda1d245d34b1c9a942920a718082ae8e54e5f3e5a58b7add/pydantic_core-2.33.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1", size = 2066757, upload-time = "2025-04-23T18:33:30.645Z" }, - { url = "https://files.pythonhosted.org/packages/08/98/dbf3fdfabaf81cda5622154fda78ea9965ac467e3239078e0dcd6df159e7/pydantic_core-2.33.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:87acbfcf8e90ca885206e98359d7dca4bcbb35abdc0ff66672a293e1d7a19101", size = 2024034, upload-time = "2025-04-23T18:33:32.843Z" }, - { url = "https://files.pythonhosted.org/packages/8d/99/7810aa9256e7f2ccd492590f86b79d370df1e9292f1f80b000b6a75bd2fb/pydantic_core-2.33.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:7f92c15cd1e97d4b12acd1cc9004fa092578acfa57b67ad5e43a197175d01a64", size = 1858578, upload-time = "2025-04-23T18:33:34.912Z" }, - { url = "https://files.pythonhosted.org/packages/d8/60/bc06fa9027c7006cc6dd21e48dbf39076dc39d9abbaf718a1604973a9670/pydantic_core-2.33.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d3f26877a748dc4251cfcfda9dfb5f13fcb034f5308388066bcfe9031b63ae7d", size = 1892858, upload-time = "2025-04-23T18:33:36.933Z" }, - { url = "https://files.pythonhosted.org/packages/f2/40/9d03997d9518816c68b4dfccb88969756b9146031b61cd37f781c74c9b6a/pydantic_core-2.33.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dac89aea9af8cd672fa7b510e7b8c33b0bba9a43186680550ccf23020f32d535", size = 2068498, upload-time = "2025-04-23T18:33:38.997Z" }, - { url = "https://files.pythonhosted.org/packages/d8/62/d490198d05d2d86672dc269f52579cad7261ced64c2df213d5c16e0aecb1/pydantic_core-2.33.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:970919794d126ba8645f3837ab6046fb4e72bbc057b3709144066204c19a455d", size = 2108428, upload-time = "2025-04-23T18:33:41.18Z" }, - { url = "https://files.pythonhosted.org/packages/9a/ec/4cd215534fd10b8549015f12ea650a1a973da20ce46430b68fc3185573e8/pydantic_core-2.33.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:3eb3fe62804e8f859c49ed20a8451342de53ed764150cb14ca71357c765dc2a6", size = 2069854, upload-time = "2025-04-23T18:33:43.446Z" }, - { url = "https://files.pythonhosted.org/packages/1a/1a/abbd63d47e1d9b0d632fee6bb15785d0889c8a6e0a6c3b5a8e28ac1ec5d2/pydantic_core-2.33.2-pp39-pypy39_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:3abcd9392a36025e3bd55f9bd38d908bd17962cc49bc6da8e7e96285336e2bca", size = 2237859, upload-time = "2025-04-23T18:33:45.56Z" }, - { url = "https://files.pythonhosted.org/packages/80/1c/fa883643429908b1c90598fd2642af8839efd1d835b65af1f75fba4d94fe/pydantic_core-2.33.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:3a1c81334778f9e3af2f8aeb7a960736e5cab1dfebfb26aabca09afd2906c039", size = 2239059, upload-time = "2025-04-23T18:33:47.735Z" }, - { url = "https://files.pythonhosted.org/packages/d4/29/3cade8a924a61f60ccfa10842f75eb12787e1440e2b8660ceffeb26685e7/pydantic_core-2.33.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2807668ba86cb38c6817ad9bc66215ab8584d1d304030ce4f0887336f28a5e27", size = 2066661, upload-time = "2025-04-23T18:33:49.995Z" }, ] [[package]] @@ -1761,7 +1347,6 @@ dependencies = [ { name = "platformdirs" }, { name = "tomli", marker = "python_full_version < '3.11'" }, { name = "tomlkit" }, - { name = "typing-extensions", marker = "python_full_version < '3.10'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/1c/e4/83e487d3ddd64ab27749b66137b26dc0c5b5c161be680e6beffdc99070b3/pylint-3.3.7.tar.gz", hash = "sha256:2b11de8bde49f9c5059452e0c310c079c746a0a8eeaa789e5aa966ecc23e4559", size = 1520709, upload-time = "2025-05-04T17:07:51.089Z" } wheels = [ @@ -1891,15 +1476,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c9/1f/4f998c900485e5c0ef43838363ba4a9723ac0ad73a9dc42068b12aaba4e4/PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b", size = 756611, upload-time = "2024-08-06T20:32:38.898Z" }, { url = "https://files.pythonhosted.org/packages/df/d1/f5a275fdb252768b7a11ec63585bc38d0e87c9e05668a139fea92b80634c/PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4", size = 140591, upload-time = "2024-08-06T20:32:40.241Z" }, { url = "https://files.pythonhosted.org/packages/0c/e8/4f648c598b17c3d06e8753d7d13d57542b30d56e6c2dedf9c331ae56312e/PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8", size = 156338, upload-time = "2024-08-06T20:32:41.93Z" }, - { url = "https://files.pythonhosted.org/packages/65/d8/b7a1db13636d7fb7d4ff431593c510c8b8fca920ade06ca8ef20015493c5/PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d", size = 184777, upload-time = "2024-08-06T20:33:25.896Z" }, - { url = "https://files.pythonhosted.org/packages/0a/02/6ec546cd45143fdf9840b2c6be8d875116a64076218b61d68e12548e5839/PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f", size = 172318, upload-time = "2024-08-06T20:33:27.212Z" }, - { url = "https://files.pythonhosted.org/packages/0e/9a/8cc68be846c972bda34f6c2a93abb644fb2476f4dcc924d52175786932c9/PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290", size = 720891, upload-time = "2024-08-06T20:33:28.974Z" }, - { url = "https://files.pythonhosted.org/packages/e9/6c/6e1b7f40181bc4805e2e07f4abc10a88ce4648e7e95ff1abe4ae4014a9b2/PyYAML-6.0.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12", size = 722614, upload-time = "2024-08-06T20:33:34.157Z" }, - { url = "https://files.pythonhosted.org/packages/3d/32/e7bd8535d22ea2874cef6a81021ba019474ace0d13a4819c2a4bce79bd6a/PyYAML-6.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19", size = 737360, upload-time = "2024-08-06T20:33:35.84Z" }, - { url = "https://files.pythonhosted.org/packages/d7/12/7322c1e30b9be969670b672573d45479edef72c9a0deac3bb2868f5d7469/PyYAML-6.0.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e", size = 699006, upload-time = "2024-08-06T20:33:37.501Z" }, - { url = "https://files.pythonhosted.org/packages/82/72/04fcad41ca56491995076630c3ec1e834be241664c0c09a64c9a2589b507/PyYAML-6.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725", size = 723577, upload-time = "2024-08-06T20:33:39.389Z" }, - { url = "https://files.pythonhosted.org/packages/ed/5e/46168b1f2757f1fcd442bc3029cd8767d88a98c9c05770d8b420948743bb/PyYAML-6.0.2-cp39-cp39-win32.whl", hash = "sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631", size = 144593, upload-time = "2024-08-06T20:33:46.63Z" }, - { url = "https://files.pythonhosted.org/packages/19/87/5124b1c1f2412bb95c59ec481eaf936cd32f0fe2a7b16b97b81c4c017a6a/PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8", size = 162312, upload-time = "2024-08-06T20:33:49.073Z" }, ] [[package]] @@ -1907,10 +1483,8 @@ name = "randomgen" version = "1.26.0" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.10.*' and sys_platform != 'darwin'", - "python_full_version == '3.10.*' and sys_platform == 'darwin'", - "python_full_version < '3.10' and sys_platform != 'darwin'", - "python_full_version < '3.10' and sys_platform == 'darwin'", + "python_full_version < '3.11' and sys_platform != 'darwin'", + "python_full_version < '3.11' and sys_platform == 'darwin'", ] dependencies = [ { name = "numpy", marker = "python_full_version < '3.11'" }, @@ -1935,11 +1509,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/53/c1/ce533d1ac0aa8087b960f1a875d55e7a5a89f3ab45b32982f0e2ec135ef2/randomgen-1.26.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d55f1f0788fc632ef7cbeb3eab32f8521b9583cb3db0a08c121b4a9f32f45e08", size = 3456869, upload-time = "2023-09-26T08:12:30.317Z" }, { url = "https://files.pythonhosted.org/packages/f6/2a/fdc884dfeb1d7edccb832fcb4a94a5e73dace2b9391b3f256f1ecd0bafa1/randomgen-1.26.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:e857a7ddc4eaf5caad7b7aa31a8efb63b8b787de9b0b73ab9aeb2817558cf4aa", size = 3481319, upload-time = "2023-09-26T08:12:32.729Z" }, { url = "https://files.pythonhosted.org/packages/6c/1d/15a6f569c6746e7c5e18925684f4ebae35ee967cf82bb02cd3233e2645b1/randomgen-1.26.0-cp312-cp312-win_amd64.whl", hash = "sha256:3323b8c6bf738200ddaccdcebef06afab33c0c4115fb2b4cb64d3d78b16df8b2", size = 6472018, upload-time = "2023-09-26T07:18:21.137Z" }, - { url = "https://files.pythonhosted.org/packages/61/68/9faedbdefe33f464b32caed2dcab317f6c3f95067328e820b82c3f8a225b/randomgen-1.26.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fa222b214ecd377d4f672bfb328fdfcd858b25c71af3457e3c26da27857e237e", size = 3356223, upload-time = "2023-09-21T08:50:34.539Z" }, - { url = "https://files.pythonhosted.org/packages/eb/ab/5e034098a0cd5b467ca6aab8b6ce52b627ddeea9c713884765303dd8fc4f/randomgen-1.26.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0b49a240914722344494ad5436aa7c570b1e5a59e8db020c7258828a5a32979f", size = 3217322, upload-time = "2023-09-21T08:50:37.501Z" }, - { url = "https://files.pythonhosted.org/packages/60/b8/6882310c07ea69c19f88aa92cd341d1bea0543cbaf51321e9c06603dbad6/randomgen-1.26.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e7bbeef5b41a1770957f6889b32c8dfdd51acd3b987fb7fddd1cb68852304c9f", size = 3410970, upload-time = "2023-09-21T09:23:35.709Z" }, - { url = "https://files.pythonhosted.org/packages/fe/8c/4f397c90cfc1a560ccd9d90076c64d869bf07c228145464bceccec8d1b27/randomgen-1.26.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:16759e6fdf5c0c1a04ae528af9f54b2863a5fd92c16baf9a6bf4d14b6b984b59", size = 3533011, upload-time = "2023-09-21T09:23:37.817Z" }, - { url = "https://files.pythonhosted.org/packages/41/30/f18b82985e21cf3d5e3d44471bbcde81a98af90fe5fa2fd6947ec7268819/randomgen-1.26.0-cp39-cp39-win_amd64.whl", hash = "sha256:62bc2f397f8e9e5380400937f878cd92c3f773beb91491daccc266e867d6d1e0", size = 6494569, upload-time = "2023-09-21T08:43:39.809Z" }, ] [[package]] @@ -1975,11 +1544,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/38/d9/b514140f3d0f8c31e1c40145feb1cd14e3dfb15ab475af8449b70a3b7caf/randomgen-1.26.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8996f3bfd5a55e7e03bfb13c8d1d1b572a32f01fab5c83f333aedcd7e993f1c2", size = 3453844, upload-time = "2024-01-05T14:40:17.186Z" }, { url = "https://files.pythonhosted.org/packages/0d/d5/ad01bdc62cdf4b72f89e3e313209ffbf1ba2e112b4bb4c0b7d05cb898b76/randomgen-1.26.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:77457fae1ba660d59e20ab6a0794aef8df07a96b1f66969ae06de7ec8a154c24", size = 3481966, upload-time = "2024-01-05T14:40:19.711Z" }, { url = "https://files.pythonhosted.org/packages/39/a2/7f66047953e89e9bf4fb2891c242f6a1732aa686f01ddffd6030ef76663e/randomgen-1.26.1-cp312-cp312-win_amd64.whl", hash = "sha256:481af861f913cbb83f7f95b9ee2ec9aafe3b7cca763be54ec0d228cb0f68c3e3", size = 6472926, upload-time = "2024-01-05T14:06:53.61Z" }, - { url = "https://files.pythonhosted.org/packages/61/09/b315e18acd549f71a9c42f5d2caeecadc571dd8211dfbf3e28a73277af31/randomgen-1.26.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a0ab718f7f8b6f276d1aa83d4620d3cd9af0389c80e89e48ab77175d71645f16", size = 3346123, upload-time = "2024-01-05T14:12:17.077Z" }, - { url = "https://files.pythonhosted.org/packages/35/3f/53aab0f237e00e905380d42a73fcc8e9111cf5c664086427b440a698f942/randomgen-1.26.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6c133886ac3e4eef320f7ae7b87b45b62d3976f131af0e75e0ac93817d339ea2", size = 3211059, upload-time = "2024-01-05T14:12:18.643Z" }, - { url = "https://files.pythonhosted.org/packages/53/aa/70e8b75842ab58513614b691826f98cafcf2eb5a2201ece9929eb7b48d0a/randomgen-1.26.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15e5cb83bceae5fdf685e14506eeab59ed8df777f4fdf2aa5a98f4bfb4f7162b", size = 3407113, upload-time = "2024-01-05T14:36:03.194Z" }, - { url = "https://files.pythonhosted.org/packages/7e/b3/b6fad296a6609a86a564f76ae3333a5741ce34203cf21a631b309d84847f/randomgen-1.26.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06ae8f1fb5693025e63bef8ab58dbebfd2a231ab6d08cc9842e67cc42e5b81e2", size = 3524846, upload-time = "2024-01-05T14:36:05.873Z" }, - { url = "https://files.pythonhosted.org/packages/1d/48/2834556351459d3edc93ae4f7fad346d49a72554b9d1b2d0b16e47416dde/randomgen-1.26.1-cp39-cp39-win_amd64.whl", hash = "sha256:47ba579b4112275a07c74932fbb068f3eaea6ae592f6f8c8bbc091b3444a94c3", size = 6490683, upload-time = "2024-01-05T14:09:40.805Z" }, ] [[package]] @@ -2011,55 +1575,16 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0d/9b/63f4c7ebc259242c89b3acafdb37b41d1185c07ff0011164674e9076b491/rich-14.0.0-py3-none-any.whl", hash = "sha256:1c9491e1951aac09caffd42f448ee3d04e58923ffe14993f6e83068dc395d7e0", size = 243229, upload-time = "2025-03-30T14:15:12.283Z" }, ] -[[package]] -name = "scipy" -version = "1.13.1" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10' and sys_platform != 'darwin'", - "python_full_version < '3.10' and sys_platform == 'darwin'", -] -dependencies = [ - { name = "numpy", marker = "python_full_version < '3.10'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/ae/00/48c2f661e2816ccf2ecd77982f6605b2950afe60f60a52b4cbbc2504aa8f/scipy-1.13.1.tar.gz", hash = "sha256:095a87a0312b08dfd6a6155cbbd310a8c51800fc931b8c0b84003014b874ed3c", size = 57210720, upload-time = "2024-05-23T03:29:26.079Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/33/59/41b2529908c002ade869623b87eecff3e11e3ce62e996d0bdcb536984187/scipy-1.13.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:20335853b85e9a49ff7572ab453794298bcf0354d8068c5f6775a0eabf350aca", size = 39328076, upload-time = "2024-05-23T03:19:01.687Z" }, - { url = "https://files.pythonhosted.org/packages/d5/33/f1307601f492f764062ce7dd471a14750f3360e33cd0f8c614dae208492c/scipy-1.13.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:d605e9c23906d1994f55ace80e0125c587f96c020037ea6aa98d01b4bd2e222f", size = 30306232, upload-time = "2024-05-23T03:19:09.089Z" }, - { url = "https://files.pythonhosted.org/packages/c0/66/9cd4f501dd5ea03e4a4572ecd874936d0da296bd04d1c45ae1a4a75d9c3a/scipy-1.13.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cfa31f1def5c819b19ecc3a8b52d28ffdcc7ed52bb20c9a7589669dd3c250989", size = 33743202, upload-time = "2024-05-23T03:19:15.138Z" }, - { url = "https://files.pythonhosted.org/packages/a3/ba/7255e5dc82a65adbe83771c72f384d99c43063648456796436c9a5585ec3/scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26264b282b9da0952a024ae34710c2aff7d27480ee91a2e82b7b7073c24722f", size = 38577335, upload-time = "2024-05-23T03:19:21.984Z" }, - { url = "https://files.pythonhosted.org/packages/49/a5/bb9ded8326e9f0cdfdc412eeda1054b914dfea952bda2097d174f8832cc0/scipy-1.13.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:eccfa1906eacc02de42d70ef4aecea45415f5be17e72b61bafcfd329bdc52e94", size = 38820728, upload-time = "2024-05-23T03:19:28.225Z" }, - { url = "https://files.pythonhosted.org/packages/12/30/df7a8fcc08f9b4a83f5f27cfaaa7d43f9a2d2ad0b6562cced433e5b04e31/scipy-1.13.1-cp310-cp310-win_amd64.whl", hash = "sha256:2831f0dc9c5ea9edd6e51e6e769b655f08ec6db6e2e10f86ef39bd32eb11da54", size = 46210588, upload-time = "2024-05-23T03:19:35.661Z" }, - { url = "https://files.pythonhosted.org/packages/b4/15/4a4bb1b15bbd2cd2786c4f46e76b871b28799b67891f23f455323a0cdcfb/scipy-1.13.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:27e52b09c0d3a1d5b63e1105f24177e544a222b43611aaf5bc44d4a0979e32f9", size = 39333805, upload-time = "2024-05-23T03:19:43.081Z" }, - { url = "https://files.pythonhosted.org/packages/ba/92/42476de1af309c27710004f5cdebc27bec62c204db42e05b23a302cb0c9a/scipy-1.13.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:54f430b00f0133e2224c3ba42b805bfd0086fe488835effa33fa291561932326", size = 30317687, upload-time = "2024-05-23T03:19:48.799Z" }, - { url = "https://files.pythonhosted.org/packages/80/ba/8be64fe225360a4beb6840f3cbee494c107c0887f33350d0a47d55400b01/scipy-1.13.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e89369d27f9e7b0884ae559a3a956e77c02114cc60a6058b4e5011572eea9299", size = 33694638, upload-time = "2024-05-23T03:19:55.104Z" }, - { url = "https://files.pythonhosted.org/packages/36/07/035d22ff9795129c5a847c64cb43c1fa9188826b59344fee28a3ab02e283/scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a78b4b3345f1b6f68a763c6e25c0c9a23a9fd0f39f5f3d200efe8feda560a5fa", size = 38569931, upload-time = "2024-05-23T03:20:01.82Z" }, - { url = "https://files.pythonhosted.org/packages/d9/10/f9b43de37e5ed91facc0cfff31d45ed0104f359e4f9a68416cbf4e790241/scipy-1.13.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:45484bee6d65633752c490404513b9ef02475b4284c4cfab0ef946def50b3f59", size = 38838145, upload-time = "2024-05-23T03:20:09.173Z" }, - { url = "https://files.pythonhosted.org/packages/4a/48/4513a1a5623a23e95f94abd675ed91cfb19989c58e9f6f7d03990f6caf3d/scipy-1.13.1-cp311-cp311-win_amd64.whl", hash = "sha256:5713f62f781eebd8d597eb3f88b8bf9274e79eeabf63afb4a737abc6c84ad37b", size = 46196227, upload-time = "2024-05-23T03:20:16.433Z" }, - { url = "https://files.pythonhosted.org/packages/f2/7b/fb6b46fbee30fc7051913068758414f2721003a89dd9a707ad49174e3843/scipy-1.13.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5d72782f39716b2b3509cd7c33cdc08c96f2f4d2b06d51e52fb45a19ca0c86a1", size = 39357301, upload-time = "2024-05-23T03:20:23.538Z" }, - { url = "https://files.pythonhosted.org/packages/dc/5a/2043a3bde1443d94014aaa41e0b50c39d046dda8360abd3b2a1d3f79907d/scipy-1.13.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:017367484ce5498445aade74b1d5ab377acdc65e27095155e448c88497755a5d", size = 30363348, upload-time = "2024-05-23T03:20:29.885Z" }, - { url = "https://files.pythonhosted.org/packages/e7/cb/26e4a47364bbfdb3b7fb3363be6d8a1c543bcd70a7753ab397350f5f189a/scipy-1.13.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:949ae67db5fa78a86e8fa644b9a6b07252f449dcf74247108c50e1d20d2b4627", size = 33406062, upload-time = "2024-05-23T03:20:36.012Z" }, - { url = "https://files.pythonhosted.org/packages/88/ab/6ecdc526d509d33814835447bbbeedbebdec7cca46ef495a61b00a35b4bf/scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de3ade0e53bc1f21358aa74ff4830235d716211d7d077e340c7349bc3542e884", size = 38218311, upload-time = "2024-05-23T03:20:42.086Z" }, - { url = "https://files.pythonhosted.org/packages/0b/00/9f54554f0f8318100a71515122d8f4f503b1a2c4b4cfab3b4b68c0eb08fa/scipy-1.13.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2ac65fb503dad64218c228e2dc2d0a0193f7904747db43014645ae139c8fad16", size = 38442493, upload-time = "2024-05-23T03:20:48.292Z" }, - { url = "https://files.pythonhosted.org/packages/3e/df/963384e90733e08eac978cd103c34df181d1fec424de383cdc443f418dd4/scipy-1.13.1-cp312-cp312-win_amd64.whl", hash = "sha256:cdd7dacfb95fea358916410ec61bbc20440f7860333aee6d882bb8046264e949", size = 45910955, upload-time = "2024-05-23T03:20:55.091Z" }, - { url = "https://files.pythonhosted.org/packages/7f/29/c2ea58c9731b9ecb30b6738113a95d147e83922986b34c685b8f6eefde21/scipy-1.13.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:436bbb42a94a8aeef855d755ce5a465479c721e9d684de76bf61a62e7c2b81d5", size = 39352927, upload-time = "2024-05-23T03:21:01.95Z" }, - { url = "https://files.pythonhosted.org/packages/5c/c0/e71b94b20ccf9effb38d7147c0064c08c622309fd487b1b677771a97d18c/scipy-1.13.1-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:8335549ebbca860c52bf3d02f80784e91a004b71b059e3eea9678ba994796a24", size = 30324538, upload-time = "2024-05-23T03:21:07.634Z" }, - { url = "https://files.pythonhosted.org/packages/6d/0f/aaa55b06d474817cea311e7b10aab2ea1fd5d43bc6a2861ccc9caec9f418/scipy-1.13.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d533654b7d221a6a97304ab63c41c96473ff04459e404b83275b60aa8f4b7004", size = 33732190, upload-time = "2024-05-23T03:21:14.41Z" }, - { url = "https://files.pythonhosted.org/packages/35/f5/d0ad1a96f80962ba65e2ce1de6a1e59edecd1f0a7b55990ed208848012e0/scipy-1.13.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:637e98dcf185ba7f8e663e122ebf908c4702420477ae52a04f9908707456ba4d", size = 38612244, upload-time = "2024-05-23T03:21:21.827Z" }, - { url = "https://files.pythonhosted.org/packages/8d/02/1165905f14962174e6569076bcc3315809ae1291ed14de6448cc151eedfd/scipy-1.13.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a014c2b3697bde71724244f63de2476925596c24285c7a637364761f8710891c", size = 38845637, upload-time = "2024-05-23T03:21:28.729Z" }, - { url = "https://files.pythonhosted.org/packages/3e/77/dab54fe647a08ee4253963bcd8f9cf17509c8ca64d6335141422fe2e2114/scipy-1.13.1-cp39-cp39-win_amd64.whl", hash = "sha256:392e4ec766654852c25ebad4f64e4e584cf19820b980bc04960bca0b0cd6eaa2", size = 46227440, upload-time = "2024-05-23T03:21:35.888Z" }, -] - [[package]] name = "scipy" version = "1.15.3" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.10.*' and sys_platform != 'darwin'", - "python_full_version == '3.10.*' and sys_platform == 'darwin'", + "python_full_version < '3.11' and sys_platform != 'darwin'", + "python_full_version < '3.11' and sys_platform == 'darwin'", ] dependencies = [ - { name = "numpy", marker = "python_full_version == '3.10.*'" }, + { name = "numpy", marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/0f/37/6964b830433e654ec7485e45a00fc9a27cf868d622838f6b6d9c5ec0d532/scipy-1.15.3.tar.gz", hash = "sha256:eae3cf522bc7df64b42cad3925c876e1b0b6c35c1337c93e12c0f366f55b0eaf", size = 59419214, upload-time = "2025-05-08T16:13:05.955Z" } wheels = [ @@ -2132,8 +1657,7 @@ name = "seaborn" version = "0.13.2" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "matplotlib", version = "3.9.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "matplotlib", version = "3.10.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "matplotlib" }, { name = "numpy" }, { name = "pandas", version = "1.5.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" }, { name = "pandas", version = "2.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" }, @@ -2207,7 +1731,6 @@ dependencies = [ { name = "colorama", marker = "sys_platform == 'win32'" }, { name = "docutils" }, { name = "imagesize" }, - { name = "importlib-metadata", marker = "python_full_version < '3.10'" }, { name = "jinja2" }, { name = "packaging" }, { name = "pygments" }, @@ -2235,7 +1758,6 @@ dependencies = [ { name = "jinja2" }, { name = "pyyaml" }, { name = "sphinx" }, - { name = "stdlib-list", marker = "python_full_version < '3.10'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/7f/a8/22b379a2a75ccb881217d3d4ae56d7d35f2d1bb4c8c0c51d0253676746a1/sphinx_autoapi-3.6.0.tar.gz", hash = "sha256:c685f274e41d0842ae7e199460c322c4bd7fec816ccc2da8d806094b4f64af06", size = 55417, upload-time = "2025-02-18T01:50:55.241Z" } wheels = [ @@ -2317,7 +1839,6 @@ version = "2.6.5" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "docutils" }, - { name = "importlib-metadata", marker = "python_full_version < '3.10'" }, { name = "pybtex" }, { name = "pybtex-docutils" }, { name = "sphinx" }, @@ -2385,15 +1906,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/52/a7/d2782e4e3f77c8450f727ba74a8f12756d5ba823d81b941f1b04da9d033a/sphinxcontrib_serializinghtml-2.0.0-py3-none-any.whl", hash = "sha256:6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331", size = 92072, upload-time = "2024-07-29T01:10:08.203Z" }, ] -[[package]] -name = "stdlib-list" -version = "0.11.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/5d/09/8d5c564931ae23bef17420a6c72618463a59222ca4291a7dd88de8a0d490/stdlib_list-0.11.1.tar.gz", hash = "sha256:95ebd1d73da9333bba03ccc097f5bac05e3aa03e6822a0c0290f87e1047f1857", size = 60442, upload-time = "2025-02-18T15:39:38.769Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/88/c7/4102536de33c19d090ed2b04e90e7452e2e3dc653cf3323208034eaaca27/stdlib_list-0.11.1-py3-none-any.whl", hash = "sha256:9029ea5e3dfde8cd4294cfd4d1797be56a67fc4693c606181730148c3fd1da29", size = 83620, upload-time = "2025-02-18T15:39:37.02Z" }, -] - [[package]] name = "sympy" version = "1.12.1" @@ -2454,8 +1966,7 @@ docs = [ { name = "sphinxcontrib-images" }, ] docs-examples = [ - { name = "matplotlib", version = "3.9.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "matplotlib", version = "3.10.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "matplotlib" }, { name = "seaborn" }, ] isort = [ @@ -2546,8 +2057,7 @@ dependencies = [ { name = "pytest" }, { name = "randomgen", version = "1.26.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "randomgen", version = "1.26.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "scipy", version = "1.13.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "scipy", version = "1.16.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "setuptools" }, { name = "sympy" }, @@ -2564,9 +2074,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fb/ce/24e78350d598b738a79d6a626bfcaebcbafa63fca8ffb128c16186372389/tmlt_core-0.18.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:99578c1aad56a03a3cfaba019cc90de797571e18e37fc8de4711e8d44edc37ce", size = 8051360, upload-time = "2025-04-02T01:22:21.403Z" }, { url = "https://files.pythonhosted.org/packages/62/9f/82be8e0b61e23eb69c0d418c068ea82b73cd0bfa3df15a19e588dc4f4c9e/tmlt_core-0.18.2-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:764c1467d26419e415e2fa4c2aafbe72626d385b42e0ee486d2039b9a400c8db", size = 9712790, upload-time = "2025-04-02T01:22:23.012Z" }, { url = "https://files.pythonhosted.org/packages/13/67/e2d95cc47391c314d50c163ff8c06aaf8f01e98fa99dd5e6833808f7d555/tmlt_core-0.18.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f898ddd7dbf4064d0d4c117f001e34d95918a3ec4f982b478b82bb91c39d75f", size = 43275203, upload-time = "2025-04-02T01:22:26.459Z" }, - { url = "https://files.pythonhosted.org/packages/ce/df/c0aa952172da103acdde941c52ef529c87b6da781f3a7388ab0ae40068d9/tmlt_core-0.18.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:9b275520740cc8357cf471d779cefd7f967db87d0662bea57da9684e4dab3388", size = 8051362, upload-time = "2025-04-02T01:22:29.559Z" }, - { url = "https://files.pythonhosted.org/packages/1c/91/f91dfa3297d258e5d02fe2e60b1c283dbc172d5b834e2dab16531e1905b8/tmlt_core-0.18.2-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:8c653c4508b62938664b1b467b29fdcdb924e8ef26f627507d0bd4873d0354af", size = 9712788, upload-time = "2025-04-02T01:22:31.428Z" }, - { url = "https://files.pythonhosted.org/packages/fb/44/cd151db098a01c6b9e93934889dd6187b9a6137f2ed5204a5771a46a8fc3/tmlt_core-0.18.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db96f578c6998a98ae568925fe582a0f6a6e4a9c2615b9935f583514ba0ecca2", size = 43275202, upload-time = "2025-04-02T01:22:34.55Z" }, ] [[package]] @@ -2652,7 +2159,6 @@ name = "typeguard" version = "4.4.4" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "importlib-metadata", marker = "python_full_version < '3.10'" }, { name = "typing-extensions" }, ] sdist = { url = "https://files.pythonhosted.org/packages/c7/68/71c1a15b5f65f40e91b65da23b8224dad41349894535a97f63a52e462196/typeguard-4.4.4.tar.gz", hash = "sha256:3a7fd2dffb705d4d0efaed4306a704c89b9dee850b688f060a8b1615a79e5f74", size = 75203, upload-time = "2025-06-18T09:56:07.624Z" } @@ -2714,7 +2220,6 @@ version = "0.8.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "dunamai" }, - { name = "eval-type-backport", marker = "python_full_version < '3.10'" }, { name = "hatchling" }, { name = "jinja2" }, { name = "pydantic" }, @@ -2738,12 +2243,3 @@ sdist = { url = "https://files.pythonhosted.org/packages/56/2c/444f465fb2c65f40c wheels = [ { url = "https://files.pythonhosted.org/packages/f3/40/b1c265d4b2b62b58576588510fc4d1fe60a86319c8de99fd8e9fec617d2c/virtualenv-20.31.2-py3-none-any.whl", hash = "sha256:36efd0d9650ee985f0cad72065001e66d49a6f24eb44d98980f630686243cf11", size = 6057982, upload-time = "2025-05-08T17:58:21.15Z" }, ] - -[[package]] -name = "zipp" -version = "3.23.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166", size = 25547, upload-time = "2025-06-08T17:06:39.4Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e", size = 10276, upload-time = "2025-06-08T17:06:38.034Z" }, -] From d763cf611e15c88494c2859c57ee280327dae3b6 Mon Sep 17 00:00:00 2001 From: dasm Date: Fri, 10 Oct 2025 15:41:59 -0700 Subject: [PATCH 05/25] Switch to bar syntax for unions in our docs. (#59) --- doc/conf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/conf.py b/doc/conf.py index 91fa7f5f..0c9a899e 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -97,6 +97,9 @@ "show-inheritance": True, } +# Autodoc typehints settings +always_use_bars_union = True + # General settings master_doc = "index" exclude_patterns = ["templates"] From 599ae310ce456439190cdf01e8798ae32ef67857 Mon Sep 17 00:00:00 2001 From: Tom Magerlein Date: Tue, 14 Oct 2025 07:32:48 -0400 Subject: [PATCH 06/25] Remove Plausible script from docs (#58) --- doc/_templates/layout.html | 11 ----------- 1 file changed, 11 deletions(-) delete mode 100644 doc/_templates/layout.html diff --git a/doc/_templates/layout.html b/doc/_templates/layout.html deleted file mode 100644 index d038aa30..00000000 --- a/doc/_templates/layout.html +++ /dev/null @@ -1,11 +0,0 @@ - - -{% extends "!layout.html" %} -{# pydata-sphinx-theme currently only supports Google Analytics. - This is a workaround until pydata-sphinx-theme v0.10.0 is released, - which adds Plausible support -#} -{% block extrahead %} - - {{ super() }} -{% endblock %} \ No newline at end of file From b967845749b005e4b3f76370bbdb9949884c44a5 Mon Sep 17 00:00:00 2001 From: Ted Date: Thu, 16 Oct 2025 13:52:48 +0200 Subject: [PATCH 07/25] Remove OutputSchemaVisitor, add schema() as a QueryExpr method (#74) * Removes OutputSchemaVisitor, add schema() as a QueryExpr method * make linters happy * first batch of comments * review comments, mostly splitting validation * actually perform validation. otherwise validation is not performed. * ah it was just because get_bounds has no transformation! doing it this way makes a lot more sense * check the schema for *both* transformations and measurements * lint --------- Co-authored-by: Damien Desfontaines --- src/tmlt/analytics/_query_expr.py | 936 +++++++++++++++++- .../_base_measurement_visitor.py | 31 +- .../_base_transformation_visitor.py | 5 +- .../_query_expr_compiler/_compiler.py | 19 +- .../_measurement_visitor.py | 8 +- .../_output_schema_visitor.py | 933 ----------------- .../session/ids/test_id_col_operations.py | 4 +- test/system/session/rows/test_add_max_rows.py | 11 +- .../test_measurement_visitor.py | 7 +- .../transformation_visitor/test_add_keys.py | 13 +- .../transformation_visitor/test_add_rows.py | 15 +- ...tor.py => test_query_expression_schema.py} | 163 ++- 12 files changed, 1052 insertions(+), 1093 deletions(-) delete mode 100644 src/tmlt/analytics/_query_expr_compiler/_output_schema_visitor.py rename test/unit/{query_expr_compiler/test_output_schema_visitor.py => test_query_expression_schema.py} (94%) diff --git a/src/tmlt/analytics/_query_expr.py b/src/tmlt/analytics/_query_expr.py index 46085b23..dbcee359 100644 --- a/src/tmlt/analytics/_query_expr.py +++ b/src/tmlt/analytics/_query_expr.py @@ -12,19 +12,33 @@ # Copyright Tumult Labs 2025 import datetime +import warnings from abc import ABC, abstractmethod -from dataclasses import dataclass +from collections.abc import Collection +from dataclasses import dataclass, replace from enum import Enum, auto from typing import Any, Callable, Dict, List, Optional, Tuple, Union -from pyspark.sql import DataFrame +from pyspark.sql import DataFrame, SparkSession +from tmlt.core.domains.spark_domains import SparkDataFrameDomain +from tmlt.core.utils.join import domain_after_join from typeguard import check_type from tmlt.analytics import AnalyticsInternalError +from tmlt.analytics._catalog import Catalog, PrivateTable, PublicTable from tmlt.analytics._coerce_spark_schema import coerce_spark_schema_or_fail -from tmlt.analytics._schema import FrozenDict, Schema +from tmlt.analytics._schema import ( + ColumnDescriptor, + ColumnType, + FrozenDict, + Schema, + analytics_to_py_types, + analytics_to_spark_columns_descriptor, + analytics_to_spark_schema, + spark_schema_to_analytics_columns, +) from tmlt.analytics.config import config -from tmlt.analytics.constraints import Constraint +from tmlt.analytics.constraints import Constraint, MaxGroupsPerID, MaxRowsPerGroupPerID from tmlt.analytics.keyset import KeySet from tmlt.analytics.truncation_strategy import TruncationStrategy @@ -169,6 +183,11 @@ class QueryExpr(ABC): returns a relation. """ + @abstractmethod + def schema(self, catalog: Catalog) -> Any: + """Returns the schema resulting from evaluating this QueryExpr.""" + raise NotImplementedError() + @abstractmethod def accept(self, visitor: "QueryExprVisitor") -> Any: """Dispatch methods on a visitor based on the QueryExpr type.""" @@ -193,8 +212,23 @@ def __post_init__(self): " (_), and it cannot start with a number, or contain any spaces." ) + def _validate(self, catalog: Catalog): + """Validation checks for this QueryExpr.""" + if self.source_id not in catalog.tables: + raise ValueError(f"Query references nonexistent table '{self.source_id}'") + if not isinstance(catalog.tables[self.source_id], PrivateTable): + raise ValueError( + f"Attempted query on table '{self.source_id}', which is " + "not a private table." + ) + + def schema(self, catalog: Catalog) -> Schema: + """Returns the schema resulting from evaluating this QueryExpr.""" + self._validate(catalog) + return catalog.tables[self.source_id].schema + def accept(self, visitor: "QueryExprVisitor") -> Any: - """Visit this QueryExpr with visitor.""" + """Visits this QueryExpr with visitor.""" return visitor.visit_private_source(self) @@ -217,6 +251,30 @@ def __post_init__(self): check_type(self.child, QueryExpr) check_type(self.columns, Optional[Tuple[str, ...]]) + def _validate(self, input_schema: Schema): + """Validation checks for this QueryExpr.""" + if self.columns: + nonexistent_columns = set(self.columns) - set(input_schema) + if nonexistent_columns: + raise ValueError( + f"Nonexistent columns in get_groups query: {nonexistent_columns}" + ) + + def schema(self, catalog: Catalog) -> Schema: + """Returns the schema resulting from evaluating this QueryExpr.""" + input_schema = self.child.schema(catalog) + self._validate(input_schema) + + if self.columns: + return Schema({column: input_schema[column] for column in self.columns}) + return Schema( + { + column: input_schema[column] + for column in input_schema + if column != input_schema.id_column + } + ) + def accept(self, visitor: "QueryExprVisitor") -> Any: """Visit this QueryExpr with visitor.""" return visitor.visit_get_groups(self) @@ -247,6 +305,12 @@ def __post_init__(self): check_type(self.lower_bound_column, str) check_type(self.upper_bound_column, str) + def schema(self, catalog: Catalog) -> Schema: + """Returns the schema resulting from evaluating this QueryExpr.""" + input_schema = self.child.schema(catalog) + _validate_groupby(self, input_schema) + return _schema_for_groupby(self, input_schema) + def accept(self, visitor: "QueryExprVisitor") -> Any: """Visit this QueryExpr with visitor.""" return visitor.visit_get_bounds(self) @@ -279,6 +343,42 @@ def __post_init__(self): ' "" are not allowed' ) + def _validate(self, input_schema: Schema): + """Validation checks for this QueryExpr.""" + nonexistent_columns = set(self.column_mapper) - set(input_schema) + if nonexistent_columns: + raise ValueError( + f"Nonexistent columns in rename query: {nonexistent_columns}" + ) + for old, new in self.column_mapper.items(): + if new in input_schema and new != old: + raise ValueError( + f"Cannot rename '{old}' to '{new}': column '{new}' already exists" + ) + + def schema(self, catalog: Catalog) -> Schema: + """Returns the schema resulting from evaluating this QueryExpr.""" + input_schema = self.child.schema(catalog) + self._validate(input_schema) + + grouping_column = input_schema.grouping_column + if grouping_column in self.column_mapper: + grouping_column = self.column_mapper[grouping_column] + + id_column = input_schema.id_column + if id_column in self.column_mapper: + id_column = self.column_mapper[id_column] + + return Schema( + { + self.column_mapper.get(column, column): input_schema[column] + for column in input_schema + }, + grouping_column=grouping_column, + id_column=id_column, + id_space=input_schema.id_space, + ) + def accept(self, visitor: "QueryExprVisitor") -> Any: """Visit this QueryExpr with visitor.""" return visitor.visit_rename(self) @@ -302,6 +402,23 @@ def __post_init__(self): check_type(self.child, QueryExpr) check_type(self.condition, str) + def _validate(self, input_schema: Schema): + """Validation checks for this QueryExpr.""" + spark = SparkSession.builder.getOrCreate() + test_df = spark.createDataFrame( + [], schema=analytics_to_spark_schema(input_schema) + ) + try: + test_df.filter(self.condition) + except Exception as e: + raise ValueError(f"Invalid filter condition '{self.condition}': {e}") from e + + def schema(self, catalog: Catalog) -> Schema: + """Returns the schema resulting from evaluating this QueryExpr.""" + input_schema = self.child.schema(catalog) + self._validate(input_schema) + return input_schema + def accept(self, visitor: "QueryExprVisitor") -> Any: """Visit this QueryExpr with visitor.""" return visitor.visit_filter(self) @@ -323,6 +440,36 @@ def __post_init__(self): if len(self.columns) != len(set(self.columns)): raise ValueError(f"Column name appears more than once in {self.columns}") + def _validate(self, input_schema: Schema): + """Validation checks for this QueryExpr.""" + grouping_column = input_schema.grouping_column + id_column = input_schema.id_column + if grouping_column is not None and grouping_column not in self.columns: + raise ValueError( + f"Grouping column '{grouping_column}' may not " + "be dropped by select query" + ) + if id_column is not None and id_column not in self.columns: + raise ValueError( + f"ID column '{id_column}' may not be dropped by select query" + ) + nonexistent_columns = set(self.columns) - set(input_schema) + if nonexistent_columns: + raise ValueError( + f"Nonexistent columns in select query: {nonexistent_columns}" + ) + + def schema(self, catalog: Catalog) -> Schema: + """Returns the schema resulting from evaluating this QueryExpr.""" + input_schema = self.child.schema(catalog) + self._validate(input_schema) + return Schema( + {column: input_schema[column] for column in self.columns}, + grouping_column=input_schema.grouping_column, + id_column=input_schema.id_column, + id_space=input_schema.id_space, + ) + def accept(self, visitor: "QueryExprVisitor") -> Any: """Visit this QueryExpr with visitor.""" return visitor.visit_select(self) @@ -354,6 +501,48 @@ def __post_init__(self): if self.schema_new_columns.grouping_column is not None: raise ValueError("Map cannot be be used to create grouping columns") + def _validate(self, input_schema: Schema): + """Validation checks for this QueryExpr.""" + new_columns = self.schema_new_columns.column_descs + if self.augment: + overlapping_columns = set(input_schema.keys()) & set(new_columns.keys()) + if overlapping_columns: + raise ValueError( + "New columns in augmenting map must not overwrite " + "existing columns, but found new columns that " + f"already exist: {', '.join(overlapping_columns)}" + ) + return + if input_schema.grouping_column: + raise ValueError( + "Map must set augment=True to ensure that " + f"grouping column '{input_schema.grouping_column}' is not lost." + ) + if input_schema.id_column: + raise ValueError( + "Map must set augment=True to ensure that " + f"ID column '{input_schema.id_column}' is not lost." + ) + + def schema(self, catalog: Catalog) -> Schema: + """Returns the schema resulting from evaluating this QueryExpr.""" + input_schema = self.child.schema(catalog) + self._validate(input_schema) + new_columns = self.schema_new_columns.column_descs + # Any column created by Map could contain a null value + for name in list(new_columns.keys()): + new_columns[name] = replace(new_columns[name], allow_null=True) + + if self.augment: + return Schema( + {**input_schema, **new_columns}, + grouping_column=input_schema.grouping_column, + id_column=input_schema.id_column, + id_space=input_schema.id_space, + ) + # If augment=False, there is no grouping column nor ID column + return Schema(new_columns) + def accept(self, visitor: "QueryExprVisitor") -> Any: """Visit this QueryExpr with visitor.""" return visitor.visit_map(self) @@ -420,6 +609,66 @@ def __post_init__(self): "columns, grouping flat map can only result in 1 new column" ) + def _validate(self, input_schema): + """Validation checks for this QueryExpr.""" + if self.schema_new_columns.grouping_column is not None: + if input_schema.grouping_column: + raise ValueError( + "Multiple grouping transformations are used in this query. " + "Only one grouping transformation is allowed." + ) + if input_schema.id_column: + raise ValueError( + "Grouping flat map cannot be used on tables with " + "the AddRowsWithID protected change." + ) + + new_columns = self.schema_new_columns.column_descs + if self.augment: + overlapping_columns = set(input_schema.keys()) & set(new_columns.keys()) + if overlapping_columns: + raise ValueError( + "New columns in augmenting map must not overwrite " + "existing columns, but found new columns that " + f"already exist: {', '.join(overlapping_columns)}" + ) + return + if input_schema.grouping_column: + raise ValueError( + "Flat map must set augment=True to ensure that " + f"grouping column '{input_schema.grouping_column}' is not lost." + ) + if input_schema.id_column: + raise ValueError( + "Flat map must set augment=True to ensure that " + f"ID column '{input_schema.id_column}' is not lost." + ) + + def schema(self, catalog: Catalog) -> Schema: + """Returns the schema resulting from evaluating this QueryExpr.""" + input_schema = self.child.schema(catalog) + self._validate(input_schema) + + grouping_column = ( + self.schema_new_columns.grouping_column + if self.schema_new_columns.grouping_column is not None + else input_schema.grouping_column + ) + new_columns = self.schema_new_columns.column_descs + # Any column created by the FlatMap could contain a null value + for name in list(new_columns.keys()): + new_columns[name] = replace(new_columns[name], allow_null=True) + + if self.augment: + return Schema( + {**input_schema, **new_columns}, + grouping_column=grouping_column, + id_column=input_schema.id_column, + id_space=input_schema.id_space, + ) + # If augment=False, there is no grouping column nor ID column + return Schema(new_columns) + def accept(self, visitor: "QueryExprVisitor") -> Any: """Visit this QueryExpr with visitor.""" return visitor.visit_flat_map(self) @@ -470,6 +719,37 @@ def accept(self, visitor: "QueryExprVisitor") -> Any: """Visit this QueryExpr with visitor.""" return visitor.visit_flat_map_by_id(self) + def _validate(self, input_schema: Schema): + """Validation checks for this QueryExpr.""" + if not input_schema.id_column: + raise ValueError( + "Flat-map-by-ID may only be used on tables with ID columns." + ) + if input_schema.grouping_column: + raise AnalyticsInternalError( + "Encountered table with both an ID column and a grouping column." + ) + if input_schema.id_column in self.schema_new_columns.column_descs: + raise ValueError( + "Flat-map-by-ID mapping function output cannot include ID column." + ) + + def schema(self, catalog: Catalog) -> Schema: + """Returns the schema resulting from evaluating this QueryExpr.""" + input_schema = self.child.schema(catalog) + self._validate(input_schema) + + id_column = input_schema.id_column + new_columns = self.schema_new_columns.column_descs + + for name in list(new_columns.keys()): + new_columns[name] = replace(new_columns[name], allow_null=True) + return Schema( + {id_column: input_schema[id_column], **new_columns}, + id_column=id_column, + id_space=input_schema.id_space, + ) + def __eq__(self, other: object) -> bool: """Returns true iff self == other. @@ -486,6 +766,107 @@ def __eq__(self, other: object) -> bool: ) +def _schema_for_join( + left_schema: Schema, + right_schema: Schema, + join_columns: Optional[Tuple[str, ...]], + join_id_space: Optional[str] = None, + how: str = "inner", +) -> Schema: + """Return the schema resulting from joining two tables. + + It is assumed that if either schema has an ID column, the one from left_schema + should be used, because this is true for both public and private joins. With private + joins, the ID columns must be compatible; this check must happen outside this + function. + + Args: + left_schema: Schema for the left table. + right_schema: Schema for the right table. + join_columns: The set of columns to join on. + join_id_space: The ID space of the resulting join. + how: The type of join to perform. Default is "inner". + """ + if left_schema.grouping_column is None: + grouping_column = right_schema.grouping_column + elif right_schema.grouping_column is None: + grouping_column = left_schema.grouping_column + elif left_schema.grouping_column == right_schema.grouping_column: + grouping_column = left_schema.grouping_column + else: + raise ValueError( + "Joining tables which both have grouping columns is only supported " + "if they have the same grouping column" + ) + common_columns = set(left_schema) & set(right_schema) + if join_columns is None and not common_columns: + raise ValueError("Tables have no common columns to join on") + if join_columns is not None and not join_columns: + # This error case should be caught when constructing the query + # expression, so it should never get here. + raise AnalyticsInternalError("Empty list of join columns provided.") + + join_columns = ( + join_columns + if join_columns + else tuple(sorted(common_columns, key=list(left_schema).index)) + ) + + if not set(join_columns) <= common_columns: + raise ValueError("Join columns must be common to both tables") + + for column in join_columns: + if left_schema[column].column_type != right_schema[column].column_type: + raise ValueError( + "Join columns must have identical types on both tables, " + f"but column '{column}' does not: {left_schema[column]} and " + f"{right_schema[column]} are incompatible" + ) + + join_column_schemas = {column: left_schema[column] for column in join_columns} + output_schema = { + **join_column_schemas, + **{ + column + ("_left" if column in common_columns else ""): left_schema[column] + for column in left_schema + if column not in join_columns + }, + **{ + column + + ("_right" if column in common_columns else ""): right_schema[column] + for column in right_schema + if column not in join_columns + }, + } + # Use Core's join utilities for determining whether a column can be null + # TODO: This could potentially be used more in this function + output_domain = domain_after_join( + left_domain=SparkDataFrameDomain( + analytics_to_spark_columns_descriptor(left_schema) + ), + right_domain=SparkDataFrameDomain( + analytics_to_spark_columns_descriptor(right_schema) + ), + on=list(join_columns), + how=how, + nulls_are_equal=True, + ) + for column in output_schema: + col_schema = output_schema[column] + output_schema[column] = ColumnDescriptor( + column_type=col_schema.column_type, + allow_null=output_domain.schema[column].allow_null, + allow_nan=col_schema.allow_nan, + allow_inf=col_schema.allow_inf, + ) + return Schema( + output_schema, + grouping_column=grouping_column, + id_column=left_schema.id_column, + id_space=join_id_space, + ) + + @dataclass(frozen=True) class JoinPrivate(QueryExpr): """Returns the join of two private tables. @@ -527,6 +908,46 @@ def __post_init__(self): if len(self.join_columns) != len(set(self.join_columns)): raise ValueError("Join columns must be distinct") + def _validate(self, left_schema: Schema, right_schema: Schema): + """Validation checks for this QueryExpr.""" + if left_schema.id_column != right_schema.id_column: + if left_schema.id_column is None or right_schema.id_column is None: + raise ValueError( + "Private joins can only be performed between two tables " + "with the same type of protected change" + ) + raise ValueError( + "Private joins between tables with the AddRowsWithID " + "protected change are only possible when the ID columns of " + "the two tables have the same name" + ) + if left_schema.id_space != right_schema.id_space: + raise ValueError( + "Private joins between tables with the AddRowsWithID protected change" + " are only possible when both tables are in the same ID space" + ) + + def schema(self, catalog: Catalog) -> Schema: + """Returns the schema resulting from evaluating this QueryExpr. + + The ordering of output columns are: + + 1. The join columns + 2. Columns that are only in the left table + 3. Columns that are only in the right table + 4. Columns that are in both tables, but not included in the join columns. These + columns are included with _left and _right suffixes. + """ + left_schema = self.child.schema(catalog) + right_schema = self.right_operand_expr.schema(catalog) + self._validate(left_schema, right_schema) + return _schema_for_join( + left_schema=left_schema, + right_schema=right_schema, + join_columns=self.join_columns, + join_id_space=left_schema.id_space, + ) + def accept(self, visitor: "QueryExprVisitor") -> Any: """Visit this QueryExpr with visitor.""" return visitor.visit_join_private(self) @@ -568,6 +989,37 @@ def __post_init__(self): f"Invalid join type '{self.how}': must be 'inner' or 'left'" ) + def _validate(self, catalog: Catalog): + """Validation checks for this QueryExpr.""" + if isinstance(self.public_table, str): + if not isinstance(catalog.tables[self.public_table], PublicTable): + raise ValueError( + f"Attempted public join on table '{self.public_table}', " + "which is not a public table" + ) + + def schema(self, catalog: Catalog) -> Schema: + """Returns the schema resulting from evaluating this QueryExpr. + + Has analogous behavior to :meth:`JoinPrivate.schema`, where the private + table is the left table. + """ + input_schema = self.child.schema(catalog) + self._validate(catalog) + if isinstance(self.public_table, str): + right_schema = catalog.tables[self.public_table].schema + else: + right_schema = Schema( + spark_schema_to_analytics_columns(self.public_table.schema) + ) + return _schema_for_join( + left_schema=input_schema, + right_schema=right_schema, + join_columns=self.join_columns, + join_id_space=input_schema.id_space, + how=self.how, + ) + def accept(self, visitor: "QueryExprVisitor") -> Any: """Visit this QueryExpr with visitor.""" return visitor.visit_join_public(self) @@ -671,6 +1123,72 @@ def __post_init__(self): FrozenDict, ) + def _validate(self, input_schema: Schema): + """Validation checks for this QueryExpr.""" + if ( + input_schema.grouping_column + and input_schema.grouping_column in self.replace_with + ): + raise ValueError( + "Cannot replace null values in column " + f"'{input_schema.grouping_column}', as it is a grouping column." + ) + if input_schema.id_column and input_schema.id_column in self.replace_with: + raise ValueError( + f"Cannot replace null values in column '{input_schema.id_column}', " + "as it is an ID column." + ) + if input_schema.id_column and (len(self.replace_with) == 0): + warnings.warn( + f"Replacing null values in the ID column '{input_schema.id_column}' " + "is not allowed, so the ID column may still contain null values.", + RuntimeWarning, + ) + + pytypes = analytics_to_py_types(input_schema) + for col, val in self.replace_with.items(): + if col not in input_schema.keys(): + raise ValueError( + f"Column '{col}' does not exist in this table, " + f"available columns are {list(input_schema.keys())}" + ) + if not isinstance(val, pytypes[col]): + # Using an int as a float is OK + if not (isinstance(val, int) and pytypes[col] == float): + raise ValueError( + f"Column '{col}' cannot have nulls replaced with " + f"{repr(val)}, as that value's type does not match the " + f"column type {input_schema[col].column_type.name}" + ) + + def schema(self, catalog: Catalog) -> Schema: + """Returns the schema resulting from evaluating this QueryExpr.""" + input_schema = self.child.schema(catalog) + self._validate(input_schema) + + columns_to_change = list(dict(self.replace_with).keys()) + if len(columns_to_change) == 0: + columns_to_change = [ + name + for name, cd in input_schema.column_descs.items() + if (cd.allow_null or cd.allow_nan) + and not (name in [input_schema.grouping_column, input_schema.id_column]) + ] + return Schema( + { + name: ColumnDescriptor( + column_type=cd.column_type, + allow_null=(cd.allow_null and not name in columns_to_change), + allow_nan=(cd.allow_nan and not name in columns_to_change), + allow_inf=cd.allow_inf, + ) + for name, cd in input_schema.column_descs.items() + }, + grouping_column=input_schema.grouping_column, + id_column=input_schema.id_column, + id_space=input_schema.id_space, + ) + def accept(self, visitor: "QueryExprVisitor") -> Any: """Visit this QueryExpr with visitor.""" return visitor.visit_replace_null_and_nan(self) @@ -710,6 +1228,65 @@ def __init__( object.__setattr__(self, "replace_with", FrozenDict.from_dict(updated_dict)) object.__setattr__(self, "child", child) + def _validate(self, input_schema: Schema): + """Validation checks for this QueryExpr.""" + if ( + input_schema.grouping_column + and input_schema.grouping_column in self.replace_with + ): + raise ValueError( + "Cannot replace infinite values in column " + f"'{input_schema.grouping_column}', as it is a grouping column" + ) + if input_schema.id_column and input_schema.id_column in self.replace_with: + raise ValueError( + f"Cannot replace infinite values in column '{input_schema.id_column}', " + "as it is an ID column" + ) + + for name in self.replace_with: + if name not in input_schema.keys(): + raise ValueError( + f"Column '{name}' does not exist in this table, " + f"available columns are {list(input_schema.keys())}" + ) + if input_schema[name].column_type != ColumnType.DECIMAL: + raise ValueError( + f"Column '{name}' has a replacement value provided, but it is " + f"of type {input_schema[name].column_type.name} (not " + f"{ColumnType.DECIMAL.name}) and so cannot " + "contain infinite values" + ) + + def schema(self, catalog: Catalog) -> Schema: + """Returns the schema resulting from evaluating this QueryExpr.""" + input_schema = self.child.schema(catalog) + self._validate(input_schema) + + columns_to_change = list(self.replace_with.keys()) + if len(columns_to_change) == 0: + columns_to_change = [ + name + for name, cd in input_schema.column_descs.items() + if cd.column_type == ColumnType.DECIMAL + and cd.allow_inf + and not (name in [input_schema.grouping_column, input_schema.id_column]) + ] + return Schema( + { + name: ColumnDescriptor( + column_type=cd.column_type, + allow_null=cd.allow_null, + allow_nan=cd.allow_nan, + allow_inf=(cd.allow_inf and not name in columns_to_change), + ) + for name, cd in input_schema.column_descs.items() + }, + grouping_column=input_schema.grouping_column, + id_column=input_schema.id_column, + id_space=input_schema.id_space, + ) + def accept(self, visitor: "QueryExprVisitor") -> Any: """Visit this QueryExpr with visitor.""" return visitor.visit_replace_infinity(self) @@ -740,6 +1317,63 @@ def __post_init__(self) -> None: check_type(self.child, QueryExpr) check_type(self.columns, Tuple[str, ...]) + def _validate(self, input_schema: Schema): + """Validation checks for this QueryExpr.""" + if ( + input_schema.grouping_column + and input_schema.grouping_column in self.columns + ): + raise ValueError( + f"Cannot drop null values in column '{input_schema.grouping_column}', " + "as it is a grouping column" + ) + if input_schema.id_column and input_schema.id_column in self.columns: + raise ValueError( + f"Cannot drop null values in column '{input_schema.id_column}', " + "as it is an ID column." + ) + if input_schema.id_column and len(self.columns) == 0: + warnings.warn( + f"Replacing null values in the ID column '{input_schema.id_column}' " + "is not allowed, so the ID column may still contain null values.", + RuntimeWarning, + ) + for name in self.columns: + if name not in input_schema.keys(): + raise ValueError( + f"Column '{name}' does not exist in this table, " + f"available columns are {list(input_schema.keys())}" + ) + + def schema(self, catalog: Catalog) -> Schema: + """Returns the schema resulting from evaluating this QueryExpr.""" + input_schema = self.child.schema(catalog) + self._validate(input_schema) + + columns = self.columns + if len(columns) == 0: + columns = tuple( + name + for name, cd in input_schema.column_descs.items() + if (cd.allow_null or cd.allow_nan) + and not name in [input_schema.grouping_column, input_schema.id_column] + ) + + return Schema( + { + name: ColumnDescriptor( + column_type=cd.column_type, + allow_null=(cd.allow_null and not name in columns), + allow_nan=(cd.allow_nan and not name in columns), + allow_inf=(cd.allow_inf), + ) + for name, cd in input_schema.column_descs.items() + }, + grouping_column=input_schema.grouping_column, + id_column=input_schema.id_column, + id_space=input_schema.id_space, + ) + def accept(self, visitor: "QueryExprVisitor") -> Any: """Visit this QueryExpr with visitor.""" return visitor.visit_drop_null_and_nan(self) @@ -763,6 +1397,67 @@ def __post_init__(self) -> None: check_type(self.child, QueryExpr) check_type(self.columns, Tuple[str, ...]) + def _validate(self, input_schema: Schema): + """Validation checks for this QueryExpr.""" + if ( + input_schema.grouping_column + and input_schema.grouping_column in self.columns + ): + raise ValueError( + "Cannot drop infinite values in column " + f"'{input_schema.grouping_column}', as it is a grouping column" + ) + # Float-valued columns cannot be ID columns, but include this to be safe. + if input_schema.id_column and input_schema.id_column in self.columns: + raise ValueError( + f"Cannot drop infinite values in column '{input_schema.id_column}', " + "as it is an ID column" + ) + for name in self.columns: + if name not in input_schema.keys(): + raise ValueError( + f"Column '{name}' does not exist in this table, " + f"available columns are {list(input_schema.keys())}" + ) + if input_schema[name].column_type != ColumnType.DECIMAL: + raise ValueError( + f"Column '{name}' was given as a column to drop " + "infinite values from, but it is of type" + f"{input_schema[name].column_type.name} (not " + f"{ColumnType.DECIMAL.name}) and so cannot " + "contain infinite values" + ) + + def schema(self, catalog: Catalog) -> Schema: + """Returns the schema resulting from evaluating this QueryExp.""" + input_schema = self.child.schema(catalog) + self._validate(input_schema) + + columns = self.columns + if len(columns) == 0: + columns = tuple( + name + for name, cd in input_schema.column_descs.items() + if cd.column_type == ColumnType.DECIMAL + and cd.allow_inf + and not name in (input_schema.grouping_column, input_schema.id_column) + ) + + return Schema( + { + name: ColumnDescriptor( + column_type=cd.column_type, + allow_null=cd.allow_null, + allow_nan=cd.allow_nan, + allow_inf=(cd.allow_inf and not name in columns), + ) + for name, cd in input_schema.column_descs.items() + }, + grouping_column=input_schema.grouping_column, + id_column=input_schema.id_column, + id_space=input_schema.id_space, + ) + def accept(self, visitor: "QueryExprVisitor") -> Any: """Visit this QueryExpr with visitor.""" return visitor.visit_drop_infinity(self) @@ -782,11 +1477,196 @@ class EnforceConstraint(QueryExpr): Appropriate values here vary depending on the constraint. These options are to support advanced use cases, and generally should not be used.""" + def _validate(self, input_schema: Schema): + """Validation checks for this QueryExpr.""" + if not input_schema.id_column: + raise ValueError( + f"Constraint {self.constraint} can only be applied to tables" + " with the AddRowsWithID protected change" + ) + if isinstance(self.constraint, (MaxGroupsPerID, MaxRowsPerGroupPerID)): + grouping_column = self.constraint.grouping_column + if grouping_column not in input_schema: + raise ValueError( + f"The grouping column of constraint {self.constraint}" + " does not exist in this table; available columns" + f" are: {', '.join(input_schema.keys())}" + ) + if grouping_column == input_schema.id_column: + raise ValueError( + f"The grouping column of constraint {self.constraint} cannot be" + " the ID column of the table it is applied to" + ) + + def schema(self, catalog: Catalog) -> Schema: + """Returns the schema resulting from evaluating this QueryExpr.""" + input_schema = self.child.schema(catalog) + self._validate(input_schema) + return input_schema + def accept(self, visitor: "QueryExprVisitor") -> Any: """Visit this QueryExpr with visitor.""" return visitor.visit_enforce_constraint(self) +def _validate_groupby( + query: Union[ + "GetBounds", + "GroupByBoundedAverage", + "GroupByBoundedSTDEV", + "GroupByBoundedSum", + "GroupByBoundedVariance", + "GroupByCount", + "GroupByCountDistinct", + "GroupByQuantile", + ], + input_schema: Schema, +): + """Validates the arguments of a group-by QueryExpr.""" + # Validating group-by columns + if isinstance(query.groupby_keys, KeySet): + # Checks that the KeySet is valid + schema = query.groupby_keys.schema() + groupby_columns: Collection[str] = schema.keys() + + for column_name, column_desc in schema.items(): + try: + input_column_desc = input_schema[column_name] + except KeyError as e: + raise KeyError( + f"Groupby column '{column_name}' is not in the input schema." + ) from e + if column_desc.column_type != input_column_desc.column_type: + raise ValueError( + f"Groupby column '{column_name}' has type" + f" '{column_desc.column_type.name}', but the column with the same " + f"name in the input data has type " + f"'{input_column_desc.column_type.name}' instead." + ) + elif isinstance(query.groupby_keys, tuple): + # Checks that the listed groupby columns exist in the schema + for col in query.groupby_keys: + if col not in input_schema: + raise ValueError(f"Groupby column '{col}' is not in the input schema.") + groupby_columns = query.groupby_keys + else: + raise AnalyticsInternalError( + f"Unexpected groupby_keys type: {type(query.groupby_keys)}." + ) + + # Validating compatibility between grouping columns and group-by columns + grouping_column = input_schema.grouping_column + if grouping_column is not None and grouping_column not in groupby_columns: + raise ValueError( + f"Column '{grouping_column}' produced by grouping transformation " + f"is not in groupby columns {list(groupby_columns)}." + ) + if ( + not isinstance(query, (GroupByCount, GroupByCountDistinct)) + and query.measure_column in groupby_columns + ): + raise ValueError( + "Column to aggregate must be a non-grouped column, not " + f"'{query.measure_column}'." + ) + + # Validating the measure column + if isinstance( + query, + ( + GetBounds, + GroupByQuantile, + GroupByBoundedSum, + GroupByBoundedSTDEV, + GroupByBoundedAverage, + GroupByBoundedVariance, + ), + ): + if query.measure_column not in input_schema: + raise ValueError( + f"{type(query).__name__} query's measure column " + f"'{query.measure_column}' does not exist in the table." + ) + if input_schema[query.measure_column].column_type not in [ + ColumnType.INTEGER, + ColumnType.DECIMAL, + ]: + raise ValueError( + f"{type(query).__name__} query's measure column " + f"'{query.measure_column}' has invalid type " + f"'{input_schema[query.measure_column].column_type.name}'. " + "Expected types: 'INTEGER' or 'DECIMAL'." + ) + if input_schema.id_column and (input_schema.id_column == query.measure_column): + raise ValueError( + f"{type(query).__name__} query's measure column is the same as the " + f"privacy ID column({input_schema.id_column}) on a table with the " + "AddRowsWithID protected change." + ) + + +def _schema_for_groupby( + query: Union[ + "GetBounds", + "GroupByBoundedAverage", + "GroupByBoundedSTDEV", + "GroupByBoundedSum", + "GroupByBoundedVariance", + "GroupByCount", + "GroupByCountDistinct", + "GroupByQuantile", + ], + input_schema: Schema, +) -> Schema: + """Returns the schema of a group-by QueryExpr.""" + groupby_columns = ( + query.groupby_keys.schema().keys() + if isinstance(query.groupby_keys, KeySet) + else query.groupby_keys + ) + + # Determining the output column types & names + if isinstance(query, (GroupByCount, GroupByCountDistinct)): + output_column_type = ColumnType.INTEGER + elif isinstance(query, (GetBounds, GroupByBoundedSum)): + output_column_type = input_schema[query.measure_column].column_type + elif isinstance( + query, + ( + GroupByQuantile, + GroupByBoundedSum, + GroupByBoundedSTDEV, + GroupByBoundedAverage, + GroupByBoundedVariance, + ), + ): + output_column_type = ColumnType.DECIMAL + else: + raise AnalyticsInternalError(f"Unexpected QueryExpr type: {type(query)}.") + if isinstance(query, GetBounds): + output_columns = { + query.lower_bound_column: ColumnDescriptor( + output_column_type, allow_null=False + ), + query.upper_bound_column: ColumnDescriptor( + output_column_type, allow_null=False + ), + } + else: + output_columns = { + query.output_column: ColumnDescriptor(output_column_type, allow_null=False), + } + + return Schema( + { + **{column: input_schema[column] for column in groupby_columns}, + **output_columns, + }, + grouping_column=None, + id_column=None, + ) + + @dataclass(frozen=True) class GroupByCount(QueryExpr): """Returns the count of each combination of the groupby domains.""" @@ -813,6 +1693,12 @@ def __post_init__(self): check_type(self.output_column, str) check_type(self.mechanism, CountMechanism) + def schema(self, catalog: Catalog) -> Schema: + """Returns the schema resulting from evaluating this QueryExpr.""" + input_schema = self.child.schema(catalog) + _validate_groupby(self, input_schema) + return _schema_for_groupby(self, input_schema) + def accept(self, visitor: "QueryExprVisitor") -> Any: """Visit this QueryExpr with visitor.""" return visitor.visit_groupby_count(self) @@ -849,6 +1735,12 @@ def __post_init__(self): check_type(self.output_column, str) check_type(self.mechanism, CountDistinctMechanism) + def schema(self, catalog: Catalog) -> Schema: + """Returns the schema resulting from evaluating this QueryExpr.""" + input_schema = self.child.schema(catalog) + _validate_groupby(self, input_schema) + return _schema_for_groupby(self, input_schema) + def accept(self, visitor: "QueryExprVisitor") -> Any: """Visit this QueryExpr with visitor.""" return visitor.visit_groupby_count_distinct(self) @@ -910,6 +1802,12 @@ def __post_init__(self): f"the upper bound '{self.high}'." ) + def schema(self, catalog: Catalog) -> Schema: + """Returns the schema resulting from evaluating this QueryExpr.""" + input_schema = self.child.schema(catalog) + _validate_groupby(self, input_schema) + return _schema_for_groupby(self, input_schema) + def accept(self, visitor: "QueryExprVisitor") -> Any: """Visit this QueryExpr with visitor.""" return visitor.visit_groupby_quantile(self) @@ -971,6 +1869,12 @@ def __post_init__(self): f"the upper bound '{self.high}'." ) + def schema(self, catalog: Catalog) -> Schema: + """Returns the schema resulting from evaluating this QueryExpr.""" + input_schema = self.child.schema(catalog) + _validate_groupby(self, input_schema) + return _schema_for_groupby(self, input_schema) + def accept(self, visitor: "QueryExprVisitor") -> Any: """Visit this QueryExpr with visitor.""" return visitor.visit_groupby_bounded_sum(self) @@ -1032,6 +1936,12 @@ def __post_init__(self): f"the upper bound '{self.high}'." ) + def schema(self, catalog: Catalog) -> Schema: + """Returns the schema resulting from evaluating this QueryExpr.""" + input_schema = self.child.schema(catalog) + _validate_groupby(self, input_schema) + return _schema_for_groupby(self, input_schema) + def accept(self, visitor: "QueryExprVisitor") -> Any: """Visit this QueryExpr with visitor.""" return visitor.visit_groupby_bounded_average(self) @@ -1093,6 +2003,12 @@ def __post_init__(self): f"the upper bound '{self.high}'." ) + def schema(self, catalog: Catalog) -> Schema: + """Returns the schema resulting from evaluating this QueryExpr.""" + input_schema = self.child.schema(catalog) + _validate_groupby(self, input_schema) + return _schema_for_groupby(self, input_schema) + def accept(self, visitor: "QueryExprVisitor") -> Any: """Visit this QueryExpr with visitor.""" return visitor.visit_groupby_bounded_variance(self) @@ -1155,6 +2071,12 @@ def __post_init__(self): f"the upper bound '{self.high}'." ) + def schema(self, catalog: Catalog) -> Schema: + """Returns the schema resulting from evaluating this QueryExpr.""" + input_schema = self.child.schema(catalog) + _validate_groupby(self, input_schema) + return _schema_for_groupby(self, input_schema) + def accept(self, visitor: "QueryExprVisitor") -> Any: """Visit this QueryExpr with visitor.""" return visitor.visit_groupby_bounded_stdev(self) @@ -1187,6 +2109,10 @@ def __post_init__(self) -> None: check_type(self.column, str) check_type(self.threshold, float) + def schema(self, catalog: Catalog) -> Schema: + """Returns the schema resulting from evaluating this QueryExpr.""" + return self.child.schema(catalog) + def accept(self, visitor: "QueryExprVisitor") -> Any: """Visit this QueryExpr with visitor.""" return visitor.visit_suppress_aggregates(self) diff --git a/src/tmlt/analytics/_query_expr_compiler/_base_measurement_visitor.py b/src/tmlt/analytics/_query_expr_compiler/_base_measurement_visitor.py index 19dee7d3..4849e4bc 100644 --- a/src/tmlt/analytics/_query_expr_compiler/_base_measurement_visitor.py +++ b/src/tmlt/analytics/_query_expr_compiler/_base_measurement_visitor.py @@ -101,9 +101,6 @@ SuppressAggregates, VarianceMechanism, ) -from tmlt.analytics._query_expr_compiler._output_schema_visitor import ( - OutputSchemaVisitor, -) from tmlt.analytics._schema import ColumnType, FrozenDict, Schema from tmlt.analytics._table_identifier import Identifier from tmlt.analytics._table_reference import TableReference @@ -708,7 +705,7 @@ def _pick_noise_for_non_count( GroupByQuantile and GetBounds only supports one noise mechanism, so it is not included here. """ - measure_column_type = query.child.accept(OutputSchemaVisitor(self.catalog))[ + measure_column_type = query.child.schema(self.catalog)[ query.measure_column ].column_type requested_mechanism: NoiseMechanism @@ -802,7 +799,7 @@ def _add_special_value_handling_to_query( These changes are added immediately before the groupby aggregation in the query. """ - expected_schema = query.child.accept(OutputSchemaVisitor(self.catalog)) + expected_schema = query.child.schema(self.catalog) # You can't perform these queries on nulls, NaNs, or infinite values # so check for those @@ -1046,7 +1043,7 @@ def visit_groupby_count(self, expr: GroupByCount) -> Tuple[Measurement, NoiseInf self._validate_approxDP_and_adjust_budget(expr) # Peek at the schema, to see if there are errors there - expr.accept(OutputSchemaVisitor(self.catalog)) + expr.schema(self.catalog) if isinstance(expr.groupby_keys, KeySet): groupby_cols = tuple(expr.groupby_keys.dataframe().columns) @@ -1130,7 +1127,7 @@ def visit_groupby_count_distinct( self._validate_approxDP_and_adjust_budget(expr) # Peek at the schema, to see if there are errors there - expr.accept(OutputSchemaVisitor(self.catalog)) + expr.schema(self.catalog) if isinstance(expr.groupby_keys, KeySet): groupby_cols = tuple(expr.groupby_keys.dataframe().columns) @@ -1150,7 +1147,7 @@ def visit_groupby_count_distinct( ) = self._visit_child_transformation(expr.child, mechanism) constrained_query = _generate_constrained_count_distinct( expr, - expr.child.accept(OutputSchemaVisitor(self.catalog)), + expr.child.schema(self.catalog), child_constraints, ) if constrained_query is not None: @@ -1251,7 +1248,7 @@ def visit_groupby_quantile( self._validate_approxDP_and_adjust_budget(expr) # Peek at the schema, to see if there are errors there - expr.accept(OutputSchemaVisitor(self.catalog)) + expr.schema(self.catalog) expr = self._add_special_value_handling_to_query(expr) if isinstance(expr.groupby_keys, KeySet): @@ -1264,7 +1261,7 @@ def visit_groupby_quantile( self.adjusted_budget ) # Peek at the schema, to see if there are errors there - expr.accept(OutputSchemaVisitor(self.catalog)) + expr.schema(self.catalog) child_transformation, child_ref = self._truncate_table( *self._visit_child_transformation(expr.child, self.default_mechanism), @@ -1346,7 +1343,7 @@ def visit_groupby_bounded_sum( self._validate_approxDP_and_adjust_budget(expr) # Peek at the schema, to see if there are errors there - expr.accept(OutputSchemaVisitor(self.catalog)) + expr.schema(self.catalog) expr = self._add_special_value_handling_to_query(expr) if isinstance(expr.groupby_keys, KeySet): @@ -1442,7 +1439,7 @@ def visit_groupby_bounded_average( self._validate_approxDP_and_adjust_budget(expr) # Peek at the schema, to see if there are errors there - expr.accept(OutputSchemaVisitor(self.catalog)) + expr.schema(self.catalog) expr = self._add_special_value_handling_to_query(expr) if isinstance(expr.groupby_keys, KeySet): @@ -1538,7 +1535,7 @@ def visit_groupby_bounded_variance( self._validate_approxDP_and_adjust_budget(expr) # Peek at the schema, to see if there are errors there - expr.accept(OutputSchemaVisitor(self.catalog)) + expr.schema(self.catalog) expr = self._add_special_value_handling_to_query(expr) if isinstance(expr.groupby_keys, KeySet): @@ -1634,7 +1631,7 @@ def visit_groupby_bounded_stdev( self._validate_approxDP_and_adjust_budget(expr) # Peek at the schema, to see if there are errors there - expr.accept(OutputSchemaVisitor(self.catalog)) + expr.schema(self.catalog) expr = self._add_special_value_handling_to_query(expr) if isinstance(expr.groupby_keys, KeySet): @@ -1726,7 +1723,7 @@ def visit_get_bounds(self, expr: GetBounds) -> Tuple[Measurement, NoiseInfo]: self._validate_approxDP_and_adjust_budget(expr) # Peek at the schema, to see if there are errors there - expr.accept(OutputSchemaVisitor(self.catalog)) + expr.schema(self.catalog) expr = self._add_special_value_handling_to_query(expr) if isinstance(expr.groupby_keys, KeySet): @@ -1740,7 +1737,7 @@ def visit_get_bounds(self, expr: GetBounds) -> Tuple[Measurement, NoiseInfo]: ) # Peek at the schema, to see if there are errors there - expr.accept(OutputSchemaVisitor(self.catalog)) + expr.schema(self.catalog) child_transformation, child_ref = self._truncate_table( *self._visit_child_transformation(expr.child, NoiseMechanism.GEOMETRIC), @@ -1823,7 +1820,7 @@ def visit_suppress_aggregates( self, expr: SuppressAggregates ) -> Tuple[Measurement, NoiseInfo]: """Create a measurement from a SuppressAggregates query expression.""" - expr.accept(OutputSchemaVisitor(self.catalog)) + expr.schema(self.catalog) child_measurement, noise_info = expr.child.accept(self) if not isinstance(child_measurement, Measurement): diff --git a/src/tmlt/analytics/_query_expr_compiler/_base_transformation_visitor.py b/src/tmlt/analytics/_query_expr_compiler/_base_transformation_visitor.py index 599530af..1180d04c 100644 --- a/src/tmlt/analytics/_query_expr_compiler/_base_transformation_visitor.py +++ b/src/tmlt/analytics/_query_expr_compiler/_base_transformation_visitor.py @@ -157,9 +157,6 @@ propagate_select, propagate_unmodified, ) -from tmlt.analytics._query_expr_compiler._output_schema_visitor import ( - OutputSchemaVisitor, -) from tmlt.analytics._schema import ( ColumnDescriptor, ColumnType, @@ -245,7 +242,7 @@ def validate_transformation( catalog: Catalog, ): """Ensure that a query's transformation is valid on a given catalog.""" - expected_schema = query.accept(OutputSchemaVisitor(catalog)) + expected_schema = query.schema(catalog) expected_output_domain = SparkDataFrameDomain( analytics_to_spark_columns_descriptor(expected_schema) ) diff --git a/src/tmlt/analytics/_query_expr_compiler/_compiler.py b/src/tmlt/analytics/_query_expr_compiler/_compiler.py index 01d0fffe..37475d60 100644 --- a/src/tmlt/analytics/_query_expr_compiler/_compiler.py +++ b/src/tmlt/analytics/_query_expr_compiler/_compiler.py @@ -22,9 +22,6 @@ from tmlt.analytics._noise_info import NoiseInfo from tmlt.analytics._query_expr import QueryExpr from tmlt.analytics._query_expr_compiler._measurement_visitor import MeasurementVisitor -from tmlt.analytics._query_expr_compiler._output_schema_visitor import ( - OutputSchemaVisitor, -) from tmlt.analytics._query_expr_compiler._transformation_visitor import ( TransformationVisitor, ) @@ -108,13 +105,13 @@ def output_measure(self) -> Union[PureDP, ApproxDP, RhoZCDP]: @staticmethod def query_schema(query: QueryExpr, catalog: Catalog) -> Schema: """Return the schema created by a given query.""" - result = query.accept(OutputSchemaVisitor(catalog=catalog)) - if not isinstance(result, Schema): + schema = query.schema(catalog) + if not isinstance(schema, Schema): raise AnalyticsInternalError( "Schema for this query is not a Schema but is instead a(n) " - f"{type(result)}." + f"{type(schema)}." ) - return result + return schema def __call__( self, @@ -139,6 +136,9 @@ def __call__( catalog: The catalog, used only for query validation. table_constraints: A mapping of tables to the existing constraints on them. """ + # Computing the schema validates that the query is well-formed. + query.schema(catalog) + visitor = MeasurementVisitor( privacy_budget=privacy_budget, stability=stability, @@ -207,7 +207,10 @@ def build_transformation( catalog: The catalog, used only for query validation. table_constraints: A mapping of tables to the existing constraints on them. """ - query.accept(OutputSchemaVisitor(catalog)) + # Computing the schema validates that the query is well-formed. It's useful to + # perform this check here in addition to __call__ so validation errors can be + # raised at view creation, not just query evaluation. + query.schema(catalog) transformation_visitor = TransformationVisitor( input_domain=input_domain, diff --git a/src/tmlt/analytics/_query_expr_compiler/_measurement_visitor.py b/src/tmlt/analytics/_query_expr_compiler/_measurement_visitor.py index 5118a1e0..43e6c60e 100644 --- a/src/tmlt/analytics/_query_expr_compiler/_measurement_visitor.py +++ b/src/tmlt/analytics/_query_expr_compiler/_measurement_visitor.py @@ -26,9 +26,6 @@ from tmlt.analytics._query_expr_compiler._base_measurement_visitor import ( BaseMeasurementVisitor, ) -from tmlt.analytics._query_expr_compiler._output_schema_visitor import ( - OutputSchemaVisitor, -) from tmlt.analytics._query_expr_compiler._transformation_visitor import ( TransformationVisitor, ) @@ -75,10 +72,7 @@ def visit_get_groups(self, expr: GetGroups) -> Tuple[Measurement, NoiseInfo]: if not isinstance(self.budget, ApproxDPBudget): raise ValueError("GetGroups is only supported with ApproxDPBudgets.") - # Peek at the schema, to see if there are errors there - expr.accept(OutputSchemaVisitor(self.catalog)) - - schema = expr.child.accept(OutputSchemaVisitor(self.catalog)) + schema = expr.child.schema(self.catalog) # Set the columns if no columns were provided. if expr.columns: diff --git a/src/tmlt/analytics/_query_expr_compiler/_output_schema_visitor.py b/src/tmlt/analytics/_query_expr_compiler/_output_schema_visitor.py deleted file mode 100644 index 51f1ad9d..00000000 --- a/src/tmlt/analytics/_query_expr_compiler/_output_schema_visitor.py +++ /dev/null @@ -1,933 +0,0 @@ -"""Defines a visitor for determining the output schemas of query expressions.""" - -# SPDX-License-Identifier: Apache-2.0 -# Copyright Tumult Labs 2025 - -from collections.abc import Collection -from dataclasses import replace -from typing import Optional, Tuple, Union - -from pyspark.sql import SparkSession -from tmlt.core.domains.spark_domains import SparkDataFrameDomain -from tmlt.core.utils.join import domain_after_join - -from tmlt.analytics import AnalyticsInternalError -from tmlt.analytics._catalog import Catalog, PrivateTable, PublicTable -from tmlt.analytics._query_expr import ( - DropInfinity, - DropNullAndNan, - EnforceConstraint, - Filter, - FlatMap, - FlatMapByID, - GetBounds, - GetGroups, - GroupByBoundedAverage, - GroupByBoundedSTDEV, - GroupByBoundedSum, - GroupByBoundedVariance, - GroupByCount, - GroupByCountDistinct, - GroupByQuantile, - JoinPrivate, - JoinPublic, - Map, - PrivateSource, - QueryExprVisitor, - Rename, - ReplaceInfinity, - ReplaceNullAndNan, - Select, - SuppressAggregates, -) -from tmlt.analytics._schema import ( - ColumnDescriptor, - ColumnType, - Schema, - analytics_to_py_types, - analytics_to_spark_columns_descriptor, - analytics_to_spark_schema, - spark_schema_to_analytics_columns, -) -from tmlt.analytics.constraints import MaxGroupsPerID, MaxRowsPerGroupPerID -from tmlt.analytics.keyset import KeySet - - -def _output_schema_for_join( - left_schema: Schema, - right_schema: Schema, - join_columns: Optional[Tuple[str, ...]], - join_id_space: Optional[str] = None, - how: str = "inner", -) -> Schema: - """Return the resulting schema from joining two tables. - - It is assumed that if either schema has an ID column, the one from - left_schema should be used. This is because the appropriate behavior here - depends on the type of join being performed, so checks for compatibility of - ID columns must happen outside this function. - - Args: - left_schema: Schema for the left table. - right_schema: Schema for the right table. - join_columns: The set of columns to join on. - join_id_space: The ID space of the resulting join. - how: The type of join to perform. Default is "inner". - """ - if left_schema.grouping_column is None: - grouping_column = right_schema.grouping_column - elif right_schema.grouping_column is None: - grouping_column = left_schema.grouping_column - elif left_schema.grouping_column == right_schema.grouping_column: - grouping_column = left_schema.grouping_column - else: - raise ValueError( - "Joining tables which both have grouping columns is only supported " - "if they have the same grouping column" - ) - common_columns = set(left_schema) & set(right_schema) - if join_columns is None and not common_columns: - raise ValueError("Tables have no common columns to join on") - if join_columns is not None and not join_columns: - # This error case should be caught when constructing the query - # expression, so it should never get here. - raise AnalyticsInternalError("Empty list of join columns provided.") - - join_columns = ( - join_columns - if join_columns - else tuple(sorted(common_columns, key=list(left_schema).index)) - ) - - if not set(join_columns) <= common_columns: - raise ValueError("Join columns must be common to both tables") - - for column in join_columns: - if left_schema[column].column_type != right_schema[column].column_type: - raise ValueError( - "Join columns must have identical types on both tables, " - f"but column '{column}' does not: {left_schema[column]} and " - f"{right_schema[column]} are incompatible" - ) - - join_column_schemas = {column: left_schema[column] for column in join_columns} - output_schema = { - **join_column_schemas, - **{ - column + ("_left" if column in common_columns else ""): left_schema[column] - for column in left_schema - if column not in join_columns - }, - **{ - column - + ("_right" if column in common_columns else ""): right_schema[column] - for column in right_schema - if column not in join_columns - }, - } - # Use Core's join utilities for determining whether a column can be null - # TODO: This could potentially be used more in this function - output_domain = domain_after_join( - left_domain=SparkDataFrameDomain( - analytics_to_spark_columns_descriptor(left_schema) - ), - right_domain=SparkDataFrameDomain( - analytics_to_spark_columns_descriptor(right_schema) - ), - on=list(join_columns), - how=how, - nulls_are_equal=True, - ) - for column in output_schema: - col_schema = output_schema[column] - output_schema[column] = ColumnDescriptor( - column_type=col_schema.column_type, - allow_null=output_domain.schema[column].allow_null, - allow_nan=col_schema.allow_nan, - allow_inf=col_schema.allow_inf, - ) - return Schema( - output_schema, - grouping_column=grouping_column, - id_column=left_schema.id_column, - id_space=join_id_space, - ) - - -def _validate_groupby( - query: Union[ - GroupByBoundedAverage, - GroupByBoundedSTDEV, - GroupByBoundedSum, - GroupByBoundedVariance, - GroupByCount, - GroupByCountDistinct, - GroupByQuantile, - GetBounds, - ], - output_schema_visitor: "OutputSchemaVisitor", -) -> Schema: - """Validate groupby aggregate query. - - Args: - query: Query expression to be validated. - output_schema_visitor: A visitor to get the output schema of an expression. - - Returns: - Output schema of current QueryExpr - """ - input_schema = query.child.accept(output_schema_visitor) - - if isinstance(query.groupby_keys, KeySet): - # Checks that the KeySet is valid - schema = query.groupby_keys.schema() - groupby_columns: Collection[str] = schema.keys() - - for column_name, column_desc in schema.items(): - try: - input_column_desc = input_schema[column_name] - except KeyError as e: - raise KeyError( - f"Groupby column '{column_name}' is not in the input schema." - ) from e - if column_desc.column_type != input_column_desc.column_type: - raise ValueError( - f"Groupby column '{column_name}' has type" - f" '{column_desc.column_type.name}', but the column with the same " - f"name in the input data has type " - f"'{input_column_desc.column_type.name}' instead." - ) - elif isinstance(query.groupby_keys, tuple): - # Checks that the listed groupby columns exist in the schema - for col in query.groupby_keys: - if col not in input_schema: - raise ValueError(f"Groupby column '{col}' is not in the input schema.") - groupby_columns = query.groupby_keys - else: - raise AnalyticsInternalError( - f"Unexpected groupby_keys type: {type(query.groupby_keys)}." - ) - - grouping_column = input_schema.grouping_column - if grouping_column is not None and grouping_column not in groupby_columns: - raise ValueError( - f"Column '{grouping_column}' produced by grouping transformation " - f"is not in groupby columns {list(groupby_columns)}." - ) - if ( - not isinstance(query, (GroupByCount, GroupByCountDistinct)) - and query.measure_column in groupby_columns - ): - raise ValueError( - "Column to aggregate must be a non-grouped column, not " - f"'{query.measure_column}'." - ) - - if isinstance(query, (GroupByCount, GroupByCountDistinct)): - output_column_type = ColumnType.INTEGER - elif isinstance(query, GetBounds): - # Measure column type check not needed, since we check it early in - # OutputSchemaVisitor.visit_get_bounds - output_column_type = input_schema[query.measure_column].column_type - elif isinstance(query, GroupByQuantile): - if input_schema[query.measure_column].column_type not in [ - ColumnType.INTEGER, - ColumnType.DECIMAL, - ]: - raise ValueError( - f"Quantile query's measure column '{query.measure_column}' has invalid" - f" type '{input_schema[query.measure_column].column_type.name}'." - " Expected types: 'INTEGER' or 'DECIMAL'." - ) - output_column_type = ColumnType.DECIMAL - elif isinstance( - query, - ( - GroupByBoundedSum, - GroupByBoundedSTDEV, - GroupByBoundedAverage, - GroupByBoundedVariance, - ), - ): - if input_schema[query.measure_column].column_type not in [ - ColumnType.INTEGER, - ColumnType.DECIMAL, - ]: - raise ValueError( - f"{type(query).__name__} query's measure column " - f"'{query.measure_column}' has invalid type " - f"'{input_schema[query.measure_column].column_type.name}'. " - "Expected types: 'INTEGER' or 'DECIMAL'." - ) - output_column_type = ( - input_schema[query.measure_column].column_type - if isinstance(query, GroupByBoundedSum) - else ColumnType.DECIMAL - ) - else: - raise AssertionError( - "Unexpected QueryExpr type. This should not happen and is" - "probably a bug; please let us know so we can fix it!" - ) - if isinstance(query, GetBounds): - output_schema = Schema( - { - **{column: input_schema[column] for column in groupby_columns}, - **{ - query.lower_bound_column: ColumnDescriptor( - output_column_type, allow_null=False - ) - }, - **{ - query.upper_bound_column: ColumnDescriptor( - output_column_type, allow_null=False - ) - }, - }, - grouping_column=None, - id_column=None, - ) - else: - output_schema = Schema( - { - **{column: input_schema[column] for column in groupby_columns}, - **{ - query.output_column: ColumnDescriptor( - output_column_type, allow_null=False - ) - }, - }, - grouping_column=None, - id_column=None, - ) - return output_schema - - -class OutputSchemaVisitor(QueryExprVisitor): - """A visitor to get the output schema of a query expression.""" - - def __init__(self, catalog: Catalog): - """Visitor constructor. - - Args: - catalog: The catalog defining schemas and relations between tables. - """ - self._catalog = catalog - - def visit_private_source(self, expr: PrivateSource) -> Schema: - """Return the resulting schema from evaluating a PrivateSource.""" - if expr.source_id not in self._catalog.tables: - raise ValueError(f"Query references nonexistent table '{expr.source_id}'") - table = self._catalog.tables[expr.source_id] - if not isinstance(table, PrivateTable): - raise ValueError( - f"Attempted query on table '{expr.source_id}', which is " - "not a private table" - ) - return table.schema - - def visit_rename(self, expr: Rename) -> Schema: - """Returns the resulting schema from evaluating a Rename.""" - input_schema = expr.child.accept(self) - grouping_column = input_schema.grouping_column - id_column = input_schema.id_column - id_space = input_schema.id_space - nonexistent_columns = set(expr.column_mapper) - set(input_schema) - if nonexistent_columns: - raise ValueError( - f"Nonexistent columns in rename query: {nonexistent_columns}" - ) - for old, new in expr.column_mapper.items(): - if new in input_schema and new != old: - raise ValueError( - f"Cannot rename '{old}' to '{new}': column '{new}' already exists" - ) - if old == grouping_column: - grouping_column = new - if old == id_column: - id_column = new - - return Schema( - { - expr.column_mapper.get(column, column): input_schema[column] - for column in input_schema - }, - grouping_column=grouping_column, - id_column=id_column, - id_space=id_space, - ) - - def visit_filter(self, expr: Filter) -> Schema: - """Returns the resulting schema from evaluating a Filter.""" - input_schema = expr.child.accept(self) - spark = SparkSession.builder.getOrCreate() - test_df = spark.createDataFrame( - [], schema=analytics_to_spark_schema(input_schema) - ) - try: - test_df.filter(expr.condition) - except Exception as e: - raise ValueError(f"Invalid filter condition '{expr.condition}': {e}") from e - return input_schema - - def visit_select(self, expr: Select) -> Schema: - """Returns the resulting schema from evaluating a Select.""" - input_schema = expr.child.accept(self) - - grouping_column = input_schema.grouping_column - id_column = input_schema.id_column - if grouping_column is not None and grouping_column not in expr.columns: - raise ValueError( - f"Grouping column '{grouping_column}' may not " - "be dropped by select query" - ) - if id_column is not None and id_column not in expr.columns: - raise ValueError( - f"ID column '{id_column}' may not be dropped by select query" - ) - - nonexistent_columns = set(expr.columns) - set(input_schema) - if nonexistent_columns: - raise ValueError( - f"Nonexistent columns in select query: {nonexistent_columns}" - ) - - return Schema( - {column: input_schema[column] for column in expr.columns}, - grouping_column=grouping_column, - id_column=id_column, - id_space=input_schema.id_space, - ) - - def visit_map(self, expr: Map) -> Schema: - """Returns the resulting schema from evaluating a Map.""" - input_schema = expr.child.accept(self) - new_columns = expr.schema_new_columns.column_descs - # Any column created by Map could contain a null value - for name in list(new_columns.keys()): - new_columns[name] = replace(new_columns[name], allow_null=True) - - if expr.augment: - overlapping_columns = set(input_schema.keys()) & set(new_columns.keys()) - if overlapping_columns: - raise ValueError( - "New columns in augmenting map must not overwrite " - "existing columns, but found new columns that " - f"already exist: {', '.join(overlapping_columns)}" - ) - return Schema( - {**input_schema, **new_columns}, - grouping_column=input_schema.grouping_column, - id_column=input_schema.id_column, - id_space=input_schema.id_space, - ) - elif input_schema.grouping_column: - raise ValueError( - "Map must set augment=True to ensure that " - f"grouping column '{input_schema.grouping_column}' is not lost." - ) - elif input_schema.id_column: - raise ValueError( - "Map must set augment=True to ensure that " - f"ID column '{input_schema.id_column}' is not lost." - ) - return Schema( - new_columns, - grouping_column=expr.schema_new_columns.grouping_column, - id_column=expr.schema_new_columns.id_column, - id_space=expr.schema_new_columns.id_space, - ) - - def visit_flat_map(self, expr: FlatMap) -> Schema: - """Returns the resulting schema from evaluating a FlatMap.""" - input_schema = expr.child.accept(self) - if expr.schema_new_columns.grouping_column is not None: - if input_schema.grouping_column: - raise ValueError( - "Multiple grouping transformations are used in this query. " - "Only one grouping transformation is allowed." - ) - if input_schema.id_column: - raise ValueError( - "Grouping flat map cannot be used on tables with " - "the AddRowsWithID protected change." - ) - grouping_column = expr.schema_new_columns.grouping_column - else: - grouping_column = input_schema.grouping_column - - new_columns = expr.schema_new_columns.column_descs - # Any column created by the FlatMap could contain a null value - for name in list(new_columns.keys()): - new_columns[name] = replace(new_columns[name], allow_null=True) - if expr.augment: - overlapping_columns = set(input_schema.keys()) & set(new_columns.keys()) - if overlapping_columns: - raise ValueError( - "New columns in augmenting map must not overwrite " - "existing columns, but found new columns that " - f"already exist: {', '.join(overlapping_columns)}" - ) - return Schema( - {**input_schema, **new_columns}, - grouping_column=grouping_column, - id_column=input_schema.id_column, - id_space=input_schema.id_space, - ) - elif input_schema.grouping_column: - raise ValueError( - "Flat map must set augment=True to ensure that " - f"grouping column '{input_schema.grouping_column}' is not lost." - ) - elif input_schema.id_column: - raise ValueError( - "Flat map must set augment=True to ensure that " - f"ID column '{input_schema.id_column}' is not lost." - ) - - return Schema( - new_columns, - grouping_column=grouping_column, - id_column=expr.schema_new_columns.id_column, - id_space=expr.schema_new_columns.id_space, - ) - - def visit_flat_map_by_id(self, expr: FlatMapByID) -> Schema: - """Returns the resulting schema from evaluating a FlatMapByID.""" - input_schema = expr.child.accept(self) - id_column = input_schema.id_column - new_columns = expr.schema_new_columns.column_descs - - if not id_column: - raise ValueError( - "Flat-map-by-ID may only be used on tables with ID columns." - ) - if input_schema.grouping_column: - raise AnalyticsInternalError( - "Encountered table with both an ID column and a grouping column." - ) - if id_column in new_columns: - raise ValueError( - "Flat-map-by-ID mapping function output cannot include ID column." - ) - - for name in list(new_columns.keys()): - new_columns[name] = replace(new_columns[name], allow_null=True) - return Schema( - {id_column: input_schema[id_column], **new_columns}, - grouping_column=None, - id_column=id_column, - id_space=input_schema.id_space, - ) - - def visit_join_private(self, expr: JoinPrivate) -> Schema: - """Returns the resulting schema from evaluating a JoinPrivate. - - The ordering of output columns are: - - 1. The join columns - 2. Columns that are only in the left table - 3. Columns that are only in the right table - 4. Columns that are in both tables, but not included in the join columns. These - columns are included with _left and _right suffixes. - """ - left_schema = expr.child.accept(self) - right_schema = expr.right_operand_expr.accept(self) - if left_schema.id_column != right_schema.id_column: - if left_schema.id_column is None or right_schema.id_column is None: - raise ValueError( - "Private joins can only be performed between two tables " - "with the same type of protected change" - ) - raise ValueError( - "Private joins between tables with the AddRowsWithID " - "protected change are only possible when the ID columns of " - "the two tables have the same name" - ) - if ( - left_schema.id_space - and right_schema.id_space - and left_schema.id_space != right_schema.id_space - ): - raise ValueError( - "Private joins between tables with the AddRowsWithID protected change" - " are only possible when both tables are in the same ID space" - ) - join_id_space: Optional[str] = None - if left_schema.id_space and right_schema.id_space: - join_id_space = left_schema.id_space - return _output_schema_for_join( - left_schema=left_schema, - right_schema=right_schema, - join_columns=expr.join_columns, - join_id_space=join_id_space, - ) - - def visit_join_public(self, expr: JoinPublic) -> Schema: - """Returns the resulting schema from evaluating a JoinPublic. - - Has analogous behavior to :meth:`OutputSchemaVisitor.visit_join_private`, - where the private table is the left table. - """ - input_schema = expr.child.accept(self) - if isinstance(expr.public_table, str): - public_table = self._catalog.tables[expr.public_table] - if not isinstance(public_table, PublicTable): - raise ValueError( - f"Attempted public join on table '{expr.public_table}', " - "which is not a public table" - ) - right_schema = public_table.schema - else: - right_schema = Schema( - spark_schema_to_analytics_columns(expr.public_table.schema) - ) - return _output_schema_for_join( - left_schema=input_schema, - right_schema=right_schema, - join_columns=expr.join_columns, - join_id_space=input_schema.id_space, - how=expr.how, - ) - - def visit_replace_null_and_nan(self, expr: ReplaceNullAndNan) -> Schema: - """Returns the resulting schema from evaluating a ReplaceNullAndNan.""" - input_schema = expr.child.accept(self) - if ( - input_schema.grouping_column - and input_schema.grouping_column in expr.replace_with - ): - raise ValueError( - "Cannot replace null values in column " - f"'{input_schema.grouping_column}', as it is a grouping column." - ) - if input_schema.id_column and input_schema.id_column in expr.replace_with: - raise ValueError( - f"Cannot replace null values in column '{input_schema.id_column}', " - "as it is an ID column." - ) - if input_schema.id_column and (len(expr.replace_with) == 0): - raise RuntimeWarning( - f"Replacing null values in the ID column '{input_schema.id_column}' " - "is not allowed, so the ID column may still contain null values." - ) - - if len(expr.replace_with) != 0: - pytypes = analytics_to_py_types(input_schema) - for col, val in expr.replace_with.items(): - if col not in input_schema.keys(): - raise ValueError( - f"Column '{col}' does not exist in this table, " - f"available columns are {list(input_schema.keys())}" - ) - if not isinstance(val, pytypes[col]): - # it's okay to use an int as a float - # so don't raise an error in that case - if not (isinstance(val, int) and pytypes[col] == float): - raise ValueError( - f"Column '{col}' cannot have nulls replaced with " - f"{repr(val)}, as that value's type does not match the " - f"column type {input_schema[col].column_type.name}" - ) - - columns_to_change = list(dict(expr.replace_with).keys()) - if len(columns_to_change) == 0: - columns_to_change = [ - col - for col in input_schema.column_descs.keys() - if (input_schema[col].allow_null or input_schema[col].allow_nan) - and not (col in [input_schema.grouping_column, input_schema.id_column]) - ] - return Schema( - { - name: ColumnDescriptor( - column_type=cd.column_type, - allow_null=(cd.allow_null and not name in columns_to_change), - allow_nan=(cd.allow_nan and not name in columns_to_change), - allow_inf=cd.allow_inf, - ) - for name, cd in input_schema.column_descs.items() - }, - grouping_column=input_schema.grouping_column, - id_column=input_schema.id_column, - id_space=input_schema.id_space, - ) - - def visit_replace_infinity(self, expr: ReplaceInfinity) -> Schema: - """Returns the resulting schema from evaluating a ReplaceInfinity.""" - input_schema = expr.child.accept(self) - - if ( - input_schema.grouping_column - and input_schema.grouping_column in expr.replace_with - ): - raise ValueError( - "Cannot replace infinite values in column " - f"'{input_schema.grouping_column}', as it is a grouping column" - ) - # Float-valued columns cannot be ID columns, but include this to be safe. - if input_schema.id_column and input_schema.id_column in expr.replace_with: - raise ValueError( - f"Cannot replace infinite values in column '{input_schema.id_column}', " - "as it is an ID column" - ) - - columns_to_change = list(expr.replace_with.keys()) - if len(columns_to_change) == 0: - columns_to_change = [ - col - for col in input_schema.column_descs.keys() - if input_schema[col].column_type == ColumnType.DECIMAL - ] - else: - for name in expr.replace_with: - if name not in input_schema.keys(): - raise ValueError( - f"Column '{name}' does not exist in this table, " - f"available columns are {list(input_schema.keys())}" - ) - if input_schema[name].column_type != ColumnType.DECIMAL: - raise ValueError( - f"Column '{name}' has a replacement value provided, but it is " - f"of type {input_schema[name].column_type.name} (not " - f"{ColumnType.DECIMAL.name}) and so cannot " - "contain infinite values" - ) - return Schema( - { - name: ColumnDescriptor( - column_type=cd.column_type, - allow_null=cd.allow_null, - allow_nan=cd.allow_nan, - allow_inf=(cd.allow_inf and not name in columns_to_change), - ) - for name, cd in input_schema.column_descs.items() - }, - grouping_column=input_schema.grouping_column, - id_column=input_schema.id_column, - id_space=input_schema.id_space, - ) - - def visit_drop_null_and_nan(self, expr: DropNullAndNan) -> Schema: - """Returns the resulting schema from evaluating a DropNullAndNan.""" - input_schema = expr.child.accept(self) - if ( - input_schema.grouping_column - and input_schema.grouping_column in expr.columns - ): - raise ValueError( - f"Cannot drop null values in column '{input_schema.grouping_column}', " - "as it is a grouping column" - ) - if input_schema.id_column and input_schema.id_column in expr.columns: - raise ValueError( - f"Cannot drop null values in column '{input_schema.id_column}', " - "as it is an ID column." - ) - if input_schema.id_column and len(expr.columns) == 0: - raise RuntimeWarning( - f"Replacing null values in the ID column '{input_schema.id_column}' " - "is not allowed, so the ID column may still contain null values." - ) - columns = expr.columns - if len(columns) == 0: - columns = tuple( - name - for name, cd in input_schema.column_descs.items() - if (cd.allow_null or cd.allow_nan) - and not name in [input_schema.grouping_column, input_schema.id_column] - ) - else: - for name in columns: - if name not in input_schema.keys(): - raise ValueError( - f"Column '{name}' does not exist in this table, " - f"available columns are {list(input_schema.keys())}" - ) - return Schema( - { - name: ColumnDescriptor( - column_type=cd.column_type, - allow_null=(cd.allow_null and not name in columns), - allow_nan=(cd.allow_nan and not name in columns), - allow_inf=(cd.allow_inf), - ) - for name, cd in input_schema.column_descs.items() - }, - grouping_column=input_schema.grouping_column, - id_column=input_schema.id_column, - id_space=input_schema.id_space, - ) - - def visit_drop_infinity(self, expr: DropInfinity) -> Schema: - """Returns the resulting schema from evaluating a DropInfinity.""" - input_schema = expr.child.accept(self) - - if ( - input_schema.grouping_column - and input_schema.grouping_column in expr.columns - ): - raise ValueError( - "Cannot drop infinite values in column " - f"'{input_schema.grouping_column}', as it is a grouping column" - ) - # Float-valued columns cannot be ID columns, but include this to be safe. - if input_schema.id_column and input_schema.id_column in expr.columns: - raise ValueError( - f"Cannot drop infinite values in column '{input_schema.id_column}', " - "as it is an ID column" - ) - - columns = expr.columns - if len(columns) == 0: - columns = tuple( - name - for name, cd in input_schema.column_descs.items() - if (cd.allow_inf) and not name == input_schema.grouping_column - ) - else: - for name in columns: - if name not in input_schema.keys(): - raise ValueError( - f"Column '{name}' does not exist in this table, " - f"available columns are {list(input_schema.keys())}" - ) - if input_schema[name].column_type != ColumnType.DECIMAL: - raise ValueError( - f"Column '{name}' was given as a column to drop " - "infinite values from, but it is of type" - f"{input_schema[name].column_type.name} (not " - f"{ColumnType.DECIMAL.name}) and so cannot " - "contain infinite values" - ) - - return Schema( - { - name: ColumnDescriptor( - column_type=cd.column_type, - allow_null=cd.allow_null, - allow_nan=cd.allow_nan, - allow_inf=(cd.allow_inf and not name in columns), - ) - for name, cd in input_schema.column_descs.items() - }, - grouping_column=input_schema.grouping_column, - id_column=input_schema.id_column, - id_space=input_schema.id_space, - ) - - def visit_enforce_constraint(self, expr: EnforceConstraint) -> Schema: - """Returns the resulting schema from evaluating an EnforceConstraint.""" - input_schema = expr.child.accept(self) - constraint = expr.constraint - - if not input_schema.id_column: - raise ValueError( - f"Constraint {expr.constraint} can only be applied to tables" - " with the AddRowsWithID protected change" - ) - if isinstance(constraint, (MaxGroupsPerID, MaxRowsPerGroupPerID)): - grouping_column = constraint.grouping_column - if grouping_column not in input_schema: - raise ValueError( - f"The grouping column of constraint {constraint}" - " does not exist in this table; available columns" - f" are: {', '.join(input_schema.keys())}" - ) - if grouping_column == input_schema.id_column: - raise ValueError( - f"The grouping column of constraint {constraint} cannot be" - " the ID column of the table it is applied to" - ) - - # No current constraints modify the schema. If that changes in the - # future, the logic for it may have to be pushed into the Constraint - # type (like how constraint._enforce() works), but for now this works. - return input_schema - - def visit_get_groups(self, expr: GetGroups) -> Schema: - """Returns the resulting schema from GetGroups.""" - input_schema = expr.child.accept(self) - - if expr.columns: - nonexistent_columns = set(expr.columns) - set(input_schema) - if nonexistent_columns: - raise ValueError( - f"Nonexistent columns in get_groups query: {nonexistent_columns}" - ) - input_schema = Schema( - {column: input_schema[column] for column in expr.columns} - ) - - else: - input_schema = Schema( - { - column: input_schema[column] - for column in input_schema - if column != input_schema.id_column - } - ) - - return input_schema - - def visit_get_bounds(self, expr: GetBounds) -> Schema: - """Returns the resulting schema from GetBounds.""" - input_schema = expr.child.accept(self) - - if expr.measure_column not in set(input_schema): - raise ValueError( - f"Cannot get bounds for column '{expr.measure_column}', which " - "does not exist" - ) - - column = input_schema[expr.measure_column] - if column.column_type not in [ - ColumnType.INTEGER, - ColumnType.DECIMAL, - ]: - raise ValueError( - f"Cannot get bounds for column '{expr.measure_column}'," - f" which is of type {column.column_type.name}; only columns of" - f" numerical type are supported." - ) - - # Check if we're trying to get the bounds of the ID column. - if input_schema.id_column and (input_schema.id_column == expr.measure_column): - raise ValueError( - "get_bounds cannot be used on the privacy ID column" - f" ({input_schema.id_column}) of a table with the AddRowsWithID" - " protected change." - ) - return _validate_groupby(expr, self) - - def visit_groupby_count(self, expr: GroupByCount) -> Schema: - """Returns the resulting schema from evaluating a GroupByCount.""" - return _validate_groupby(expr, self) - - def visit_groupby_count_distinct(self, expr: GroupByCountDistinct) -> Schema: - """Returns the resulting schema from evaluating a GroupByCountDistinct.""" - return _validate_groupby(expr, self) - - def visit_groupby_quantile(self, expr: GroupByQuantile) -> Schema: - """Returns the resulting schema from evaluating a GroupByQuantile.""" - return _validate_groupby(expr, self) - - def visit_groupby_bounded_sum(self, expr: GroupByBoundedSum) -> Schema: - """Returns the resulting schema from evaluating a GroupByBoundedSum.""" - return _validate_groupby(expr, self) - - def visit_groupby_bounded_average(self, expr: GroupByBoundedAverage) -> Schema: - """Returns the resulting schema from evaluating a GroupByBoundedAverage.""" - return _validate_groupby(expr, self) - - def visit_groupby_bounded_variance(self, expr: GroupByBoundedVariance) -> Schema: - """Returns the resulting schema from evaluating a GroupByBoundedVariance.""" - return _validate_groupby(expr, self) - - def visit_groupby_bounded_stdev(self, expr: GroupByBoundedSTDEV) -> Schema: - """Returns the resulting schema from evaluating a GroupByBoundedSTDEV.""" - return _validate_groupby(expr, self) - - def visit_suppress_aggregates(self, expr: SuppressAggregates) -> Schema: - """Returns the resulting schema from evaluating a SuppressAggregates.""" - return expr.child.accept(self) diff --git a/test/system/session/ids/test_id_col_operations.py b/test/system/session/ids/test_id_col_operations.py index f2963144..a0069deb 100644 --- a/test/system/session/ids/test_id_col_operations.py +++ b/test/system/session/ids/test_id_col_operations.py @@ -104,7 +104,7 @@ def test_replace_null_and_nan_raises_error( ) def test_replace_null_and_nan_raises_warning(session, query: QueryBuilder): """Tests that replace nulls/nans raises warning on IDs table with empty mapping.""" - with pytest.raises( + with pytest.warns( RuntimeWarning, match="the ID column may still contain null values." ): session.evaluate( @@ -121,7 +121,7 @@ def test_replace_null_and_nan_raises_warning(session, query: QueryBuilder): ) def test_drop_null_and_nan_raises_warning(session, query: QueryBuilder): """Tests that replace nulls/nans raises warning on IDs table with empty list.""" - with pytest.raises( + with pytest.warns( RuntimeWarning, match="the ID column may still contain null values." ): session.evaluate( diff --git a/test/system/session/rows/test_add_max_rows.py b/test/system/session/rows/test_add_max_rows.py index c606bbd8..252cc6ba 100644 --- a/test/system/session/rows/test_add_max_rows.py +++ b/test/system/session/rows/test_add_max_rows.py @@ -529,8 +529,8 @@ def test_get_bounds_inf_budget_sum(self, spark, data): column="str_column", protected_change=AddOneRow(), error_type=ValueError, - message="Cannot get bounds for column 'str_column'," - " which is of type VARCHAR", + message="GetBounds query's measure column 'str_column' has invalid type" + " 'VARCHAR'. Expected types: 'INTEGER' or 'DECIMAL'", ), Case("missing_column")( data=pd.DataFrame( @@ -540,8 +540,8 @@ def test_get_bounds_inf_budget_sum(self, spark, data): column="column_does_not_exist", protected_change=AddOneRow(), error_type=ValueError, - message="Cannot get bounds for column 'column_does_not_exist'," - " which does not exist", + message="GetBounds query's measure column 'column_does_not_exist'" + " does not exist", ), Case("id_column")( data=pd.DataFrame( @@ -551,7 +551,8 @@ def test_get_bounds_inf_budget_sum(self, spark, data): column="id_column", protected_change=AddRowsWithID("id_column"), error_type=ValueError, - message="get_bounds cannot be used on the privacy ID column", + message="GetBounds query's measure column is the same as the privacy ID " + "column\\(id_column\\)", ), ) def test_get_bounds_invalid_columns( diff --git a/test/unit/query_expr_compiler/test_measurement_visitor.py b/test/unit/query_expr_compiler/test_measurement_visitor.py index 321271fc..8b51779a 100644 --- a/test/unit/query_expr_compiler/test_measurement_visitor.py +++ b/test/unit/query_expr_compiler/test_measurement_visitor.py @@ -77,9 +77,6 @@ _get_query_bounds, ) from tmlt.analytics._query_expr_compiler._measurement_visitor import MeasurementVisitor -from tmlt.analytics._query_expr_compiler._output_schema_visitor import ( - OutputSchemaVisitor, -) from tmlt.analytics._schema import ( ColumnDescriptor, ColumnType, @@ -329,9 +326,7 @@ def run_with_empty_data_and_check_schema( self, query: QueryExpr, output_measure: Union[PureDP, RhoZCDP] ): """Run a query and check the schema of the result.""" - expected_column_types = query.accept( - OutputSchemaVisitor(self.catalog) - ).column_types + expected_column_types = query.schema(self.catalog).column_types self.visitor.output_measure = output_measure measurement, _ = query.accept(self.visitor) empty_data = create_empty_input(measurement.input_domain) diff --git a/test/unit/query_expr_compiler/transformation_visitor/test_add_keys.py b/test/unit/query_expr_compiler/transformation_visitor/test_add_keys.py index 796043cc..09de96a4 100644 --- a/test/unit/query_expr_compiler/transformation_visitor/test_add_keys.py +++ b/test/unit/query_expr_compiler/transformation_visitor/test_add_keys.py @@ -35,9 +35,6 @@ ReplaceNullAndNan, Select, ) -from tmlt.analytics._query_expr_compiler._output_schema_visitor import ( - OutputSchemaVisitor, -) from tmlt.analytics._query_expr_compiler._transformation_visitor import ( TransformationVisitor, ) @@ -80,7 +77,7 @@ def _validate_transform_basics( first_transform = chain_to_list(transformation)[0] assert isinstance(first_transform, IdentityTransformation) - expected_schema = query.accept(OutputSchemaVisitor(self.catalog)) + expected_schema = query.schema(self.catalog) assert expected_schema.grouping_column == grouping_column expected_output_domain = SparkDataFrameDomain( @@ -485,11 +482,11 @@ def test_visit_replace_null_and_nan( self._validate_result(transformation, reference, expected_df) assert constraints == [] - expected_output_schema = query.accept(OutputSchemaVisitor(self.catalog)) + expected_output_schema = query.schema(self.catalog) expected_output_domain = SparkDataFrameDomain( schema=analytics_to_spark_columns_descriptor(expected_output_schema) ) - expected_output_schema = query.accept(OutputSchemaVisitor(self.catalog)) + expected_output_schema = query.schema(self.catalog) expected_output_domain = SparkDataFrameDomain( schema=analytics_to_spark_columns_descriptor(expected_output_schema) ) @@ -528,7 +525,7 @@ def test_visit_replace_infinity( self._validate_result(transformation, reference, expected_df) assert constraints == [] - expected_output_schema = query.accept(OutputSchemaVisitor(self.catalog)) + expected_output_schema = query.schema(self.catalog) expected_output_domain = SparkDataFrameDomain( schema=analytics_to_spark_columns_descriptor(expected_output_schema) ) @@ -556,7 +553,7 @@ def _validate_transform_basics( first_transform = chain_to_list(transformation)[0] assert isinstance(first_transform, IdentityTransformation) - expected_schema = query.accept(OutputSchemaVisitor(self.catalog)) + expected_schema = query.schema(self.catalog) assert expected_schema.grouping_column == "id" expected_output_domain = SparkDataFrameDomain( diff --git a/test/unit/query_expr_compiler/transformation_visitor/test_add_rows.py b/test/unit/query_expr_compiler/transformation_visitor/test_add_rows.py index 31efba60..3d4fcc92 100644 --- a/test/unit/query_expr_compiler/transformation_visitor/test_add_rows.py +++ b/test/unit/query_expr_compiler/transformation_visitor/test_add_rows.py @@ -44,9 +44,6 @@ ReplaceNullAndNan, Select, ) -from tmlt.analytics._query_expr_compiler._output_schema_visitor import ( - OutputSchemaVisitor, -) from tmlt.analytics._query_expr_compiler._transformation_visitor import ( TransformationVisitor, ) @@ -86,7 +83,7 @@ def _validate_transform_basics( first_transform = chain_to_list(t)[0] assert isinstance(first_transform, IdentityTransformation) - expected_schema = query.accept(OutputSchemaVisitor(self.catalog)) + expected_schema = query.schema(self.catalog) expected_output_domain = SparkDataFrameDomain( analytics_to_spark_columns_descriptor(expected_schema) ) @@ -396,7 +393,7 @@ def test_visit_join_private( assert transformation.input_domain == self.visitor.input_domain assert transformation.input_metric == self.visitor.input_metric - expected_schema = query.accept(OutputSchemaVisitor(self.catalog)) + expected_schema = query.schema(self.catalog) expected_output_domain = SparkDataFrameDomain( analytics_to_spark_columns_descriptor(expected_schema) ) @@ -585,11 +582,11 @@ def test_visit_replace_null_and_nan( self._validate_transform_basics(transformation, reference, query) assert isinstance(transformation, ChainTT) assert isinstance(transformation.transformation2, AugmentDictTransformation) - expected_output_schema = query.accept(OutputSchemaVisitor(self.catalog)) + expected_output_schema = query.schema(self.catalog) expected_output_domain = SparkDataFrameDomain( schema=analytics_to_spark_columns_descriptor(expected_output_schema) ) - expected_output_schema = query.accept(OutputSchemaVisitor(self.catalog)) + expected_output_schema = query.schema(self.catalog) expected_output_domain = SparkDataFrameDomain( schema=analytics_to_spark_columns_descriptor(expected_output_schema) ) @@ -672,7 +669,7 @@ def test_visit_replace_infinity( transformation, reference, constraints = query.accept(self.visitor) self._validate_transform_basics(transformation, reference, query) - expected_output_schema = query.accept(OutputSchemaVisitor(self.catalog)) + expected_output_schema = query.schema(self.catalog) expected_output_domain = SparkDataFrameDomain( schema=analytics_to_spark_columns_descriptor(expected_output_schema) ) @@ -847,7 +844,7 @@ def _validate_transform_basics( assert t.input_domain == self.visitor.input_domain assert t.input_metric == self.visitor.input_metric - expected_schema = query.accept(OutputSchemaVisitor(self.catalog)) + expected_schema = query.schema(self.catalog) expected_output_domain = SparkDataFrameDomain( analytics_to_spark_columns_descriptor(expected_schema) ) diff --git a/test/unit/query_expr_compiler/test_output_schema_visitor.py b/test/unit/test_query_expression_schema.py similarity index 94% rename from test/unit/query_expr_compiler/test_output_schema_visitor.py rename to test/unit/test_query_expression_schema.py index 80a47200..2d13f3f1 100644 --- a/test/unit/query_expr_compiler/test_output_schema_visitor.py +++ b/test/unit/test_query_expression_schema.py @@ -1,4 +1,4 @@ -"""Tests for OutputSchemaVisitor.""" +"""Tests for QueryExpression schema determination.""" # SPDX-License-Identifier: Apache-2.0 # Copyright Tumult Labs 2025 @@ -37,9 +37,6 @@ Select, SuppressAggregates, ) -from tmlt.analytics._query_expr_compiler._output_schema_visitor import ( - OutputSchemaVisitor, -) from tmlt.analytics._schema import ( ColumnDescriptor, ColumnType, @@ -325,7 +322,7 @@ ###TESTS FOR QUERY VALIDATION### -@pytest.fixture(name="validation_visitor", scope="class") +@pytest.fixture(name="validation_catalog", scope="class") def setup_validation(request): """Set up test data.""" catalog = Catalog() @@ -361,15 +358,14 @@ def setup_validation(request): catalog.add_private_table( "groupby_one_column_private", {"A": ColumnDescriptor(ColumnType.VARCHAR)} ) - visitor = OutputSchemaVisitor(catalog) - request.cls.visitor = visitor + request.cls.catalog = catalog -@pytest.mark.usefixtures("validation_visitor") +@pytest.mark.usefixtures("validation_catalog") class TestValidation: - """Test Validation with Visitor.""" + """Test Validation with Catalog.""" - visitor: OutputSchemaVisitor + catalog: Catalog @pytest.mark.parametrize( "query_expr,expected_error_msg", OUTPUT_SCHEMA_INVALID_QUERY_TESTS @@ -379,7 +375,7 @@ def test_invalid_query_expr( ) -> None: """Check that appropriate exceptions are raised on invalid queries.""" with pytest.raises(ValueError, match=expected_error_msg): - query_expr.accept(self.visitor) + query_expr.schema(self.catalog) @pytest.mark.parametrize( "groupby_keys,exception_type,expected_error_msg", @@ -428,7 +424,7 @@ def test_invalid_group_by_count( ) -> None: """Test invalid measurement QueryExpr.""" with pytest.raises(exception_type, match=expected_error_msg): - GroupByCount(PrivateSource("private"), groupby_keys).accept(self.visitor) + GroupByCount(PrivateSource("private"), groupby_keys).schema(self.catalog) @pytest.mark.parametrize( "groupby_keys,exception_type,expected_error_msg", @@ -485,13 +481,13 @@ def test_invalid_group_by_aggregations( GroupByBoundedVariance, ]: with pytest.raises(exception_type, match=expected_error_msg): - DataClass(PrivateSource("private"), groupby_keys, "B", 1.0, 5.0).accept( - self.visitor + DataClass(PrivateSource("private"), groupby_keys, "B", 1.0, 5.0).schema( + self.catalog ) with pytest.raises(exception_type, match=expected_error_msg): GroupByQuantile( PrivateSource("private"), groupby_keys, "B", 0.5, 1.0, 5.0 - ).accept(self.visitor) + ).schema(self.catalog) with pytest.raises(exception_type, match=expected_error_msg): GetBounds( PrivateSource("private"), @@ -499,12 +495,12 @@ def test_invalid_group_by_aggregations( "B", "lower_bound", "upper_bound", - ).accept(self.visitor) + ).schema(self.catalog) ###QUERY VALIDATION WITH NULLS### @pytest.fixture(name="test_data_nulls", scope="class") -def setup_visitor_with_nulls(request) -> None: +def setup_catalog_with_nulls(request) -> None: """Set up test data.""" catalog = Catalog() catalog.add_private_table( @@ -543,15 +539,14 @@ def setup_visitor_with_nulls(request) -> None: "groupby_one_column_private", {"A": ColumnDescriptor(ColumnType.VARCHAR, allow_null=True)}, ) - visitor = OutputSchemaVisitor(catalog) - request.cls.visitor = visitor + request.cls.catalog = catalog @pytest.mark.usefixtures("test_data_nulls") class TestValidationWithNulls: """Test Validation with Nulls.""" - visitor: OutputSchemaVisitor + catalog: Catalog @pytest.mark.parametrize( "query_expr,expected_error_msg", OUTPUT_SCHEMA_INVALID_QUERY_TESTS @@ -561,7 +556,7 @@ def test_invalid_query_expr_null( ) -> None: """Check that appropriate exceptions are raised on invalid queries.""" with pytest.raises(ValueError, match=expected_error_msg): - query_expr.accept(self.visitor) + query_expr.schema(self.catalog) @pytest.mark.parametrize( "groupby_keys,exception_type,expected_error_msg", @@ -602,7 +597,7 @@ def test_invalid_group_by_count_null( ) -> None: """Test invalid measurement QueryExpr.""" with pytest.raises(exception_type, match=expected_error_msg): - GroupByCount(PrivateSource("private"), groupby_keys).accept(self.visitor) + GroupByCount(PrivateSource("private"), groupby_keys).schema(self.catalog) @pytest.mark.parametrize( "groupby_keys,exception_type,expected_error_msg", @@ -659,13 +654,13 @@ def test_invalid_group_by_aggregations_null( GroupByBoundedVariance, ]: with pytest.raises(exception_type, match=expected_error_msg): - DataClass(PrivateSource("private"), groupby_keys, "B", 1.0, 5.0).accept( - self.visitor + DataClass(PrivateSource("private"), groupby_keys, "B", 1.0, 5.0).schema( + self.catalog ) with pytest.raises(exception_type, match=expected_error_msg): GroupByQuantile( PrivateSource("private"), groupby_keys, "B", 0.5, 1.0, 5.0 - ).accept(self.visitor) + ).schema(self.catalog) with pytest.raises(exception_type, match=expected_error_msg): GetBounds( PrivateSource("private"), @@ -673,18 +668,13 @@ def test_invalid_group_by_aggregations_null( "B", "lower_bound", "upper_bound", - ).accept(self.visitor) + ).schema(self.catalog) - def test_visit_private_source(self) -> None: - """Test visit_private_source.""" + def test_schema_private_source(self) -> None: + """Test schema for private_source.""" query = PrivateSource("private") - schema = self.visitor.visit_private_source(query) - assert ( - schema - == self.visitor._catalog.tables[ # pylint: disable=protected-access - "private" - ].schema - ) + schema = query.schema(self.catalog) + assert schema == self.catalog.tables["private"].schema @pytest.mark.parametrize( "column_mapper,expected_schema", @@ -753,28 +743,23 @@ def test_visit_private_source(self) -> None: ), ], ) - def test_visit_rename( + def test_schema_rename( self, column_mapper: Dict[str, str], expected_schema: Schema ) -> None: - """Test visit_rename.""" + """Test schema for rename.""" query = Rename( child=PrivateSource("private"), column_mapper=FrozenDict.from_dict(column_mapper), ) - schema = self.visitor.visit_rename(query) + schema = query.schema(self.catalog) assert schema == expected_schema @pytest.mark.parametrize("condition", ["B > X", "X < 500", "NOTNULL < 30"]) - def test_visit_filter(self, condition: str) -> None: - """Test visit_filter.""" + def test_schema_filter(self, condition: str) -> None: + """Test schema for filter.""" query = Filter(child=PrivateSource("private"), condition=condition) - schema = self.visitor.visit_filter(query) - assert ( - schema - == self.visitor._catalog.tables[ # pylint: disable=protected-access - "private" - ].schema - ) + schema = query.schema(self.catalog) + assert schema == self.catalog.tables["private"].schema @pytest.mark.parametrize( "columns,expected_schema", @@ -796,10 +781,10 @@ def test_visit_filter(self, condition: str) -> None: ), ], ) - def test_visit_select(self, columns: List[str], expected_schema: Schema) -> None: - """Test visit_select.""" + def test_schema_select(self, columns: List[str], expected_schema: Schema) -> None: + """Test schema for select.""" query = Select(child=PrivateSource("private"), columns=tuple(columns)) - schema = self.visitor.visit_select(query) + schema = query.schema(self.catalog) assert schema == expected_schema @pytest.mark.parametrize( @@ -936,9 +921,9 @@ def test_visit_select(self, columns: List[str], expected_schema: Schema) -> None ), ], ) - def test_visit_map(self, query: Map, expected_schema: Schema) -> None: - """Test visit_map.""" - schema = self.visitor.visit_map(query) + def test_schema_map(self, query: Map, expected_schema: Schema) -> None: + """Test schema for map.""" + schema = query.schema(self.catalog) assert schema == expected_schema @pytest.mark.parametrize( @@ -987,9 +972,9 @@ def test_visit_map(self, query: Map, expected_schema: Schema) -> None: ), ], ) - def test_visit_flat_map(self, query: FlatMap, expected_schema: Schema) -> None: - """Test visit_flat_map.""" - schema = self.visitor.visit_flat_map(query) + def test_schema_flat_map(self, query: FlatMap, expected_schema: Schema) -> None: + """Test schema for flat_map.""" + schema = query.schema(self.catalog) assert schema == expected_schema @pytest.mark.parametrize( @@ -1022,11 +1007,11 @@ def test_visit_flat_map(self, query: FlatMap, expected_schema: Schema) -> None: ) ], ) - def test_visit_join_private( + def test_schema_join_private( self, query: JoinPrivate, expected_schema: Schema ) -> None: - """Test visit_join_private.""" - schema = self.visitor.visit_join_private(query) + """Test schema for join_private.""" + schema = query.schema(self.catalog) assert schema == expected_schema @pytest.mark.parametrize( @@ -1081,11 +1066,11 @@ def test_visit_join_private( ), ], ) - def test_visit_join_public( + def test_schema_join_public( self, query: JoinPublic, expected_schema: Schema ) -> None: - """Test visit_join_public.""" - schema = self.visitor.visit_join_public(query) + """Test schema for join_public.""" + schema = query.schema(self.catalog) assert schema == expected_schema @parametrize( @@ -1134,19 +1119,20 @@ def test_visit_join_public( ), ), ) - def test_visit_join_private_nulls(self, left_schema, right_schema, expected_schema): - """Test that OutputSchemaVisitor correctly propagates nulls through a join.""" + def test_schema_join_private_nulls( + self, left_schema, right_schema, expected_schema + ): + """Test that schema correctly propagates nulls through a join.""" catalog = Catalog() catalog.add_private_table("left", left_schema) catalog.add_private_table("right", right_schema) - visitor = OutputSchemaVisitor(catalog) query = JoinPrivate( child=PrivateSource("left"), right_operand_expr=PrivateSource("right"), truncation_strategy_left=TruncationStrategy.DropExcess(1), truncation_strategy_right=TruncationStrategy.DropExcess(1), ) - result_schema = visitor.visit_join_private(query) + result_schema = query.schema(catalog) assert result_schema == expected_schema @parametrize( @@ -1195,16 +1181,15 @@ def test_visit_join_private_nulls(self, left_schema, right_schema, expected_sche ), ), ) - def test_visit_join_public_nulls( + def test_schema_join_public_nulls( self, private_schema, public_schema, expected_schema ): - """Test that OutputSchemaVisitor correctly propagates nulls through a join.""" + """Test that schema correctly propagates nulls through a join.""" catalog = Catalog() catalog.add_private_table("private", private_schema) catalog.add_public_table("public", public_schema) - visitor = OutputSchemaVisitor(catalog) query = JoinPublic(child=PrivateSource("private"), public_table="public") - result_schema = visitor.visit_join_public(query) + result_schema = query.schema(catalog) assert result_schema == expected_schema @pytest.mark.parametrize( @@ -1289,11 +1274,11 @@ def test_visit_join_public_nulls( ), ], ) - def test_visit_replace_null_and_nan( + def test_schema_replace_null_and_nan( self, query: ReplaceNullAndNan, expected_schema: Schema ) -> None: - """Test visit_replace_null_and_nan.""" - schema = self.visitor.visit_replace_null_and_nan(query) + """Test schema for replace_null_and_nan.""" + schema = query.schema(self.catalog) assert schema == expected_schema @pytest.mark.parametrize( @@ -1376,11 +1361,11 @@ def test_visit_replace_null_and_nan( ), ], ) - def test_visit_drop_null_and_nan( + def test_schema_drop_null_and_nan( self, query: DropNullAndNan, expected_schema: Schema ) -> None: - """Test visit_drop_null_and_nan.""" - schema = self.visitor.visit_drop_null_and_nan(query) + """Test schema for drop_null_and_nan.""" + schema = query.schema(self.catalog) assert schema == expected_schema @pytest.mark.parametrize( @@ -1461,11 +1446,11 @@ def test_visit_drop_null_and_nan( ), ], ) - def test_visit_drop_infinity( + def test_schema_drop_infinity( self, query: DropInfinity, expected_schema: Schema ) -> None: - """Test visit_drop_infinity.""" - schema = self.visitor.visit_drop_infinity(query) + """Test schema for drop_infinity.""" + schema = query.schema(self.catalog) assert schema == expected_schema @pytest.mark.parametrize( @@ -1623,15 +1608,15 @@ def test_visit_drop_infinity( ), ], ) - def test_visit_groupby_queries( + def test_schema_groupby_queries( self, query: QueryExpr, expected_schema: Schema ) -> None: - """Test visit_groupby_*.""" - schema = query.accept(self.visitor) + """Test schema for groupby_*.""" + schema = query.schema(self.catalog) assert schema == expected_schema - def test_visit_groupby_get_bounds_partition_selection(self) -> None: - """Test visit_get_bounds with auto partition selection enabled.""" + def test_schema_groupby_get_bounds_partition_selection(self) -> None: + """Test schema for get_bounds with auto partition selection enabled.""" expected_schema = Schema( { "A": ColumnDescriptor(ColumnType.VARCHAR, allow_null=True), @@ -1647,7 +1632,7 @@ def test_visit_groupby_get_bounds_partition_selection(self) -> None: lower_bound_column="lower_bound", upper_bound_column="upper_bound", ) - schema = query.accept(self.visitor) + schema = query.schema(self.catalog) assert schema == expected_schema @pytest.mark.parametrize( @@ -1678,8 +1663,8 @@ def test_visit_groupby_get_bounds_partition_selection(self) -> None: ), ], ) - def test_visit_suppress_aggregates(self, query: SuppressAggregates) -> None: - """Test visit_suppress_aggregates.""" - expected_schema = query.child.accept(self.visitor) - got_schema = query.accept(self.visitor) + def test_schema_suppress_aggregates(self, query: SuppressAggregates) -> None: + """Test schema for suppress_aggregates.""" + expected_schema = query.child.schema(self.catalog) + got_schema = query.schema(self.catalog) assert expected_schema == got_schema From 44f20d8f1f270c86ee3b57f6dcda8460602ded6f Mon Sep 17 00:00:00 2001 From: Ted Date: Sat, 18 Oct 2025 15:20:40 +0200 Subject: [PATCH 08/25] Make the merge queue fail immediately if linting fails (#90) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This only has a minor impact on dev experience (if both lint and tests/docs fail, you only hear about the latter when you re-try merging). But one Should™ check all that locally to avoid spending too many CI minutes, so linting failures indicate "you should re-check stuff locally", and there's no need to then run more expensive processes like tests and docs. --- .github/workflows/merge_queue.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/merge_queue.yml b/.github/workflows/merge_queue.yml index 25eb3fdf..26b032ef 100644 --- a/.github/workflows/merge_queue.yml +++ b/.github/workflows/merge_queue.yml @@ -42,7 +42,9 @@ jobs: - run: uv run nox -t lint Test: runs-on: ubuntu-latest - needs: Package + needs: + - Package + - Lint steps: - name: Checkout repository uses: actions/checkout@v4 @@ -56,7 +58,9 @@ jobs: - run: uv run nox -s smoketest test-doctest test-fast Docs: runs-on: ubuntu-latest - needs: Package + needs: + - Package + - Lint steps: - name: Checkout repository uses: actions/checkout@v4 From 23ddea7b60ee773c5aeb6ff65b91bb5a8436c0ea Mon Sep 17 00:00:00 2001 From: dasm Date: Sat, 18 Oct 2025 17:52:23 -0700 Subject: [PATCH 09/25] Delete old gitlab-specific files (#91) * Delete gitlab CI definitions. * Delete nightly pipeline handler/slack webhook code. * Delete gitlab issue template --- .ci/common.yml | 44 ----- .ci/pipeline-handler.py | 185 ------------------- .ci/pipeline-handlers.bash | 54 ------ .gitlab-ci.yml | 279 ----------------------------- .gitlab/issue_templates/Default.md | 21 --- 5 files changed, 583 deletions(-) delete mode 100644 .ci/common.yml delete mode 100644 .ci/pipeline-handler.py delete mode 100644 .ci/pipeline-handlers.bash delete mode 100644 .gitlab-ci.yml delete mode 100644 .gitlab/issue_templates/Default.md diff --git a/.ci/common.yml b/.ci/common.yml deleted file mode 100644 index 3986035b..00000000 --- a/.ci/common.yml +++ /dev/null @@ -1,44 +0,0 @@ -variables: - PIP_CACHE_DIR: "$CI_PROJECT_DIR/.cache/pip" - POETRY_CACHE_DIR: "$CI_PROJECT_DIR/.cache/pypoetry" - POETRY_VIRTUALENVS_IN_PROJECT: "true" - # Force nox to produce colorful logs: - FORCE_COLOR: "true" - # Enable feature flags - # https://docs.gitlab.com/runner/configuration/feature-flags.html - FF_SCRIPT_SECTIONS: "true" - FF_USE_FASTZIP: "true" - -.base: - image: registry.gitlab.com/tumult-labs/ops/ci/linux:python3.10 - before_script: - - java -version - - python --version - - poetry self show - # Use same abbreviated SHA of first 7 characters across different CI jobs for the same commit. - # Without this there are random failures due to different abbreviations while installing apro package. - # https://github.com/mtkennerly/dunamai/issues/89 - - git config core.abbrev 7 - # Set up SSH config so that the runner can `pip install` out of GitLab - # repositories. - - mkdir -p ~/.ssh/ - - cp $GITLAB_SSH_KNOWN_HOSTS ~/.ssh/known_hosts - - chmod 600 "$CI_SSH_KEY" - - cp -pv "$CI_SSH_KEY" ~/.ssh/id_ed25519 - - poetry install --only scripting - - source .venv/bin/activate - artifacts: - when: always - expire_in: 1 week - cache: - # Cache the pip cache. While the cache could be persisted across changes to - # the Poetry lock file, clearing it when that changes provides a good way to - # keep the cache from growing too large due to old packages. - - key: - files: ["poetry.lock"] - paths: [".cache/pip", ".cache/pypoetry"] - tags: [aws-small] - interruptible: true - after_script: - # Just to be safe, remove the key after the job finishes - - rm -v ~/.ssh/id_ed25519 diff --git a/.ci/pipeline-handler.py b/.ci/pipeline-handler.py deleted file mode 100644 index 680f3918..00000000 --- a/.ci/pipeline-handler.py +++ /dev/null @@ -1,185 +0,0 @@ -#!/usr/bin/env python -"""Various CI pipeline handlers.""" - -import requests -import sys -import os -import logging -import argparse - -_log = logging.getLogger() -RESULTS_PER_PAGE = 100 - -def _send_slack_webhook(url: str, content: str): - """Create a Slack post with the given content using a webhook.""" - body = { - "blocks": [{"type": "section", "text": {"type": "mrkdwn", "text": content}}] - } - - _log.info("Sending Slack webhook") - _log.debug(f"Body: {body}") - resp = requests.post(url, json=body) - if resp.status_code > 399: - _log.critical( - f"Slack webhook failed (HTTP {resp.status_code}), response: {resp.text}" - ) - raise RuntimeError("Error sending webhook") - - _log.info(f"Slack webhook sent, status {resp.status_code}") - - -def _gitlab_api_get(path: str, token: str): - """Query the GitLab API at the given path, using the given token for auth.""" - api_v4_url = os.environ["CI_API_V4_URL"] - headers = {"Authorization": f"Bearer {token}"} - return requests.get( - f"{api_v4_url}/{path}?per_page={RESULTS_PER_PAGE}", headers=headers - ) - - -def _get_pipeline_jobs(project_id: int, pipeline_id: int, token: str): - resp = _gitlab_api_get( - f"projects/{project_id}/pipelines/{pipeline_id}/jobs", token=token - ) - if resp.status_code > 399: - _log.critical( - f"Unable to list pipeline {pipeline_id} jobs (HTTP {resp.status_code}), " - f"response: {resp.text}" - ) - raise RuntimeError("Error making GitLab API requests") - return resp - - -def _nightly_handler(args): - token = os.environ["NIGHTLY_HANDLER_TOKEN"] - webhook_url = os.environ["NIGHTLY_SLACK_WEBHOOK_URL"] - project_id = int(os.environ["CI_PROJECT_ID"]) - pipeline_id = int(os.environ["CI_PIPELINE_ID"]) - pipeline_url = os.environ["CI_PIPELINE_URL"] - pipeline_branch = os.environ["CI_COMMIT_BRANCH"] - - _log.info(f"Handling pipeline {pipeline_id} as nightly pipeline...") - - jobs_json = _get_pipeline_jobs(project_id, pipeline_id, token).json() - jobs = {j["name"]: j for j in jobs_json} - - bridges_resp = _gitlab_api_get( - f"projects/{project_id}/pipelines/{pipeline_id}/bridges", token=token - ) - if bridges_resp.status_code > 399: - _log.critical( - f"Unable to list downstream pipelines (HTTP {bridges_resp.status_code}), " - f"response: {bridges_resp.text}" - ) - raise RuntimeError("Error making GitLab API requests") - bridges_json = bridges_resp.json() - for bridge in bridges_json: - downstream_pipeline = bridge["downstream_pipeline"] - bridge_jobs_json = _get_pipeline_jobs( - downstream_pipeline["project_id"], downstream_pipeline["id"], token - ).json() - for job in bridge_jobs_json: - jobs[f"{bridge['name']}:{job['name']}"] = job - - jobs = {j.replace(" ", ""): body for j, body in jobs.items()} - # The handler job is obviously still running when this script runs, which - # makes the script consider it as failed. So, we ignore it. - jobs.pop("nightly_handler") - - unknown_allow_failures = set(args.allow_failure) - set(jobs.keys()) - if unknown_allow_failures: - _log.warning( - "Jobs were set to allow failure, but they do not exist: " - + " ".join(unknown_allow_failures) - ) - - passed_jobs = {j: body for j, body in jobs.items() if body["status"] == "success"} - failed_jobs = { - j: body for j, body in jobs.items() - if body["status"] not in {"success", "manual", "skipped"} - and j not in args.allow_failure - } - allowed_failed_jobs = { - j: body for j, body in jobs.items() - if body["status"] not in {"success", "manual", "skipped"} - and j in args.allow_failure - } - - def format_job_status(jobs): - return [f"{j} ({b['status']})" for j, b in jobs.items()] - - _log.info(f"Passed jobs: {', '.join(passed_jobs.keys())}") - _log.info(f"Failed jobs: {', '.join(format_job_status(failed_jobs))}") - _log.info( - f"Allowed failed jobs: {', '.join(format_job_status(allowed_failed_jobs))}" - ) - - if failed_jobs: - status_text = ( - f"Nightly <{pipeline_url}|pipeline> for `{pipeline_branch}`: :x: Failed" - ) - job_links = [f"<{body['web_url']}|{j}>" for j, body in failed_jobs.items()] - body_text = f"Failed jobs: {', '.join(job_links)}\n@channel" - else: - trigger_release_url = jobs["trigger_release"]["web_url"] - status_text = ( - f"Nightly <{pipeline_url}|pipeline> for `{pipeline_branch}`: " - ":white_check_mark: Passed" - ) - body_text = f"Trigger a release with <{trigger_release_url}|this job>" - - if allowed_failed_jobs: - job_links = [f"<{body['web_url']}|{j}>" for j, body in failed_jobs.items()] - body_text += ( - "\nSome jobs which are allowed to fail are failing: " - + ", ".join(job_links) - + "\nThis may be because of a bug, or just the result of intentional " - "backwards-incompatible changes. If you are making a release based on " - "this commit, ensure that we know the cause of all of these failures " - "and that they are expected." - ) - - _send_slack_webhook(webhook_url, f"*{status_text}*\n{body_text}") - - -def _release_handler(args): - raise NotImplementedError( - "Continue using the Bash version of the release handler for now" - ) - - -def _arg_parser_nightly_handler(subparsers): - sp = subparsers.add_parser("nightly", help="Run the nightly pipeline handler") - sp.add_argument("--allow-failure", action="append", default=[]) - sp.set_defaults(func=_nightly_handler) - - -def _arg_parser_release_handler(subparsers): - sp = subparsers.add_parser("release", help="Run the release handler") - sp.set_defaults(func=_release_handler) - - -def _arg_parser(): - parser = argparse.ArgumentParser(prog="pipeline-handler") - subparsers = parser.add_subparsers(dest="cmd", required=True) - _arg_parser_nightly_handler(subparsers) - _arg_parser_release_handler(subparsers) - - return parser - - -def main(argv): - parser = _arg_parser() - parser.add_argument("-v", "--verbose", action="store_true") - args = parser.parse_args(argv) - logging.basicConfig( - level=logging.DEBUG if args.verbose else logging.INFO, - format="%(levelname)s: %(message)s", - ) - _log.info(f"Arguments: {args}") - - args.func(args) - - -if __name__ == "__main__": - main(sys.argv[1:]) diff --git a/.ci/pipeline-handlers.bash b/.ci/pipeline-handlers.bash deleted file mode 100644 index 0c0c4400..00000000 --- a/.ci/pipeline-handlers.bash +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash - -function send_slack_webhook () { - if [[ -z "$webhook_url" ]]; then - echo "webhook_url unset" - return 1 - fi - if [[ -z "$message_content" ]]; then - echo "message_content unset" - return 1 - fi - cat > body.json < Date: Sat, 18 Oct 2025 19:12:24 -0700 Subject: [PATCH 10/25] Use the latest version of the push-docs action. (#93) --- .github/workflows/commit.yml | 2 +- .github/workflows/release.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/commit.yml b/.github/workflows/commit.yml index 8bd8afbd..b3f36b26 100644 --- a/.github/workflows/commit.yml +++ b/.github/workflows/commit.yml @@ -45,7 +45,7 @@ jobs: name: dist path: dist - name: Push docs - uses: opendp/tumult-tools/actions/push_docs@1c7b4b3cfa98ab75f3c5b0b91858c897a81629ad + uses: opendp/tumult-tools/actions/push_docs@df58d56705007d8b7ad1b66280df7465afa15a49 with: docs-repository: opendp/tumult-docs docs-repository-token: ${{ secrets.DOCS_REPO_PAT }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 72c51f06..37eee132 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -144,7 +144,7 @@ jobs: echo "MAJOR_VERSION=${BASH_REMATCH[1]}" >> $GITHUB_ENV echo "MINOR_VERSION=${BASH_REMATCH[2]}" >> $GITHUB_ENV - name: Push docs - uses: opendp/tumult-tools/actions/push_docs@1c7b4b3cfa98ab75f3c5b0b91858c897a81629ad + uses: opendp/tumult-tools/actions/push_docs@df58d56705007d8b7ad1b66280df7465afa15a49 with: docs-repository: opendp/tumult-docs docs-repository-token: ${{ secrets.DOCS_REPO_PAT }} From 1706945cb9ba26dee50ac5eaca1f104e95f4508e Mon Sep 17 00:00:00 2001 From: Ted Date: Sun, 19 Oct 2025 10:26:34 +0200 Subject: [PATCH 11/25] Make ReplaceInfinity work similarly to all other QueryExpr dataclasses (#92) * make ReplaceInfinity work similarly to all other QueryExpr dataclasses * no not those imports * also remove useless line --------- Co-authored-by: Damien Desfontaines --- src/tmlt/analytics/_query_expr.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/src/tmlt/analytics/_query_expr.py b/src/tmlt/analytics/_query_expr.py index dbcee359..6307bc9a 100644 --- a/src/tmlt/analytics/_query_expr.py +++ b/src/tmlt/analytics/_query_expr.py @@ -1194,7 +1194,7 @@ def accept(self, visitor: "QueryExprVisitor") -> Any: return visitor.visit_replace_null_and_nan(self) -@dataclass(frozen=True, init=False, eq=True) +@dataclass(frozen=True) class ReplaceInfinity(QueryExpr): """Returns data with +inf and -inf expressions replaced by defaults.""" @@ -1211,22 +1211,19 @@ class ReplaceInfinity(QueryExpr): :class:`~.AnalyticsDefault` class variables). """ - def __init__( - self, child: QueryExpr, replace_with: FrozenDict = FrozenDict.from_dict({}) - ) -> None: + def __post_init__(self) -> None: """Checks arguments to constructor.""" - check_type(child, QueryExpr) - check_type(replace_with, FrozenDict) - check_type(dict(replace_with), Dict[str, Tuple[float, float]]) + check_type(self.child, QueryExpr) + check_type(self.replace_with, FrozenDict) + check_type(dict(self.replace_with), Dict[str, Tuple[float, float]]) # Ensure that the values in replace_with are tuples of floats updated_dict = {} - for col, val in replace_with.items(): + for col, val in self.replace_with.items(): updated_dict[col] = (float(val[0]), float(val[1])) # Subverting the frozen dataclass to update the replace_with attribute object.__setattr__(self, "replace_with", FrozenDict.from_dict(updated_dict)) - object.__setattr__(self, "child", child) def _validate(self, input_schema: Schema): """Validation checks for this QueryExpr.""" From 94d6732d011b92e4e8f1569392db92f99b469309 Mon Sep 17 00:00:00 2001 From: Ted Date: Sun, 19 Oct 2025 20:18:18 +0200 Subject: [PATCH 12/25] remove cleanup.py (#95) Co-authored-by: Damien Desfontaines --- src/tmlt/analytics/cleanup.py | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100644 src/tmlt/analytics/cleanup.py diff --git a/src/tmlt/analytics/cleanup.py b/src/tmlt/analytics/cleanup.py deleted file mode 100644 index 338df5c3..00000000 --- a/src/tmlt/analytics/cleanup.py +++ /dev/null @@ -1,17 +0,0 @@ -"""Cleanup functions for Analytics. - -@nodoc. -""" -# pylint: disable=unused-import -# SPDX-License-Identifier: Apache-2.0 -# Copyright Tumult Labs 2025 - -import warnings - -from tmlt.analytics.utils import cleanup, remove_all_temp_tables - -warnings.warn( - "The contents of the cleanup module have been moved to tmlt.analytics.utils.", - DeprecationWarning, - stacklevel=2, -) From 39ba1aeba72d8105542b90011d6442ea9b73f884 Mon Sep 17 00:00:00 2001 From: Ted Date: Sun, 19 Oct 2025 22:24:36 +0200 Subject: [PATCH 13/25] remove flat_map.max_num_rows (#96) * remove flat_map.max_num_rows * lint (and by that I mean fix tests) --------- Co-authored-by: Damien Desfontaines --- src/tmlt/analytics/query_builder.py | 12 ------------ test/system/session/rows/test_add_max_rows.py | 2 +- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/src/tmlt/analytics/query_builder.py b/src/tmlt/analytics/query_builder.py index 8eb802c8..15b053eb 100644 --- a/src/tmlt/analytics/query_builder.py +++ b/src/tmlt/analytics/query_builder.py @@ -1247,7 +1247,6 @@ def flat_map( augment: bool = False, grouping: bool = False, max_rows: Optional[int] = None, - max_num_rows: Optional[int] = None, ) -> "QueryBuilder": """Applies a mapping function to each row, returning zero or more rows. @@ -1352,7 +1351,6 @@ def flat_map( max_rows: The enforced limit on the number of rows from each ``f(row)``. If ``f`` produces more rows than this, only the first ``max_rows`` rows will be in the output. - max_num_rows: Deprecated synonym for ``max_rows``. """ grouping_column: Optional[str] if grouping: @@ -1365,16 +1363,6 @@ def flat_map( (grouping_column,) = new_column_types else: grouping_column = None - if max_num_rows is not None: - if max_rows is not None: - raise ValueError( - "You must use either max_rows or max_num_rows, not both" - ) - warnings.warn( - "max_num_rows is deprecated and will be removed in a future release", - DeprecationWarning, - ) - max_rows = max_num_rows self._query_expr = FlatMap( child=self._query_expr, f=f, diff --git a/test/system/session/rows/test_add_max_rows.py b/test/system/session/rows/test_add_max_rows.py index 252cc6ba..cfee3af1 100644 --- a/test/system/session/rows/test_add_max_rows.py +++ b/test/system/session/rows/test_add_max_rows.py @@ -402,7 +402,7 @@ def duplicate_rows(_: Row) -> List[Row]: duplicate_rows, new_column_types={"C": "VARCHAR"}, augment=True, - max_num_rows=2, + max_rows=2, ) .get_groups(["A", "B", "C"]) ) From 642ec90213d958555fb10c1dd7b1e734b553a372 Mon Sep 17 00:00:00 2001 From: Ted Date: Thu, 23 Oct 2025 23:45:46 +0200 Subject: [PATCH 14/25] remove EnforceConstraints.options (#97) Co-authored-by: Damien Desfontaines --- src/tmlt/analytics/_query_expr.py | 5 -- .../_base_transformation_visitor.py | 4 +- src/tmlt/analytics/constraints/_truncation.py | 12 +-- src/tmlt/analytics/query_builder.py | 4 +- .../test_constraints.py | 75 ------------------- test/unit/test_query_expression_visitor.py | 7 +- 6 files changed, 9 insertions(+), 98 deletions(-) diff --git a/src/tmlt/analytics/_query_expr.py b/src/tmlt/analytics/_query_expr.py index 6307bc9a..8314a540 100644 --- a/src/tmlt/analytics/_query_expr.py +++ b/src/tmlt/analytics/_query_expr.py @@ -1468,11 +1468,6 @@ class EnforceConstraint(QueryExpr): """The QueryExpr to which the constraint will be applied.""" constraint: Constraint """A constraint to be enforced.""" - options: FrozenDict = FrozenDict.from_dict({}) - """Options to be used when enforcing the constraint. - - Appropriate values here vary depending on the constraint. These options are - to support advanced use cases, and generally should not be used.""" def _validate(self, input_schema: Schema): """Validation checks for this QueryExpr.""" diff --git a/src/tmlt/analytics/_query_expr_compiler/_base_transformation_visitor.py b/src/tmlt/analytics/_query_expr_compiler/_base_transformation_visitor.py index 1180d04c..9da9c97f 100644 --- a/src/tmlt/analytics/_query_expr_compiler/_base_transformation_visitor.py +++ b/src/tmlt/analytics/_query_expr_compiler/_base_transformation_visitor.py @@ -1556,9 +1556,7 @@ def visit_enforce_constraint(self, expr: EnforceConstraint) -> Output: expr.child ) # pylint: disable=protected-access - transformation, ref = expr.constraint._enforce( - child_transformation, child_ref, *expr.options - ) + transformation, ref = expr.constraint._enforce(child_transformation, child_ref) # pylint: enable=protected-access return self.Output( transformation, diff --git a/src/tmlt/analytics/constraints/_truncation.py b/src/tmlt/analytics/constraints/_truncation.py index bd312501..0d7b3c56 100644 --- a/src/tmlt/analytics/constraints/_truncation.py +++ b/src/tmlt/analytics/constraints/_truncation.py @@ -146,7 +146,7 @@ def _enforce( else: - def gen_tranformation_ark(parent_domain, parent_metric, target): + def gen_transformation_ark(parent_domain, parent_metric, target): return LimitRowsPerGroupValue( parent_domain, parent_metric, child_ref.identifier, target, self.max ) @@ -154,7 +154,7 @@ def gen_tranformation_ark(parent_domain, parent_metric, target): return generate_nested_transformation( child_transformation, child_ref.parent, - {AddRemoveKeys: gen_tranformation_ark}, + {AddRemoveKeys: gen_transformation_ark}, ) @@ -254,7 +254,7 @@ def _enforce( else: - def gen_tranformation_ark(parent_domain, parent_metric, target): + def gen_transformation_ark(parent_domain, parent_metric, target): return LimitKeysPerGroupValue( parent_domain, parent_metric, @@ -267,7 +267,7 @@ def gen_tranformation_ark(parent_domain, parent_metric, target): return generate_nested_transformation( child_transformation, child_ref.parent, - {AddRemoveKeys: gen_tranformation_ark}, + {AddRemoveKeys: gen_transformation_ark}, ) @@ -358,7 +358,7 @@ def _enforce( " with the AddRowsWithID protected change." ) - def gen_tranformation_ark(parent_domain, parent_metric, target): + def gen_transformation_ark(parent_domain, parent_metric, target): return LimitRowsPerKeyPerGroupValue( parent_domain, parent_metric, @@ -371,5 +371,5 @@ def gen_tranformation_ark(parent_domain, parent_metric, target): return generate_nested_transformation( child_transformation, child_ref.parent, - {AddRemoveKeys: gen_tranformation_ark}, + {AddRemoveKeys: gen_transformation_ark}, ) diff --git a/src/tmlt/analytics/query_builder.py b/src/tmlt/analytics/query_builder.py index 15b053eb..b3c1976a 100644 --- a/src/tmlt/analytics/query_builder.py +++ b/src/tmlt/analytics/query_builder.py @@ -1722,9 +1722,7 @@ def enforce(self, constraint: Constraint) -> "QueryBuilder": Args: constraint: The constraint to enforce. """ - self._query_expr = EnforceConstraint( - self._query_expr, constraint, options=FrozenDict.from_dict({}) - ) + self._query_expr = EnforceConstraint(self._query_expr, constraint) return self def get_groups(self, columns: Optional[List[str]] = None) -> Query: diff --git a/test/unit/query_expr_compiler/transformation_visitor/test_constraints.py b/test/unit/query_expr_compiler/transformation_visitor/test_constraints.py index 37d7118b..e52a7868 100644 --- a/test/unit/query_expr_compiler/transformation_visitor/test_constraints.py +++ b/test/unit/query_expr_compiler/transformation_visitor/test_constraints.py @@ -8,7 +8,6 @@ import pandas as pd import pytest from pyspark.sql import DataFrame -from tmlt.core.metrics import SymmetricDifference from tmlt.analytics import MaxGroupsPerID, MaxRowsPerGroupPerID, MaxRowsPerID from tmlt.analytics._catalog import Catalog @@ -16,9 +15,7 @@ from tmlt.analytics._query_expr_compiler._transformation_visitor import ( TransformationVisitor, ) -from tmlt.analytics._schema import FrozenDict from tmlt.analytics._table_identifier import Identifier -from tmlt.analytics._transformation_utils import get_table_from_ref from .conftest import TestTransformationVisitor @@ -110,75 +107,3 @@ def test_max_rows_per_group_per_id(self, constraint_max: int, grouping_col: str) ) self._test_is_subset(input_df, result_df) - - @pytest.mark.parametrize("constraint_max", [1, 2, 3]) - def test_l1_update_metric(self, constraint_max: int): - """Test L1 truncation with updating metric.""" - constraint = MaxRowsPerID(constraint_max) - query = EnforceConstraint( - PrivateSource("ids_duplicates"), - constraint, - options=FrozenDict.from_dict({"update_metric": True}), - ) - transformation, ref, constraints = query.accept(self.visitor) - assert len(constraints) == 1 - assert constraints[0] == constraint - assert ( - get_table_from_ref(transformation, ref).output_metric - == SymmetricDifference() - ) - - input_df: pd.DataFrame = self.dataframes["ids_duplicates"].toPandas() - result_df = self._get_result(transformation, ref) - - # Check that each ID doesn't appear more times than the constraint bound. - rows_per_id = result_df.groupby("id")["id"].count() - assert all( - rows_per_id <= constraint_max - ), f"MaxRowsPerID constraint violated, counts were:\n{str(rows_per_id)}" - - self._test_is_subset(input_df, result_df) - - @pytest.mark.parametrize( - "group_max,row_max,grouping_col", [(1, 1, "St"), (1, 2, "St"), (2, 1, "St")] - ) - def test_l0_linf_update_metric( - self, group_max: int, row_max: int, grouping_col: str - ): - """Test L0 + L-inf truncation with updating metric.""" - query = EnforceConstraint( - EnforceConstraint( - PrivateSource("ids_duplicates"), - MaxGroupsPerID(grouping_col, group_max), - options=FrozenDict.from_dict({"update_metric": True}), - ), - MaxRowsPerGroupPerID(grouping_col, row_max), - options=FrozenDict.from_dict({"update_metric": True}), - ) - transformation, ref, constraints = query.accept(self.visitor) - assert len(constraints) == 2 - assert ( - get_table_from_ref(transformation, ref).output_metric - == SymmetricDifference() - ) - - input_df: pd.DataFrame = self.dataframes["ids_duplicates"].toPandas() - result_df = self._get_result(transformation, ref) - - # Check that each no ID has more groups associated with it than the - # truncation bound. - groups_per_id = result_df.groupby("id").nunique()[grouping_col] - assert all( - groups_per_id <= group_max - ), f"MaxGroupsPerID constraint violated, counts were:\n{str(groups_per_id)}" - - # Check that each (ID, grouping_column) pair doesn't appear more - # times than the constraint bound. - rows_per_group_per_id = result_df.value_counts(["id", grouping_col]) - assert all( - rows_per_group_per_id <= row_max - ), "MaxRowsPerGroupPerID constraint violated, counts were:\n" + str( - rows_per_group_per_id - ) - - self._test_is_subset(input_df, result_df) diff --git a/test/unit/test_query_expression_visitor.py b/test/unit/test_query_expression_visitor.py index b48ee30f..466dc4e3 100644 --- a/test/unit/test_query_expression_visitor.py +++ b/test/unit/test_query_expression_visitor.py @@ -155,12 +155,7 @@ def visit_suppress_aggregates(self, expr): ), (DropInfinity(PrivateSource("P"), tuple("column")), "DropInfinity"), (DropNullAndNan(PrivateSource("P"), tuple("column")), "DropNullAndNan"), - ( - EnforceConstraint( - PrivateSource("P"), MaxRowsPerID(5), FrozenDict.from_dict({}) - ), - "EnforceConstraint", - ), + (EnforceConstraint(PrivateSource("P"), MaxRowsPerID(5)), "EnforceConstraint"), (GetGroups(PrivateSource("P"), tuple("column")), "GetGroups"), ( GetBounds(PrivateSource("P"), KeySet.from_dict({}), "A", "lower", "upper"), From 440dfc9207a6e000786623e737d22bb495f8538a Mon Sep 17 00:00:00 2001 From: Ted Date: Fri, 24 Oct 2025 12:15:48 +0200 Subject: [PATCH 15/25] Make noise selection logic a rewriting rule (#94) * main code * new files, not finished * tests! * compilation ordering & comments * arg I have to do kinda gross things * review comments * review comments, review comments * note to self: when adding tests, ensure they pass * review comments --------- Co-authored-by: Damien Desfontaines --- src/tmlt/analytics/_query_expr.py | 117 ++-- .../_base_measurement_visitor.py | 157 +---- .../_query_expr_compiler/_compiler.py | 9 + .../_query_expr_compiler/_rewrite_rules.py | 203 ++++++ .../test_measurement_visitor.py | 604 +----------------- .../query_expr_compiler/test_rewrite_rules.py | 393 ++++++++++++ 6 files changed, 711 insertions(+), 772 deletions(-) create mode 100644 src/tmlt/analytics/_query_expr_compiler/_rewrite_rules.py create mode 100644 test/unit/query_expr_compiler/test_rewrite_rules.py diff --git a/src/tmlt/analytics/_query_expr.py b/src/tmlt/analytics/_query_expr.py index 8314a540..d5c58323 100644 --- a/src/tmlt/analytics/_query_expr.py +++ b/src/tmlt/analytics/_query_expr.py @@ -21,6 +21,7 @@ from pyspark.sql import DataFrame, SparkSession from tmlt.core.domains.spark_domains import SparkDataFrameDomain +from tmlt.core.measurements.aggregations import NoiseMechanism from tmlt.core.utils.join import domain_after_join from typeguard import check_type @@ -194,6 +195,18 @@ def accept(self, visitor: "QueryExprVisitor") -> Any: raise NotImplementedError() +@dataclass(frozen=True) +class SingleChildQueryExpr(QueryExpr): + """A QueryExpr that has a single child. + + This is used in the compilation step, to make it easier for rewrite rules to + automatically recurse along the QueryExpr tree. + """ + + child: QueryExpr + """The QueryExpr used to generate the input table to this QueryExpr.""" + + @dataclass(frozen=True) class PrivateSource(QueryExpr): """Loads the private source.""" @@ -233,12 +246,9 @@ def accept(self, visitor: "QueryExprVisitor") -> Any: @dataclass(frozen=True) -class GetGroups(QueryExpr): +class GetGroups(SingleChildQueryExpr): """Returns groups based on the geometric partition selection for these columns.""" - child: QueryExpr - """The QueryExpr to get groups for.""" - columns: Optional[Tuple[str, ...]] = None """The columns used for geometric partition selection. @@ -281,11 +291,9 @@ def accept(self, visitor: "QueryExprVisitor") -> Any: @dataclass(frozen=True) -class GetBounds(QueryExpr): +class GetBounds(SingleChildQueryExpr): """Returns approximate upper and lower bounds of a column.""" - child: QueryExpr - """The QueryExpr to get groups for.""" groupby_keys: Union[KeySet, Tuple[str, ...]] """The keys, or columns list to collect keys from, to be grouped on.""" measure_column: str @@ -317,13 +325,10 @@ def accept(self, visitor: "QueryExprVisitor") -> Any: @dataclass(frozen=True) -class Rename(QueryExpr): +class Rename(SingleChildQueryExpr): """Returns the dataframe with columns renamed.""" - child: QueryExpr - """The QueryExpr to apply Rename to.""" column_mapper: FrozenDict - """The mapping of old column names to new column names. This mapping can contain all column names or just a subset. If it @@ -385,11 +390,9 @@ def accept(self, visitor: "QueryExprVisitor") -> Any: @dataclass(frozen=True) -class Filter(QueryExpr): +class Filter(SingleChildQueryExpr): """Returns the subset of the rows that satisfy the condition.""" - child: QueryExpr - """The QueryExpr to filter.""" condition: str """A string of SQL expression specifying the filter to apply to the data. @@ -425,11 +428,9 @@ def accept(self, visitor: "QueryExprVisitor") -> Any: @dataclass(frozen=True) -class Select(QueryExpr): +class Select(SingleChildQueryExpr): """Returns a subset of the columns.""" - child: QueryExpr - """The QueryExpr to apply the select on.""" columns: Tuple[str, ...] """The columns to select.""" @@ -476,11 +477,9 @@ def accept(self, visitor: "QueryExprVisitor") -> Any: @dataclass(frozen=True) -class Map(QueryExpr): +class Map(SingleChildQueryExpr): """Applies a map function to each row of a relation.""" - child: QueryExpr - """The QueryExpr to apply the map on.""" f: Callable[[Row], Row] """The map function.""" schema_new_columns: Schema @@ -565,11 +564,9 @@ def __eq__(self, other: object) -> bool: @dataclass(frozen=True) -class FlatMap(QueryExpr): +class FlatMap(SingleChildQueryExpr): """Applies a flat map function to each row of a relation.""" - child: QueryExpr - """The QueryExpr to apply the flat map on.""" f: Callable[[Row], List[Row]] """The flat map function.""" schema_new_columns: Schema @@ -692,11 +689,9 @@ def __eq__(self, other: object) -> bool: @dataclass(frozen=True) -class FlatMapByID(QueryExpr): +class FlatMapByID(SingleChildQueryExpr): """Applies a flat map function to each group of rows with a common ID.""" - child: QueryExpr - """The QueryExpr to apply the flat map on.""" f: Callable[[List[Row]], List[Row]] """The flat map function.""" schema_new_columns: Schema @@ -954,11 +949,9 @@ def accept(self, visitor: "QueryExprVisitor") -> Any: @dataclass(frozen=True) -class JoinPublic(QueryExpr): +class JoinPublic(SingleChildQueryExpr): """Returns the join of a private and public table.""" - child: QueryExpr - """The QueryExpr to join with public_df.""" public_table: Union[DataFrame, str] """A DataFrame or public source to join with.""" join_columns: Optional[Tuple[str, ...]] = None @@ -1094,7 +1087,7 @@ class AnalyticsDefault: @dataclass(frozen=True) -class ReplaceNullAndNan(QueryExpr): +class ReplaceNullAndNan(SingleChildQueryExpr): """Returns data with null and NaN expressions replaced by a default. .. warning:: @@ -1104,9 +1097,6 @@ class ReplaceNullAndNan(QueryExpr): that contains null values. """ - child: QueryExpr - """The QueryExpr to replace null/NaN values in.""" - replace_with: FrozenDict = FrozenDict.from_dict({}) """New values to replace with, by column. @@ -1195,12 +1185,9 @@ def accept(self, visitor: "QueryExprVisitor") -> Any: @dataclass(frozen=True) -class ReplaceInfinity(QueryExpr): +class ReplaceInfinity(SingleChildQueryExpr): """Returns data with +inf and -inf expressions replaced by defaults.""" - child: QueryExpr - """The QueryExpr to replace +inf and -inf values in.""" - replace_with: FrozenDict = FrozenDict.from_dict({}) """New values to replace with, by column. The first value for each column will be used to replace -infinity, and the second value will be used to @@ -1290,7 +1277,7 @@ def accept(self, visitor: "QueryExprVisitor") -> Any: @dataclass(frozen=True) -class DropNullAndNan(QueryExpr): +class DropNullAndNan(SingleChildQueryExpr): """Returns data with rows that contain null or NaN value dropped. .. warning:: @@ -1300,9 +1287,6 @@ class DropNullAndNan(QueryExpr): that contains null values. """ - child: QueryExpr - """The QueryExpr in which to drop nulls/NaNs.""" - columns: Tuple[str, ...] = tuple() """Columns in which to look for nulls and NaNs. @@ -1377,12 +1361,9 @@ def accept(self, visitor: "QueryExprVisitor") -> Any: @dataclass(frozen=True) -class DropInfinity(QueryExpr): +class DropInfinity(SingleChildQueryExpr): """Returns data with rows that contain +inf/-inf dropped.""" - child: QueryExpr - """The QueryExpr in which to drop +inf/-inf.""" - columns: Tuple[str, ...] = tuple() """Columns in which to look for and infinite values. @@ -1461,11 +1442,9 @@ def accept(self, visitor: "QueryExprVisitor") -> Any: @dataclass(frozen=True) -class EnforceConstraint(QueryExpr): +class EnforceConstraint(SingleChildQueryExpr): """Enforces a constraint on the data.""" - child: QueryExpr - """The QueryExpr to which the constraint will be applied.""" constraint: Constraint """A constraint to be enforced.""" @@ -1660,11 +1639,9 @@ def _schema_for_groupby( @dataclass(frozen=True) -class GroupByCount(QueryExpr): +class GroupByCount(SingleChildQueryExpr): """Returns the count of each combination of the groupby domains.""" - child: QueryExpr - """The QueryExpr to measure.""" groupby_keys: Union[KeySet, Tuple[str, ...]] """The keys, or columns list to collect keys from, to be grouped on.""" output_column: str = "count" @@ -1675,6 +1652,8 @@ class GroupByCount(QueryExpr): By DEFAULT, the framework automatically selects an appropriate mechanism. """ + core_mechanism: Optional[NoiseMechanism] = None + """The Core mechanism used for this aggregation. Specified during compilation.""" def __post_init__(self): """Checks arguments to constructor.""" @@ -1697,11 +1676,9 @@ def accept(self, visitor: "QueryExprVisitor") -> Any: @dataclass(frozen=True) -class GroupByCountDistinct(QueryExpr): +class GroupByCountDistinct(SingleChildQueryExpr): """Returns the count of distinct rows in each groupby domain value.""" - child: QueryExpr - """The QueryExpr to measure.""" groupby_keys: Union[KeySet, Tuple[str, ...]] """The keys, or columns list to collect keys from, to be grouped on.""" columns_to_count: Optional[Tuple[str, ...]] = None @@ -1716,6 +1693,8 @@ class GroupByCountDistinct(QueryExpr): By DEFAULT, the framework automatically selects an appropriate mechanism. """ + core_mechanism: Optional[NoiseMechanism] = None + """The Core mechanism used for this aggregation. Specified during compilation.""" def __post_init__(self): """Checks arguments to constructor.""" @@ -1739,7 +1718,7 @@ def accept(self, visitor: "QueryExprVisitor") -> Any: @dataclass(frozen=True) -class GroupByQuantile(QueryExpr): +class GroupByQuantile(SingleChildQueryExpr): """Returns the quantile of a column for each combination of the groupby domains. If the column to be measured contains null, NaN, or positive or negative infinity, @@ -1748,8 +1727,6 @@ class GroupByQuantile(QueryExpr): calculated. """ - child: QueryExpr - """The QueryExpr to measure.""" groupby_keys: Union[KeySet, Tuple[str, ...]] """The keys, or columns list to collect keys from, to be grouped on.""" measure_column: str @@ -1806,7 +1783,7 @@ def accept(self, visitor: "QueryExprVisitor") -> Any: @dataclass(frozen=True) -class GroupByBoundedSum(QueryExpr): +class GroupByBoundedSum(SingleChildQueryExpr): """Returns the bounded sum of a column for each combination of groupby domains. If the column to be measured contains null, NaN, or positive or negative infinity, @@ -1815,8 +1792,6 @@ class GroupByBoundedSum(QueryExpr): calculated. """ - child: QueryExpr - """The QueryExpr to measure.""" groupby_keys: Union[KeySet, Tuple[str, ...]] """The keys, or columns list to collect keys from, to be grouped on.""" measure_column: str @@ -1837,6 +1812,8 @@ class GroupByBoundedSum(QueryExpr): By DEFAULT, the framework automatically selects an appropriate mechanism. """ + core_mechanism: Optional[NoiseMechanism] = None + """The Core mechanism used for this aggregation. Specified during compilation.""" def __post_init__(self): """Checks arguments to constructor.""" @@ -1873,7 +1850,7 @@ def accept(self, visitor: "QueryExprVisitor") -> Any: @dataclass(frozen=True) -class GroupByBoundedAverage(QueryExpr): +class GroupByBoundedAverage(SingleChildQueryExpr): """Returns bounded average of a column for each combination of groupby domains. If the column to be measured contains null, NaN, or positive or negative infinity, @@ -1882,8 +1859,6 @@ class GroupByBoundedAverage(QueryExpr): calculated. """ - child: QueryExpr - """The QueryExpr to measure.""" groupby_keys: Union[KeySet, Tuple[str, ...]] """The keys, or columns list to collect keys from, to be grouped on.""" measure_column: str @@ -1904,6 +1879,8 @@ class GroupByBoundedAverage(QueryExpr): By DEFAULT, the framework automatically selects an appropriate mechanism. """ + core_mechanism: Optional[NoiseMechanism] = None + """The Core mechanism used for this aggregation. Specified during compilation.""" def __post_init__(self): """Checks arguments to constructor.""" @@ -1940,7 +1917,7 @@ def accept(self, visitor: "QueryExprVisitor") -> Any: @dataclass(frozen=True) -class GroupByBoundedVariance(QueryExpr): +class GroupByBoundedVariance(SingleChildQueryExpr): """Returns bounded variance of a column for each combination of groupby domains. If the column to be measured contains null, NaN, or positive or negative infinity, @@ -1949,8 +1926,6 @@ class GroupByBoundedVariance(QueryExpr): calculated. """ - child: QueryExpr - """The QueryExpr to measure.""" groupby_keys: Union[KeySet, Tuple[str, ...]] """The keys, or columns list to collect keys from, to be grouped on.""" measure_column: str @@ -1971,6 +1946,8 @@ class GroupByBoundedVariance(QueryExpr): By DEFAULT, the framework automatically selects an appropriate mechanism. """ + core_mechanism: Optional[NoiseMechanism] = None + """The Core mechanism used for this aggregation. Specified during compilation.""" def __post_init__(self): """Checks arguments to constructor.""" @@ -2007,7 +1984,7 @@ def accept(self, visitor: "QueryExprVisitor") -> Any: @dataclass(frozen=True) -class GroupByBoundedSTDEV(QueryExpr): +class GroupByBoundedSTDEV(SingleChildQueryExpr): """Returns bounded stdev of a column for each combination of groupby domains. If the column to be measured contains null, NaN, or positive or negative infinity, @@ -2016,8 +1993,6 @@ class GroupByBoundedSTDEV(QueryExpr): standard deviation is calculated. """ - child: QueryExpr - """The QueryExpr to measure.""" groupby_keys: Union[KeySet, Tuple[str, ...]] """The keys, or columns list to collect keys from, to be grouped on.""" measure_column: str @@ -2038,6 +2013,8 @@ class GroupByBoundedSTDEV(QueryExpr): By DEFAULT, the framework automatically selects an appropriate mechanism. """ + core_mechanism: Optional[NoiseMechanism] = None + """The Core mechanism used for this aggregation. Specified during compilation.""" def __post_init__(self): """Checks arguments to constructor.""" @@ -2075,7 +2052,7 @@ def accept(self, visitor: "QueryExprVisitor") -> Any: @dataclass(frozen=True) -class SuppressAggregates(QueryExpr): +class SuppressAggregates(SingleChildQueryExpr): """Remove all counts that are less than the threshold.""" child: GroupByCount diff --git a/src/tmlt/analytics/_query_expr_compiler/_base_measurement_visitor.py b/src/tmlt/analytics/_query_expr_compiler/_base_measurement_visitor.py index 4849e4bc..26427d02 100644 --- a/src/tmlt/analytics/_query_expr_compiler/_base_measurement_visitor.py +++ b/src/tmlt/analytics/_query_expr_compiler/_base_measurement_visitor.py @@ -120,6 +120,24 @@ ) +def _get_core_mechanism( + query: Union[ + GroupByBoundedAverage, + GroupByBoundedSTDEV, + GroupByBoundedSum, + GroupByBoundedVariance, + GroupByCount, + GroupByCountDistinct, + ] +) -> NoiseMechanism: + if query.core_mechanism is None: + raise AnalyticsInternalError( + f"QueryExpr {query} should have had core_mechanism set during rewriting, " + "but it was None instead." + ) + return query.core_mechanism + + def _get_query_bounds( query: Union[ GroupByBoundedAverage, @@ -254,6 +272,7 @@ def _generate_constrained_count_distinct( groupby_keys=query.groupby_keys, output_column=query.output_column, mechanism=mechanism, + core_mechanism=query.core_mechanism, ) elif len(groupby_columns) == 1: # A groupby on exactly one column is performed; if that column has a @@ -277,6 +296,7 @@ def _generate_constrained_count_distinct( groupby_keys=query.groupby_keys, output_column=query.output_column, mechanism=mechanism, + core_mechanism=query.core_mechanism, ) # If none of the above cases are true, no optimization is possible. @@ -655,129 +675,6 @@ def _validate_approxDP_and_adjust_budget( else: raise AnalyticsInternalError(f"Unknown mechanism {mechanism}.") - def _pick_noise_for_count( - self, query: Union[GroupByCount, GroupByCountDistinct] - ) -> NoiseMechanism: - """Pick the noise mechanism to use for a count or count-distinct query.""" - requested_mechanism: NoiseMechanism - if query.mechanism in (CountMechanism.DEFAULT, CountDistinctMechanism.DEFAULT): - if isinstance(self.output_measure, (PureDP, ApproxDP)): - requested_mechanism = NoiseMechanism.LAPLACE - else: # output measure is RhoZCDP - requested_mechanism = NoiseMechanism.DISCRETE_GAUSSIAN - elif query.mechanism in ( - CountMechanism.LAPLACE, - CountDistinctMechanism.LAPLACE, - ): - requested_mechanism = NoiseMechanism.LAPLACE - elif query.mechanism in ( - CountMechanism.GAUSSIAN, - CountDistinctMechanism.GAUSSIAN, - ): - requested_mechanism = NoiseMechanism.DISCRETE_GAUSSIAN - else: - raise ValueError( - f"Did not recognize the mechanism name {query.mechanism}." - " Supported mechanisms are DEFAULT, LAPLACE, and GAUSSIAN." - ) - - if requested_mechanism == NoiseMechanism.LAPLACE: - return NoiseMechanism.GEOMETRIC - elif requested_mechanism == NoiseMechanism.DISCRETE_GAUSSIAN: - return NoiseMechanism.DISCRETE_GAUSSIAN - else: - # This should never happen - raise AnalyticsInternalError( - f"Did not recognize the requested mechanism {requested_mechanism}." - ) - - def _pick_noise_for_non_count( - self, - query: Union[ - GroupByBoundedAverage, - GroupByBoundedSTDEV, - GroupByBoundedSum, - GroupByBoundedVariance, - ], - ) -> NoiseMechanism: - """Pick the noise mechanism for non-count queries. - - GroupByQuantile and GetBounds only supports one noise mechanism, so it is not - included here. - """ - measure_column_type = query.child.schema(self.catalog)[ - query.measure_column - ].column_type - requested_mechanism: NoiseMechanism - if query.mechanism in ( - SumMechanism.DEFAULT, - AverageMechanism.DEFAULT, - VarianceMechanism.DEFAULT, - StdevMechanism.DEFAULT, - ): - requested_mechanism = ( - NoiseMechanism.LAPLACE - if isinstance(self.output_measure, (PureDP, ApproxDP)) - else NoiseMechanism.GAUSSIAN - ) - elif query.mechanism in ( - SumMechanism.LAPLACE, - AverageMechanism.LAPLACE, - VarianceMechanism.LAPLACE, - StdevMechanism.LAPLACE, - ): - requested_mechanism = NoiseMechanism.LAPLACE - elif query.mechanism in ( - SumMechanism.GAUSSIAN, - AverageMechanism.GAUSSIAN, - VarianceMechanism.GAUSSIAN, - StdevMechanism.GAUSSIAN, - ): - requested_mechanism = NoiseMechanism.GAUSSIAN - else: - raise ValueError( - f"Did not recognize requested mechanism {query.mechanism}." - " Supported mechanisms are DEFAULT, LAPLACE, and GAUSSIAN." - ) - - # If the query requested a Laplace measure ... - if requested_mechanism == NoiseMechanism.LAPLACE: - if measure_column_type == ColumnType.INTEGER: - return NoiseMechanism.GEOMETRIC - elif measure_column_type == ColumnType.DECIMAL: - return NoiseMechanism.LAPLACE - else: - raise AssertionError( - "Query's measure column should be numeric. This should" - " not happen and is probably a bug; please let us know" - " so we can fix it!" - ) - - # If the query requested a Gaussian measure... - elif requested_mechanism == NoiseMechanism.GAUSSIAN: - if isinstance(self.output_measure, PureDP): - raise ValueError( - "Gaussian noise is not supported under PureDP. " - "Please use RhoZCDP or another measure." - ) - if measure_column_type == ColumnType.DECIMAL: - return NoiseMechanism.GAUSSIAN - elif measure_column_type == ColumnType.INTEGER: - return NoiseMechanism.DISCRETE_GAUSSIAN - else: - raise AssertionError( - "Query's measure column should be numeric. This should" - " not happen and is probably a bug; please let us know" - " so we can fix it!" - ) - - # The requested_mechanism should be either LAPLACE or - # GAUSSIAN, so something has gone awry - else: - raise AnalyticsInternalError( - f"Did not recognize requested mechanism {requested_mechanism}." - ) - def _add_special_value_handling_to_query( self, query: Union[ @@ -1055,7 +952,7 @@ def visit_groupby_count(self, expr: GroupByCount) -> Tuple[Measurement, NoiseInf self.adjusted_budget ) - mechanism = self._pick_noise_for_count(expr) + mechanism = _get_core_mechanism(expr) child_transformation, child_ref = self._truncate_table( *self._visit_child_transformation(expr.child, mechanism), grouping_columns=groupby_cols, @@ -1139,7 +1036,7 @@ def visit_groupby_count_distinct( self.adjusted_budget ) - mechanism = self._pick_noise_for_count(expr) + mechanism = _get_core_mechanism(expr) ( child_transformation, child_ref, @@ -1356,8 +1253,8 @@ def visit_groupby_bounded_sum( self.adjusted_budget ) - mechanism = self._pick_noise_for_non_count(expr) lower, upper = _get_query_bounds(expr) + mechanism = _get_core_mechanism(expr) child_transformation, child_ref = self._truncate_table( *self._visit_child_transformation(expr.child, mechanism), @@ -1453,10 +1350,10 @@ def visit_groupby_bounded_average( ) lower, upper = _get_query_bounds(expr) - mechanism = self._pick_noise_for_non_count(expr) + mechanism = _get_core_mechanism(expr) child_transformation, child_ref = self._truncate_table( - *self._visit_child_transformation(expr.child, self.default_mechanism), + *self._visit_child_transformation(expr.child, mechanism), grouping_columns=groupby_cols, ) transformation = get_table_from_ref(child_transformation, child_ref) @@ -1549,7 +1446,7 @@ def visit_groupby_bounded_variance( ) lower, upper = _get_query_bounds(expr) - mechanism = self._pick_noise_for_non_count(expr) + mechanism = _get_core_mechanism(expr) child_transformation, child_ref = self._truncate_table( *self._visit_child_transformation(expr.child, mechanism), @@ -1645,7 +1542,7 @@ def visit_groupby_bounded_stdev( ) lower, upper = _get_query_bounds(expr) - mechanism = self._pick_noise_for_non_count(expr) + mechanism = _get_core_mechanism(expr) child_transformation, child_ref = self._truncate_table( *self._visit_child_transformation(expr.child, mechanism), diff --git a/src/tmlt/analytics/_query_expr_compiler/_compiler.py b/src/tmlt/analytics/_query_expr_compiler/_compiler.py index 37475d60..fd90d946 100644 --- a/src/tmlt/analytics/_query_expr_compiler/_compiler.py +++ b/src/tmlt/analytics/_query_expr_compiler/_compiler.py @@ -22,6 +22,7 @@ from tmlt.analytics._noise_info import NoiseInfo from tmlt.analytics._query_expr import QueryExpr from tmlt.analytics._query_expr_compiler._measurement_visitor import MeasurementVisitor +from tmlt.analytics._query_expr_compiler._rewrite_rules import CompilationInfo, rewrite from tmlt.analytics._query_expr_compiler._transformation_visitor import ( TransformationVisitor, ) @@ -139,6 +140,14 @@ def __call__( # Computing the schema validates that the query is well-formed. query.schema(catalog) + # Compilation happens in two stages: first, we apply rewrite rules... + compilation_info = CompilationInfo( + output_measure=self._output_measure, + catalog=catalog, + ) + query = rewrite(compilation_info, query) + + # ... then we visit the query. visitor = MeasurementVisitor( privacy_budget=privacy_budget, stability=stability, diff --git a/src/tmlt/analytics/_query_expr_compiler/_rewrite_rules.py b/src/tmlt/analytics/_query_expr_compiler/_rewrite_rules.py new file mode 100644 index 00000000..6a8e9006 --- /dev/null +++ b/src/tmlt/analytics/_query_expr_compiler/_rewrite_rules.py @@ -0,0 +1,203 @@ +"""Rules for rewriting QueryExprs. + +These are executed at the beginning of the query compilation process, and each rewrite +rule corresponds to one compilation step. The rewritten QueryExpr is then visited by the +MeasurementVisitor to be converted to a Core measurement. +""" + +# SPDX-License-Identifier: Apache-2.0 + +from dataclasses import dataclass, replace +from functools import wraps +from typing import Callable, Union + +from tmlt.core.measurements.aggregations import NoiseMechanism +from tmlt.core.measures import ApproxDP, PureDP, RhoZCDP + +from tmlt.analytics import AnalyticsInternalError +from tmlt.analytics._catalog import Catalog +from tmlt.analytics._query_expr import ( + AverageMechanism, + CountDistinctMechanism, + CountMechanism, + GroupByBoundedAverage, + GroupByBoundedSTDEV, + GroupByBoundedSum, + GroupByBoundedVariance, + GroupByCount, + GroupByCountDistinct, + JoinPrivate, + PrivateSource, + QueryExpr, + SingleChildQueryExpr, + StdevMechanism, + SumMechanism, + SuppressAggregates, + VarianceMechanism, +) +from tmlt.analytics._schema import ColumnType + + +@dataclass(frozen=True) +class CompilationInfo: + """Contextual information used by rewrite rules during compilation.""" + + output_measure: Union[PureDP, ApproxDP, RhoZCDP] + """The output measure used by this query.""" + + catalog: Catalog + """The Catalog of the Session this query is executed on.""" + + +def depth_first( + func: Callable[[QueryExpr], QueryExpr] +) -> Callable[[QueryExpr], QueryExpr]: + """Recursively applies the given method to a QueryExpr, depth-first.""" + + @wraps(func) + def wrapped(expr: QueryExpr) -> QueryExpr: + if isinstance(expr, PrivateSource): + return func(expr) + if isinstance(expr, SuppressAggregates): + child = wrapped(expr.child) + if not isinstance(child, GroupByCount): + raise AnalyticsInternalError( + "Rewriting rule should have produced a QueryExpr of type " + "GroupByCount as a child for SuppressAggregates, got type " + f"{type(child).__qualname__} instead." + ) + return func(replace(expr, child=child)) + if isinstance(expr, SingleChildQueryExpr): + child = wrapped(expr.child) + return func(replace(expr, child=child)) + if isinstance(expr, JoinPrivate): + left = wrapped(expr.child) + right = wrapped(expr.right_operand_expr) + return func(replace(expr, child=left, right_operand_expr=right)) + else: + raise AnalyticsInternalError( + f"Unrecognized QueryExpr subtype {type(expr).__qualname__}." + ) + + return wrapped + + +def select_noise_mechanism(info: CompilationInfo) -> Callable[[QueryExpr], QueryExpr]: + """Changes the default noise type into a concrete noise type for aggregations.""" + + def select_noise_for_count( + info: CompilationInfo, expr: Union[GroupByCount, GroupByCountDistinct] + ) -> QueryExpr: + mechanism = expr.mechanism + if mechanism in (CountMechanism.DEFAULT, CountDistinctMechanism.DEFAULT): + core_mechanism = ( + NoiseMechanism.GEOMETRIC + if isinstance(info.output_measure, (PureDP, ApproxDP)) + else NoiseMechanism.DISCRETE_GAUSSIAN + ) + elif mechanism in (CountMechanism.LAPLACE, CountDistinctMechanism.LAPLACE): + core_mechanism = NoiseMechanism.GEOMETRIC + elif mechanism in ( + CountMechanism.GAUSSIAN, + CountDistinctMechanism.GAUSSIAN, + ): + if not isinstance(info.output_measure, RhoZCDP): + raise ValueError( + "Gaussian noise is only supported when using a RhoZCDP budget. " + "Use Laplace noise instead, or switch to RhoZCDP." + ) + core_mechanism = NoiseMechanism.DISCRETE_GAUSSIAN + else: + raise ValueError( + f"Did not recognize the mechanism name {mechanism}." + " Supported mechanisms are DEFAULT, LAPLACE, and GAUSSIAN." + ) + return replace(expr, core_mechanism=core_mechanism) + + def select_noise_for_non_count( + info: CompilationInfo, + expr: Union[ + GroupByBoundedAverage, + GroupByBoundedSTDEV, + GroupByBoundedSum, + GroupByBoundedVariance, + ], + ) -> QueryExpr: + mechanism = expr.mechanism + # Distinguish between Laplace/Geometric or (Discrete) Gaussian. + # Assume floating-point output column type at first + if mechanism in ( + SumMechanism.DEFAULT, + AverageMechanism.DEFAULT, + VarianceMechanism.DEFAULT, + StdevMechanism.DEFAULT, + ): + core_mechanism = ( + NoiseMechanism.LAPLACE + if isinstance(info.output_measure, (PureDP, ApproxDP)) + else NoiseMechanism.GAUSSIAN + ) + elif mechanism in ( + SumMechanism.LAPLACE, + AverageMechanism.LAPLACE, + VarianceMechanism.LAPLACE, + StdevMechanism.LAPLACE, + ): + core_mechanism = NoiseMechanism.LAPLACE + elif mechanism in ( + SumMechanism.GAUSSIAN, + AverageMechanism.GAUSSIAN, + VarianceMechanism.GAUSSIAN, + StdevMechanism.GAUSSIAN, + ): + if not isinstance(info.output_measure, RhoZCDP): + raise ValueError( + "Gaussian noise is only supported when using a RhoZCDP budget. " + "Use Laplace noise instead, or switch to RhoZCDP." + ) + core_mechanism = NoiseMechanism.GAUSSIAN + else: + raise ValueError( + f"Did not recognize requested mechanism {mechanism}." + " Supported mechanisms are DEFAULT, LAPLACE, and GAUSSIAN." + ) + + # If the measure column type is integer, use integer noise distributions + schema = expr.child.schema(info.catalog) + measure_column_type = schema[expr.measure_column].column_type + if measure_column_type == ColumnType.INTEGER: + core_mechanism = ( + NoiseMechanism.GEOMETRIC + if core_mechanism == NoiseMechanism.LAPLACE + else NoiseMechanism.DISCRETE_GAUSSIAN + ) + + return replace(expr, core_mechanism=core_mechanism) + + @depth_first + def select_noise(expr: QueryExpr) -> QueryExpr: + if isinstance(expr, (GroupByCount, GroupByCountDistinct)): + return select_noise_for_count(info, expr) + if isinstance( + expr, + ( + GroupByBoundedAverage, + GroupByBoundedSTDEV, + GroupByBoundedSum, + GroupByBoundedVariance, + ), + ): + return select_noise_for_non_count(info, expr) + return expr + + return select_noise + + +def rewrite(info: CompilationInfo, expr: QueryExpr) -> QueryExpr: + """Rewrites the given QueryExpr into a QueryExpr that can be compiled.""" + rewrite_rules = [ + select_noise_mechanism(info), + ] + for rule in rewrite_rules: + expr = rule(expr) + return expr diff --git a/test/unit/query_expr_compiler/test_measurement_visitor.py b/test/unit/query_expr_compiler/test_measurement_visitor.py index 8b51779a..4d23dde5 100644 --- a/test/unit/query_expr_compiler/test_measurement_visitor.py +++ b/test/unit/query_expr_compiler/test_measurement_visitor.py @@ -1,6 +1,6 @@ """Tests for MeasurementVisitor.""" from test.conftest import assert_frame_equal_with_sort, create_empty_input -from typing import List, Optional, Union +from typing import List, Union from unittest.mock import patch import pandas as pd @@ -9,7 +9,6 @@ from pyspark.sql.types import LongType, StringType, StructField, StructType from tmlt.core.domains.collections import DictDomain from tmlt.core.domains.spark_domains import ( - SparkColumnDescriptor, SparkDataFrameDomain, SparkFloatColumnDescriptor, SparkIntegerColumnDescriptor, @@ -28,7 +27,6 @@ from tmlt.core.transformations.base import Transformation from tmlt.core.transformations.chaining import ChainTT from tmlt.core.utils.exact_number import ExactNumber -from tmlt.core.utils.type_utils import assert_never from tmlt.analytics import ( KeySet, @@ -347,575 +345,6 @@ def check_noise_info( _, noise_info = query.accept(self.pick_noise_visitor) assert noise_info == expected_noise_info - @pytest.mark.parametrize( - "query_mechanism,output_measure,expected_mechanism", - [ - (CountMechanism.DEFAULT, PureDP(), NoiseMechanism.GEOMETRIC), - (CountMechanism.DEFAULT, RhoZCDP(), NoiseMechanism.DISCRETE_GAUSSIAN), - (CountMechanism.LAPLACE, PureDP(), NoiseMechanism.GEOMETRIC), - (CountMechanism.LAPLACE, RhoZCDP(), NoiseMechanism.GEOMETRIC), - (CountMechanism.GAUSSIAN, PureDP(), NoiseMechanism.DISCRETE_GAUSSIAN), - (CountMechanism.GAUSSIAN, RhoZCDP(), NoiseMechanism.DISCRETE_GAUSSIAN), - ], - ) - def test_pick_noise_for_count( - self, - query_mechanism: CountMechanism, - output_measure: Union[PureDP, RhoZCDP], - expected_mechanism: NoiseMechanism, - ) -> None: - """Test _pick_noise_for_count for GroupByCount query expressions.""" - query = GroupByCount( - child=self.base_query, - groupby_keys=KeySet.from_dict({}), - mechanism=query_mechanism, - ) - self.pick_noise_visitor.output_measure = output_measure - # pylint: disable=protected-access - got_mechanism = self.pick_noise_visitor._pick_noise_for_count(query) - # pylint: enable=protected-access - assert got_mechanism == expected_mechanism - - @pytest.mark.parametrize( - "query_mechanism,output_measure,expected_mechanism", - [ - (CountDistinctMechanism.DEFAULT, PureDP(), NoiseMechanism.GEOMETRIC), - ( - CountDistinctMechanism.DEFAULT, - RhoZCDP(), - NoiseMechanism.DISCRETE_GAUSSIAN, - ), - (CountDistinctMechanism.LAPLACE, PureDP(), NoiseMechanism.GEOMETRIC), - (CountDistinctMechanism.LAPLACE, RhoZCDP(), NoiseMechanism.GEOMETRIC), - ( - CountDistinctMechanism.GAUSSIAN, - PureDP(), - NoiseMechanism.DISCRETE_GAUSSIAN, - ), - ( - CountDistinctMechanism.GAUSSIAN, - RhoZCDP(), - NoiseMechanism.DISCRETE_GAUSSIAN, - ), - ], - ) - def test_pick_noise_for_count_distinct( - self, - query_mechanism: CountDistinctMechanism, - output_measure: Union[PureDP, RhoZCDP], - expected_mechanism: NoiseMechanism, - ) -> None: - """Test _pick_noise_for_count for GroupByCountDistinct query expressions.""" - query = GroupByCountDistinct( - child=self.base_query, - groupby_keys=KeySet.from_dict({}), - mechanism=query_mechanism, - ) - self.pick_noise_visitor.output_measure = output_measure - # pylint: disable=protected-access - got_mechanism = self.pick_noise_visitor._pick_noise_for_count(query) - # pylint: enable=protected-access - assert got_mechanism == expected_mechanism - - @pytest.mark.parametrize( - "query_mechanism,output_measure,measure_column_type,expected_mechanism", - [ - ( - AverageMechanism.DEFAULT, - PureDP(), - SparkIntegerColumnDescriptor(), - NoiseMechanism.GEOMETRIC, - ), - ( - AverageMechanism.DEFAULT, - PureDP(), - SparkFloatColumnDescriptor(), - NoiseMechanism.LAPLACE, - ), - ( - AverageMechanism.DEFAULT, - RhoZCDP(), - SparkIntegerColumnDescriptor(), - NoiseMechanism.DISCRETE_GAUSSIAN, - ), - ( - AverageMechanism.DEFAULT, - RhoZCDP(), - SparkFloatColumnDescriptor(), - NoiseMechanism.GAUSSIAN, - ), - ( - AverageMechanism.LAPLACE, - PureDP(), - SparkIntegerColumnDescriptor(), - NoiseMechanism.GEOMETRIC, - ), - ( - AverageMechanism.LAPLACE, - PureDP(), - SparkFloatColumnDescriptor(), - NoiseMechanism.LAPLACE, - ), - ( - AverageMechanism.LAPLACE, - RhoZCDP(), - SparkIntegerColumnDescriptor(), - NoiseMechanism.GEOMETRIC, - ), - ( - AverageMechanism.LAPLACE, - RhoZCDP(), - SparkFloatColumnDescriptor(), - NoiseMechanism.LAPLACE, - ), - (AverageMechanism.GAUSSIAN, PureDP(), SparkIntegerColumnDescriptor(), None), - (AverageMechanism.GAUSSIAN, PureDP(), SparkFloatColumnDescriptor(), None), - ( - AverageMechanism.GAUSSIAN, - RhoZCDP(), - SparkIntegerColumnDescriptor(), - NoiseMechanism.DISCRETE_GAUSSIAN, - ), - ( - AverageMechanism.GAUSSIAN, - RhoZCDP(), - SparkFloatColumnDescriptor(), - NoiseMechanism.GAUSSIAN, - ), - ], - ) - def test_pick_noise_for_average( - self, - query_mechanism: AverageMechanism, - output_measure: Union[PureDP, RhoZCDP], - measure_column_type: SparkColumnDescriptor, - # if expected_mechanism is None, this combination is not supported - expected_mechanism: Optional[NoiseMechanism], - ) -> None: - """Test _pick_noise_for_non_count for GroupByBoundedAverage query exprs.""" - if isinstance(measure_column_type, SparkIntegerColumnDescriptor): - measure_column = "B" - elif isinstance(measure_column_type, SparkFloatColumnDescriptor): - measure_column = "X" - else: - raise AssertionError("Unknown measure column type") - query = GroupByBoundedAverage( - child=self.base_query, - measure_column=measure_column, - low=0, - high=1, - mechanism=query_mechanism, - groupby_keys=KeySet.from_dict({}), - ) - self.pick_noise_visitor.output_measure = output_measure - # pylint: disable=protected-access - if expected_mechanism is not None: - got_mechanism = self.pick_noise_visitor._pick_noise_for_non_count(query) - assert got_mechanism == expected_mechanism - else: - with pytest.raises( - ValueError, - match=( - "Gaussian noise is not supported under PureDP. " - "Please use RhoZCDP or another measure." - ), - ): - self.pick_noise_visitor._pick_noise_for_non_count(query) - # pylint: enable=protected-access - - @pytest.mark.parametrize( - "query_mechanism,output_measure,measure_column_type,expected_mechanism", - [ - ( - SumMechanism.DEFAULT, - PureDP(), - SparkIntegerColumnDescriptor(), - NoiseMechanism.GEOMETRIC, - ), - ( - SumMechanism.DEFAULT, - PureDP(), - SparkFloatColumnDescriptor(), - NoiseMechanism.LAPLACE, - ), - ( - SumMechanism.DEFAULT, - RhoZCDP(), - SparkIntegerColumnDescriptor(), - NoiseMechanism.DISCRETE_GAUSSIAN, - ), - ( - SumMechanism.DEFAULT, - RhoZCDP(), - SparkFloatColumnDescriptor(), - NoiseMechanism.GAUSSIAN, - ), - ( - SumMechanism.LAPLACE, - PureDP(), - SparkIntegerColumnDescriptor(), - NoiseMechanism.GEOMETRIC, - ), - ( - SumMechanism.LAPLACE, - PureDP(), - SparkFloatColumnDescriptor(), - NoiseMechanism.LAPLACE, - ), - ( - SumMechanism.LAPLACE, - RhoZCDP(), - SparkIntegerColumnDescriptor(), - NoiseMechanism.GEOMETRIC, - ), - ( - SumMechanism.LAPLACE, - RhoZCDP(), - SparkFloatColumnDescriptor(), - NoiseMechanism.LAPLACE, - ), - (SumMechanism.GAUSSIAN, PureDP(), SparkIntegerColumnDescriptor(), None), - (SumMechanism.GAUSSIAN, PureDP(), SparkFloatColumnDescriptor(), None), - ( - SumMechanism.GAUSSIAN, - RhoZCDP(), - SparkIntegerColumnDescriptor(), - NoiseMechanism.DISCRETE_GAUSSIAN, - ), - ( - SumMechanism.GAUSSIAN, - RhoZCDP(), - SparkFloatColumnDescriptor(), - NoiseMechanism.GAUSSIAN, - ), - ], - ) - def test_pick_noise_for_sum( - self, - query_mechanism: SumMechanism, - output_measure: Union[PureDP, RhoZCDP], - measure_column_type: SparkColumnDescriptor, - # if expected_mechanism is None, this combination is not supported - expected_mechanism: Optional[NoiseMechanism], - ) -> None: - """Test _pick_noise_for_non_count for GroupByBoundedSum query exprs.""" - if isinstance(measure_column_type, SparkFloatColumnDescriptor): - measure_column = "X" - elif isinstance(measure_column_type, SparkIntegerColumnDescriptor): - measure_column = "B" - else: - raise AssertionError("Unknown measure column type") - query = GroupByBoundedSum( - child=self.base_query, - measure_column=measure_column, - low=0, - high=1, - mechanism=query_mechanism, - groupby_keys=KeySet.from_dict({}), - ) - self.pick_noise_visitor.output_measure = output_measure - # pylint: disable=protected-access - if expected_mechanism is not None: - got_mechanism = self.pick_noise_visitor._pick_noise_for_non_count(query) - assert got_mechanism == expected_mechanism - else: - with pytest.raises( - ValueError, - match=( - "Gaussian noise is not supported under PureDP. " - "Please use RhoZCDP or another measure." - ), - ): - self.pick_noise_visitor._pick_noise_for_non_count(query) - # pylint: enable=protected-access - - @pytest.mark.parametrize( - "query_mechanism,output_measure,measure_column_type,expected_mechanism", - [ - ( - VarianceMechanism.DEFAULT, - PureDP(), - SparkIntegerColumnDescriptor(), - NoiseMechanism.GEOMETRIC, - ), - ( - VarianceMechanism.DEFAULT, - PureDP(), - SparkFloatColumnDescriptor(), - NoiseMechanism.LAPLACE, - ), - ( - VarianceMechanism.DEFAULT, - RhoZCDP(), - SparkIntegerColumnDescriptor(), - NoiseMechanism.DISCRETE_GAUSSIAN, - ), - ( - VarianceMechanism.DEFAULT, - RhoZCDP(), - SparkFloatColumnDescriptor(), - NoiseMechanism.GAUSSIAN, - ), - ( - VarianceMechanism.LAPLACE, - PureDP(), - SparkIntegerColumnDescriptor(), - NoiseMechanism.GEOMETRIC, - ), - ( - VarianceMechanism.LAPLACE, - PureDP(), - SparkFloatColumnDescriptor(), - NoiseMechanism.LAPLACE, - ), - ( - VarianceMechanism.LAPLACE, - RhoZCDP(), - SparkIntegerColumnDescriptor(), - NoiseMechanism.GEOMETRIC, - ), - ( - VarianceMechanism.LAPLACE, - RhoZCDP(), - SparkFloatColumnDescriptor(), - NoiseMechanism.LAPLACE, - ), - ( - VarianceMechanism.GAUSSIAN, - PureDP(), - SparkIntegerColumnDescriptor(), - None, - ), - (VarianceMechanism.GAUSSIAN, PureDP(), SparkFloatColumnDescriptor(), None), - ( - VarianceMechanism.GAUSSIAN, - RhoZCDP(), - SparkIntegerColumnDescriptor(), - NoiseMechanism.DISCRETE_GAUSSIAN, - ), - ( - VarianceMechanism.GAUSSIAN, - RhoZCDP(), - SparkFloatColumnDescriptor(), - NoiseMechanism.GAUSSIAN, - ), - ], - ) - def test_pick_noise_for_variance( - self, - query_mechanism: VarianceMechanism, - output_measure: Union[PureDP, RhoZCDP], - measure_column_type: SparkColumnDescriptor, - # if expected_mechanism is None, this combination is not supported - expected_mechanism: Optional[NoiseMechanism], - ) -> None: - """Test _pick_noise_for_non_count for GroupByBoundedVariance query exprs.""" - if isinstance(measure_column_type, SparkFloatColumnDescriptor): - measure_column = "X" - elif isinstance(measure_column_type, SparkIntegerColumnDescriptor): - measure_column = "B" - else: - raise AssertionError("Unknown measure column type") - query = GroupByBoundedVariance( - child=self.base_query, - measure_column=measure_column, - low=0, - high=1, - mechanism=query_mechanism, - groupby_keys=KeySet.from_dict({}), - ) - self.pick_noise_visitor.output_measure = output_measure - # pylint: disable=protected-access - if expected_mechanism is not None: - got_mechanism = self.pick_noise_visitor._pick_noise_for_non_count(query) - assert got_mechanism == expected_mechanism - else: - with pytest.raises( - ValueError, - match=( - "Gaussian noise is not supported under PureDP. " - "Please use RhoZCDP or another measure." - ), - ): - self.pick_noise_visitor._pick_noise_for_non_count(query) - # pylint: enable=protected-access - - @pytest.mark.parametrize( - "query_mechanism,output_measure,measure_column_type,expected_mechanism", - [ - ( - StdevMechanism.DEFAULT, - PureDP(), - SparkIntegerColumnDescriptor(), - NoiseMechanism.GEOMETRIC, - ), - ( - StdevMechanism.DEFAULT, - PureDP(), - SparkFloatColumnDescriptor(), - NoiseMechanism.LAPLACE, - ), - ( - StdevMechanism.DEFAULT, - RhoZCDP(), - SparkIntegerColumnDescriptor(), - NoiseMechanism.DISCRETE_GAUSSIAN, - ), - ( - StdevMechanism.DEFAULT, - RhoZCDP(), - SparkFloatColumnDescriptor(), - NoiseMechanism.GAUSSIAN, - ), - ( - StdevMechanism.LAPLACE, - PureDP(), - SparkIntegerColumnDescriptor(), - NoiseMechanism.GEOMETRIC, - ), - ( - StdevMechanism.LAPLACE, - PureDP(), - SparkFloatColumnDescriptor(), - NoiseMechanism.LAPLACE, - ), - ( - StdevMechanism.LAPLACE, - RhoZCDP(), - SparkIntegerColumnDescriptor(), - NoiseMechanism.GEOMETRIC, - ), - ( - StdevMechanism.LAPLACE, - RhoZCDP(), - SparkFloatColumnDescriptor(), - NoiseMechanism.LAPLACE, - ), - (StdevMechanism.GAUSSIAN, PureDP(), SparkIntegerColumnDescriptor(), None), - (StdevMechanism.GAUSSIAN, PureDP(), SparkFloatColumnDescriptor(), None), - ( - StdevMechanism.GAUSSIAN, - RhoZCDP(), - SparkIntegerColumnDescriptor(), - NoiseMechanism.DISCRETE_GAUSSIAN, - ), - ( - StdevMechanism.GAUSSIAN, - RhoZCDP(), - SparkFloatColumnDescriptor(), - NoiseMechanism.GAUSSIAN, - ), - ], - ) - def test_pick_noise_for_stdev( - self, - query_mechanism: StdevMechanism, - output_measure: Union[PureDP, RhoZCDP], - measure_column_type: SparkColumnDescriptor, - # if expected_mechanism is None, this combination is not supported - expected_mechanism: Optional[NoiseMechanism], - ) -> None: - """Test _pick_noise_for_non_count for GroupByBoundedSTDEV query exprs.""" - if isinstance(measure_column_type, SparkFloatColumnDescriptor): - measure_column = "X" - elif isinstance(measure_column_type, SparkIntegerColumnDescriptor): - measure_column = "B" - else: - raise AssertionError("Unknown measure column type") - query = GroupByBoundedSTDEV( - child=self.base_query, - measure_column=measure_column, - low=0, - high=1, - mechanism=query_mechanism, - groupby_keys=KeySet.from_dict({}), - ) - self.pick_noise_visitor.output_measure = output_measure - # pylint: disable=protected-access - if expected_mechanism is not None: - got_mechanism = self.pick_noise_visitor._pick_noise_for_non_count(query) - assert got_mechanism == expected_mechanism - else: - with pytest.raises( - ValueError, - match=( - "Gaussian noise is not supported under PureDP. " - "Please use RhoZCDP or another measure." - ), - ): - self.pick_noise_visitor._pick_noise_for_non_count(query) - # pylint: enable=protected-access - - @pytest.mark.parametrize( - "mechanism", - [ - (AverageMechanism.LAPLACE), - (StdevMechanism.LAPLACE), - (SumMechanism.LAPLACE), - (VarianceMechanism.LAPLACE), - ], - ) - def test_pick_noise_invalid_column( - self, - mechanism: Union[ - AverageMechanism, StdevMechanism, SumMechanism, VarianceMechanism - ], - ) -> None: - """Test _pick_noise_for_non_count with a non-numeric column. - - This only tests Laplace noise. - """ - query: Union[ - GroupByBoundedAverage, - GroupByBoundedSTDEV, - GroupByBoundedSum, - GroupByBoundedVariance, - ] - if isinstance(mechanism, AverageMechanism): - query = GroupByBoundedAverage( - child=self.base_query, - measure_column="A", - low=0, - high=1, - mechanism=mechanism, - groupby_keys=KeySet.from_dict({}), - ) - elif isinstance(mechanism, StdevMechanism): - query = GroupByBoundedSTDEV( - child=self.base_query, - measure_column="A", - low=0, - high=1, - mechanism=mechanism, - groupby_keys=KeySet.from_dict({}), - ) - elif isinstance(mechanism, SumMechanism): - query = GroupByBoundedSum( - child=self.base_query, - measure_column="A", - low=0, - high=1, - mechanism=mechanism, - groupby_keys=KeySet.from_dict({}), - ) - elif isinstance(mechanism, VarianceMechanism): - query = GroupByBoundedVariance( - child=self.base_query, - measure_column="A", - low=0, - high=1, - mechanism=mechanism, - groupby_keys=KeySet.from_dict({}), - ) - else: - assert_never(mechanism) - with pytest.raises( - AssertionError, match="Query's measure column should be numeric." - ): - # pylint: disable=protected-access - self.visitor._pick_noise_for_non_count(query) - # pylint: enable=protected-access - def test_validate_measurement(self): """Test _validate_measurement.""" with patch( @@ -972,6 +401,7 @@ def _check_measurement(self, measurement: Measurement): child=PrivateSource("private"), groupby_keys=KeySet.from_dict({}), mechanism=CountMechanism.DEFAULT, + core_mechanism=NoiseMechanism.GEOMETRIC, ), PureDP(), NoiseInfo( @@ -989,6 +419,7 @@ def _check_measurement(self, measurement: Measurement): groupby_keys=KeySet.from_dict({"B": [0, 1]}), mechanism=CountMechanism.LAPLACE, output_column="count", + core_mechanism=NoiseMechanism.GEOMETRIC, ), PureDP(), NoiseInfo( @@ -1006,6 +437,7 @@ def _check_measurement(self, measurement: Measurement): groupby_keys=KeySet.from_dict({"A": ["zero"]}), mechanism=CountMechanism.GAUSSIAN, output_column="custom_count_column", + core_mechanism=NoiseMechanism.DISCRETE_GAUSSIAN, ), RhoZCDP(), NoiseInfo( @@ -1022,6 +454,7 @@ def _check_measurement(self, measurement: Measurement): child=PrivateSource("private"), groupby_keys=KeySet.from_dict({}), mechanism=CountMechanism.DEFAULT, + core_mechanism=NoiseMechanism.DISCRETE_GAUSSIAN, ), RhoZCDP(), NoiseInfo( @@ -1038,6 +471,7 @@ def _check_measurement(self, measurement: Measurement): child=PrivateSource("private"), groupby_keys=KeySet.from_dict({}), mechanism=CountMechanism.LAPLACE, + core_mechanism=NoiseMechanism.GEOMETRIC, ), RhoZCDP(), NoiseInfo( @@ -1069,6 +503,7 @@ def test_visit_groupby_count( child=PrivateSource("private"), groupby_keys=KeySet.from_dict({}), mechanism=CountDistinctMechanism.DEFAULT, + core_mechanism=NoiseMechanism.GEOMETRIC, ), PureDP(), NoiseInfo( @@ -1086,6 +521,7 @@ def test_visit_groupby_count( groupby_keys=KeySet.from_dict({"B": [0, 1]}), mechanism=CountDistinctMechanism.LAPLACE, output_column="count", + core_mechanism=NoiseMechanism.GEOMETRIC, ), PureDP(), NoiseInfo( @@ -1102,6 +538,7 @@ def test_visit_groupby_count( child=PrivateSource("private"), groupby_keys=KeySet.from_dict({}), columns_to_count=tuple(["A"]), + core_mechanism=NoiseMechanism.GEOMETRIC, ), PureDP(), NoiseInfo( @@ -1119,6 +556,7 @@ def test_visit_groupby_count( groupby_keys=KeySet.from_dict({"A": ["zero"]}), mechanism=CountDistinctMechanism.GAUSSIAN, output_column="custom_count_column", + core_mechanism=NoiseMechanism.DISCRETE_GAUSSIAN, ), RhoZCDP(), NoiseInfo( @@ -1135,6 +573,7 @@ def test_visit_groupby_count( child=PrivateSource("private"), groupby_keys=KeySet.from_dict({}), mechanism=CountDistinctMechanism.DEFAULT, + core_mechanism=NoiseMechanism.DISCRETE_GAUSSIAN, ), RhoZCDP(), NoiseInfo( @@ -1151,6 +590,7 @@ def test_visit_groupby_count( child=PrivateSource("private"), groupby_keys=KeySet.from_dict({}), mechanism=CountDistinctMechanism.LAPLACE, + core_mechanism=NoiseMechanism.GEOMETRIC, ), RhoZCDP(), NoiseInfo( @@ -1321,6 +761,7 @@ def test_visit_groupby_quantile( mechanism=AverageMechanism.DEFAULT, output_column="custom_output_column", measure_column="B", + core_mechanism=NoiseMechanism.GEOMETRIC, ), PureDP(), NoiseInfo( @@ -1345,6 +786,7 @@ def test_visit_groupby_quantile( output_column="sum", low=123.345, high=987.65, + core_mechanism=NoiseMechanism.LAPLACE, ), PureDP(), NoiseInfo( @@ -1369,6 +811,7 @@ def test_visit_groupby_quantile( output_column="different_sum", low=123.345, high=987.65, + core_mechanism=NoiseMechanism.LAPLACE, ), PureDP(), NoiseInfo( @@ -1392,6 +835,7 @@ def test_visit_groupby_quantile( measure_column="B", low=0, high=1, + core_mechanism=NoiseMechanism.DISCRETE_GAUSSIAN, ), RhoZCDP(), NoiseInfo( @@ -1431,6 +875,7 @@ def test_visit_groupby_bounded_average( mechanism=SumMechanism.DEFAULT, output_column="custom_output_column", measure_column="B", + core_mechanism=NoiseMechanism.GEOMETRIC, ), PureDP(), NoiseInfo( @@ -1451,6 +896,7 @@ def test_visit_groupby_bounded_average( output_column="sum", low=123.345, high=987.65, + core_mechanism=NoiseMechanism.LAPLACE, ), PureDP(), NoiseInfo( @@ -1471,6 +917,7 @@ def test_visit_groupby_bounded_average( output_column="different_sum", low=123.345, high=987.65, + core_mechanism=NoiseMechanism.LAPLACE, ), PureDP(), NoiseInfo( @@ -1490,6 +937,7 @@ def test_visit_groupby_bounded_average( measure_column="B", low=0, high=1, + core_mechanism=NoiseMechanism.DISCRETE_GAUSSIAN, ), RhoZCDP(), NoiseInfo( @@ -1525,6 +973,7 @@ def test_visit_groupby_bounded_sum( mechanism=VarianceMechanism.DEFAULT, output_column="custom_output_column", measure_column="B", + core_mechanism=NoiseMechanism.GEOMETRIC, ), PureDP(), NoiseInfo( @@ -1553,6 +1002,7 @@ def test_visit_groupby_bounded_sum( output_column="sum", low=123.345, high=987.65, + core_mechanism=NoiseMechanism.LAPLACE, ), PureDP(), NoiseInfo( @@ -1581,6 +1031,7 @@ def test_visit_groupby_bounded_sum( output_column="different_sum", low=123.345, high=987.65, + core_mechanism=NoiseMechanism.LAPLACE, ), PureDP(), NoiseInfo( @@ -1608,6 +1059,7 @@ def test_visit_groupby_bounded_sum( measure_column="B", low=0, high=1, + core_mechanism=NoiseMechanism.DISCRETE_GAUSSIAN, ), RhoZCDP(), NoiseInfo( @@ -1651,6 +1103,7 @@ def test_visit_groupby_bounded_variance( mechanism=StdevMechanism.DEFAULT, output_column="custom_output_column", measure_column="B", + core_mechanism=NoiseMechanism.GEOMETRIC, ), PureDP(), NoiseInfo( @@ -1679,6 +1132,7 @@ def test_visit_groupby_bounded_variance( output_column="sum", low=123.345, high=987.65, + core_mechanism=NoiseMechanism.LAPLACE, ), PureDP(), NoiseInfo( @@ -1707,6 +1161,7 @@ def test_visit_groupby_bounded_variance( output_column="different_sum", low=123.345, high=987.65, + core_mechanism=NoiseMechanism.LAPLACE, ), PureDP(), NoiseInfo( @@ -1734,6 +1189,7 @@ def test_visit_groupby_bounded_variance( measure_column="B", low=0, high=1, + core_mechanism=NoiseMechanism.DISCRETE_GAUSSIAN, ), RhoZCDP(), NoiseInfo( @@ -1822,6 +1278,7 @@ def test_visit_transformations(self, query: QueryExpr): groupby_keys=KeySet.from_dict({}), child=PrivateSource("private"), output_column="count", + core_mechanism=NoiseMechanism.GEOMETRIC, ), column="count", threshold=5.5, @@ -1831,6 +1288,7 @@ def test_visit_transformations(self, query: QueryExpr): groupby_keys=KeySet.from_dict({"B": [0, 1]}), child=PrivateSource("private"), output_column="count", + core_mechanism=NoiseMechanism.GEOMETRIC, ), column="count", threshold=-10, @@ -1856,6 +1314,7 @@ def test_visit_suppress_aggregates(self, query: SuppressAggregates) -> None: {"A": ["a0", "a1", "a2", "a3"]} ), output_column="count", + core_mechanism=NoiseMechanism.GEOMETRIC, ), column="count", threshold=0, @@ -1878,6 +1337,7 @@ def test_visit_suppress_aggregates(self, query: SuppressAggregates) -> None: {"A": ["a0", "a1", "a2", "a3"]}, ), output_column="custom_count_name", + core_mechanism=NoiseMechanism.GEOMETRIC, ), column="custom_count_name", threshold=3, diff --git a/test/unit/query_expr_compiler/test_rewrite_rules.py b/test/unit/query_expr_compiler/test_rewrite_rules.py new file mode 100644 index 00000000..5fdef85d --- /dev/null +++ b/test/unit/query_expr_compiler/test_rewrite_rules.py @@ -0,0 +1,393 @@ +"""Tests for rewrite rules.""" +from dataclasses import dataclass, replace +from typing import Any, Union + +import pytest +from tmlt.core.measurements.aggregations import NoiseMechanism +from tmlt.core.measures import ApproxDP, PureDP, RhoZCDP +from tmlt.core.utils.testing import Case, parametrize + +from tmlt.analytics import KeySet +from tmlt.analytics._catalog import Catalog +from tmlt.analytics._query_expr import ( + AverageMechanism, + CountDistinctMechanism, + CountMechanism, + GroupByBoundedAverage, + GroupByBoundedSTDEV, + GroupByBoundedSum, + GroupByBoundedVariance, + GroupByCount, + GroupByCountDistinct, + PrivateSource, + QueryExpr, + QueryExprVisitor, + SingleChildQueryExpr, + StdevMechanism, + SumMechanism, + SuppressAggregates, + VarianceMechanism, +) +from tmlt.analytics._query_expr_compiler._rewrite_rules import ( + CompilationInfo, + select_noise_mechanism, +) +from tmlt.analytics._schema import ColumnDescriptor, ColumnType, Schema + +# SPDX-License-Identifier: Apache-2.0 +# Copyright Tumult Labs 2025 + + +@pytest.fixture(scope="module", name="catalog") +def fixture_catalog(): + """Setup tests.""" + c = Catalog() + c.add_private_table( + "private", + { + "string_col": ColumnDescriptor(ColumnType.VARCHAR), + "int_col": ColumnDescriptor(ColumnType.INTEGER), + "float_col": ColumnDescriptor(ColumnType.DECIMAL), + }, + ) + return c + + +BASE_EXPR = PrivateSource("private") + +AGG_CLASSES = { + "count": (GroupByCount, CountMechanism), + "count_distinct": (GroupByCountDistinct, CountDistinctMechanism), + "average": (GroupByBoundedAverage, AverageMechanism), + "sum": (GroupByBoundedSum, SumMechanism), + "stdev": (GroupByBoundedSTDEV, StdevMechanism), + "variance": (GroupByBoundedVariance, VarianceMechanism), +} + + +@parametrize( + [ + Case()( + query_mechanism=mech, + output_measure=meas, + expected_mechanism="GEOMETRIC", + ) + for mech in ["DEFAULT", "LAPLACE"] + for meas in [PureDP(), ApproxDP()] + ] + + [ + Case()( + query_mechanism=mech, + output_measure=RhoZCDP(), + expected_mechanism="DISCRETE_GAUSSIAN", + ) + for mech in ["DEFAULT", "GAUSSIAN"] + ] + + [ + Case()( + query_mechanism="LAPLACE", + output_measure=RhoZCDP(), + expected_mechanism="GEOMETRIC", + ) + ], +) +@parametrize( + [ + Case()(agg="count"), + Case()(agg="count_distinct"), + ] +) +def test_noise_selection_counts( + catalog: Catalog, + agg: str, + query_mechanism: str, + output_measure: Union[PureDP, ApproxDP, RhoZCDP], + expected_mechanism: str, +) -> None: + """Test noise selection for GroupByCount{Distinct,} query expressions.""" + (AggExpr, AggMech) = AGG_CLASSES[agg] + expr = AggExpr( + child=BASE_EXPR, + groupby_keys=KeySet.from_dict({}), + mechanism=AggMech[query_mechanism], + ) + info = CompilationInfo(output_measure=output_measure, catalog=catalog) + got_expr = select_noise_mechanism(info)(expr) + assert got_expr == replace(expr, core_mechanism=NoiseMechanism[expected_mechanism]) + + +@parametrize( + [ + Case()( + query_mechanism=mech, + output_measure=meas, + measure_column="int_col", + expected_mechanism="GEOMETRIC", + ) + for mech in ["DEFAULT", "LAPLACE"] + for meas in [PureDP(), ApproxDP()] + ] + + [ + Case()( + query_mechanism=mech, + output_measure=meas, + measure_column="float_col", + expected_mechanism="LAPLACE", + ) + for mech in ["DEFAULT", "LAPLACE"] + for meas in [PureDP(), ApproxDP()] + ] + + [ + Case()( + query_mechanism=mech, + output_measure=RhoZCDP(), + measure_column="int_col", + expected_mechanism="DISCRETE_GAUSSIAN", + ) + for mech in ["DEFAULT", "GAUSSIAN"] + ] + + [ + Case()( + query_mechanism=mech, + output_measure=RhoZCDP(), + measure_column="float_col", + expected_mechanism="GAUSSIAN", + ) + for mech in ["DEFAULT", "GAUSSIAN"] + ] + + [ + Case()( + query_mechanism="LAPLACE", + output_measure=RhoZCDP(), + measure_column="int_col", + expected_mechanism="GEOMETRIC", + ) + ] + + [ + Case()( + query_mechanism="LAPLACE", + output_measure=RhoZCDP(), + measure_column="float_col", + expected_mechanism="LAPLACE", + ) + ], +) +@parametrize( + [ + Case()(agg="sum"), + Case()(agg="average"), + Case()(agg="stdev"), + Case()(agg="variance"), + ] +) +def test_noise_selection_numeric_aggregations( + catalog: Catalog, + agg: str, + query_mechanism: str, + measure_column: str, + output_measure: Union[PureDP, ApproxDP, RhoZCDP], + expected_mechanism: str, +) -> None: + """Test noise selection for GroupByBoundedAverage query exprs.""" + (AggExpr, AggMech) = AGG_CLASSES[agg] + expr = AggExpr( + child=BASE_EXPR, + measure_column=measure_column, + low=0, + high=1, + mechanism=AggMech[query_mechanism], + groupby_keys=KeySet.from_dict({}), + ) + info = CompilationInfo(output_measure=output_measure, catalog=catalog) + got_expr = select_noise_mechanism(info)(expr) + assert got_expr == replace(expr, core_mechanism=NoiseMechanism[expected_mechanism]) + + +@parametrize( + [ + Case()( + query_mechanism=mech, output_measure=meas, expected_mechanism="GEOMETRIC" + ) + for mech in ["DEFAULT", "LAPLACE"] + for meas in [PureDP(), ApproxDP()] + ] + + [ + Case()( + query_mechanism=mech, + output_measure=RhoZCDP(), + expected_mechanism="DISCRETE_GAUSSIAN", + ) + for mech in ["DEFAULT", "GAUSSIAN"] + ] + + [ + Case()( + query_mechanism="LAPLACE", + output_measure=meas, + expected_mechanism="GEOMETRIC", + ) + for meas in [PureDP(), ApproxDP(), RhoZCDP()] + ], +) +def test_noise_selection_suppress_aggregates( + catalog: Catalog, + query_mechanism: str, + output_measure: Union[PureDP, ApproxDP, RhoZCDP], + expected_mechanism: str, +) -> None: + """Test noise selection for GroupByCount query expressions.""" + expr = SuppressAggregates( + child=GroupByCount( + child=BASE_EXPR, + groupby_keys=KeySet.from_dict({}), + mechanism=CountMechanism[query_mechanism], + ), + column="count", + threshold=42, + ) + info = CompilationInfo(output_measure=output_measure, catalog=catalog) + got_expr = select_noise_mechanism(info)(expr) + assert got_expr == replace( + expr, + child=replace(expr.child, core_mechanism=NoiseMechanism[expected_mechanism]), + ) + + +@parametrize( + [ + Case()( + expr=GroupByCount( + child=BASE_EXPR, + groupby_keys=KeySet.from_dict({}), + mechanism=CountMechanism.GAUSSIAN, + ) + ), + Case()( + expr=GroupByCountDistinct( + child=BASE_EXPR, + groupby_keys=KeySet.from_dict({}), + mechanism=CountDistinctMechanism.GAUSSIAN, + ) + ), + Case()( + expr=GroupByBoundedAverage( + child=BASE_EXPR, + groupby_keys=KeySet.from_dict({}), + measure_column="int_col", + low=0, + high=1, + mechanism=AverageMechanism.GAUSSIAN, + ) + ), + Case()( + expr=GroupByBoundedSum( + child=BASE_EXPR, + groupby_keys=KeySet.from_dict({}), + measure_column="int_col", + low=0, + high=1, + mechanism=SumMechanism.GAUSSIAN, + ) + ), + Case()( + expr=GroupByBoundedSTDEV( + child=BASE_EXPR, + groupby_keys=KeySet.from_dict({}), + measure_column="int_col", + low=0, + high=1, + mechanism=StdevMechanism.GAUSSIAN, + ) + ), + Case()( + expr=GroupByBoundedVariance( + child=BASE_EXPR, + groupby_keys=KeySet.from_dict({}), + measure_column="int_col", + low=0, + high=1, + mechanism=VarianceMechanism.GAUSSIAN, + ) + ), + Case()( + expr=SuppressAggregates( + child=GroupByCount( + child=PrivateSource("blah"), + groupby_keys=KeySet.from_dict({}), + mechanism=CountMechanism.GAUSSIAN, + ), + column="count", + threshold=42, + ) + ), + ], +) +@parametrize( + [ + Case()(output_measure=PureDP()), + Case()(output_measure=ApproxDP()), + ] +) +def test_noise_selection_invalid_noise( + catalog: Catalog, expr: QueryExpr, output_measure: Union[PureDP, ApproxDP] +) -> None: + info = CompilationInfo(output_measure=output_measure, catalog=catalog) + with pytest.raises( + ValueError, + match=( + "Gaussian noise is only supported when using a RhoZCDP budget. " + "Use Laplace noise instead, or switch to RhoZCDP." + ), + ): + select_noise_mechanism(info)(expr) + + +@dataclass(frozen=True) +class SomeKindOfPostProcessing(SingleChildQueryExpr): + """A fake post-processing QueryExpr.""" + + field: int + """A field, because why not.""" + + def schema(self, catalog: Catalog) -> Schema: + """Just propagate the schema from the child.""" + return self.child.schema(catalog) + + def accept(self, visitor: "QueryExprVisitor") -> Any: + """This should not be called.""" + raise NotImplementedError() + + +def test_recursive_noise_selection(catalog: Catalog) -> None: + """Checks that noise selection works for new post-processing QueryExprs.""" + expr = SomeKindOfPostProcessing( + child=SomeKindOfPostProcessing( + child=GroupByBoundedAverage( + child=BASE_EXPR, + groupby_keys=KeySet.from_dict({}), + measure_column="int_col", + low=0, + high=1, + mechanism=AverageMechanism.DEFAULT, + ), + field=42, + ), + field=17, + ) + expected_expr = SomeKindOfPostProcessing( + child=SomeKindOfPostProcessing( + child=GroupByBoundedAverage( + child=BASE_EXPR, + groupby_keys=KeySet.from_dict({}), + measure_column="int_col", + low=0, + high=1, + mechanism=AverageMechanism.DEFAULT, + core_mechanism=NoiseMechanism.GEOMETRIC, + ), + field=42, + ), + field=17, + ) + info = CompilationInfo(output_measure=ApproxDP(), catalog=catalog) + got_expr = select_noise_mechanism(info)(expr) + assert got_expr == expected_expr From 7cb678c3f55c01f7bfef489acc031960279679b3 Mon Sep 17 00:00:00 2001 From: Ted Date: Sat, 25 Oct 2025 11:05:35 +0200 Subject: [PATCH 16/25] Allow literal strings as mechanism arguments to aggregations (#100) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add literals, remove cols * STDEV → Stdev * change some tests * convert literals to enums, add tests * Revert "STDEV → Stdev" This reverts commit a9500e5ce36445e659af4dee7ede8403957e752a. * add changelog * make case insensitivity explicit * lint --------- Co-authored-by: Damien Desfontaines --- CHANGELOG.rst | 4 + src/tmlt/analytics/query_builder.py | 163 +++++++++++++------- test/system/session/rows/conftest.py | 5 +- test/unit/test_query_builder.py | 213 +++++++++++++++++++++++---- 4 files changed, 298 insertions(+), 87 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 8be73d2b..e5c59355 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -14,6 +14,10 @@ Changed ~~~~~~~ - Dropped support for Python 3.9, as it has reached end-of-life. - Dropped support for pyspark <3.5.0 on Macs after discovering that these configurations frequently crash. Older versions of the library may also be affected. +- Aggregation mechanisms can now be specified as strings instead of enums, e.g. ``"laplace"`` instead of ``CountMechanism.LAPLACE`` or ``SumMechanism.LAPLACE``. +- Removed previously deprecated argument ``max_num_rows`` to ``flat_map``. Use ``max_rows`` instead. +- Removed previously deprecated argument ``cols`` to ``count_distinct``. Use ``columns`` instead. + .. _v0.20.2: diff --git a/src/tmlt/analytics/query_builder.py b/src/tmlt/analytics/query_builder.py index b3c1976a..58fc3806 100644 --- a/src/tmlt/analytics/query_builder.py +++ b/src/tmlt/analytics/query_builder.py @@ -6,13 +6,13 @@ from __future__ import annotations import datetime -import warnings from dataclasses import dataclass from typing import ( Any, Callable, Dict, List, + Literal, Mapping, Optional, Protocol, @@ -2005,7 +2005,9 @@ def groupby(self, by: Union[KeySet, List[str], str]) -> "GroupedQueryBuilder": def count( self, name: Optional[str] = None, - mechanism: CountMechanism = CountMechanism.DEFAULT, + mechanism: Union[ + CountMechanism, Literal["default", "laplace", "gaussian"] + ] = "default", ) -> Query: """Returns a count query ready to be evaluated. @@ -2069,8 +2071,8 @@ def count( Args: name: Name for the resulting aggregation column. Defaults to "count". - mechanism: Choice of noise mechanism. By default, the framework - automatically selects an appropriate mechanism. + mechanism: Choice of noise mechanism (case-insensitive). By default, the + framework automatically selects an appropriate mechanism. """ return self.groupby(KeySet.from_dict({})).count(name=name, mechanism=mechanism) @@ -2078,8 +2080,9 @@ def count_distinct( self, columns: Optional[List[str]] = None, name: Optional[str] = None, - mechanism: CountDistinctMechanism = CountDistinctMechanism.DEFAULT, - cols: Optional[List[str]] = None, + mechanism: Union[ + CountDistinctMechanism, Literal["default", "laplace", "gaussian"] + ] = "default", ) -> Query: """Returns a count_distinct query ready to be evaluated. @@ -2151,12 +2154,11 @@ def count_distinct( name: Name for the resulting aggregation column. Defaults to "count_distinct" if no columns are provided, or "count_distinct(A, B, C)" if the provided columns are A, B, and C. - mechanism: Choice of noise mechanism. By default, the framework - automatically selects an appropriate mechanism. - cols: Deprecated; use ``columns`` instead. + mechanism: Choice of noise mechanism (case-insensitive). By default, the + framework automatically selects an appropriate mechanism. """ return self.groupby(KeySet.from_dict({})).count_distinct( - columns=columns, name=name, mechanism=mechanism, cols=cols + columns=columns, name=name, mechanism=mechanism ) def quantile( @@ -2445,7 +2447,9 @@ def sum( low: float, high: float, name: Optional[str] = None, - mechanism: SumMechanism = SumMechanism.DEFAULT, + mechanism: Union[ + SumMechanism, Literal["default", "laplace", "gaussian"] + ] = "default", ) -> Query: """Returns a sum query ready to be evaluated. @@ -2518,8 +2522,8 @@ def sum( is less than ``high``. name: The name to give the resulting aggregation column. Defaults to ``f"{column}_sum"``. - mechanism: Choice of noise mechanism. By default, the framework - automatically selects an appropriate mechanism. + mechanism: Choice of noise mechanism (case-insensitive). By default, the + framework automatically selects an appropriate mechanism. """ return self.groupby(KeySet.from_dict({})).sum( column=column, low=low, high=high, name=name, mechanism=mechanism @@ -2531,7 +2535,9 @@ def average( low: float, high: float, name: Optional[str] = None, - mechanism: AverageMechanism = AverageMechanism.DEFAULT, + mechanism: Union[ + AverageMechanism, Literal["default", "laplace", "gaussian"] + ] = "default", ) -> Query: """Returns an average query ready to be evaluated. @@ -2604,8 +2610,8 @@ def average( is less than ``high``. name: The name to give the resulting aggregation column. Defaults to ``f"{column}_average"``. - mechanism: Choice of noise mechanism. By default, the framework - automatically selects an appropriate mechanism. + mechanism: Choice of noise mechanism (case-insensitive). By default, the + framework automatically selects an appropriate mechanism. """ return self.groupby(KeySet.from_dict({})).average( column=column, low=low, high=high, name=name, mechanism=mechanism @@ -2617,7 +2623,9 @@ def variance( low: float, high: float, name: Optional[str] = None, - mechanism: VarianceMechanism = VarianceMechanism.DEFAULT, + mechanism: Union[ + VarianceMechanism, Literal["default", "laplace", "gaussian"] + ] = "default", ) -> Query: """Returns a variance query ready to be evaluated. @@ -2690,8 +2698,8 @@ def variance( is less than ``high``. name: The name to give the resulting aggregation column. Defaults to ``f"{column}_variance"``. - mechanism: Choice of noise mechanism. By default, the framework - automatically selects an appropriate mechanism. + mechanism: Choice of noise mechanism (case-insensitive). By default, the + framework automatically selects an appropriate mechanism. """ return self.groupby(KeySet.from_dict({})).variance( column=column, low=low, high=high, name=name, mechanism=mechanism @@ -2703,7 +2711,9 @@ def stdev( low: float, high: float, name: Optional[str] = None, - mechanism: StdevMechanism = StdevMechanism.DEFAULT, + mechanism: Union[ + StdevMechanism, Literal["default", "laplace", "gaussian"] + ] = "default", ) -> Query: """Returns a standard deviation query ready to be evaluated. @@ -2776,8 +2786,8 @@ def stdev( is less than ``high``. name: The name to give the resulting aggregation column. Defaults to ``f"{column}_stdev"``. - mechanism: Choice of noise mechanism. By default, the framework - automatically selects an appropriate mechanism. + mechanism: Choice of noise mechanism (case-insensitive). By default, the + framework automatically selects an appropriate mechanism. """ return self.groupby(KeySet.from_dict({})).stdev( column=column, low=low, high=high, name=name, mechanism=mechanism @@ -2808,7 +2818,9 @@ def __init__(self, source_id, query_expr, groupby_keys) -> None: def count( self, name: Optional[str] = None, - mechanism: CountMechanism = CountMechanism.DEFAULT, + mechanism: Union[ + CountMechanism, Literal["default", "laplace", "gaussian"] + ] = "default", ) -> GroupbyCountQuery: """Returns a GroupedCountQuery with a count query. @@ -2859,11 +2871,19 @@ def count( Args: name: Name for the resulting aggregation column. Defaults to "count". - mechanism: Choice of noise mechanism. By default, the framework - automatically selects an appropriate mechanism. + mechanism: Choice of noise mechanism (case-insensitive). By default, the + framework automatically selects an appropriate mechanism. """ if name is None: name = "count" + if isinstance(mechanism, str): + try: + mechanism = CountMechanism[mechanism.upper()] + except KeyError as e: + raise ValueError( + f'Unknown mechanism "{mechanism}". Available options are ' + '"laplace", "gaussian", or "default".' + ) from e query_expr = GroupByCount( child=self._query_expr, groupby_keys=self._groupby_keys, @@ -2876,8 +2896,9 @@ def count_distinct( self, columns: Optional[List[str]] = None, name: Optional[str] = None, - mechanism: CountDistinctMechanism = CountDistinctMechanism.DEFAULT, - cols: Optional[List[str]] = None, + mechanism: Union[ + CountDistinctMechanism, Literal["default", "laplace", "gaussian"] + ] = "default", ) -> Query: """Returns a Query with a count_distinct query. @@ -2934,21 +2955,9 @@ def count_distinct( name: Name for the resulting aggregation column. Defaults to "count_distinct" if no columns are provided, or "count_distinct(A, B, C)" if the provided columns are A, B, and C. - mechanism: Choice of noise mechanism. By default, the framework - automatically selects an appropriate mechanism. - cols: Deprecated; use ``columns`` instead. + mechanism: Choice of noise mechanism (case-insensitive). By default, the + framework automatically selects an appropriate mechanism. """ - if cols is not None: - warnings.warn( - "The `cols` argument is deprecated; use `columns` instead", - DeprecationWarning, - ) - if columns is not None: - raise ValueError( - "cannot provide both `cols` and `columns` arguments to" - " count_distinct" - ) - columns = cols columns_to_count: Optional[List[str]] = None if columns is not None and len(columns) > 0: columns_to_count = list(columns) @@ -2957,6 +2966,14 @@ def count_distinct( name = f"count_distinct({', '.join(columns_to_count)})" else: name = "count_distinct" + if isinstance(mechanism, str): + try: + mechanism = CountDistinctMechanism[mechanism.upper()] + except KeyError as e: + raise ValueError( + f'Unknown mechanism "{mechanism}". Available options are ' + '"laplace", "gaussian", or "default".' + ) from e query_expr = GroupByCountDistinct( child=self._query_expr, columns_to_count=tuple(columns_to_count) if columns_to_count else None, @@ -3279,7 +3296,9 @@ def sum( low: float, high: float, name: Optional[str] = None, - mechanism: SumMechanism = SumMechanism.DEFAULT, + mechanism: Union[ + SumMechanism, Literal["default", "laplace", "gaussian"] + ] = "default", ) -> Query: """Returns a Query with a sum query. @@ -3354,11 +3373,19 @@ def sum( is less than ``high``. name: The name to give the resulting aggregation column. Defaults to ``f"{column}_sum"``. - mechanism: Choice of noise mechanism. By default, the framework - automatically selects an appropriate mechanism. + mechanism: Choice of noise mechanism (case-insensitive). By default, the + framework automatically selects an appropriate mechanism. """ if name is None: name = f"{column}_sum" + if isinstance(mechanism, str): + try: + mechanism = SumMechanism[mechanism.upper()] + except KeyError as e: + raise ValueError( + f'Unknown mechanism "{mechanism}". Available options are ' + '"laplace", "gaussian", or "default".' + ) from e query_expr = GroupByBoundedSum( child=self._query_expr, groupby_keys=self._groupby_keys, @@ -3376,7 +3403,9 @@ def average( low: float, high: float, name: Optional[str] = None, - mechanism: AverageMechanism = AverageMechanism.DEFAULT, + mechanism: Union[ + AverageMechanism, Literal["default", "laplace", "gaussian"] + ] = "default", ) -> Query: """Returns a Query with an average query. @@ -3451,11 +3480,19 @@ def average( is less than ``high``. name: The name to give the resulting aggregation column. Defaults to ``f"{column}_average"``. - mechanism: Choice of noise mechanism. By default, the framework - automatically selects an appropriate mechanism. + mechanism: Choice of noise mechanism (case-insensitive). By default, the + framework automatically selects an appropriate mechanism. """ if name is None: name = f"{column}_average" + if isinstance(mechanism, str): + try: + mechanism = AverageMechanism[mechanism.upper()] + except KeyError as e: + raise ValueError( + f'Unknown mechanism "{mechanism}". Available options are ' + '"laplace", "gaussian", or "default".' + ) from e query_expr = GroupByBoundedAverage( child=self._query_expr, groupby_keys=self._groupby_keys, @@ -3473,7 +3510,9 @@ def variance( low: float, high: float, name: Optional[str] = None, - mechanism: VarianceMechanism = VarianceMechanism.DEFAULT, + mechanism: Union[ + VarianceMechanism, Literal["default", "laplace", "gaussian"] + ] = "default", ) -> Query: """Returns a Query with a variance query. @@ -3549,11 +3588,19 @@ def variance( is less than ``high``. name: The name to give the resulting aggregation column. Defaults to ``f"{column}_variance"``. - mechanism: Choice of noise mechanism. By default, the framework - automatically selects an appropriate mechanism. + mechanism: Choice of noise mechanism (case-insensitive). By default, the + framework automatically selects an appropriate mechanism. """ if name is None: name = f"{column}_variance" + if isinstance(mechanism, str): + try: + mechanism = VarianceMechanism[mechanism.upper()] + except KeyError as e: + raise ValueError( + f'Unknown mechanism "{mechanism}". Available options are ' + '"laplace", "gaussian", or "default".' + ) from e query_expr = GroupByBoundedVariance( child=self._query_expr, groupby_keys=self._groupby_keys, @@ -3571,7 +3618,9 @@ def stdev( low: float, high: float, name: Optional[str] = None, - mechanism: StdevMechanism = StdevMechanism.DEFAULT, + mechanism: Union[ + StdevMechanism, Literal["default", "laplace", "gaussian"] + ] = "default", ) -> Query: """Returns a Query with a standard deviation query. @@ -3646,11 +3695,19 @@ def stdev( is less than ``high``. name: The name to give the resulting aggregation column. Defaults to ``f"{column}_stdev"``. - mechanism: Choice of noise mechanism. By default, the framework - automatically selects an appropriate mechanism. + mechanism: Choice of noise mechanism (case-insensitive). By default, the + framework automatically selects an appropriate mechanism. """ if name is None: name = f"{column}_stdev" + if isinstance(mechanism, str): + try: + mechanism = StdevMechanism[mechanism.upper()] + except KeyError as e: + raise ValueError( + f'Unknown mechanism "{mechanism}". Available options are ' + '"laplace", "gaussian", or "default".' + ) from e query_expr = GroupByBoundedSTDEV( child=self._query_expr, groupby_keys=self._groupby_keys, diff --git a/test/system/session/rows/conftest.py b/test/system/session/rows/conftest.py index 92ffa308..265f6e7c 100644 --- a/test/system/session/rows/conftest.py +++ b/test/system/session/rows/conftest.py @@ -82,7 +82,7 @@ pd.DataFrame({"total": [4]}), ), ( # Total with LAPLACE (Geometric noise gets applied) - QueryBuilder("private").count(name="total", mechanism=CountMechanism.LAPLACE), + QueryBuilder("private").count(name="total", mechanism="laplace"), GroupByCount( child=PrivateSource("private"), groupby_keys=KeySet.from_dict({}), @@ -93,7 +93,8 @@ ), ( # Total with LAPLACE (Geometric noise gets applied) QueryBuilder("private").count_distinct( - name="total", mechanism=CountDistinctMechanism.LAPLACE + name="total", + mechanism="laplace", ), GroupByCountDistinct( child=PrivateSource("private"), diff --git a/test/unit/test_query_builder.py b/test/unit/test_query_builder.py index 5a416946..33df166e 100644 --- a/test/unit/test_query_builder.py +++ b/test/unit/test_query_builder.py @@ -26,6 +26,9 @@ TruncationStrategy, ) from tmlt.analytics._query_expr import ( + AverageMechanism, + CountDistinctMechanism, + CountMechanism, DropInfinity, DropNullAndNan, Filter, @@ -46,7 +49,10 @@ ReplaceInfinity, ReplaceNullAndNan, Select, + StdevMechanism, + SumMechanism, SuppressAggregates, + VarianceMechanism, ) from tmlt.analytics._schema import FrozenDict, Schema @@ -60,8 +66,7 @@ ###DEFINE ROOT BUILDER### def root_builder(): """Set up QueryBuilder.""" - root_built = QueryBuilder(PRIVATE_ID) - return root_built + return QueryBuilder(PRIVATE_ID) @pytest.mark.parametrize("join_columns", [(None), (["B"])]) @@ -899,36 +904,6 @@ def test_count_distinct_ungrouped( expected_name, ) - @pytest.mark.parametrize("columns", [(["A"]), (["col1", "col2"])]) - def test_count_distinct_raises_warnings(self, columns: List[str]): - """Test that count_distinct raises warning when ``cols`` is provided.""" - with pytest.warns( - DeprecationWarning, match=re.escape("`cols` argument is deprecated") - ): - root_builder().count_distinct(cols=columns) - - keys = KeySet.from_dict({e: ["a"] for e in columns}) - with pytest.warns( - DeprecationWarning, match=re.escape("`cols` argument is deprecated") - ): - root_builder().groupby(keys).count_distinct(cols=columns) - - @pytest.mark.parametrize("columns", [(["A"]), (["col1", "col2"])]) - def test_count_distinct_raises_error(self, columns: List[str]): - """Test that count_distinct raises error with both ``cols`` and ``columns``.""" - with pytest.raises( - ValueError, - match=re.escape("cannot provide both `cols` and `columns` arguments"), - ): - root_builder().count_distinct(columns=columns, cols=columns) - - keys = KeySet.from_dict({e: ["a"] for e in columns}) - with pytest.raises( - ValueError, - match=re.escape("cannot provide both `cols` and `columns` arguments"), - ): - root_builder().groupby(keys).count_distinct(columns=columns, cols=columns) - @pytest.mark.parametrize( "keys_df,name,expected_name,columns", ( @@ -1405,3 +1380,177 @@ def test_query_fast_equality_check(query1: Query, query2: Query, equal: bool): # pylint: disable=protected-access assert query1._is_equivalent(query2) == equal # pylint: enable=protected-access + + +def root_grouped_builder(): + """Set up GroupedQueryBuilder.""" + return QueryBuilder(PRIVATE_ID).groupby(KeySet.from_dict({"A": ["0", "1"]})) + + +@pytest.mark.parametrize( + "query1,query2", + [ + ( + root_grouped_builder().count(), + root_grouped_builder().count(mechanism="default"), + ), + ( + root_grouped_builder().count(), + root_grouped_builder().count(mechanism=CountMechanism.DEFAULT), + ), + ( + root_grouped_builder().count(mechanism="laplace"), + root_grouped_builder().count(mechanism=CountMechanism.LAPLACE), + ), + ( + root_grouped_builder().count(mechanism="gaussian"), + root_grouped_builder().count(mechanism=CountMechanism.GAUSSIAN), + ), + ( + root_grouped_builder().count_distinct(), + root_grouped_builder().count_distinct(mechanism="default"), + ), + ( + root_grouped_builder().count_distinct(), + root_grouped_builder().count_distinct( + mechanism=CountDistinctMechanism.DEFAULT + ), + ), + ( + root_grouped_builder().count_distinct(mechanism="laplace"), + root_grouped_builder().count_distinct( + mechanism=CountDistinctMechanism.LAPLACE + ), + ), + ( + root_grouped_builder().count_distinct(mechanism="gaussian"), + root_grouped_builder().count_distinct( + mechanism=CountDistinctMechanism.GAUSSIAN + ), + ), + ( + root_grouped_builder().sum(column="B", low=0, high=1), + root_grouped_builder().sum(column="B", low=0, high=1, mechanism="default"), + ), + ( + root_grouped_builder().sum(column="B", low=0, high=1), + root_grouped_builder().sum( + column="B", low=0, high=1, mechanism=SumMechanism.DEFAULT + ), + ), + ( + root_grouped_builder().sum(column="B", low=0, high=1, mechanism="laplace"), + root_grouped_builder().sum( + column="B", low=0, high=1, mechanism=SumMechanism.LAPLACE + ), + ), + ( + root_grouped_builder().sum(column="B", low=0, high=1, mechanism="gaussian"), + root_grouped_builder().sum( + column="B", low=0, high=1, mechanism=SumMechanism.GAUSSIAN + ), + ), + ( + root_grouped_builder().average(column="B", low=0, high=1), + root_grouped_builder().average( + column="B", low=0, high=1, mechanism="default" + ), + ), + ( + root_grouped_builder().average(column="B", low=0, high=1), + root_grouped_builder().average( + column="B", low=0, high=1, mechanism=AverageMechanism.DEFAULT + ), + ), + ( + root_grouped_builder().average( + column="B", low=0, high=1, mechanism="laplace" + ), + root_grouped_builder().average( + column="B", low=0, high=1, mechanism=AverageMechanism.LAPLACE + ), + ), + ( + root_grouped_builder().average( + column="B", low=0, high=1, mechanism="gaussian" + ), + root_grouped_builder().average( + column="B", low=0, high=1, mechanism=AverageMechanism.GAUSSIAN + ), + ), + ( + root_grouped_builder().stdev(column="B", low=0, high=1), + root_grouped_builder().stdev( + column="B", low=0, high=1, mechanism="default" + ), + ), + ( + root_grouped_builder().stdev(column="B", low=0, high=1), + root_grouped_builder().stdev( + column="B", low=0, high=1, mechanism=StdevMechanism.DEFAULT + ), + ), + ( + root_grouped_builder().stdev( + column="B", low=0, high=1, mechanism="laplace" + ), + root_grouped_builder().stdev( + column="B", low=0, high=1, mechanism=StdevMechanism.LAPLACE + ), + ), + ( + root_grouped_builder().stdev( + column="B", low=0, high=1, mechanism="gaussian" + ), + root_grouped_builder().stdev( + column="B", low=0, high=1, mechanism=StdevMechanism.GAUSSIAN + ), + ), + ( + root_grouped_builder().variance(column="B", low=0, high=1), + root_grouped_builder().variance( + column="B", low=0, high=1, mechanism="default" + ), + ), + ( + root_grouped_builder().variance(column="B", low=0, high=1), + root_grouped_builder().variance( + column="B", low=0, high=1, mechanism=VarianceMechanism.DEFAULT + ), + ), + ( + root_grouped_builder().variance( + column="B", low=0, high=1, mechanism="laplace" + ), + root_grouped_builder().variance( + column="B", low=0, high=1, mechanism=VarianceMechanism.LAPLACE + ), + ), + ( + root_grouped_builder().variance( + column="B", low=0, high=1, mechanism="gaussian" + ), + root_grouped_builder().variance( + column="B", low=0, high=1, mechanism=VarianceMechanism.GAUSSIAN + ), + ), + ], +) +def test_string_or_enum_mechanisms_are_equal(query1: Query, query2: Query): + assert query1 == query2 + + +def test_unknown_mechanisms(): + match = re.escape('Unknown mechanism "blah". Available options are') + with pytest.raises(ValueError, match=match): + root_grouped_builder().count(mechanism="blah") + with pytest.raises(ValueError, match=match): + root_grouped_builder().count_distinct(mechanism="blah") + with pytest.raises(ValueError, match=match): + root_grouped_builder().sum(column="B", low=0, high=1, mechanism="blah") + with pytest.raises(ValueError, match=match): + root_grouped_builder().average(column="B", low=0, high=1, mechanism="blah") + with pytest.raises(ValueError, match=match): + root_grouped_builder().stdev(column="B", low=0, high=1, mechanism="blah") + with pytest.raises(ValueError, match=match): + root_grouped_builder().variance(column="B", low=0, high=1, mechanism="blah") From 7c31a3f70aaff28b5e04151bcfae7e5ae7409729 Mon Sep 17 00:00:00 2001 From: Ted Date: Sun, 26 Oct 2025 09:05:04 +0100 Subject: [PATCH 17/25] =?UTF-8?q?GroupBySTDEV=20=E2=86=92=20GroupByStdev?= =?UTF-8?q?=20(#103)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Done using `git grep -rl GroupByBoundedSTDEV . | xargs sed -i 's/GroupByBoundedSTDEV/GroupByBoundedStdev/g'` Co-authored-by: Damien Desfontaines --- src/tmlt/analytics/_query_expr.py | 14 +++++++------- .../_base_measurement_visitor.py | 12 ++++++------ .../_base_transformation_visitor.py | 6 +++--- .../analytics/_query_expr_compiler/_compiler.py | 2 +- .../_query_expr_compiler/_rewrite_rules.py | 6 +++--- src/tmlt/analytics/query_builder.py | 4 ++-- test/system/session/rows/test_add_max_rows.py | 4 ++-- .../test_measurement_visitor.py | 14 +++++++------- .../unit/query_expr_compiler/test_rewrite_rules.py | 6 +++--- .../transformation_visitor/test_add_rows.py | 4 ++-- test/unit/test_query_builder.py | 8 ++++---- test/unit/test_query_expr_compiler.py | 12 ++++++------ test/unit/test_query_expression.py | 8 ++++---- test/unit/test_query_expression_schema.py | 8 ++++---- test/unit/test_query_expression_visitor.py | 8 ++++---- 15 files changed, 58 insertions(+), 58 deletions(-) diff --git a/src/tmlt/analytics/_query_expr.py b/src/tmlt/analytics/_query_expr.py index d5c58323..c816c227 100644 --- a/src/tmlt/analytics/_query_expr.py +++ b/src/tmlt/analytics/_query_expr.py @@ -1484,7 +1484,7 @@ def _validate_groupby( query: Union[ "GetBounds", "GroupByBoundedAverage", - "GroupByBoundedSTDEV", + "GroupByBoundedStdev", "GroupByBoundedSum", "GroupByBoundedVariance", "GroupByCount", @@ -1548,7 +1548,7 @@ def _validate_groupby( GetBounds, GroupByQuantile, GroupByBoundedSum, - GroupByBoundedSTDEV, + GroupByBoundedStdev, GroupByBoundedAverage, GroupByBoundedVariance, ), @@ -1580,7 +1580,7 @@ def _schema_for_groupby( query: Union[ "GetBounds", "GroupByBoundedAverage", - "GroupByBoundedSTDEV", + "GroupByBoundedStdev", "GroupByBoundedSum", "GroupByBoundedVariance", "GroupByCount", @@ -1606,7 +1606,7 @@ def _schema_for_groupby( ( GroupByQuantile, GroupByBoundedSum, - GroupByBoundedSTDEV, + GroupByBoundedStdev, GroupByBoundedAverage, GroupByBoundedVariance, ), @@ -1984,7 +1984,7 @@ def accept(self, visitor: "QueryExprVisitor") -> Any: @dataclass(frozen=True) -class GroupByBoundedSTDEV(SingleChildQueryExpr): +class GroupByBoundedStdev(SingleChildQueryExpr): """Returns bounded stdev of a column for each combination of groupby domains. If the column to be measured contains null, NaN, or positive or negative infinity, @@ -2201,8 +2201,8 @@ def visit_groupby_bounded_variance(self, expr: GroupByBoundedVariance) -> Any: raise NotImplementedError @abstractmethod - def visit_groupby_bounded_stdev(self, expr: GroupByBoundedSTDEV) -> Any: - """Visit a :class:`GroupByBoundedSTDEV`.""" + def visit_groupby_bounded_stdev(self, expr: GroupByBoundedStdev) -> Any: + """Visit a :class:`GroupByBoundedStdev`.""" raise NotImplementedError @abstractmethod diff --git a/src/tmlt/analytics/_query_expr_compiler/_base_measurement_visitor.py b/src/tmlt/analytics/_query_expr_compiler/_base_measurement_visitor.py index 26427d02..19b7c52c 100644 --- a/src/tmlt/analytics/_query_expr_compiler/_base_measurement_visitor.py +++ b/src/tmlt/analytics/_query_expr_compiler/_base_measurement_visitor.py @@ -80,7 +80,7 @@ GetBounds, GetGroups, GroupByBoundedAverage, - GroupByBoundedSTDEV, + GroupByBoundedStdev, GroupByBoundedSum, GroupByBoundedVariance, GroupByCount, @@ -123,7 +123,7 @@ def _get_core_mechanism( query: Union[ GroupByBoundedAverage, - GroupByBoundedSTDEV, + GroupByBoundedStdev, GroupByBoundedSum, GroupByBoundedVariance, GroupByCount, @@ -141,7 +141,7 @@ def _get_core_mechanism( def _get_query_bounds( query: Union[ GroupByBoundedAverage, - GroupByBoundedSTDEV, + GroupByBoundedStdev, GroupByBoundedSum, GroupByBoundedVariance, GroupByQuantile, @@ -599,7 +599,7 @@ def _validate_approxDP_and_adjust_budget( self, expr: Union[ GroupByBoundedAverage, - GroupByBoundedSTDEV, + GroupByBoundedStdev, GroupByBoundedSum, GroupByBoundedVariance, GroupByCount, @@ -679,7 +679,7 @@ def _add_special_value_handling_to_query( self, query: Union[ GroupByBoundedAverage, - GroupByBoundedSTDEV, + GroupByBoundedStdev, GroupByBoundedSum, GroupByBoundedVariance, GroupByQuantile, @@ -1522,7 +1522,7 @@ def build_groupby_bounded_stdev( ) def visit_groupby_bounded_stdev( - self, expr: GroupByBoundedSTDEV + self, expr: GroupByBoundedStdev ) -> Tuple[Measurement, NoiseInfo]: """Create a measurement from a GroupByBoundedStdev query expression.""" self._validate_approxDP_and_adjust_budget(expr) diff --git a/src/tmlt/analytics/_query_expr_compiler/_base_transformation_visitor.py b/src/tmlt/analytics/_query_expr_compiler/_base_transformation_visitor.py index 9da9c97f..7c622688 100644 --- a/src/tmlt/analytics/_query_expr_compiler/_base_transformation_visitor.py +++ b/src/tmlt/analytics/_query_expr_compiler/_base_transformation_visitor.py @@ -132,7 +132,7 @@ GetBounds, GetGroups, GroupByBoundedAverage, - GroupByBoundedSTDEV, + GroupByBoundedStdev, GroupByBoundedSum, GroupByBoundedVariance, GroupByCount, @@ -1597,8 +1597,8 @@ def visit_groupby_bounded_variance(self, expr: GroupByBoundedVariance) -> Any: """Visit a GroupByBoundedVariance query expression (raises an error).""" raise NotImplementedError - def visit_groupby_bounded_stdev(self, expr: GroupByBoundedSTDEV) -> Any: - """Visit a GroupByBoundedSTDEV query expression (raises an error).""" + def visit_groupby_bounded_stdev(self, expr: GroupByBoundedStdev) -> Any: + """Visit a GroupByBoundedStdev query expression (raises an error).""" raise NotImplementedError def visit_suppress_aggregates(self, expr: SuppressAggregates) -> Any: diff --git a/src/tmlt/analytics/_query_expr_compiler/_compiler.py b/src/tmlt/analytics/_query_expr_compiler/_compiler.py index fd90d946..6dcaf24e 100644 --- a/src/tmlt/analytics/_query_expr_compiler/_compiler.py +++ b/src/tmlt/analytics/_query_expr_compiler/_compiler.py @@ -69,7 +69,7 @@ class QueryExprCompiler: * :class:`~tmlt.analytics._query_expr.GroupByCountDistinct` * :class:`~tmlt.analytics._query_expr.GroupByBoundedSum` * :class:`~tmlt.analytics._query_expr.GroupByBoundedAverage` - * :class:`~tmlt.analytics._query_expr.GroupByBoundedSTDEV` + * :class:`~tmlt.analytics._query_expr.GroupByBoundedStdev` * :class:`~tmlt.analytics._query_expr.GroupByBoundedVariance` * :class:`~tmlt.analytics._query_expr.GroupByQuantile` """ diff --git a/src/tmlt/analytics/_query_expr_compiler/_rewrite_rules.py b/src/tmlt/analytics/_query_expr_compiler/_rewrite_rules.py index 6a8e9006..dd099728 100644 --- a/src/tmlt/analytics/_query_expr_compiler/_rewrite_rules.py +++ b/src/tmlt/analytics/_query_expr_compiler/_rewrite_rules.py @@ -21,7 +21,7 @@ CountDistinctMechanism, CountMechanism, GroupByBoundedAverage, - GroupByBoundedSTDEV, + GroupByBoundedStdev, GroupByBoundedSum, GroupByBoundedVariance, GroupByCount, @@ -118,7 +118,7 @@ def select_noise_for_non_count( info: CompilationInfo, expr: Union[ GroupByBoundedAverage, - GroupByBoundedSTDEV, + GroupByBoundedStdev, GroupByBoundedSum, GroupByBoundedVariance, ], @@ -182,7 +182,7 @@ def select_noise(expr: QueryExpr) -> QueryExpr: expr, ( GroupByBoundedAverage, - GroupByBoundedSTDEV, + GroupByBoundedStdev, GroupByBoundedSum, GroupByBoundedVariance, ), diff --git a/src/tmlt/analytics/query_builder.py b/src/tmlt/analytics/query_builder.py index 58fc3806..a427220f 100644 --- a/src/tmlt/analytics/query_builder.py +++ b/src/tmlt/analytics/query_builder.py @@ -39,7 +39,7 @@ GetBounds, GetGroups, GroupByBoundedAverage, - GroupByBoundedSTDEV, + GroupByBoundedStdev, GroupByBoundedSum, GroupByBoundedVariance, GroupByCount, @@ -3708,7 +3708,7 @@ def stdev( f'Unknown mechanism "{mechanism}". Available options are ' '"laplace", "gaussian", or "default".' ) from e - query_expr = GroupByBoundedSTDEV( + query_expr = GroupByBoundedStdev( child=self._query_expr, groupby_keys=self._groupby_keys, measure_column=column, diff --git a/test/system/session/rows/test_add_max_rows.py b/test/system/session/rows/test_add_max_rows.py index cfee3af1..a85b36fc 100644 --- a/test/system/session/rows/test_add_max_rows.py +++ b/test/system/session/rows/test_add_max_rows.py @@ -37,7 +37,7 @@ ) from tmlt.analytics._noise_info import _NoiseMechanism from tmlt.analytics._query_expr import ( - GroupByBoundedSTDEV, + GroupByBoundedStdev, GroupByCount, PrivateSource, QueryExpr, @@ -132,7 +132,7 @@ def test_queries_privacy_budget_infinity_puredp( QueryBuilder("private") .groupby(KeySet.from_dict({"A": ["0", "1"]})) .stdev(column="B", low=0, high=1, mechanism=StdevMechanism.GAUSSIAN), - GroupByBoundedSTDEV( + GroupByBoundedStdev( child=PrivateSource("private"), groupby_keys=KeySet.from_dict({"A": ["0", "1"]}), measure_column="B", diff --git a/test/unit/query_expr_compiler/test_measurement_visitor.py b/test/unit/query_expr_compiler/test_measurement_visitor.py index 4d23dde5..45956d9d 100644 --- a/test/unit/query_expr_compiler/test_measurement_visitor.py +++ b/test/unit/query_expr_compiler/test_measurement_visitor.py @@ -49,7 +49,7 @@ Filter, FlatMap, GroupByBoundedAverage, - GroupByBoundedSTDEV, + GroupByBoundedStdev, GroupByBoundedSum, GroupByBoundedVariance, GroupByCount, @@ -124,7 +124,7 @@ def test_average(lower: float, upper: float) -> None: @pytest.mark.parametrize("lower,upper", [(0, 1), (-123456, 0), (7899000, 9999999)]) def test_stdev(lower: float, upper: float) -> None: """Test _get_query_bounds on STDEV query expr, with lower!=upper.""" - stdev = GroupByBoundedSTDEV( + stdev = GroupByBoundedStdev( child=PrivateSource("private"), groupby_keys=KeySet.from_dict({}), measure_column="", @@ -1095,7 +1095,7 @@ def test_visit_groupby_bounded_variance( "query,output_measure,noise_info", [ ( - GroupByBoundedSTDEV( + GroupByBoundedStdev( child=PrivateSource("private"), groupby_keys=KeySet.from_dict({}), low=-100, @@ -1124,7 +1124,7 @@ def test_visit_groupby_bounded_variance( ), ), ( - GroupByBoundedSTDEV( + GroupByBoundedStdev( child=PrivateSource("private"), groupby_keys=KeySet.from_dict({"B": [0, 1]}), measure_column="X", @@ -1153,7 +1153,7 @@ def test_visit_groupby_bounded_variance( ), ), ( - GroupByBoundedSTDEV( + GroupByBoundedStdev( child=PrivateSource("private"), groupby_keys=KeySet.from_dict({"B": [0, 1]}), measure_column="X", @@ -1182,7 +1182,7 @@ def test_visit_groupby_bounded_variance( ), ), ( - GroupByBoundedSTDEV( + GroupByBoundedStdev( child=PrivateSource("private"), groupby_keys=KeySet.from_dict({"A": ["zero"]}), mechanism=StdevMechanism.DEFAULT, @@ -1213,7 +1213,7 @@ def test_visit_groupby_bounded_variance( ) def test_visit_groupby_bounded_stdev( self, - query: GroupByBoundedSTDEV, + query: GroupByBoundedStdev, output_measure: Union[PureDP, RhoZCDP], noise_info: NoiseInfo, ) -> None: diff --git a/test/unit/query_expr_compiler/test_rewrite_rules.py b/test/unit/query_expr_compiler/test_rewrite_rules.py index 5fdef85d..8e253e77 100644 --- a/test/unit/query_expr_compiler/test_rewrite_rules.py +++ b/test/unit/query_expr_compiler/test_rewrite_rules.py @@ -14,7 +14,7 @@ CountDistinctMechanism, CountMechanism, GroupByBoundedAverage, - GroupByBoundedSTDEV, + GroupByBoundedStdev, GroupByBoundedSum, GroupByBoundedVariance, GroupByCount, @@ -60,7 +60,7 @@ def fixture_catalog(): "count_distinct": (GroupByCountDistinct, CountDistinctMechanism), "average": (GroupByBoundedAverage, AverageMechanism), "sum": (GroupByBoundedSum, SumMechanism), - "stdev": (GroupByBoundedSTDEV, StdevMechanism), + "stdev": (GroupByBoundedStdev, StdevMechanism), "variance": (GroupByBoundedVariance, VarianceMechanism), } @@ -289,7 +289,7 @@ def test_noise_selection_suppress_aggregates( ) ), Case()( - expr=GroupByBoundedSTDEV( + expr=GroupByBoundedStdev( child=BASE_EXPR, groupby_keys=KeySet.from_dict({}), measure_column="int_col", diff --git a/test/unit/query_expr_compiler/transformation_visitor/test_add_rows.py b/test/unit/query_expr_compiler/transformation_visitor/test_add_rows.py index 3d4fcc92..42cc32d4 100644 --- a/test/unit/query_expr_compiler/transformation_visitor/test_add_rows.py +++ b/test/unit/query_expr_compiler/transformation_visitor/test_add_rows.py @@ -28,7 +28,7 @@ Filter, FlatMap, GroupByBoundedAverage, - GroupByBoundedSTDEV, + GroupByBoundedStdev, GroupByBoundedSum, GroupByBoundedVariance, GroupByCount, @@ -821,7 +821,7 @@ def test_measurement_visits(self): with pytest.raises(NotImplementedError): self.visitor.visit_groupby_bounded_stdev( - GroupByBoundedSTDEV( + GroupByBoundedStdev( child=PrivateSource(source_id="rows1"), groupby_keys=KeySet.from_dict({}), measure_column="A", diff --git a/test/unit/test_query_builder.py b/test/unit/test_query_builder.py index 33df166e..9626cca7 100644 --- a/test/unit/test_query_builder.py +++ b/test/unit/test_query_builder.py @@ -34,7 +34,7 @@ Filter, FlatMap, GroupByBoundedAverage, - GroupByBoundedSTDEV, + GroupByBoundedStdev, GroupByBoundedSum, GroupByBoundedVariance, GroupByCount, @@ -940,7 +940,7 @@ def assert_common_query_fields_correct( GroupByQuantile, GroupByBoundedAverage, GroupByBoundedVariance, - GroupByBoundedSTDEV, + GroupByBoundedStdev, ], expected_groupby_keys: KeySet, expected_measure_column: str, @@ -1257,7 +1257,7 @@ def test_stdev_ungrouped(self, spark, name: Optional[str], expected_name: str): query = root_builder().stdev(column="B", low=0.0, high=1.0, name=name) assert isinstance(query, Query) query_expr = query._query_expr - assert isinstance(query_expr, GroupByBoundedSTDEV) + assert isinstance(query_expr, GroupByBoundedStdev) self.assert_common_query_fields_correct( query_expr, keys, "B", 0.0, 1.0, expected_name ) @@ -1280,7 +1280,7 @@ def test_stdev_keyset( ) assert isinstance(query, Query) query_expr = query._query_expr - assert isinstance(query_expr, GroupByBoundedSTDEV) + assert isinstance(query_expr, GroupByBoundedStdev) self.assert_common_query_fields_correct( query_expr, keys, "B", 0.0, 1.0, expected_name ) diff --git a/test/unit/test_query_expr_compiler.py b/test/unit/test_query_expr_compiler.py index 9d2d5d8b..83964701 100644 --- a/test/unit/test_query_expr_compiler.py +++ b/test/unit/test_query_expr_compiler.py @@ -41,7 +41,7 @@ Filter, FlatMap, GroupByBoundedAverage, - GroupByBoundedSTDEV, + GroupByBoundedStdev, GroupByBoundedSum, GroupByBoundedVariance, GroupByCount, @@ -133,7 +133,7 @@ pd.DataFrame({"A": ["0", "1"], "average": [0.666667, 1.0]}), ), ( # BoundedSTDEV - GroupByBoundedSTDEV( + GroupByBoundedStdev( child=PrivateSource("private"), groupby_keys=KeySet.from_dict({"A": ["0", "1"]}), measure_column="X", @@ -739,7 +739,7 @@ def test_queries(self, query_expr: QueryExpr, expected: pd.DataFrame): pd.DataFrame({"A": ["0", "1"], "average": [0.33333, 0.0]}), ), ( # BoundedSTDEV on floating-point valued measure column with LAPLACE - GroupByBoundedSTDEV( + GroupByBoundedStdev( child=PrivateSource("private"), groupby_keys=KeySet.from_dict({"A": ["0", "1"]}), measure_column="X", @@ -751,7 +751,7 @@ def test_queries(self, query_expr: QueryExpr, expected: pd.DataFrame): pd.DataFrame({"A": ["0", "1"], "stdev": [0.5, np.NaN]}), ), ( # BoundedSTDEV on integer valued measure column with LAPLACE - GroupByBoundedSTDEV( + GroupByBoundedStdev( child=PrivateSource("private"), groupby_keys=KeySet.from_dict({"A": ["0", "1"]}), measure_column="B", @@ -763,7 +763,7 @@ def test_queries(self, query_expr: QueryExpr, expected: pd.DataFrame): pd.DataFrame({"A": ["0", "1"], "stdev": [0.5, np.NaN]}), ), ( # BoundedSTDEV on integer valued measure column with GAUSSIAN - GroupByBoundedSTDEV( + GroupByBoundedStdev( child=PrivateSource("private"), groupby_keys=KeySet.from_dict({"A": ["0", "1"]}), measure_column="B", @@ -899,7 +899,7 @@ def test_queries(self, query_expr: QueryExpr, expected: pd.DataFrame): ), ( # BoundedSTDEV on floating-point valued measure column with GAUSSIAN [ - GroupByBoundedSTDEV( + GroupByBoundedStdev( child=PrivateSource("private"), groupby_keys=KeySet.from_dict({"A": ["0", "1"]}), measure_column="X", diff --git a/test/unit/test_query_expression.py b/test/unit/test_query_expression.py index 09fcca4b..5c68ab39 100644 --- a/test/unit/test_query_expression.py +++ b/test/unit/test_query_expression.py @@ -28,7 +28,7 @@ FlatMap, FlatMapByID, GroupByBoundedAverage, - GroupByBoundedSTDEV, + GroupByBoundedStdev, GroupByBoundedSum, GroupByBoundedVariance, GroupByCount, @@ -350,7 +350,7 @@ def test_invalid_groupbyagg( GroupByBoundedSum, GroupByBoundedAverage, GroupByBoundedVariance, - GroupByBoundedSTDEV, + GroupByBoundedStdev, ]: with pytest.raises((TypeCheckError, ValueError), match=expected_error_msg): DataClass(PrivateSource("private"), keys, measure_column, low, high) @@ -440,7 +440,7 @@ def test_clamping_bounds_casting(low: float, high: float): GroupByBoundedSum, GroupByBoundedAverage, GroupByBoundedVariance, - GroupByBoundedSTDEV, + GroupByBoundedStdev, ]: query = DataClass( PrivateSource("private"), @@ -455,7 +455,7 @@ def test_clamping_bounds_casting(low: float, high: float): GroupByBoundedSum, GroupByBoundedAverage, GroupByBoundedVariance, - GroupByBoundedSTDEV, + GroupByBoundedStdev, ), ) assert type(query.low) == type(query.high) diff --git a/test/unit/test_query_expression_schema.py b/test/unit/test_query_expression_schema.py index 2d13f3f1..e7f033c1 100644 --- a/test/unit/test_query_expression_schema.py +++ b/test/unit/test_query_expression_schema.py @@ -20,7 +20,7 @@ FlatMap, GetBounds, GroupByBoundedAverage, - GroupByBoundedSTDEV, + GroupByBoundedStdev, GroupByBoundedSum, GroupByBoundedVariance, GroupByCount, @@ -476,7 +476,7 @@ def test_invalid_group_by_aggregations( """Test invalid measurement QueryExpr.""" for DataClass in [ GroupByBoundedAverage, - GroupByBoundedSTDEV, + GroupByBoundedStdev, GroupByBoundedSum, GroupByBoundedVariance, ]: @@ -649,7 +649,7 @@ def test_invalid_group_by_aggregations_null( """Test invalid measurement QueryExpr.""" for DataClass in [ GroupByBoundedAverage, - GroupByBoundedSTDEV, + GroupByBoundedStdev, GroupByBoundedSum, GroupByBoundedVariance, ]: @@ -1565,7 +1565,7 @@ def test_schema_drop_infinity( ), ), ( - GroupByBoundedSTDEV( + GroupByBoundedStdev( child=PrivateSource("private"), groupby_keys=KeySet.from_dict( {"A": ["a1", "a2"], "D": [datetime.date(1999, 12, 31)]} diff --git a/test/unit/test_query_expression_visitor.py b/test/unit/test_query_expression_visitor.py index 466dc4e3..aedbfb93 100644 --- a/test/unit/test_query_expression_visitor.py +++ b/test/unit/test_query_expression_visitor.py @@ -16,7 +16,7 @@ GetBounds, GetGroups, GroupByBoundedAverage, - GroupByBoundedSTDEV, + GroupByBoundedStdev, GroupByBoundedSum, GroupByBoundedVariance, GroupByCount, @@ -107,7 +107,7 @@ def visit_groupby_bounded_variance(self, expr): return "GroupByBoundedVariance" def visit_groupby_bounded_stdev(self, expr): - return "GroupByBoundedSTDEV" + return "GroupByBoundedStdev" def visit_suppress_aggregates(self, expr): return "SuppressAggregates" @@ -183,8 +183,8 @@ def visit_suppress_aggregates(self, expr): "GroupByBoundedVariance", ), ( - GroupByBoundedSTDEV(PrivateSource("P"), KeySet.from_dict({}), "A", 0, 1), - "GroupByBoundedSTDEV", + GroupByBoundedStdev(PrivateSource("P"), KeySet.from_dict({}), "A", 0, 1), + "GroupByBoundedStdev", ), ( SuppressAggregates( From f409f93b2ef249f99f3bd4f3516f8aada1c82848 Mon Sep 17 00:00:00 2001 From: Ted Date: Thu, 30 Oct 2025 12:20:19 +0100 Subject: [PATCH 18/25] better names for JoinPrivate children (#104) Co-authored-by: Damien Desfontaines --- src/tmlt/analytics/_query_expr.py | 12 ++++---- .../_base_transformation_visitor.py | 4 +-- .../_query_expr_compiler/_rewrite_rules.py | 6 ++-- .../test_measurement_visitor.py | 4 +-- .../transformation_visitor/test_add_rows.py | 20 ++++++------- test/unit/test_query_builder.py | 12 ++++---- test/unit/test_query_expr_compiler.py | 28 +++++++++---------- test/unit/test_query_expression_schema.py | 8 +++--- 8 files changed, 47 insertions(+), 47 deletions(-) diff --git a/src/tmlt/analytics/_query_expr.py b/src/tmlt/analytics/_query_expr.py index c816c227..05147d7c 100644 --- a/src/tmlt/analytics/_query_expr.py +++ b/src/tmlt/analytics/_query_expr.py @@ -872,9 +872,9 @@ class JoinPrivate(QueryExpr): :meth:`~tmlt.analytics.QueryBuilder.join_private`. """ - child: QueryExpr + left_child: QueryExpr """The QueryExpr to join with right operand.""" - right_operand_expr: QueryExpr + right_child: QueryExpr """The QueryExpr for private source to join with.""" truncation_strategy_left: Optional[TruncationStrategy.Type] = None """Truncation strategy to be used for the left table.""" @@ -885,8 +885,8 @@ class JoinPrivate(QueryExpr): def __post_init__(self): """Checks arguments to constructor.""" - check_type(self.child, QueryExpr) - check_type(self.right_operand_expr, QueryExpr) + check_type(self.left_child, QueryExpr) + check_type(self.right_child, QueryExpr) check_type( self.truncation_strategy_left, Optional[TruncationStrategy.Type], @@ -933,8 +933,8 @@ def schema(self, catalog: Catalog) -> Schema: 4. Columns that are in both tables, but not included in the join columns. These columns are included with _left and _right suffixes. """ - left_schema = self.child.schema(catalog) - right_schema = self.right_operand_expr.schema(catalog) + left_schema = self.left_child.schema(catalog) + right_schema = self.right_child.schema(catalog) self._validate(left_schema, right_schema) return _schema_for_join( left_schema=left_schema, diff --git a/src/tmlt/analytics/_query_expr_compiler/_base_transformation_visitor.py b/src/tmlt/analytics/_query_expr_compiler/_base_transformation_visitor.py index 7c622688..84b41bb8 100644 --- a/src/tmlt/analytics/_query_expr_compiler/_base_transformation_visitor.py +++ b/src/tmlt/analytics/_query_expr_compiler/_base_transformation_visitor.py @@ -838,13 +838,13 @@ def build_private_join_transformation( def visit_join_private(self, expr: JoinPrivateExpr) -> Output: """Create a transformation from a JoinPrivate query expression.""" - left_transformation, left_ref, left_constraints = expr.child.accept(self) + left_transformation, left_ref, left_constraints = expr.left_child.accept(self) right_visitor = self._new_visitor_after_transformation(left_transformation) ( right_transformation, right_ref, right_constraints, - ) = expr.right_operand_expr.accept(right_visitor) + ) = expr.right_child.accept(right_visitor) if left_ref.parent != right_ref.parent: raise ValueError( diff --git a/src/tmlt/analytics/_query_expr_compiler/_rewrite_rules.py b/src/tmlt/analytics/_query_expr_compiler/_rewrite_rules.py index dd099728..7707a2af 100644 --- a/src/tmlt/analytics/_query_expr_compiler/_rewrite_rules.py +++ b/src/tmlt/analytics/_query_expr_compiler/_rewrite_rules.py @@ -71,9 +71,9 @@ def wrapped(expr: QueryExpr) -> QueryExpr: child = wrapped(expr.child) return func(replace(expr, child=child)) if isinstance(expr, JoinPrivate): - left = wrapped(expr.child) - right = wrapped(expr.right_operand_expr) - return func(replace(expr, child=left, right_operand_expr=right)) + left = wrapped(expr.left_child) + right = wrapped(expr.right_child) + return func(replace(expr, left_child=left, right_child=right)) else: raise AnalyticsInternalError( f"Unrecognized QueryExpr subtype {type(expr).__qualname__}." diff --git a/test/unit/query_expr_compiler/test_measurement_visitor.py b/test/unit/query_expr_compiler/test_measurement_visitor.py index 45956d9d..e8791f98 100644 --- a/test/unit/query_expr_compiler/test_measurement_visitor.py +++ b/test/unit/query_expr_compiler/test_measurement_visitor.py @@ -1252,8 +1252,8 @@ def test_visit_groupby_bounded_stdev( ), ( JoinPrivate( - child=PrivateSource("private"), - right_operand_expr=PrivateSource("private_2"), + left_child=PrivateSource("private"), + right_child=PrivateSource("private_2"), truncation_strategy_left=TruncationStrategy.DropExcess(3), truncation_strategy_right=TruncationStrategy.DropExcess(3), ) diff --git a/test/unit/query_expr_compiler/transformation_visitor/test_add_rows.py b/test/unit/query_expr_compiler/transformation_visitor/test_add_rows.py index 42cc32d4..622244f2 100644 --- a/test/unit/query_expr_compiler/transformation_visitor/test_add_rows.py +++ b/test/unit/query_expr_compiler/transformation_visitor/test_add_rows.py @@ -360,8 +360,8 @@ def test_visit_flat_map_invalid(self) -> None: [ ( JoinPrivate( - child=PrivateSource("rows1"), - right_operand_expr=PrivateSource("rows2"), + left_child=PrivateSource("rows1"), + right_child=PrivateSource("rows2"), truncation_strategy_left=TruncationStrategy.DropExcess(3), truncation_strategy_right=TruncationStrategy.DropExcess(10), ), @@ -372,8 +372,8 @@ def test_visit_flat_map_invalid(self) -> None: ), ( JoinPrivate( - child=PrivateSource("rows2"), - right_operand_expr=PrivateSource("rows1"), + left_child=PrivateSource("rows2"), + right_child=PrivateSource("rows1"), truncation_strategy_left=TruncationStrategy.DropExcess(3), truncation_strategy_right=TruncationStrategy.DropNonUnique(), join_columns=tuple(["I"]), @@ -425,8 +425,8 @@ class InvalidStrategy(TruncationStrategy.Type): """An invalid truncation strategy.""" query1 = JoinPrivate( - child=PrivateSource("rows1"), - right_operand_expr=PrivateSource("rows2"), + left_child=PrivateSource("rows1"), + right_child=PrivateSource("rows2"), truncation_strategy_left=InvalidStrategy(), truncation_strategy_right=TruncationStrategy.DropExcess(3), ) @@ -437,8 +437,8 @@ class InvalidStrategy(TruncationStrategy.Type): query1.accept(self.visitor) query2 = JoinPrivate( - child=PrivateSource("rows1"), - right_operand_expr=PrivateSource("rows2"), + left_child=PrivateSource("rows1"), + right_child=PrivateSource("rows2"), truncation_strategy_left=TruncationStrategy.DropExcess(2), truncation_strategy_right=InvalidStrategy(), ) @@ -446,8 +446,8 @@ class InvalidStrategy(TruncationStrategy.Type): query2.accept(self.visitor) query3 = JoinPrivate( - child=PrivateSource("rows1"), - right_operand_expr=PrivateSource("rows2"), + left_child=PrivateSource("rows1"), + right_child=PrivateSource("rows2"), truncation_strategy_left=None, truncation_strategy_right=None, ) diff --git a/test/unit/test_query_builder.py b/test/unit/test_query_builder.py index 9626cca7..66f6bc06 100644 --- a/test/unit/test_query_builder.py +++ b/test/unit/test_query_builder.py @@ -166,9 +166,9 @@ def test_join_private(join_columns: Optional[Sequence[str]]): assert private_join_expr.truncation_strategy_right == TruncationStrategy.DropExcess( 2 ) - right_operand_expr = private_join_expr.right_operand_expr - assert isinstance(right_operand_expr, PrivateSource) - assert right_operand_expr.source_id == "private_2" + right_child = private_join_expr.right_child + assert isinstance(right_child, PrivateSource) + assert right_child.source_id == "private_2" assert isinstance(query_expr, GroupByCount) @@ -200,9 +200,9 @@ def test_join_private_str() -> None: assert private_join_expr.truncation_strategy_right == TruncationStrategy.DropExcess( 2 ) - right_operand_expr = private_join_expr.right_operand_expr - assert isinstance(right_operand_expr, PrivateSource) - assert right_operand_expr.source_id == "private_2" + right_child = private_join_expr.right_child + assert isinstance(right_child, PrivateSource) + assert right_child.source_id == "private_2" assert isinstance(query_expr, GroupByCount) diff --git a/test/unit/test_query_expr_compiler.py b/test/unit/test_query_expr_compiler.py index 83964701..839bee34 100644 --- a/test/unit/test_query_expr_compiler.py +++ b/test/unit/test_query_expr_compiler.py @@ -1048,8 +1048,8 @@ def test_join_private(self, spark): ) transformation, reference, _constraints = self.compiler.build_transformation( JoinPrivate( - child=PrivateSource("private"), - right_operand_expr=PrivateSource("private_2"), + left_child=PrivateSource("private"), + right_child=PrivateSource("private_2"), truncation_strategy_left=TruncationStrategy.DropExcess(3), truncation_strategy_right=TruncationStrategy.DropExcess(3), ), @@ -1087,8 +1087,8 @@ def test_join_private(self, spark): [ ( JoinPrivate( - child=PrivateSource("private"), - right_operand_expr=PrivateSource("private_2"), + left_child=PrivateSource("private"), + right_child=PrivateSource("private_2"), truncation_strategy_left=TruncationStrategy.DropExcess(3), truncation_strategy_right=TruncationStrategy.DropExcess(3), ), @@ -1096,8 +1096,8 @@ def test_join_private(self, spark): ), ( JoinPrivate( - child=PrivateSource("private"), - right_operand_expr=PrivateSource("private_2"), + left_child=PrivateSource("private"), + right_child=PrivateSource("private_2"), truncation_strategy_left=TruncationStrategy.DropExcess(3), truncation_strategy_right=TruncationStrategy.DropExcess(1), ), @@ -1105,8 +1105,8 @@ def test_join_private(self, spark): ), ( JoinPrivate( - child=PrivateSource("private"), - right_operand_expr=PrivateSource("private_2"), + left_child=PrivateSource("private"), + right_child=PrivateSource("private_2"), truncation_strategy_left=TruncationStrategy.DropExcess(1), truncation_strategy_right=TruncationStrategy.DropExcess(1), ), @@ -1114,8 +1114,8 @@ def test_join_private(self, spark): ), ( JoinPrivate( - child=PrivateSource("private"), - right_operand_expr=PrivateSource("private_2"), + left_child=PrivateSource("private"), + right_child=PrivateSource("private_2"), truncation_strategy_left=TruncationStrategy.DropExcess(3), truncation_strategy_right=TruncationStrategy.DropNonUnique(), ), @@ -1123,8 +1123,8 @@ def test_join_private(self, spark): ), ( JoinPrivate( - child=PrivateSource("private"), - right_operand_expr=PrivateSource("private_2"), + left_child=PrivateSource("private"), + right_child=PrivateSource("private_2"), truncation_strategy_left=TruncationStrategy.DropNonUnique(), truncation_strategy_right=TruncationStrategy.DropNonUnique(), ), @@ -1156,8 +1156,8 @@ class Strategy(TruncationStrategy.Type): """An invalid truncation strategy.""" query = JoinPrivate( - child=PrivateSource("private"), - right_operand_expr=PrivateSource("private_2"), + left_child=PrivateSource("private"), + right_child=PrivateSource("private_2"), truncation_strategy_left=Strategy(), truncation_strategy_right=Strategy(), ) diff --git a/test/unit/test_query_expression_schema.py b/test/unit/test_query_expression_schema.py index e7f033c1..9357f154 100644 --- a/test/unit/test_query_expression_schema.py +++ b/test/unit/test_query_expression_schema.py @@ -982,8 +982,8 @@ def test_schema_flat_map(self, query: FlatMap, expected_schema: Schema) -> None: [ ( JoinPrivate( - child=PrivateSource("private"), - right_operand_expr=PrivateSource("groupby_one_column_private"), + left_child=PrivateSource("private"), + right_child=PrivateSource("groupby_one_column_private"), truncation_strategy_left=TruncationStrategy.DropExcess(10), truncation_strategy_right=TruncationStrategy.DropExcess(10), ), @@ -1127,8 +1127,8 @@ def test_schema_join_private_nulls( catalog.add_private_table("left", left_schema) catalog.add_private_table("right", right_schema) query = JoinPrivate( - child=PrivateSource("left"), - right_operand_expr=PrivateSource("right"), + left_child=PrivateSource("left"), + right_child=PrivateSource("right"), truncation_strategy_left=TruncationStrategy.DropExcess(1), truncation_strategy_right=TruncationStrategy.DropExcess(1), ) From c6f31d5fff2592ef8ae37c3179d6634fad63cbd7 Mon Sep 17 00:00:00 2001 From: Ted Date: Thu, 30 Oct 2025 17:55:58 +0100 Subject: [PATCH 19/25] update links to my blog (#106) Co-authored-by: Damien Desfontaines --- doc/index.rst | 2 +- doc/topic-guides/privacy-budgets.rst | 10 +++++----- doc/topic-guides/understanding-sensitivity.rst | 2 +- doc/tutorials/first-steps.rst | 2 +- doc/tutorials/privacy-budget-basics.rst | 4 ++-- src/tmlt/analytics/privacy_budget.py | 2 +- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/doc/index.rst b/doc/index.rst index 88c24fa2..901e9c9e 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -97,7 +97,7 @@ If you have any questions, feedback, or feature requests, please `let us know on The Tumult Analytics documentation introduces all of the concepts necessary to get started producing differentially private results. Users who wish to learn more about the fundamentals of differential privacy can consult -`this blog post series `__ +`this blog post series `__ or `this longer introduction `__. .. diff --git a/doc/topic-guides/privacy-budgets.rst b/doc/topic-guides/privacy-budgets.rst index 93646b83..110ed428 100644 --- a/doc/topic-guides/privacy-budgets.rst +++ b/doc/topic-guides/privacy-budgets.rst @@ -22,10 +22,10 @@ Tumult Analytics currently supports three distinct privacy definitions: * *Pure differential privacy ("pure DP", or simply "DP")*, with its associated privacy parameter ``epsilon``. For data publication use cases, the value of ``epsilon`` is often chosen to be - `lower than 5 `_. + `lower than 5 `_. Values below 1 are typically considered conservative. Pure DP is the original definition of differential privacy. To learn more, consult this - `blog post `__. + `blog post `__. * *Approximate differential privacy ("Approx DP")*, with its associated privacy parameters ``epsilon`` and ``delta``. Approximate DP is a relaxation of PureDP where mechanisms are allowed to fail to provide Pure DP guarantees with some (hopefully small) probability,``delta``. Delta can @@ -33,11 +33,11 @@ Tumult Analytics currently supports three distinct privacy definitions: smaller than ``1/n``, where ``n`` is the number of people in your dataset. The relaxation of Pure DP enables new types of queries and can therefore be a powerful tool that still offers reasonable privacy protections when used correctly. To learn more, consult this - `blog post `__. + `blog post `__. * *Zero-concentrated differential privacy ("zCDP")*, with its associated privacy parameter ``rho``. zCDP is a variant of differential privacy, which adjusts the way privacy loss is measured. To learn more, consult this: - `blog post `__. + `blog post `__. Queries with higher values of epsilon, delta, and rho will produce results with less randomization that are therefore more accurate, whereas @@ -147,7 +147,7 @@ their suspicion, whereas larger epsilons allow the attacker to determine with in certainty whether or not their target is in the database. For an even more in-depth explanation of this topic, you can check out the following -`blog post `__. +`blog post `__. The impact of data size ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/doc/topic-guides/understanding-sensitivity.rst b/doc/topic-guides/understanding-sensitivity.rst index 96f5a886..f9dc2845 100644 --- a/doc/topic-guides/understanding-sensitivity.rst +++ b/doc/topic-guides/understanding-sensitivity.rst @@ -228,7 +228,7 @@ Analytics, it is certainly not exhaustive. If you have additional questions, fee to reach out to us on `our Slack server `_! -.. _blog post: https://desfontain.es/privacy/gaussian-noise.html +.. _blog post: https://desfontain.es/blog/gaussian-noise.html diff --git a/doc/tutorials/first-steps.rst b/doc/tutorials/first-steps.rst index 2e1611c7..ead82e90 100644 --- a/doc/tutorials/first-steps.rst +++ b/doc/tutorials/first-steps.rst @@ -28,7 +28,7 @@ accomplish common tasks. To learn more about the trade-offs involved in parameter setting and mechanism design, you can consult our :ref:`topic guides `. -.. _differential privacy: https://desfontain.es/privacy/friendly-intro-to-differential-privacy.html +.. _differential privacy: https://desfontain.es/blog/friendly-intro-to-differential-privacy.html Setup ----- diff --git a/doc/tutorials/privacy-budget-basics.rst b/doc/tutorials/privacy-budget-basics.rst index f2c5fbd9..0c3d08db 100644 --- a/doc/tutorials/privacy-budget-basics.rst +++ b/doc/tutorials/privacy-budget-basics.rst @@ -29,9 +29,9 @@ fundamentals, you can consult the following resources. - If you would like to know what privacy parameters are commonly used for data publication, you can consult this `list of real-world use cases`_. -.. _explainer: https://desfontain.es/privacy/differential-privacy-in-more-detail.html +.. _explainer: https://desfontain.es/blog/differential-privacy-in-more-detail.html -.. _list of real-world use cases: https://desfontain.es/privacy/real-world-differential-privacy.html +.. _list of real-world use cases: https://desfontain.es/blog/real-world-differential-privacy.html These are only optional reading! The one-sentence summary above (smaller budget = better privacy) is enough to follow the rest of this tutorial. Let's get diff --git a/src/tmlt/analytics/privacy_budget.py b/src/tmlt/analytics/privacy_budget.py index 18f71705..a8d32d4d 100644 --- a/src/tmlt/analytics/privacy_budget.py +++ b/src/tmlt/analytics/privacy_budget.py @@ -230,7 +230,7 @@ class ApproxDPBudget(PrivacyBudget): This privacy definition is also known as (ε, δ)-differential privacy, and the associated privacy parameters are epsilon and delta. The formal definition can - be found `here `__. + be found `here `__. """ # pylint: disable=line-too-long _epsilon: ExactNumber From 88f4853872a63fe9abfab52c21c4840cf99a3198 Mon Sep 17 00:00:00 2001 From: Ted Date: Sat, 1 Nov 2025 16:01:52 +0100 Subject: [PATCH 20/25] Simplify joined schema generation by relying more on Core logic (#105) * simplify join schema generation * review comments --------- Co-authored-by: Damien Desfontaines --- src/tmlt/analytics/_query_expr.py | 111 ++++++++++++++---------------- 1 file changed, 51 insertions(+), 60 deletions(-) diff --git a/src/tmlt/analytics/_query_expr.py b/src/tmlt/analytics/_query_expr.py index 05147d7c..67bff179 100644 --- a/src/tmlt/analytics/_query_expr.py +++ b/src/tmlt/analytics/_query_expr.py @@ -36,6 +36,7 @@ analytics_to_py_types, analytics_to_spark_columns_descriptor, analytics_to_spark_schema, + spark_dataframe_domain_to_analytics_columns, spark_schema_to_analytics_columns, ) from tmlt.analytics.config import config @@ -761,52 +762,35 @@ def __eq__(self, other: object) -> bool: ) -def _schema_for_join( +def _validate_join( left_schema: Schema, right_schema: Schema, join_columns: Optional[Tuple[str, ...]], - join_id_space: Optional[str] = None, - how: str = "inner", -) -> Schema: - """Return the schema resulting from joining two tables. - - It is assumed that if either schema has an ID column, the one from left_schema - should be used, because this is true for both public and private joins. With private - joins, the ID columns must be compatible; this check must happen outside this - function. +): + """Validates that both tables can be joined by comparing their schemas. - Args: - left_schema: Schema for the left table. - right_schema: Schema for the right table. - join_columns: The set of columns to join on. - join_id_space: The ID space of the resulting join. - how: The type of join to perform. Default is "inner". + This is used for both public and private joins; therefore, this does not check + any properties related to ID columns & ID spaces. """ - if left_schema.grouping_column is None: - grouping_column = right_schema.grouping_column - elif right_schema.grouping_column is None: - grouping_column = left_schema.grouping_column - elif left_schema.grouping_column == right_schema.grouping_column: - grouping_column = left_schema.grouping_column - else: + if ( + left_schema.grouping_column is not None + and right_schema.grouping_column is not None + and left_schema.grouping_column != right_schema.grouping_column + ): raise ValueError( "Joining tables which both have grouping columns is only supported " "if they have the same grouping column" ) - common_columns = set(left_schema) & set(right_schema) - if join_columns is None and not common_columns: - raise ValueError("Tables have no common columns to join on") + if join_columns is not None and not join_columns: # This error case should be caught when constructing the query # expression, so it should never get here. raise AnalyticsInternalError("Empty list of join columns provided.") - join_columns = ( - join_columns - if join_columns - else tuple(sorted(common_columns, key=list(left_schema).index)) - ) - + common_columns = set(left_schema) & set(right_schema) + if join_columns is None and not common_columns: + raise ValueError("Tables have no common columns to join on") + join_columns = join_columns or tuple(common_columns) if not set(join_columns) <= common_columns: raise ValueError("Join columns must be common to both tables") @@ -818,23 +802,35 @@ def _schema_for_join( f"{right_schema[column]} are incompatible" ) - join_column_schemas = {column: left_schema[column] for column in join_columns} - output_schema = { - **join_column_schemas, - **{ - column + ("_left" if column in common_columns else ""): left_schema[column] - for column in left_schema - if column not in join_columns - }, - **{ - column - + ("_right" if column in common_columns else ""): right_schema[column] - for column in right_schema - if column not in join_columns - }, - } - # Use Core's join utilities for determining whether a column can be null - # TODO: This could potentially be used more in this function + +def _schema_for_join( + left_schema: Schema, + right_schema: Schema, + join_columns: Optional[Tuple[str, ...]], + join_id_space: Optional[str], + how: str, +) -> Schema: + """Return the schema resulting from joining two tables. + + It is assumed that if either schema has an ID column, the one from left_schema + should be used, because this is true for both public and private joins. With private + joins, the ID columns must be compatible; this check must happen outside this + function. + + Args: + left_schema: Schema for the left table. + right_schema: Schema for the right table. + join_columns: The set of columns to join on. + join_id_space: The ID space of the resulting join. + how: The type of join to perform. + """ + grouping_column = left_schema.grouping_column or right_schema.grouping_column + common_columns = set(left_schema) & set(right_schema) + join_columns = join_columns or tuple( + sorted(common_columns, key=list(left_schema).index) + ) + + # Get the join schema from the Core convenience method output_domain = domain_after_join( left_domain=SparkDataFrameDomain( analytics_to_spark_columns_descriptor(left_schema) @@ -846,16 +842,8 @@ def _schema_for_join( how=how, nulls_are_equal=True, ) - for column in output_schema: - col_schema = output_schema[column] - output_schema[column] = ColumnDescriptor( - column_type=col_schema.column_type, - allow_null=output_domain.schema[column].allow_null, - allow_nan=col_schema.allow_nan, - allow_inf=col_schema.allow_inf, - ) return Schema( - output_schema, + column_descs=spark_dataframe_domain_to_analytics_columns(output_domain), grouping_column=grouping_column, id_column=left_schema.id_column, id_space=join_id_space, @@ -921,6 +909,7 @@ def _validate(self, left_schema: Schema, right_schema: Schema): "Private joins between tables with the AddRowsWithID protected change" " are only possible when both tables are in the same ID space" ) + _validate_join(left_schema, right_schema, self.join_columns) def schema(self, catalog: Catalog) -> Schema: """Returns the schema resulting from evaluating this QueryExpr. @@ -941,6 +930,7 @@ def schema(self, catalog: Catalog) -> Schema: right_schema=right_schema, join_columns=self.join_columns, join_id_space=left_schema.id_space, + how="inner", ) def accept(self, visitor: "QueryExprVisitor") -> Any: @@ -982,7 +972,7 @@ def __post_init__(self): f"Invalid join type '{self.how}': must be 'inner' or 'left'" ) - def _validate(self, catalog: Catalog): + def _validate(self, catalog: Catalog, left_schema: Schema, right_schema: Schema): """Validation checks for this QueryExpr.""" if isinstance(self.public_table, str): if not isinstance(catalog.tables[self.public_table], PublicTable): @@ -990,6 +980,7 @@ def _validate(self, catalog: Catalog): f"Attempted public join on table '{self.public_table}', " "which is not a public table" ) + _validate_join(left_schema, right_schema, self.join_columns) def schema(self, catalog: Catalog) -> Schema: """Returns the schema resulting from evaluating this QueryExpr. @@ -998,13 +989,13 @@ def schema(self, catalog: Catalog) -> Schema: table is the left table. """ input_schema = self.child.schema(catalog) - self._validate(catalog) if isinstance(self.public_table, str): right_schema = catalog.tables[self.public_table].schema else: right_schema = Schema( spark_schema_to_analytics_columns(self.public_table.schema) ) + self._validate(catalog, input_schema, right_schema) return _schema_for_join( left_schema=input_schema, right_schema=right_schema, From 018da2ed4dba27bce465a2ad35a5a008506651c3 Mon Sep 17 00:00:00 2001 From: Ted Date: Mon, 3 Nov 2025 23:17:10 +0100 Subject: [PATCH 21/25] Move the handling of special values into a rewrite rule (#102) * create rewriting logic, fix docstrings * rewrite session tests for special values * add a test for get_bounds * unit tests for the new rewrite rule * lint * add todo for changelog * add tests with privacy IDs * changelog, tests tests tests * more tests. send help * yay the xfail is a pass now * more tests to please the review gods * better test comments * lint * whoops where did that come from * why would this start failing now?! * remove visitor tests checking for special values; they are now handled in the rewriting rules instead --------- Co-authored-by: Damien Desfontaines --- CHANGELOG.rst | 2 + src/tmlt/analytics/_query_expr.py | 55 +- .../_base_measurement_visitor.py | 68 +- .../_query_expr_compiler/_rewrite_rules.py | 65 ++ src/tmlt/analytics/query_builder.py | 59 +- test/system/session/rows/conftest.py | 54 - .../rows/test_add_max_rows_infs_nulls.py | 512 ---------- test/system/session/test_special_values.py | 953 ++++++++++++++++++ .../test_measurement_visitor.py | 52 - .../query_expr_compiler/test_rewrite_rules.py | 245 ++++- 10 files changed, 1307 insertions(+), 758 deletions(-) delete mode 100644 test/system/session/rows/test_add_max_rows_infs_nulls.py create mode 100644 test/system/session/test_special_values.py diff --git a/CHANGELOG.rst b/CHANGELOG.rst index e5c59355..0d5ccec5 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -17,6 +17,8 @@ Changed - Aggregation mechanisms can now be specified as strings instead of enums, e.g. ``"laplace"`` instead of ``CountMechanism.LAPLACE`` or ``SumMechanism.LAPLACE``. - Removed previously deprecated argument ``max_num_rows`` to ``flat_map``. Use ``max_rows`` instead. - Removed previously deprecated argument ``cols`` to ``count_distinct``. Use ``columns`` instead. +- Infinity values are now automatically dropped before a floating-point column is passed to `get_bounds`. (The documentation previously claimed that this was done, but this was not the case.) +- Fixed the documentation of the behavior of some numeric aggregations (`sum`, `average`, `stdev`, `variance`, `quantile`) to match the actual behavior: infinity values are clamped using the specified bounds before being passed to the aggregation function, not dropped. .. _v0.20.2: diff --git a/src/tmlt/analytics/_query_expr.py b/src/tmlt/analytics/_query_expr.py index 67bff179..54a8874d 100644 --- a/src/tmlt/analytics/_query_expr.py +++ b/src/tmlt/analytics/_query_expr.py @@ -1,11 +1,10 @@ """Building blocks of the Tumult Analytics query language. Not for direct use. -Defines the :class:`QueryExpr` class, which represents expressions in the -Tumult Analytics query language. QueryExpr and its subclasses should not be -directly constructed or deconstructed by most users; interfaces such as -:class:`tmlt.analytics.QueryBuilder` to create them and -:class:`tmlt.analytics.Session` to consume them provide more -user-friendly features. +Defines the :class:`QueryExpr` class, which represents expressions in the Tumult +Analytics query language. QueryExpr and its subclasses should not be directly +constructed; but instead built using a :class:`tmlt.analytics.QueryBuilder`. The +documentation of the :class:`tmlt.analytics.QueryBuilder` provides more information +about the intended semantics of :class:`QueryExpr` objects. """ # SPDX-License-Identifier: Apache-2.0 @@ -175,14 +174,10 @@ class StdevMechanism(Enum): class QueryExpr(ABC): """A query expression, base class for relational operators. - In most cases, QueryExpr should not be manipulated directly, but rather - created using :class:`tmlt.analytics.QueryBuilder` and then - consumed by :class:`tmlt.analytics.Session`. While they can be - created and modified directly, this is an advanced usage and is not - recommended for typical users. - - QueryExpr are organized in a tree, where each node is an operator which - returns a relation. + QueryExpr are organized in a tree, where each node is an operator that returns a + table. They are built using the :class:`tmlt.analytics.QueryBuilder`, then rewritten + during the compilation process. They should not be created directly, except in + tests. """ @abstractmethod @@ -1775,13 +1770,7 @@ def accept(self, visitor: "QueryExprVisitor") -> Any: @dataclass(frozen=True) class GroupByBoundedSum(SingleChildQueryExpr): - """Returns the bounded sum of a column for each combination of groupby domains. - - If the column to be measured contains null, NaN, or positive or negative infinity, - those values will be dropped (as if dropped explicitly via - :class:`DropNullAndNan` and :class:`DropInfinity`) before the sum is - calculated. - """ + """Returns the bounded sum of a column for each combination of groupby domains.""" groupby_keys: Union[KeySet, Tuple[str, ...]] """The keys, or columns list to collect keys from, to be grouped on.""" @@ -1842,13 +1831,7 @@ def accept(self, visitor: "QueryExprVisitor") -> Any: @dataclass(frozen=True) class GroupByBoundedAverage(SingleChildQueryExpr): - """Returns bounded average of a column for each combination of groupby domains. - - If the column to be measured contains null, NaN, or positive or negative infinity, - those values will be dropped (as if dropped explicitly via - :class:`DropNullAndNan` and :class:`DropInfinity`) before the average is - calculated. - """ + """Returns bounded average of a column for each combination of groupby domains.""" groupby_keys: Union[KeySet, Tuple[str, ...]] """The keys, or columns list to collect keys from, to be grouped on.""" @@ -1909,13 +1892,7 @@ def accept(self, visitor: "QueryExprVisitor") -> Any: @dataclass(frozen=True) class GroupByBoundedVariance(SingleChildQueryExpr): - """Returns bounded variance of a column for each combination of groupby domains. - - If the column to be measured contains null, NaN, or positive or negative infinity, - those values will be dropped (as if dropped explicitly via - :class:`DropNullAndNan` and :class:`DropInfinity`) before the variance is - calculated. - """ + """Returns bounded variance of a column for each combination of groupby domains.""" groupby_keys: Union[KeySet, Tuple[str, ...]] """The keys, or columns list to collect keys from, to be grouped on.""" @@ -1976,13 +1953,7 @@ def accept(self, visitor: "QueryExprVisitor") -> Any: @dataclass(frozen=True) class GroupByBoundedStdev(SingleChildQueryExpr): - """Returns bounded stdev of a column for each combination of groupby domains. - - If the column to be measured contains null, NaN, or positive or negative infinity, - those values will be dropped (as if dropped explicitly via - :class:`DropNullAndNan` and :class:`DropInfinity`) before the - standard deviation is calculated. - """ + """Returns bounded stdev of a column for each combination of groupby domains.""" groupby_keys: Union[KeySet, Tuple[str, ...]] """The keys, or columns list to collect keys from, to be grouped on.""" diff --git a/src/tmlt/analytics/_query_expr_compiler/_base_measurement_visitor.py b/src/tmlt/analytics/_query_expr_compiler/_base_measurement_visitor.py index 19b7c52c..554c1e26 100644 --- a/src/tmlt/analytics/_query_expr_compiler/_base_measurement_visitor.py +++ b/src/tmlt/analytics/_query_expr_compiler/_base_measurement_visitor.py @@ -1,5 +1,4 @@ """Defines a base class for building measurement visitors.""" -import dataclasses import math import warnings from abc import abstractmethod @@ -101,7 +100,7 @@ SuppressAggregates, VarianceMechanism, ) -from tmlt.analytics._schema import ColumnType, FrozenDict, Schema +from tmlt.analytics._schema import Schema from tmlt.analytics._table_identifier import Identifier from tmlt.analytics._table_reference import TableReference from tmlt.analytics._transformation_utils import get_table_from_ref @@ -675,65 +674,6 @@ def _validate_approxDP_and_adjust_budget( else: raise AnalyticsInternalError(f"Unknown mechanism {mechanism}.") - def _add_special_value_handling_to_query( - self, - query: Union[ - GroupByBoundedAverage, - GroupByBoundedStdev, - GroupByBoundedSum, - GroupByBoundedVariance, - GroupByQuantile, - GetBounds, - ], - ): - """Returns a new query that handles nulls, NaNs and infinite values. - - If the measure column allows nulls or NaNs, the new query - will drop those values. - - If the measure column allows infinite values, the new query will replace those - values with the low and high values specified in the query. - - These changes are added immediately before the groupby aggregation in the query. - """ - expected_schema = query.child.schema(self.catalog) - - # You can't perform these queries on nulls, NaNs, or infinite values - # so check for those - try: - measure_desc = expected_schema[query.measure_column] - except KeyError as e: - raise KeyError( - f"Measure column {query.measure_column} is not in the input schema." - ) from e - - new_child: QueryExpr - # If null or NaN values are allowed ... - if measure_desc.allow_null or ( - measure_desc.column_type == ColumnType.DECIMAL and measure_desc.allow_nan - ): - # then drop those values - # (but don't mutate the original query) - new_child = DropNullAndNan( - child=query.child, columns=tuple([query.measure_column]) - ) - query = dataclasses.replace(query, child=new_child) - if not isinstance(query, GetBounds): - # If infinite values are allowed... - if ( - measure_desc.column_type == ColumnType.DECIMAL - and measure_desc.allow_inf - ): - # then clamp them (to low/high values) - new_child = ReplaceInfinity( - child=query.child, - replace_with=FrozenDict.from_dict( - {query.measure_column: (query.low, query.high)} - ), - ) - query = dataclasses.replace(query, child=new_child) - return query - def _validate_measurement(self, measurement: Measurement, mid_stability: sp.Expr): """Validate a measurement.""" if isinstance(self.adjusted_budget.value, tuple): @@ -1146,7 +1086,6 @@ def visit_groupby_quantile( # Peek at the schema, to see if there are errors there expr.schema(self.catalog) - expr = self._add_special_value_handling_to_query(expr) if isinstance(expr.groupby_keys, KeySet): groupby_cols = tuple(expr.groupby_keys.dataframe().columns) @@ -1241,7 +1180,6 @@ def visit_groupby_bounded_sum( # Peek at the schema, to see if there are errors there expr.schema(self.catalog) - expr = self._add_special_value_handling_to_query(expr) if isinstance(expr.groupby_keys, KeySet): groupby_cols = tuple(expr.groupby_keys.dataframe().columns) @@ -1337,7 +1275,6 @@ def visit_groupby_bounded_average( # Peek at the schema, to see if there are errors there expr.schema(self.catalog) - expr = self._add_special_value_handling_to_query(expr) if isinstance(expr.groupby_keys, KeySet): groupby_cols = tuple(expr.groupby_keys.dataframe().columns) @@ -1433,7 +1370,6 @@ def visit_groupby_bounded_variance( # Peek at the schema, to see if there are errors there expr.schema(self.catalog) - expr = self._add_special_value_handling_to_query(expr) if isinstance(expr.groupby_keys, KeySet): groupby_cols = tuple(expr.groupby_keys.dataframe().columns) @@ -1529,7 +1465,6 @@ def visit_groupby_bounded_stdev( # Peek at the schema, to see if there are errors there expr.schema(self.catalog) - expr = self._add_special_value_handling_to_query(expr) if isinstance(expr.groupby_keys, KeySet): groupby_cols = tuple(expr.groupby_keys.dataframe().columns) @@ -1622,7 +1557,6 @@ def visit_get_bounds(self, expr: GetBounds) -> Tuple[Measurement, NoiseInfo]: # Peek at the schema, to see if there are errors there expr.schema(self.catalog) - expr = self._add_special_value_handling_to_query(expr) if isinstance(expr.groupby_keys, KeySet): groupby_cols = tuple(expr.groupby_keys.dataframe().columns) keyset_budget = self._get_zero_budget() diff --git a/src/tmlt/analytics/_query_expr_compiler/_rewrite_rules.py b/src/tmlt/analytics/_query_expr_compiler/_rewrite_rules.py index 7707a2af..7d14173b 100644 --- a/src/tmlt/analytics/_query_expr_compiler/_rewrite_rules.py +++ b/src/tmlt/analytics/_query_expr_compiler/_rewrite_rules.py @@ -20,15 +20,21 @@ AverageMechanism, CountDistinctMechanism, CountMechanism, + DropInfinity, + DropNullAndNan, + FrozenDict, + GetBounds, GroupByBoundedAverage, GroupByBoundedStdev, GroupByBoundedSum, GroupByBoundedVariance, GroupByCount, GroupByCountDistinct, + GroupByQuantile, JoinPrivate, PrivateSource, QueryExpr, + ReplaceInfinity, SingleChildQueryExpr, StdevMechanism, SumMechanism, @@ -193,9 +199,68 @@ def select_noise(expr: QueryExpr) -> QueryExpr: return select_noise +def add_special_value_handling( + info: CompilationInfo, +) -> Callable[[QueryExpr], QueryExpr]: + """Rewrites the query to handle nulls, NaNs and infinite values. + + If the measure column allows nulls or NaNs, the rewritten query will drop those + values. If the measure column allows infinite values, the new query will replace + those values with the clamping bounds specified in the query, or drop these values + for :meth:`~tmlt.analytics.QueryBuilder.get_bounds`. + """ + + @depth_first + def handle_special_values(expr: QueryExpr) -> QueryExpr: + if not isinstance( + expr, + ( + GroupByBoundedAverage, + GroupByBoundedStdev, + GroupByBoundedSum, + GroupByBoundedVariance, + GroupByQuantile, + GetBounds, + ), + ): + return expr + schema = expr.child.schema(info.catalog) + measure_desc = schema[expr.measure_column] + # Remove nulls/NaN if necessary + if measure_desc.allow_null or ( + measure_desc.column_type == ColumnType.DECIMAL and measure_desc.allow_nan + ): + expr = replace( + expr, + child=DropNullAndNan(child=expr.child, columns=(expr.measure_column,)), + ) + # Remove infinities if necessary + if measure_desc.column_type == ColumnType.DECIMAL and measure_desc.allow_inf: + if isinstance(expr, GetBounds): + return replace( + expr, + child=DropInfinity( + child=expr.child, columns=(expr.measure_column,) + ), + ) + return replace( + expr, + child=ReplaceInfinity( + child=expr.child, + replace_with=FrozenDict.from_dict( + {expr.measure_column: (expr.low, expr.high)} + ), + ), + ) + return expr + + return handle_special_values + + def rewrite(info: CompilationInfo, expr: QueryExpr) -> QueryExpr: """Rewrites the given QueryExpr into a QueryExpr that can be compiled.""" rewrite_rules = [ + add_special_value_handling(info), select_noise_mechanism(info), ] for rule in rewrite_rules: diff --git a/src/tmlt/analytics/query_builder.py b/src/tmlt/analytics/query_builder.py index a427220f..478e9f33 100644 --- a/src/tmlt/analytics/query_builder.py +++ b/src/tmlt/analytics/query_builder.py @@ -764,7 +764,7 @@ def replace_infinity( ) return self - def drop_null_and_nan(self, columns: Optional[List[str]]) -> "QueryBuilder": + def drop_null_and_nan(self, columns: Optional[List[str]] = None) -> "QueryBuilder": """Removes rows containing null or NaN values. .. note:: @@ -869,7 +869,7 @@ def drop_null_and_nan(self, columns: Optional[List[str]]) -> "QueryBuilder": ) return self - def drop_infinity(self, columns: Optional[List[str]]) -> "QueryBuilder": + def drop_infinity(self, columns: Optional[List[str]] = None) -> "QueryBuilder": """Remove rows containing infinite values. .. @@ -2453,12 +2453,6 @@ def sum( ) -> Query: """Returns a sum query ready to be evaluated. - .. note:: - If the column being measured contains NaN or null values, a - :meth:`~drop_null_and_nan` query will be performed first. If the - column being measured contains infinite values, a - :meth:`~drop_infinity` query will be performed first. - .. note:: Regarding the clamping bounds: @@ -2472,6 +2466,12 @@ def sum( Consult the :ref:`Numerical aggregations ` tutorial for more information. + .. note:: + If the column being measured contains NaN or null values, a + :meth:`~drop_null_and_nan` query will be performed first. If the column + being measured contains infinite values, these values will be clamped + between ``low`` and ``high``. + .. >>> from tmlt.analytics import ( ... AddOneRow, @@ -2541,12 +2541,6 @@ def average( ) -> Query: """Returns an average query ready to be evaluated. - .. note:: - If the column being measured contains NaN or null values, a - :meth:`~drop_null_and_nan` query will be performed first. If the - column being measured contains infinite values, a - :meth:`~drop_infinity` query will be performed first. - .. note:: Regarding the clamping bounds: @@ -2560,6 +2554,12 @@ def average( Consult the :ref:`Numerical aggregations ` tutorial for more information. + .. note:: + If the column being measured contains NaN or null values, a + :meth:`~drop_null_and_nan` query will be performed first. If the column + being measured contains infinite values, these values will be clamped + between ``low`` and ``high``. + .. >>> from tmlt.analytics import ( ... AddOneRow, @@ -2629,12 +2629,6 @@ def variance( ) -> Query: """Returns a variance query ready to be evaluated. - .. note:: - If the column being measured contains NaN or null values, a - :meth:`~drop_null_and_nan` query will be performed first. If the - column being measured contains infinite values, a - :meth:`~drop_infinity` query will be performed first. - .. note:: Regarding the clamping bounds: @@ -2648,6 +2642,12 @@ def variance( Consult the :ref:`Numerical aggregations ` tutorial for more information. + .. note:: + If the column being measured contains NaN or null values, a + :meth:`~drop_null_and_nan` query will be performed first. If the column + being measured contains infinite values, these values will be clamped + between ``low`` and ``high``. + .. >>> from tmlt.analytics import ( ... AddOneRow, @@ -2717,12 +2717,6 @@ def stdev( ) -> Query: """Returns a standard deviation query ready to be evaluated. - .. note:: - If the column being measured contains NaN or null values, a - :meth:`~drop_null_and_nan` query will be performed first. If the - column being measured contains infinite values, a - :meth:`~drop_infinity` query will be performed first. - .. note:: Regarding the clamping bounds: @@ -2736,6 +2730,12 @@ def stdev( Consult the :ref:`Numerical aggregations ` tutorial for more information. + .. note:: + If the column being measured contains NaN or null values, a + :meth:`~drop_null_and_nan` query will be performed first. If the column + being measured contains infinite values, these values will be clamped + between ``low`` and ``high``. + .. >>> from tmlt.analytics import ( ... AddOneRow, @@ -2995,10 +2995,9 @@ def quantile( .. note:: If the column being measured contains NaN or null values, a - :meth:`~QueryBuilder.drop_null_and_nan` query will be performed - first. If the column being measured contains infinite values, a - :meth:`~QueryBuilder.drop_infinity` query will be performed first. - + :meth:`~QueryBuilder.drop_null_and_nan` query will be performed first. If + the column being measured contains infinite values, these values will be + clamped between ``low`` and ``high``. .. >>> from tmlt.analytics import ( ... AddOneRow, diff --git a/test/system/session/rows/conftest.py b/test/system/session/rows/conftest.py index 265f6e7c..10368575 100644 --- a/test/system/session/rows/conftest.py +++ b/test/system/session/rows/conftest.py @@ -38,7 +38,6 @@ FrozenDict, Schema, analytics_to_spark_columns_descriptor, - analytics_to_spark_schema, ) # Shorthands for some values used in tests @@ -709,56 +708,3 @@ def sess_data(spark, request): analytics_to_spark_columns_descriptor(Schema(sdf_col_types)) ) request.cls.sdf_input_domain = sdf_input_domain - - -###DATA FOR SESSIONS WITH NULLS### -@pytest.fixture(name="null_session_data", scope="class") -def null_setup(spark, request): - """Set up test data for sessions with nulls.""" - # Since Spark gives back timestamps with microsecond accuracy, this - # dataframe needs to make that the default precision for column T. - pdf = pd.DataFrame( - [ - ["a0", 0, 0.0, datetime.date(2000, 1, 1), datetime.datetime(2020, 1, 1)], - [None, 1, 1.0, datetime.date(2001, 1, 1), datetime.datetime(2021, 1, 1)], - ["a2", None, 2.0, datetime.date(2002, 1, 1), datetime.datetime(2022, 1, 1)], - ["a3", 3, None, datetime.date(2003, 1, 1), datetime.datetime(2023, 1, 1)], - ["a4", 4, 4.0, None, datetime.datetime(2024, 1, 1)], - ["a5", 5, 5.0, datetime.date(2005, 1, 1), None], - ], - columns=["A", "I", "X", "D", "T"], - ).astype({"T": "datetime64[us]"}) - - request.cls.pdf = pdf - - sdf_col_types = { - "A": ColumnDescriptor(ColumnType.VARCHAR, allow_null=True), - "I": ColumnDescriptor(ColumnType.INTEGER, allow_null=True), - "X": ColumnDescriptor(ColumnType.DECIMAL, allow_null=True), - "D": ColumnDescriptor(ColumnType.DATE, allow_null=True), - "T": ColumnDescriptor(ColumnType.TIMESTAMP, allow_null=True), - } - - sdf = spark.createDataFrame( - pdf, schema=analytics_to_spark_schema(Schema(sdf_col_types)) - ) - request.cls.sdf = sdf - - -###DATA FOR SESSIONS WITH INF VALUES### -@pytest.fixture(name="infs_test_data", scope="class") -def infs_setup(spark, request): - """Set up tests.""" - pdf = pd.DataFrame( - {"A": ["a0", "a0", "a1", "a1"], "B": [float("-inf"), 2.0, 5.0, float("inf")]} - ) - request.cls.pdf = pdf - - sdf_col_types = { - "A": ColumnDescriptor(ColumnType.VARCHAR), - "B": ColumnDescriptor(ColumnType.DECIMAL, allow_inf=True), - } - sdf = spark.createDataFrame( - pdf, schema=analytics_to_spark_schema(Schema(sdf_col_types)) - ) - request.cls.sdf = sdf diff --git a/test/system/session/rows/test_add_max_rows_infs_nulls.py b/test/system/session/rows/test_add_max_rows_infs_nulls.py deleted file mode 100644 index 20738d87..00000000 --- a/test/system/session/rows/test_add_max_rows_infs_nulls.py +++ /dev/null @@ -1,512 +0,0 @@ -"""System tests for Sessions with Nulls and Infs.""" - -# SPDX-License-Identifier: Apache-2.0 -# Copyright Tumult Labs 2025 - -import datetime -from typing import Any, Dict, List, Mapping, Tuple, Union - -import pandas as pd -import pytest -from pyspark.sql import DataFrame, SparkSession -from pyspark.sql.types import StringType, StructField, StructType -from tmlt.core.measurements.interactive_measurements import SequentialQueryable -from tmlt.core.utils.testing import Case, parametrize - -from tmlt.analytics import ( - AddOneRow, - AnalyticsDefault, - ColumnDescriptor, - ColumnType, - KeySet, - PureDPBudget, - QueryBuilder, - Session, - TruncationStrategy, -) -from tmlt.analytics._table_identifier import NamedTable - -from ....conftest import assert_frame_equal_with_sort - - -@pytest.mark.usefixtures("null_session_data") -class TestSessionWithNulls: - """Tests for sessions with Nulls.""" - - pdf: pd.DataFrame - sdf: DataFrame - - def _expected_replace(self, d: Mapping[str, Any]) -> pd.DataFrame: - """The expected value if you replace None with default values in d.""" - new_cols: List[pd.DataFrame] = [] - for col in list(self.pdf.columns): - if col in dict(d): - # make sure I becomes an integer here - if col == "I": - new_cols.append(self.pdf[col].fillna(dict(d)[col]).astype(int)) - else: - new_cols.append(self.pdf[col].fillna(dict(d)[col])) - else: - new_cols.append(self.pdf[col]) - # `axis=1` means that you want to "concatenate" by columns - # i.e., you want your new table to look like this: - # df1 | df2 | df3 | ... - # df1 | df2 | df3 | ... - return pd.concat(new_cols, axis=1) - - def test_expected_replace(self) -> None: - """Test the test method _expected_replace.""" - d = { - "A": "a999", - "I": -999, - "X": 99.9, - "D": datetime.date(1999, 1, 1), - "T": datetime.datetime(2019, 1, 1), - } - expected = pd.DataFrame( - [ - [ - "a0", - 0, - 0.0, - datetime.date(2000, 1, 1), - datetime.datetime(2020, 1, 1), - ], - [ - "a999", - 1, - 1.0, - datetime.date(2001, 1, 1), - datetime.datetime(2021, 1, 1), - ], - [ - "a2", - -999, - 2.0, - datetime.date(2002, 1, 1), - datetime.datetime(2022, 1, 1), - ], - [ - "a3", - 3, - 99.9, - datetime.date(2003, 1, 1), - datetime.datetime(2023, 1, 1), - ], - [ - "a4", - 4, - 4.0, - datetime.date(1999, 1, 1), - datetime.datetime(2024, 1, 1), - ], - [ - "a5", - 5, - 5.0, - datetime.date(2005, 1, 1), - datetime.datetime(2019, 1, 1), - ], - ], - columns=["A", "I", "X", "D", "T"], - ) - assert_frame_equal_with_sort(self.pdf, self._expected_replace({})) - assert_frame_equal_with_sort( - expected, - self._expected_replace(d), - ) - - @pytest.mark.parametrize( - "cols_to_defaults", - [ - ({"A": "aaaaaaa"}), - ({"I": 999}), - ( - { - "A": "aaa", - "I": 999, - "X": -99.9, - "D": datetime.date.fromtimestamp(0), - "T": datetime.datetime.fromtimestamp(0), - } - ), - ], - ) - def test_replace_null_and_nan( - self, - cols_to_defaults: Mapping[ - str, Union[int, float, str, datetime.date, datetime.datetime] - ], - ) -> None: - """Test Session.replace_null_and_nan.""" - session = Session.from_dataframe( - PureDPBudget(float("inf")), - "private", - self.sdf, - protected_change=AddOneRow(), - ) - session.create_view( - QueryBuilder("private").replace_null_and_nan(cols_to_defaults), - "replaced", - cache=False, - ) - # pylint: disable=protected-access - queryable = session._accountant._queryable - assert isinstance(queryable, SequentialQueryable) - data = queryable._data - assert isinstance(data, dict) - assert isinstance(data[NamedTable("replaced")], DataFrame) - # pylint: enable=protected-access - assert_frame_equal_with_sort( - data[NamedTable("replaced")].toPandas(), - self._expected_replace(cols_to_defaults), - ) - - @pytest.mark.parametrize( - "public_df,keyset,expected", - [ - ( - pd.DataFrame( - [[None, 0], [None, 1], ["a2", 1], ["a2", 2]], - columns=["A", "new_column"], - ), - KeySet.from_dict({"new_column": [0, 1, 2]}), - pd.DataFrame([[0, 1], [1, 2], [2, 1]], columns=["new_column", "count"]), - ), - ( - pd.DataFrame( - [["a0", 0, 0], [None, 1, 17], ["a5", 5, 17], ["a5", 5, 400]], - columns=["A", "I", "new_column"], - ), - KeySet.from_dict({"new_column": [0, 17, 400]}), - pd.DataFrame( - [[0, 1], [17, 2], [400, 1]], columns=["new_column", "count"] - ), - ), - ( - pd.DataFrame( - [ - [datetime.date(2000, 1, 1), "2000"], - [datetime.date(2001, 1, 1), "2001"], - [None, "none"], - [None, "also none"], - ], - columns=["D", "year"], - ), - KeySet.from_dict( - {"D": [datetime.date(2000, 1, 1), datetime.date(2001, 1, 1), None]} - ), - pd.DataFrame( - [ - [datetime.date(2000, 1, 1), 1], - [datetime.date(2001, 1, 1), 1], - [None, 2], - ], - columns=["D", "count"], - ), - ), - ], - ) - def test_join_public( - self, spark, public_df: pd.DataFrame, keyset: KeySet, expected: pd.DataFrame - ) -> None: - """Test that join_public creates the correct results. - - The query used to evaluate this is a GroupByCount on the new dataframe, - using the keyset provided. - """ - session = Session.from_dataframe( - PureDPBudget(float("inf")), - "private", - self.sdf, - protected_change=AddOneRow(), - ) - session.add_public_dataframe("public", spark.createDataFrame(public_df)) - result = session.evaluate( - QueryBuilder("private").join_public("public").groupby(keyset).count(), - privacy_budget=PureDPBudget(float("inf")), - ) - assert_frame_equal_with_sort(result.toPandas(), expected) - - @pytest.mark.parametrize( - "private_df,keyset,expected", - [ - ( - pd.DataFrame( - [[None, 0], [None, 1], ["a2", 1], ["a2", 2]], - columns=["A", "new_column"], - ), - KeySet.from_dict({"new_column": [0, 1, 2]}), - pd.DataFrame([[0, 1], [1, 2], [2, 1]], columns=["new_column", "count"]), - ), - ( - pd.DataFrame( - [["a0", 0, 0], [None, 1, 17], ["a5", 5, 17], ["a5", 5, 400]], - columns=["A", "I", "new_column"], - ), - KeySet.from_dict({"new_column": [0, 17, 400]}), - pd.DataFrame( - [[0, 1], [17, 2], [400, 1]], columns=["new_column", "count"] - ), - ), - ( - pd.DataFrame( - [ - [datetime.date(2000, 1, 1), "2000"], - [datetime.date(2001, 1, 1), "2001"], - [None, "none"], - [None, "also none"], - ], - columns=["D", "year"], - ), - KeySet.from_dict( - {"D": [datetime.date(2000, 1, 1), datetime.date(2001, 1, 1), None]} - ), - pd.DataFrame( - [ - [datetime.date(2000, 1, 1), 1], - [datetime.date(2001, 1, 1), 1], - [None, 2], - ], - columns=["D", "count"], - ), - ), - ], - ) - def test_join_private( - self, spark, private_df: pd.DataFrame, keyset: KeySet, expected: pd.DataFrame - ) -> None: - """Test that join_private creates the correct results. - - The query used to evaluate this is a GroupByCount on the joined dataframe, - using the keyset provided. - """ - session = ( - Session.Builder() - .with_privacy_budget(PureDPBudget(float("inf"))) - .with_private_dataframe("private", self.sdf, AddOneRow()) - .with_private_dataframe( - "private2", spark.createDataFrame(private_df), AddOneRow() - ) - .build() - ) - result = session.evaluate( - QueryBuilder("private") - .join_private( - QueryBuilder("private2"), - TruncationStrategy.DropExcess(100), - TruncationStrategy.DropExcess(100), - ) - .groupby(keyset) - .count(), - PureDPBudget(float("inf")), - ) - assert_frame_equal_with_sort(result.toPandas(), expected) - - @parametrize( - Case("both_allow_nulls")( - public_schema=StructType([StructField("foo", StringType(), True)]), - private_schema=StructType([StructField("foo", StringType(), True)]), - expected_schema={ - "foo": ColumnDescriptor(ColumnType.VARCHAR, allow_null=True) - }, - ), - Case("none_allow_nulls")( - public_schema=StructType([StructField("foo", StringType(), False)]), - private_schema=StructType([StructField("foo", StringType(), False)]), - expected_schema={ - "foo": ColumnDescriptor(ColumnType.VARCHAR, allow_null=False) - }, - ), - Case("public_only_nulls")( - public_schema=StructType([StructField("foo", StringType(), True)]), - private_schema=StructType([StructField("foo", StringType(), False)]), - expected_schema={ - "foo": ColumnDescriptor(ColumnType.VARCHAR, allow_null=False) - }, - ), - Case("private_only_nulls")( - public_schema=StructType([StructField("foo", StringType(), False)]), - private_schema=StructType([StructField("foo", StringType(), True)]), - expected_schema={ - "foo": ColumnDescriptor(ColumnType.VARCHAR, allow_null=False) - }, - ), - ) - def test_public_join_schema_null_propagation( - self, - public_schema: StructType, - private_schema: StructType, - expected_schema: StructType, - spark: SparkSession, - ): - """Tests that join_public correctly handles schemas that allow null values.""" - public_df = spark.createDataFrame([], public_schema) - private_df = spark.createDataFrame([], private_schema) - sess = ( - Session.Builder() - .with_privacy_budget(PureDPBudget(float("inf"))) - .with_private_dataframe("private", private_df, protected_change=AddOneRow()) - .with_public_dataframe("public", public_df) - .build() - ) - sess.create_view( - QueryBuilder("private").join_public("public"), source_id="join", cache=False - ) - assert sess.get_schema("join") == expected_schema - - @parametrize( - Case("both_allow_nulls")( - left_schema=StructType([StructField("foo", StringType(), True)]), - right_schema=StructType([StructField("foo", StringType(), True)]), - expected_schema={ - "foo": ColumnDescriptor(ColumnType.VARCHAR, allow_null=True) - }, - ), - Case("none_allow_nulls")( - left_schema=StructType([StructField("foo", StringType(), False)]), - right_schema=StructType([StructField("foo", StringType(), False)]), - expected_schema={ - "foo": ColumnDescriptor(ColumnType.VARCHAR, allow_null=False) - }, - ), - Case("public_only_nulls")( - left_schema=StructType([StructField("foo", StringType(), True)]), - right_schema=StructType([StructField("foo", StringType(), False)]), - expected_schema={ - "foo": ColumnDescriptor(ColumnType.VARCHAR, allow_null=False) - }, - ), - Case("private_only_nulls")( - left_schema=StructType([StructField("foo", StringType(), False)]), - right_schema=StructType([StructField("foo", StringType(), True)]), - expected_schema={ - "foo": ColumnDescriptor(ColumnType.VARCHAR, allow_null=False) - }, - ), - ) - def test_private_join_schema_null_propagation( - self, - left_schema: StructType, - right_schema: StructType, - expected_schema: StructType, - spark: SparkSession, - ): - """Tests that join_private correctly handles schemas that allow null values.""" - left_df = spark.createDataFrame([], left_schema) - right_df = spark.createDataFrame([], right_schema) - sess = ( - Session.Builder() - .with_privacy_budget(PureDPBudget(float("inf"))) - .with_private_dataframe("left", left_df, protected_change=AddOneRow()) - .with_private_dataframe("right", right_df, protected_change=AddOneRow()) - .build() - ) - sess.create_view( - QueryBuilder("left").join_private( - "right", - truncation_strategy_left=TruncationStrategy.DropExcess(1), - truncation_strategy_right=TruncationStrategy.DropExcess(1), - ), - source_id="join", - cache=False, - ) - assert sess.get_schema("join") == expected_schema - - -@pytest.mark.usefixtures("infs_test_data") -class TestSessionWithInfs: - """Tests for Sessions with Infs.""" - - pdf: pd.DataFrame - sdf: DataFrame - - @pytest.mark.parametrize( - "replace_with,", - [ - ({}), - ({"B": (-100.0, 100.0)}), - ({"B": (123.45, 678.90)}), - ({"B": (999.9, 111.1)}), - ], - ) - def test_replace_infinity( - self, replace_with: Dict[str, Tuple[float, float]] - ) -> None: - """Test replace_infinity query.""" - session = Session.from_dataframe( - PureDPBudget(float("inf")), - "private", - self.sdf, - protected_change=AddOneRow(), - ) - session.create_view( - QueryBuilder("private").replace_infinity(replace_with), - "replaced", - cache=False, - ) - # pylint: disable=protected-access - queryable = session._accountant._queryable - assert isinstance(queryable, SequentialQueryable) - data = queryable._data - assert isinstance(data, dict) - assert isinstance(data[NamedTable("replaced")], DataFrame) - # pylint: enable=protected-access - (replace_negative, replace_positive) = replace_with.get( - "B", (AnalyticsDefault.DECIMAL, AnalyticsDefault.DECIMAL) - ) - expected = self.pdf.replace(float("-inf"), replace_negative).replace( - float("inf"), replace_positive - ) - assert_frame_equal_with_sort(data[NamedTable("replaced")].toPandas(), expected) - - @pytest.mark.parametrize( - "replace_with,expected", - [ - ({}, pd.DataFrame([["a0", 2.0], ["a1", 5.0]], columns=["A", "sum"])), - ( - {"B": (-100.0, 100.0)}, - pd.DataFrame([["a0", -98.0], ["a1", 105.0]], columns=["A", "sum"]), - ), - ( - {"B": (500.0, 100.0)}, - pd.DataFrame([["a0", 502.0], ["a1", 105.0]], columns=["A", "sum"]), - ), - ], - ) - def test_sum( - self, replace_with: Dict[str, Tuple[float, float]], expected: pd.DataFrame - ) -> None: - """Test GroupByBoundedSum after replacing infinite values.""" - session = Session.from_dataframe( - PureDPBudget(float("inf")), - "private", - self.sdf, - protected_change=AddOneRow(), - ) - result = session.evaluate( - QueryBuilder("private") - .replace_infinity(replace_with) - .groupby(KeySet.from_dict({"A": ["a0", "a1"]})) - .sum("B", low=-1000, high=1000, name="sum"), - PureDPBudget(float("inf")), - ) - assert_frame_equal_with_sort(result.toPandas(), expected) - - def test_drop_infinity(self): - """Test GroupByBoundedSum after dropping infinite values.""" - session = Session.from_dataframe( - PureDPBudget(float("inf")), - "private", - self.sdf, - protected_change=AddOneRow(), - ) - result = session.evaluate( - QueryBuilder("private") - .drop_infinity(columns=["B"]) - .groupby(KeySet.from_dict({"A": ["a0", "a1"]})) - .sum("B", low=-1000, high=1000, name="sum"), - PureDPBudget(float("inf")), - ) - expected = pd.DataFrame([["a0", 2.0], ["a1", 5.0]], columns=["A", "sum"]) - assert_frame_equal_with_sort(result.toPandas(), expected) diff --git a/test/system/session/test_special_values.py b/test/system/session/test_special_values.py new file mode 100644 index 00000000..846af57f --- /dev/null +++ b/test/system/session/test_special_values.py @@ -0,0 +1,953 @@ +"""System tests for tables with special values (nulls, nans, infinities).""" + +# SPDX-License-Identifier: Apache-2.0 +# Copyright Tumult Labs 2025 + +import datetime +from typing import Dict, List, Optional, Tuple, Union + +import pandas as pd +import pytest +from numpy import sqrt +from pyspark.sql import DataFrame +from tmlt.core.utils.testing import Case, parametrize + +from tmlt.analytics import ( + AddOneRow, + AddRowsWithID, + ColumnDescriptor, + ColumnType, + KeySet, + MaxRowsPerID, + ProtectedChange, + PureDPBudget, + Query, + QueryBuilder, + Session, + TruncationStrategy, +) +from tmlt.analytics._schema import Schema, analytics_to_spark_schema + +from ...conftest import assert_frame_equal_with_sort + + +@pytest.fixture(name="sdf_special_values", scope="module") +def special_values_dataframe(spark): + """Set up test data for sessions with special values.""" + sdf_col_types = { + "string_nulls": ColumnDescriptor(ColumnType.VARCHAR, allow_null=True), + "int_no_null": ColumnDescriptor(ColumnType.INTEGER, allow_null=False), + "int_nulls": ColumnDescriptor(ColumnType.INTEGER, allow_null=True), + "float_no_special": ColumnDescriptor( + ColumnType.DECIMAL, + allow_null=False, + allow_nan=False, + allow_inf=False, + ), + "float_nulls": ColumnDescriptor( + ColumnType.DECIMAL, + allow_null=True, + allow_nan=False, + allow_inf=False, + ), + "float_nans": ColumnDescriptor( + ColumnType.DECIMAL, + allow_null=False, + allow_nan=True, + allow_inf=False, + ), + "float_infs": ColumnDescriptor( + ColumnType.DECIMAL, + allow_null=False, + allow_nan=False, + allow_inf=True, + ), + "float_all_special": ColumnDescriptor( + ColumnType.DECIMAL, + allow_null=True, + allow_nan=True, + allow_inf=True, + ), + "date_nulls": ColumnDescriptor(ColumnType.DATE, allow_null=True), + "time_nulls": ColumnDescriptor(ColumnType.TIMESTAMP, allow_null=True), + } + date = datetime.date(2000, 1, 1) + time = datetime.datetime(2020, 1, 1) + sdf = spark.createDataFrame( + [(f"normal_{i}", 1, 1, 1.0, 1.0, 1.0, 1.0, 1.0, date, time) for i in range(20)] + + [ + # Rows with nulls + (None, 1, 1, 1.0, 1.0, 1.0, 1.0, 1.0, date, time), + ("u2", 1, None, 1.0, 1.0, 1.0, 1.0, 1.0, date, time), + ("u3", 1, 1, 1.0, None, 1.0, 1.0, None, date, time), + ("u4", 1, 1, 1.0, 1.0, 1.0, 1.0, 1.0, None, time), + ("u5", 1, 1, 1.0, 1.0, 1.0, 1.0, 1.0, date, None), + # Rows with nans + ("a6", 1, 1, 1.0, 1.0, float("nan"), 1.0, float("nan"), date, time), + # Rows with infinities + ("i7", 1, 1, 1.0, 1.0, 1.0, float("inf"), float("inf"), date, time), + ("i8", 1, 1, 1.0, 1.0, 1.0, -float("inf"), -float("inf"), date, time), + ("i9", 1, 1, 1.0, 1.0, 1.0, float("inf"), 1.0, date, time), + ("i10", 1, 1, 1.0, 1.0, 1.0, -float("inf"), 1.0, date, time), + ], + schema=analytics_to_spark_schema(Schema(sdf_col_types)), + ) + return sdf + + +@parametrize( + [ + Case("int_sum")( + # There are 29 1s in the "int_nulls" column and one null, which should be + # dropped by default. + query=QueryBuilder("private").sum("int_nulls", 0, 1), + expected_df=pd.DataFrame( + [[29]], + columns=["int_nulls_sum"], + ), + ), + Case("count_distinct")( + # Nulls, nans, and infinities count as distinct values in a count_distinct + # query. All other values in the "float_all_special" are 1s. + query=QueryBuilder("private").count_distinct(["float_all_special"]), + expected_df=pd.DataFrame( + [[5]], + columns=["count_distinct(float_all_special)"], + ), + ), + Case("count_distinct_deduplicates")( + # The "float_infs" column contains 1s, two positive infinities, and two + # negative infinities. + query=QueryBuilder("private").count_distinct(["float_infs"]), + expected_df=pd.DataFrame( + [[3]], + columns=["count_distinct(float_infs)"], + ), + ), + Case("float_average")( + # In the "float_all_special" column, there are 26 1s, one null (dropped), + # one NaN (dropped), one negative infinity (clamped to 50), and one positive + # infinity (clamed to 100). + query=QueryBuilder("private").sum("float_all_special", -100, 300), + expected_df=pd.DataFrame( + [[226]], # 26-100+300 + columns=["float_all_special_sum"], + ), + ), + Case("group_by_null")( + # Nulls can be used as group-by + query=( + QueryBuilder("private") + .groupby( + KeySet.from_dict({"date_nulls": [datetime.date(2000, 1, 1), None]}) + ) + .count() + ), + expected_df=pd.DataFrame( + [[datetime.date(2000, 1, 1), 29], [None, 1]], + columns=["date_nulls", "count"], + ), + ), + ] +) +def test_default_behavior( + sdf_special_values: DataFrame, query: Query, expected_df: pd.DataFrame +): + inf_budget = PureDPBudget(float("inf")) + sess = Session.from_dataframe( + inf_budget, + "private", + sdf_special_values, + AddOneRow(), + ) + result = sess.evaluate(query, inf_budget) + print(expected_df) + assert_frame_equal_with_sort(result.toPandas(), expected_df) + + +@parametrize( + [ + Case("int_noop")( + # Column "int_no_null" has only non-null values, all equal to 1 + replace_with={"int_no_null": 42}, + column="int_no_null", + low=0, + high=1, + expected_average=1, + ), + Case("int_replace_null")( + # Column "int_nulls" has one null value and 29 1s. + replace_with={"int_nulls": 31}, + column="int_nulls", + low=0, + high=100, + expected_average=2.0, # (29+31)/30 + ), + Case("float_replace_null")( + # Column "float_nulls" has one null value and 29 1s. + replace_with={"float_nulls": 61}, + column="float_nulls", + low=0, + high=100, + expected_average=3.0, # (29+61)/30 + ), + Case("float_replace_nan")( + # Column "float_nulls" has one null value and 29 1s. + replace_with={"float_nans": 91}, + column="float_nans", + low=0, + high=100, + expected_average=4.0, # (29+91)/30 + ), + Case("float_replace_both")( + # Column "float_all_special" has 26 1s, one null value, one nan-value, one + # negative infinity (clamped to 0), one positive infinity (clamped to 34). + replace_with={"float_all_special": 15}, + column="float_all_special", + low=0, + high=34, + expected_average=3.0, # (26+15+15+34)/30 + ), + Case("replace_all_with_none")( + # When called with no argument, replace_null_and_nan should replace all null + # values by analytics defaults, e.g. 0. + replace_with=None, + column="float_nulls", + low=0, + high=1, + expected_average=29.0 / 30, + ), + Case("replace_all_with_empty_dict")( + # Same thing with an empty dict and with nan values. + replace_with={}, + column="float_nans", + low=0, + high=1, + expected_average=29.0 / 30, + ), + ] +) +def test_replace_null_and_nan( + sdf_special_values: DataFrame, + replace_with: Optional[Dict[str, Union[int, float]]], + column: str, + low: Union[int, float], + high: Union[int, float], + expected_average: Union[int, float], +): + inf_budget = PureDPBudget(float("inf")) + sess = Session.from_dataframe( + inf_budget, + "private", + sdf_special_values, + AddOneRow(), + ) + base_query = QueryBuilder("private") + query = base_query.replace_null_and_nan(replace_with).average(column, low, high) + result = sess.evaluate(query, inf_budget) + expected_df = pd.DataFrame([[expected_average]], columns=[column + "_average"]) + assert_frame_equal_with_sort(result.toPandas(), expected_df) + + +@parametrize( + [ + # All columns have 30 rows, all non-special values are equal to 1. + Case("int_noop")( + # Column "int_no_null" has only regular values. + affected_columns=["int_no_null"], + measure_column="int_no_null", + low=0, + high=1, + expected_sum=30, + ), + Case("int_drop_nulls")( + # Column "int_nulls" has one null value and 29 1s. + affected_columns=["int_nulls"], + measure_column="int_nulls", + low=0, + high=100, + expected_sum=29, + ), + Case("float_drop_nulls")( + # Column "float_nulls" has one null value and 29 1s. + affected_columns=["float_nulls"], + measure_column="float_nulls", + low=0, + high=100, + expected_sum=29, + ), + Case("float_drop_nan")( + # Column "float_nans" has one nan value and 29 1s. + affected_columns=["float_nans"], + measure_column="float_nans", + low=0, + high=100, + expected_sum=29, + ), + Case("float_drop_both")( + # Column "float_all_special" has 26 1s, one null, one nan, one negative + # infinity (clamped to 0), one positive infinity (clamped to 100). + affected_columns=["float_all_special"], + measure_column="float_all_special", + low=0, + high=100, + expected_sum=126, + ), + Case("drop_other_columns")( + # Column "float_infs" has 26 1s, two negative infinities (clamped to 0) and + # two positive infinities (clamped to 100). But dropping rows from columns + # "string_nulls", "float_nulls", "float_nans", "date_nulls" and "time_nulls" + # should remove five rows, leaving just 21 normal values. + affected_columns=[ + "string_nulls", + "float_nulls", + "float_nans", + "date_nulls", + "time_nulls", + ], + measure_column="float_infs", + low=0, + high=100, + expected_sum=221, + ), + Case("drop_all_with_none")( + # When called with no argument, replace_null_and_nan should drop all rows + # that have null/nan values anywhere, which leaves 24 1s even if we're + # summing a column without nulls. + affected_columns=None, + measure_column="int_no_null", + low=0, + high=1, + expected_sum=24, + ), + Case("drop_all_with_empty_list")( + # Same thing with an empty list. + affected_columns=[], + measure_column="float_nulls", + low=0, + high=1, + expected_sum=24.0, + ), + ] +) +def test_drop_null_and_nan( + sdf_special_values: DataFrame, + affected_columns: Optional[List[str]], + measure_column: str, + low: Union[int, float], + high: Union[int, float], + expected_sum: Union[int, float], +): + inf_budget = PureDPBudget(float("inf")) + sess = Session.from_dataframe( + inf_budget, + "private", + sdf_special_values, + AddOneRow(), + ) + base_query = QueryBuilder("private") + query = base_query.drop_null_and_nan(affected_columns).sum( + measure_column, low, high + ) + result = sess.evaluate(query, inf_budget) + expected_df = pd.DataFrame([[expected_sum]], columns=[measure_column + "_sum"]) + assert_frame_equal_with_sort(result.toPandas(), expected_df) + + +@parametrize( + # All these tests compute the average of the "float_infs" column of the input table, + # in which there are: + # - 26 non-infinity values, all equal to 1 + # - two negative infinity + # - two positive infinity + # We test this using average and not sum to distinguish between infinities being + # removed from infinities being changed to 0. + [ + Case("replace_no_clamp")( + replace_with={"float_infs": (0, 17)}, + low=-100, + high=100, + # 26+0+0+17+17 = 60, divided by 30 is 2 + expected_average=2.0, + ), + Case("replace_clamp")( + replace_with={"float_infs": (-4217, 300)}, + low=-5, + high=22, + # 26-5+5+22+22 = 60, divided by 30 is 2 + expected_average=2.0, + ), + Case("replace_unrelated_column")( + # If we don't explicitly replace infinity in the measure column, then + # infinities should be clamped to the bounds. + replace_with={"float_all_special": (-4217, 300)}, + low=-10, + high=27, + # 26-10+10+27+27 = 60, divided by 30 is 2 + expected_average=2.0, + ), + Case("replace_with_none")( + # If used without any argument, replace_infinity transforms all infinity + # values in all columns of the table to 0. + replace_with=None, + low=-10, + high=10, + expected_average=26.0 / 30.0, + ), + Case("replace_with_empty_dict")( + # Same with an empty dict. + replace_with={}, + low=-10, + high=10, + expected_average=26.0 / 30.0, + ), + ] +) +def test_replace_infinity_average( + sdf_special_values: DataFrame, + replace_with: Optional[Dict[str, Tuple[float, float]]], + low: float, + high: float, + expected_average: float, +): + inf_budget = PureDPBudget(float("inf")) + sess = Session.from_dataframe( + inf_budget, + "private", + sdf_special_values, + AddOneRow(), + ) + base_query = QueryBuilder("private") + query = base_query.replace_infinity(replace_with).average("float_infs", low, high) + result = sess.evaluate(query, inf_budget) + expected_df = pd.DataFrame([[expected_average]], columns=["float_infs_average"]) + assert_frame_equal_with_sort(result.toPandas(), expected_df) + + +@parametrize( + [ + Case("all_ones")( + replace_with={"float_infs": (1, 1)}, + expected_sum=30.0, + expected_stdev=0, + expected_variance=0, + ), + Case("one_zero_one_one")( + # If we don't replace infinities in the measure column, then infinity values + # should be clamped to the bounds, namely 0 and 1. + replace_with={"float_all_special": (1, 1)}, + expected_sum=28.0, + expected_stdev=sqrt((2 * (28.0 / 30) ** 2 + 28 * (2.0 / 30) ** 2) / 29), + expected_variance=(2 * (28.0 / 30) ** 2 + 28 * (2.0 / 30) ** 2) / 29, + ), + Case("all_zeroes")( + # Without argument, all infinities are replaced by 0 + replace_with=None, + expected_sum=26.0, + expected_stdev=sqrt((4 * (26.0 / 30) ** 2 + 26 * (4.0 / 30) ** 2) / 29), + expected_variance=(4 * (26.0 / 30) ** 2 + 26 * (4.0 / 30) ** 2) / 29, + ), + ] +) +def test_replace_infinity_other_aggregations( + sdf_special_values: DataFrame, + replace_with: Optional[Dict[str, Tuple[float, float]]], + expected_sum: float, + expected_stdev: float, + expected_variance: float, +): + inf_budget = PureDPBudget(float("inf")) + sess = Session.from_dataframe( + inf_budget, + "private", + sdf_special_values, + protected_change=AddOneRow(), + ) + + query_sum = ( + QueryBuilder("private").replace_infinity(replace_with).sum("float_infs", 0, 1) + ) + result_sum = sess.evaluate(query_sum, inf_budget) + expected_df = pd.DataFrame([[expected_sum]], columns=["float_infs_sum"]) + assert_frame_equal_with_sort(result_sum.toPandas(), expected_df) + + query_stdev = ( + QueryBuilder("private").replace_infinity(replace_with).stdev("float_infs", 0, 1) + ) + result_stdev = sess.evaluate(query_stdev, inf_budget) + expected_df = pd.DataFrame([[expected_stdev]], columns=["float_infs_stdev"]) + assert_frame_equal_with_sort(result_stdev.toPandas(), expected_df) + + query_variance = ( + QueryBuilder("private") + .replace_infinity(replace_with) + .variance("float_infs", 0, 1) + ) + result_variance = sess.evaluate(query_variance, inf_budget) + expected_df = pd.DataFrame([[expected_variance]], columns=["float_infs_variance"]) + assert_frame_equal_with_sort(result_variance.toPandas(), expected_df) + + +@parametrize( + # All these tests compute the sum of the "float_infs" column of the input table. + [ + Case("drop_rows_in_column")( + # There are 26 non-infinity values in the "float_infs" column. + columns=["float_infs"], + expected_sum=26.0, + ), + Case("drop_no_rows")( + # The call to drop_infinity is a no-op. In the "float_infs" column, there + # are two rows with positive infinities (clamped to 1), and two with + # negative infinities (clamped to 0). + columns=["float_no_special"], + expected_sum=28.0, + ), + Case("drop_some_rows_due_to_other_columns")( + # Two rows with infinite values in the "float_infs" column also have + # infinite values in the "float_all_special" column. We end up with one + # positive infinity value, clamped to 1, and one negative, clamped to 0. + columns=["float_all_special"], + expected_sum=27.0, + ), + Case("drop_rows_in_all_columns")( + # If used without any argument, drop_infinity removes all infinity values in + # all columns of the table. + columns=None, + expected_sum=26.0, + ), + ] +) +def test_drop_infinity( + sdf_special_values: DataFrame, + columns: Optional[List[str]], + expected_sum: float, +): + inf_budget = PureDPBudget(float("inf")) + sess = Session.from_dataframe( + inf_budget, + "private", + sdf_special_values, + AddOneRow(), + ) + base_query = QueryBuilder("private") + query = base_query.drop_infinity(columns).sum("float_infs", 0, 1) + result = sess.evaluate(query, inf_budget) + expected_df = pd.DataFrame([[expected_sum]], columns=["float_infs_sum"]) + assert_frame_equal_with_sort(result.toPandas(), expected_df) + + +@parametrize( + [ + Case("works_with_nulls")( + # get_bounds doesn't explode when called on a null column + query=QueryBuilder("private").get_bounds("int_nulls"), + expected_df=pd.DataFrame( + [[-1, 1]], + columns=["int_nulls_lower_bound", "int_nulls_upper_bound"], + ), + ), + Case("works_with_nan")( + # Same with nans + query=QueryBuilder("private").get_bounds("float_nans"), + expected_df=pd.DataFrame( + [[-1, 1]], + columns=["float_nans_lower_bound", "float_nans_upper_bound"], + ), + ), + Case("works_with_infinity")( + # Same with infinities + query=QueryBuilder("private").get_bounds("float_infs"), + expected_df=pd.DataFrame( + [[-1, 1]], + columns=["float_infs_lower_bound", "float_infs_upper_bound"], + ), + ), + Case("drop_and_replace")( + # Dropping nulls & nans removes 6/30 values, replacing 4 infinity values by + # (-3,3) guarantees ensures get_bounds should get the interval corresponding + # to next power of 2, namely 4 + query=( + QueryBuilder("private") + .drop_null_and_nan() + .replace_infinity({"float_infs": (-3, 3)}) + .get_bounds("float_infs") + ), + expected_df=pd.DataFrame( + [[-4, 4]], + columns=["float_infs_lower_bound", "float_infs_upper_bound"], + ), + ), + ] +) +def test_get_bounds( + sdf_special_values: DataFrame, query: Query, expected_df: pd.DataFrame +): + inf_budget = PureDPBudget(float("inf")) + sess = Session.from_dataframe( + inf_budget, + "private", + sdf_special_values, + AddOneRow(), + ) + result = sess.evaluate(query, inf_budget) + assert_frame_equal_with_sort(result.toPandas(), expected_df) + + +@parametrize( + [ + Case("normal_case_explicit")( + # Column "float_all_special" has 26 1s, one null (replaced by 100), one nan + # (replaced by 100), and two infinities (dropped). + query=( + QueryBuilder("private") + .enforce(MaxRowsPerID(1)) + .replace_null_and_nan({"float_all_special": 100.0}) + .drop_infinity(["float_all_special"]) + .sum("float_all_special", 0, 100) + ), + expected_df=pd.DataFrame( + [[226]], # 26+100+100 + columns=["float_all_special_sum"], + ), + ), + Case("normal_case_implicit")( + # Column "float_all_special" has 26 1s, one null, one nan, one negative + # infinity (clamped to -50), one positive infinity (clamped to 100). + query=( + QueryBuilder("private") + .enforce(MaxRowsPerID(1)) + .sum("float_all_special", -50, 100) + ), + expected_df=pd.DataFrame( + [[76]], # 26-50+100 + columns=["float_all_special_sum"], + ), + ), + Case("nulls_are_not_dropped_in_id_column")( + # When called with no argument, replace_null_and_nan should drop all rows + # that have null/nan values anywhere, except in the privacy ID column. This + # should leave 25 1s even if we're summing a column without nulls. + query=( + QueryBuilder("private") + .drop_null_and_nan() + .enforce(MaxRowsPerID(1)) + .sum("int_no_null", 0, 1) + ), + expected_df=pd.DataFrame([[25]], columns=["int_no_null_sum"]), + ), + ] +) +def test_privacy_ids( + sdf_special_values: DataFrame, query: Query, expected_df: pd.DataFrame +): + inf_budget = PureDPBudget(float("inf")) + sess = Session.from_dataframe( + inf_budget, + "private", + sdf_special_values, + AddRowsWithID("string_nulls"), + ) + result = sess.evaluate(query, inf_budget) + assert_frame_equal_with_sort(result.toPandas(), expected_df) + + +@pytest.fixture(name="sdf_for_joins", scope="module") +def dataframe_for_join(spark): + """Set up test data for sessions with special values. + + This data is then joined with the ``sdf_special_values`` dataframe used previously + in this test suite. + """ + sdf_col_types = { + "string_nulls": ColumnDescriptor(ColumnType.VARCHAR, allow_null=True), + "int_nulls": ColumnDescriptor(ColumnType.INTEGER, allow_null=True), + "float_all_special": ColumnDescriptor( + ColumnType.DECIMAL, + allow_null=True, + allow_nan=True, + allow_inf=True, + ), + "date_nulls": ColumnDescriptor(ColumnType.DATE, allow_null=True), + "time_nulls": ColumnDescriptor(ColumnType.TIMESTAMP, allow_null=True), + "new_int": ColumnDescriptor(ColumnType.INTEGER, allow_null=False), + } + date = datetime.date(2000, 1, 1) + time = datetime.datetime(2020, 1, 1) + sdf = spark.createDataFrame( + [ + # Normal row + ("normal_0", 1, 1.0, date, time, 1), + # Rows with nulls: some whose values appear in `sdf_special_values`… + (None, 1, 1.0, date, time, 1), + ("u2", None, 1.0, date, time, 1), + ("u3", 1, None, date, time, 1), + # … and two identical rows, where the combination of nulls does not appear + # in `sdf_special_values`. + ("u4", 1, 1.0, None, None, 1), + ("u5", 1, 1.0, None, None, 1), + # Row with nans + ("a6", 1, float("nan"), date, time, 1), + # Rows with infinities + ("i7", 1, float("inf"), date, time, 1), + ("i8", 1, -float("inf"), date, time, 1), + ], + schema=analytics_to_spark_schema(Schema(sdf_col_types)), + ) + return sdf + + +@parametrize( + [ + Case("public_join_inner_all_match")( + # Joining with the first three columns, all columns of the right table + # should match exactly one row, without duplicates. This checks that tables + # are joined on all three kinds of special values. + protected_change=AddOneRow(), + query=( + QueryBuilder("private") + .join_public( + "public", + ["string_nulls", "int_nulls", "float_all_special"], + "inner", + ) + .sum("new_int", 0, 1) + ), + expected_df=pd.DataFrame( + [[9]], + columns=["new_int_sum"], + ), + ), + Case("public_join_inner_duplicates")( + # Joining with the date and time columns only creates matches for the rows + # where both are specified: 28 in the left table and 7 in the right table. + protected_change=AddOneRow(), + query=( + QueryBuilder("private") + .join_public("public", ["date_nulls", "time_nulls"], "inner") + .sum("new_int", 0, 1) + ), + expected_df=pd.DataFrame( + [[28 * 7]], + columns=["new_int_sum"], + ), + ), + Case("public_join_left_duplicates")( + # Same as before, except we do a left join, so 2 rows in the original table + # are preserved in the join. + protected_change=AddOneRow(), + query=( + QueryBuilder("private") + .join_public("public", ["date_nulls", "time_nulls"], "left") + .count() + ), + expected_df=pd.DataFrame( + [[28 * 7 + 2]], + columns=["count"], + ), + ), + Case("private_join_add_rows")( + # Private joins without duplicates should work the same way as the inner + # public join above, leaving the 9 rows in common between the two tables. + protected_change=AddOneRow(), + query=( + QueryBuilder("private") + .join_private( + "private_2", + join_columns=["string_nulls", "int_nulls", "float_all_special"], + truncation_strategy_left=TruncationStrategy.DropNonUnique(), + truncation_strategy_right=TruncationStrategy.DropNonUnique(), + ) + .count() + ), + expected_df=pd.DataFrame([[9]], columns=["count"]), + ), + Case("private_join_ids")( + # Same with a privacy ID column. + protected_change=AddRowsWithID("string_nulls"), + query=( + QueryBuilder("private") + .join_private( + "private_2", + join_columns=["string_nulls", "int_nulls", "float_all_special"], + ) + .enforce(MaxRowsPerID(1)) + .count() + ), + expected_df=pd.DataFrame([[9]], columns=["count"]), + ), + Case("private_join_preserves_special_values")( + # After the join, "float_all_special" should have the same data as in the + # table used for the join: 5 1s, one null (replaced by 100), one nan + # (replaced by 100), and two infinities (dropped). + protected_change=AddRowsWithID("string_nulls"), + query=( + QueryBuilder("private") + .join_private( + "private_2", + join_columns=["string_nulls", "int_nulls", "float_all_special"], + ) + .enforce(MaxRowsPerID(1)) + .drop_infinity(["float_all_special"]) + .replace_null_and_nan({"float_all_special": 100.0}) + .sum("float_all_special", 0, 200) + ), + expected_df=pd.DataFrame( + [[5 + 100 + 100]], columns=["float_all_special_sum"] + ), + ), + Case("public_join_preserves_special_values")( + # Same with a public join. + protected_change=AddOneRow(), + query=( + QueryBuilder("private") + .join_public( + "public", + join_columns=["string_nulls", "int_nulls", "float_all_special"], + ) + .drop_infinity(["float_all_special"]) + .replace_null_and_nan({"float_all_special": 100.0}) + .sum("float_all_special", 0, 200) + ), + expected_df=pd.DataFrame( + [[5 + 100 + 100]], columns=["float_all_special_sum"] + ), + ), + ] +) +def test_joins( + sdf_special_values: DataFrame, + sdf_for_joins: DataFrame, + protected_change: ProtectedChange, + query: Query, + expected_df: pd.DataFrame, +): + inf_budget = PureDPBudget.inf() + sess = ( + Session.Builder() + .with_id_space("default_id_space") + .with_private_dataframe("private", sdf_special_values, protected_change) + .with_private_dataframe("private_2", sdf_for_joins, protected_change) + .with_public_dataframe("public", sdf_for_joins) + .with_privacy_budget(inf_budget) + .build() + ) + result = sess.evaluate(query, inf_budget) + assert_frame_equal_with_sort(result.toPandas(), expected_df) + + +@parametrize( + [ + Case("private_int_remove_nulls")( + # Null joined with no nulls = no nulls + protected_change=AddOneRow(), + query=( + QueryBuilder("private") + .rename({"int_no_null": "int_joined"}) + .join_private( + QueryBuilder("private_2").rename({"int_nulls": "int_joined"}), + join_columns=["int_joined"], + truncation_strategy_left=TruncationStrategy.DropExcess(30), + truncation_strategy_right=TruncationStrategy.DropExcess(30), + ) + ), + expected_col=( + "int_joined", + ColumnDescriptor(ColumnType.INTEGER, allow_null=False), + ), + ), + Case("private_float_remove_both")( + # All special joined with only nulls & nan = only nulls & nan + protected_change=AddOneRow(), + query=( + QueryBuilder("private") + .drop_null_and_nan(["float_all_special"]) + .join_private( + QueryBuilder("private").drop_infinity(["float_all_special"]), + join_columns=["float_all_special"], + truncation_strategy_left=TruncationStrategy.DropExcess(30), + truncation_strategy_right=TruncationStrategy.DropExcess(30), + ) + ), + expected_col=( + "float_all_special", + ColumnDescriptor( + ColumnType.DECIMAL, + allow_null=False, + allow_nan=False, + allow_inf=False, + ), + ), + ), + Case("public_int_remove_nulls_from_right")( + # No nulls joined with nulls = no nulls (public version) + protected_change=AddOneRow(), + query=( + QueryBuilder("private") + .select(["int_no_null"]) + .rename({"int_no_null": "int_nulls"}) + .join_public( + "public", + join_columns=["int_nulls"], + ) + ), + expected_col=( + "int_nulls", + ColumnDescriptor(ColumnType.INTEGER, allow_null=False), + ), + ), + Case("public_int_remove_nulls_from_left")( + # Nulls joined with no nulls = no nulls (reverse) + protected_change=AddOneRow(), + query=( + QueryBuilder("private") + .rename({"int_nulls": "new_int"}) + .join_public( + "public", + join_columns=["new_int"], + ) + ), + expected_col=( + "new_int", + ColumnDescriptor(ColumnType.INTEGER, allow_null=False), + ), + ), + Case("public_int_keep_null_on_left_join")( + # Nulls *left* joined with no nulls = nulls + protected_change=AddOneRow(), + query=( + QueryBuilder("private") + .rename({"int_nulls": "new_int"}) + .join_public( + "public", + join_columns=["new_int"], + how="left", + ) + ), + expected_col=( + "int_nulls", + ColumnDescriptor(ColumnType.INTEGER, allow_null=True), + ), + ), + ] +) +def test_join_schema( + sdf_special_values: DataFrame, + sdf_for_joins: DataFrame, + protected_change: ProtectedChange, + query: Query, + expected_col: Dict["str", ColumnDescriptor], +): + inf_budget = PureDPBudget.inf() + sess = ( + Session.Builder() + .with_id_space("default_id_space") + .with_private_dataframe("private", sdf_special_values, protected_change) + .with_private_dataframe("private_2", sdf_for_joins, protected_change) + .with_public_dataframe("public", sdf_for_joins) + .with_privacy_budget(inf_budget) + .build() + ) + sess.create_view(query, "view", cache=False) + schema = sess.get_schema("view") + assert expected_col in schema.items() diff --git a/test/unit/query_expr_compiler/test_measurement_visitor.py b/test/unit/query_expr_compiler/test_measurement_visitor.py index e8791f98..7be38fa6 100644 --- a/test/unit/query_expr_compiler/test_measurement_visitor.py +++ b/test/unit/query_expr_compiler/test_measurement_visitor.py @@ -635,24 +635,6 @@ def test_visit_groupby_count_distinct( ] ), ), - ( - QueryBuilder("private").quantile( - low=-100, - high=100, - name="custom_output_column", - column="null_and_nan", - quantile=0.1, - ), - PureDP(), - NoiseInfo( - [ - { - "noise_mechanism": _NoiseMechanism.EXPONENTIAL, - "noise_parameter": 3.3333333333333326, - } - ] - ), - ), ( QueryBuilder("private") .groupby(KeySet.from_dict({"B": [0, 1]})) @@ -673,26 +655,6 @@ def test_visit_groupby_count_distinct( ] ), ), - ( - QueryBuilder("private") - .groupby(KeySet.from_dict({"B": [0, 1]})) - .quantile( - column="null_and_inf", - name="quantile", - low=123.345, - high=987.65, - quantile=0.25, - ), - PureDP(), - NoiseInfo( - [ - { - "noise_mechanism": _NoiseMechanism.EXPONENTIAL, - "noise_parameter": 3.3333333333333326, - } - ] - ), - ), ( QueryBuilder("private") .groupby(KeySet.from_dict({"A": ["zero"]})) @@ -707,20 +669,6 @@ def test_visit_groupby_count_distinct( ] ), ), - ( - QueryBuilder("private") - .groupby(KeySet.from_dict({"A": ["zero"]})) - .quantile(quantile=0.5, low=0, high=1, column="nan_and_inf"), - RhoZCDP(), - NoiseInfo( - [ - { - "noise_mechanism": _NoiseMechanism.EXPONENTIAL, - "noise_parameter": 2.9814239699997196, - } - ] - ), - ), ( QueryBuilder("private") .groupby(KeySet.from_dict({"A": ["zero"]})) diff --git a/test/unit/query_expr_compiler/test_rewrite_rules.py b/test/unit/query_expr_compiler/test_rewrite_rules.py index 8e253e77..c5b4a249 100644 --- a/test/unit/query_expr_compiler/test_rewrite_rules.py +++ b/test/unit/query_expr_compiler/test_rewrite_rules.py @@ -13,6 +13,9 @@ AverageMechanism, CountDistinctMechanism, CountMechanism, + DropInfinity, + DropNullAndNan, + GetBounds, GroupByBoundedAverage, GroupByBoundedStdev, GroupByBoundedSum, @@ -22,6 +25,7 @@ PrivateSource, QueryExpr, QueryExprVisitor, + ReplaceInfinity, SingleChildQueryExpr, StdevMechanism, SumMechanism, @@ -30,9 +34,10 @@ ) from tmlt.analytics._query_expr_compiler._rewrite_rules import ( CompilationInfo, + add_special_value_handling, select_noise_mechanism, ) -from tmlt.analytics._schema import ColumnDescriptor, ColumnType, Schema +from tmlt.analytics._schema import ColumnDescriptor, ColumnType, FrozenDict, Schema # SPDX-License-Identifier: Apache-2.0 # Copyright Tumult Labs 2025 @@ -391,3 +396,241 @@ def test_recursive_noise_selection(catalog: Catalog) -> None: info = CompilationInfo(output_measure=ApproxDP(), catalog=catalog) got_expr = select_noise_mechanism(info)(expr) assert got_expr == expected_expr + + +@parametrize( + [ + Case()(agg="count"), + Case()(agg="count_distinct"), + ] +) +@parametrize( + [ + Case()(col_desc=ColumnDescriptor(ColumnType.INTEGER, allow_null=False)), + Case()(col_desc=ColumnDescriptor(ColumnType.INTEGER, allow_null=True)), + Case()( + col_desc=ColumnDescriptor( + ColumnType.DECIMAL, allow_null=False, allow_nan=False, allow_inf=False + ) + ), + Case()( + col_desc=ColumnDescriptor( + ColumnType.DECIMAL, allow_null=True, allow_nan=True, allow_inf=True + ) + ), + ] +) +def test_special_value_handling_count_unaffected( + agg: str, + col_desc: ColumnDescriptor, +) -> None: + (AggExpr, AggMech) = AGG_CLASSES[agg] + expr = AggExpr( + child=BASE_EXPR, + groupby_keys=KeySet.from_dict({}), + mechanism=AggMech["DEFAULT"], + ) + catalog = Catalog() + catalog.add_private_table("private", {"col": col_desc}) + info = CompilationInfo(output_measure=PureDP(), catalog=catalog) + got_expr = add_special_value_handling(info)(expr) + assert got_expr == expr + + +@parametrize( + [ + # Columns with no special values should be unaffected + Case(f"no_op_null_{col_type}")( + col_desc=ColumnDescriptor( + col_type, allow_null=False, allow_nan=False, allow_inf=False + ), + new_child=BASE_EXPR, + ) + for col_type in [ + ColumnType.INTEGER, + ColumnType.DECIMAL, + ColumnType.DATE, + ColumnType.TIMESTAMP, + ] + ] + + [ + # NaNs and infinities do not matter for non-floats + Case(f"no_op_nan_inf_{col_type}")( + col_desc=ColumnDescriptor( + col_type, allow_null=False, allow_nan=True, allow_inf=True + ), + new_child=BASE_EXPR, + ) + for col_type in [ColumnType.INTEGER, ColumnType.DATE, ColumnType.TIMESTAMP] + ] + + [ + # Nulls must be dropped if needed + Case(f"drop_null_{col_type}")( + col_desc=ColumnDescriptor( + col_type, allow_null=True, allow_nan=False, allow_inf=False + ), + new_child=DropNullAndNan(child=BASE_EXPR, columns=("col",)), + ) + for col_type in [ + ColumnType.INTEGER, + ColumnType.DECIMAL, + ColumnType.DATE, + ColumnType.TIMESTAMP, + ] + ] + + [ + # NaNs must also be dropped if needed + Case("drop_nan")( + col_desc=ColumnDescriptor( + ColumnType.DECIMAL, allow_null=False, allow_nan=True, allow_inf=False + ), + new_child=DropNullAndNan(child=BASE_EXPR, columns=("col",)), + ), + # Only one pass is enough to drop both nulls and NaNs + Case("drop_both")( + col_desc=ColumnDescriptor( + ColumnType.DECIMAL, allow_null=True, allow_nan=True, allow_inf=False + ), + new_child=DropNullAndNan(child=BASE_EXPR, columns=("col",)), + ), + # If not handled, infinities must be clamped to the clamping bounds + Case("clamp_inf")( + col_desc=ColumnDescriptor( + ColumnType.DECIMAL, allow_null=False, allow_nan=False, allow_inf=True + ), + new_child=ReplaceInfinity( + child=BASE_EXPR, replace_with=FrozenDict.from_dict({"col": (0, 1)}) + ), + ), + # Handling both kinds of special values at once. This would fail if the two + # value handling exprs are in the wrong order; this is not ideal, but ah well. + Case("drop_nan_clamp_inf")( + col_desc=ColumnDescriptor( + ColumnType.DECIMAL, allow_null=True, allow_nan=True, allow_inf=True + ), + new_child=ReplaceInfinity( + child=DropNullAndNan(child=BASE_EXPR, columns=("col",)), + replace_with=FrozenDict.from_dict({"col": (0, 1)}), + ), + ), + ] +) +@parametrize( + [ + Case()(agg="sum"), + Case()(agg="average"), + Case()(agg="stdev"), + Case()(agg="variance"), + ] +) +def test_special_value_handling_numeric_aggregations( + agg: str, + col_desc: ColumnDescriptor, + new_child: QueryExpr, +) -> None: + (AggExpr, AggMech) = AGG_CLASSES[agg] + expr = AggExpr( + child=BASE_EXPR, + measure_column="col", + low=0, + high=1, + groupby_keys=KeySet.from_dict({}), + mechanism=AggMech["DEFAULT"], + ) + catalog = Catalog() + catalog.add_private_table("private", {"col": col_desc}) + info = CompilationInfo(output_measure=PureDP(), catalog=catalog) + got_expr = add_special_value_handling(info)(expr) + assert got_expr == replace( + expr, + child=new_child, + ) + + +@parametrize( + [ + # Columns with no special values should be unaffected + Case(f"no-op-{col_type}")( + col_desc=ColumnDescriptor( + col_type, allow_null=False, allow_nan=False, allow_inf=False + ), + new_child=BASE_EXPR, + ) + for col_type in [ + ColumnType.INTEGER, + ColumnType.DECIMAL, + ColumnType.DATE, + ColumnType.TIMESTAMP, + ] + ] + + [ + # NaNs and infinities do not matter for non-floats + Case(f"no-op-nan-inf-{col_type}")( + col_desc=ColumnDescriptor( + col_type, allow_null=False, allow_nan=True, allow_inf=True + ), + new_child=BASE_EXPR, + ) + for col_type in [ColumnType.INTEGER, ColumnType.DATE, ColumnType.TIMESTAMP] + ] + + [ + # Nulls must be dropped if needed + Case(f"drop-nulls-{col_type}")( + col_desc=ColumnDescriptor( + col_type, allow_null=True, allow_nan=False, allow_inf=False + ), + new_child=DropNullAndNan(child=BASE_EXPR, columns=("col",)), + ) + for col_type in [ + ColumnType.INTEGER, + ColumnType.DECIMAL, + ColumnType.DATE, + ColumnType.TIMESTAMP, + ] + ] + + [ + # NaNs must also be dropped if needed + Case("drop-nan")( + col_desc=ColumnDescriptor( + ColumnType.DECIMAL, allow_null=False, allow_nan=True, allow_inf=False + ), + new_child=DropNullAndNan(child=BASE_EXPR, columns=("col",)), + ), + # Same for infinities (contrary to other aggregations which use clamping) + Case("drop-inf")( + col_desc=ColumnDescriptor( + ColumnType.DECIMAL, allow_null=False, allow_nan=False, allow_inf=True + ), + new_child=DropInfinity(child=BASE_EXPR, columns=("col",)), + ), + # And both kinds of special values must be handled + Case("drop-nan-and-inf")( + col_desc=ColumnDescriptor( + ColumnType.DECIMAL, allow_null=True, allow_nan=True, allow_inf=True + ), + new_child=DropInfinity( + child=DropNullAndNan(child=BASE_EXPR, columns=("col",)), + columns=("col",), + ), + ), + ] +) +def test_special_value_handling_get_bounds( + col_desc: ColumnDescriptor, + new_child: QueryExpr, +) -> None: + expr = GetBounds( + child=BASE_EXPR, + measure_column="col", + groupby_keys=KeySet.from_dict({}), + lower_bound_column="lower", + upper_bound_column="upper", + ) + catalog = Catalog() + catalog.add_private_table("private", {"col": col_desc}) + info = CompilationInfo(output_measure=PureDP(), catalog=catalog) + got_expr = add_special_value_handling(info)(expr) + assert got_expr == replace( + expr, + child=new_child, + ) From eb0e25b67e38d33a8fb03f4d559dea7e29673e55 Mon Sep 17 00:00:00 2001 From: Ted Date: Tue, 4 Nov 2025 09:34:34 +0100 Subject: [PATCH 22/25] Simplify assert_frame_equal_with_sort (#113) Co-authored-by: Damien Desfontaines --- test/conftest.py | 4 +--- test/system/session/rows/test_add_max_rows.py | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/test/conftest.py b/test/conftest.py index 4984872c..9df58735 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -229,7 +229,6 @@ def assert_frame_equal_with_sort( first_df: pd.DataFrame, second_df: pd.DataFrame, sort_columns: Optional[Sequence[str]] = None, - **kwargs: Any, ): """Asserts that the two Pandas DataFrames are equal. @@ -240,7 +239,6 @@ def assert_frame_equal_with_sort( first_df: First dataframe to compare. second_df: Second dataframe to compare. sort_columns: Names of column to sort on. By default sorts by all columns. - **kwargs: Keyword arguments that will be passed to assert_frame_equal(). """ if sorted(first_df.columns) != sorted(second_df.columns): raise ValueError( @@ -258,7 +256,7 @@ def assert_frame_equal_with_sort( # We explicitly pass check_dtype=False the equality check, so that identical # DataFrames which differ only in dtypes (like one with an int64 column and # the other with an Int64 column) are considered equal. - pd.testing.assert_frame_equal(first_df, second_df, check_dtype=False, **kwargs) + pd.testing.assert_frame_equal(first_df, second_df, check_dtype=False) def create_mock_measurement( diff --git a/test/system/session/rows/test_add_max_rows.py b/test/system/session/rows/test_add_max_rows.py index a85b36fc..3b7cbb2c 100644 --- a/test/system/session/rows/test_add_max_rows.py +++ b/test/system/session/rows/test_add_max_rows.py @@ -634,7 +634,7 @@ def test_create_view_with_stability( f=lambda row: [{}, {}], new_column_types={}, augment=True, - max_rows=1, + max_rows=2, ) session.create_view(transformation_query, "flatmap_transformation", cache=False) @@ -644,7 +644,7 @@ def test_create_view_with_stability( .sum("X", 0, 3, name="sum") ) actual = session.evaluate(sum_query, privacy_budget) - assert_frame_equal_with_sort(actual.toPandas(), expected, rtol=1) + assert_frame_equal_with_sort(actual.toPandas(), expected) @pytest.mark.parametrize( "starting_budget,partition_budget", From 679f85543c9f05ad2ab8f52ff4db72803ea29b26 Mon Sep 17 00:00:00 2001 From: Ted Date: Sun, 9 Nov 2025 22:13:33 +0100 Subject: [PATCH 23/25] Allow single strings in QueryBuilder method arguments that take a list of strings (#115) * add single-columns logic in the querybuilder * add tests * add changelog * also simplify types for GroupByCountDistinct * make mypy happy * fix test, review comments --------- Co-authored-by: Damien Desfontaines --- CHANGELOG.rst | 12 ++- src/tmlt/analytics/_query_expr.py | 11 ++- src/tmlt/analytics/query_builder.py | 49 +++++----- test/unit/test_query_builder.py | 133 ++++++++++++++++++++-------- 4 files changed, 139 insertions(+), 66 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 0d5ccec5..ceb4cec1 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -15,10 +15,14 @@ Changed - Dropped support for Python 3.9, as it has reached end-of-life. - Dropped support for pyspark <3.5.0 on Macs after discovering that these configurations frequently crash. Older versions of the library may also be affected. - Aggregation mechanisms can now be specified as strings instead of enums, e.g. ``"laplace"`` instead of ``CountMechanism.LAPLACE`` or ``SumMechanism.LAPLACE``. -- Removed previously deprecated argument ``max_num_rows`` to ``flat_map``. Use ``max_rows`` instead. -- Removed previously deprecated argument ``cols`` to ``count_distinct``. Use ``columns`` instead. -- Infinity values are now automatically dropped before a floating-point column is passed to `get_bounds`. (The documentation previously claimed that this was done, but this was not the case.) -- Fixed the documentation of the behavior of some numeric aggregations (`sum`, `average`, `stdev`, `variance`, `quantile`) to match the actual behavior: infinity values are clamped using the specified bounds before being passed to the aggregation function, not dropped. +- Removed previously deprecated argument ``max_num_rows`` to :meth:`~tmlt.analytics.QueryBuilder.flat_map`. Use ``max_rows`` instead. +- Removed previously deprecated argument ``cols`` to :meth:`~tmlt.analytics.QueryBuilder.count_distinct`. Use ``columns`` instead. +- Infinity values are now automatically dropped before a floating-point column is passed to :meth:`~tmlt.analytics.QueryBuilder.get_bounds`. (The documentation previously claimed that this was done, but this was not the case.) +- Fixed the documentation of the behavior of some numeric aggregations (:meth:`~tmlt.analytics.QueryBuilder.sum`, :meth:`~tmlt.analytics.QueryBuilder.average`, :meth:`~tmlt.analytics.QueryBuilder.stdev`, :meth:`~tmlt.analytics.QueryBuilder.variance`, :meth:`~tmlt.analytics.QueryBuilder.quantile`) to match the actual behavior: infinity values are clamped using the specified bounds before being passed to the aggregation function, not dropped. +- Single column names can now be passed as a string to :meth:`~tmlt.analytics.QueryBuilder.groupby`, :meth:`~tmlt.analytics.QueryBuilder.get_bounds`, :meth:`~tmlt.analytics.QueryBuilder.select`, :meth:`~tmlt.analytics.QueryBuilder.drop_infinity`, and :meth:`~tmlt.analytics.QueryBuilder.drop_null_and_nan`. + + + .. _v0.20.2: diff --git a/src/tmlt/analytics/_query_expr.py b/src/tmlt/analytics/_query_expr.py index 54a8874d..f5a4213e 100644 --- a/src/tmlt/analytics/_query_expr.py +++ b/src/tmlt/analytics/_query_expr.py @@ -245,17 +245,16 @@ def accept(self, visitor: "QueryExprVisitor") -> Any: class GetGroups(SingleChildQueryExpr): """Returns groups based on the geometric partition selection for these columns.""" - columns: Optional[Tuple[str, ...]] = None + columns: Tuple[str, ...] = tuple() """The columns used for geometric partition selection. - If empty or none are provided, will use all of the columns in the table - for partition selection. + If empty, will use all of the columns in the table for partition selection. """ def __post_init__(self): """Checks arguments to constructor.""" check_type(self.child, QueryExpr) - check_type(self.columns, Optional[Tuple[str, ...]]) + check_type(self.columns, Tuple[str, ...]) def _validate(self, input_schema: Schema): """Validation checks for this QueryExpr.""" @@ -1667,7 +1666,7 @@ class GroupByCountDistinct(SingleChildQueryExpr): groupby_keys: Union[KeySet, Tuple[str, ...]] """The keys, or columns list to collect keys from, to be grouped on.""" - columns_to_count: Optional[Tuple[str, ...]] = None + columns_to_count: Tuple[str, ...] = tuple() """The columns that are compared when determining if two rows are distinct. If empty, will count all distinct rows. @@ -1687,7 +1686,7 @@ def __post_init__(self): if isinstance(self.groupby_keys, tuple): config.features.auto_partition_selection.raise_if_disabled() check_type(self.child, QueryExpr) - check_type(self.columns_to_count, Optional[Tuple[str, ...]]) + check_type(self.columns_to_count, Tuple[str, ...]) check_type(self.groupby_keys, (KeySet, Tuple[str, ...])) check_type(self.output_column, str) check_type(self.mechanism, CountDistinctMechanism) diff --git a/src/tmlt/analytics/query_builder.py b/src/tmlt/analytics/query_builder.py index 478e9f33..c879bd23 100644 --- a/src/tmlt/analytics/query_builder.py +++ b/src/tmlt/analytics/query_builder.py @@ -764,7 +764,9 @@ def replace_infinity( ) return self - def drop_null_and_nan(self, columns: Optional[List[str]] = None) -> "QueryBuilder": + def drop_null_and_nan( + self, columns: Optional[List[str] | str] = None + ) -> "QueryBuilder": """Removes rows containing null or NaN values. .. note:: @@ -862,14 +864,16 @@ def drop_null_and_nan(self, columns: Optional[List[str]] = None) -> "QueryBuilde """ if columns is None: columns = [] - if columns is None: - raise AnalyticsInternalError("columns parameter is None.") + if isinstance(columns, str): + columns = [columns] self._query_expr = DropNullAndNan( child=self._query_expr, columns=tuple(columns) ) return self - def drop_infinity(self, columns: Optional[List[str]] = None) -> "QueryBuilder": + def drop_infinity( + self, columns: Optional[List[str] | str] = None + ) -> "QueryBuilder": """Remove rows containing infinite values. .. @@ -952,9 +956,8 @@ def drop_infinity(self, columns: Optional[List[str]] = None) -> "QueryBuilder": """ if columns is None: columns = [] - if columns is None: - raise AnalyticsInternalError("columns parameter is None.") - + if isinstance(columns, str): + columns = [columns] self._query_expr = DropInfinity(child=self._query_expr, columns=tuple(columns)) return self @@ -1086,7 +1089,7 @@ def filter(self, condition: str) -> "QueryBuilder": self._query_expr = Filter(child=self._query_expr, condition=condition) return self - def select(self, columns: Sequence[str]) -> "QueryBuilder": + def select(self, columns: Sequence[str] | str) -> "QueryBuilder": """Selects the specified columns, dropping the others. .. @@ -1133,9 +1136,11 @@ def select(self, columns: Sequence[str]) -> "QueryBuilder": Args: columns: The columns to select. """ + if isinstance(columns, str): + columns = [columns] self._query_expr = Select( child=self._query_expr, - columns=tuple(columns) if columns is not None else None, + columns=tuple(columns), ) return self @@ -1725,7 +1730,7 @@ def enforce(self, constraint: Constraint) -> "QueryBuilder": self._query_expr = EnforceConstraint(self._query_expr, constraint) return self - def get_groups(self, columns: Optional[List[str]] = None) -> Query: + def get_groups(self, columns: Optional[List[str] | str] = None) -> Query: """Returns a query that gets combinations of values in the listed columns. .. note:: @@ -1780,8 +1785,11 @@ def get_groups(self, columns: Optional[List[str]] = None) -> Query: :class:`~tmlt.analytics.ProtectedChange` of :class:`~tmlt.analytics.AddRowsWithID`. """ - cols = tuple(columns) if columns is not None else None - query_expr = GetGroups(child=self._query_expr, columns=cols) + if columns is None: + columns = [] + if isinstance(columns, str): + columns = [columns] + query_expr = GetGroups(child=self._query_expr, columns=tuple(columns)) return Query(query_expr) def get_bounds( @@ -2078,7 +2086,7 @@ def count( def count_distinct( self, - columns: Optional[List[str]] = None, + columns: Optional[List[str] | str] = None, name: Optional[str] = None, mechanism: Union[ CountDistinctMechanism, Literal["default", "laplace", "gaussian"] @@ -2894,7 +2902,7 @@ def count( def count_distinct( self, - columns: Optional[List[str]] = None, + columns: Optional[List[str] | str] = None, name: Optional[str] = None, mechanism: Union[ CountDistinctMechanism, Literal["default", "laplace", "gaussian"] @@ -2958,12 +2966,13 @@ def count_distinct( mechanism: Choice of noise mechanism (case-insensitive). By default, the framework automatically selects an appropriate mechanism. """ - columns_to_count: Optional[List[str]] = None - if columns is not None and len(columns) > 0: - columns_to_count = list(columns) + if columns is None: + columns = [] + if isinstance(columns, str): + columns = [columns] if not name: - if columns_to_count: - name = f"count_distinct({', '.join(columns_to_count)})" + if columns: + name = f"count_distinct({', '.join(columns)})" else: name = "count_distinct" if isinstance(mechanism, str): @@ -2976,7 +2985,7 @@ def count_distinct( ) from e query_expr = GroupByCountDistinct( child=self._query_expr, - columns_to_count=tuple(columns_to_count) if columns_to_count else None, + columns_to_count=tuple(columns), groupby_keys=self._groupby_keys, output_column=name, mechanism=mechanism, diff --git a/test/unit/test_query_builder.py b/test/unit/test_query_builder.py index 66f6bc06..8abeabfc 100644 --- a/test/unit/test_query_builder.py +++ b/test/unit/test_query_builder.py @@ -10,7 +10,7 @@ import datetime import re from dataclasses import FrozenInstanceError -from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple, Union +from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast import pandas as pd import pytest @@ -33,6 +33,7 @@ DropNullAndNan, Filter, FlatMap, + GetGroups, GroupByBoundedAverage, GroupByBoundedStdev, GroupByBoundedSum, @@ -252,9 +253,9 @@ def test_filter(): assert root_expr.source_id == PRIVATE_ID -def test_select(): +@pytest.mark.parametrize("columns", ["A", ["A"]]) +def test_select(columns: List[str] | str): """QueryBuilder select works as expected.""" - columns = ["A"] query = ( root_builder() .select(columns) @@ -270,7 +271,7 @@ def test_select(): select_expr = query_expr.child assert isinstance(select_expr, Select) - assert select_expr.columns == tuple(columns) + assert select_expr.columns == ("A",) root_expr = select_expr.child assert isinstance(root_expr, PrivateSource) @@ -732,46 +733,90 @@ def test_replace_infinity( assert replace_expr.replace_with == FrozenDict.from_dict(expected_replace_with) -@pytest.mark.parametrize("columns", [([]), (None), (["A"]), (["A", "B"])]) -def test_drop_null_and_nan(columns: Optional[List[str]]) -> None: +@pytest.mark.parametrize( + "columns,expected_columns", + [ + ([], tuple()), + (None, tuple()), + ("A", ("A",)), + (["A"], ("A",)), + (["A", "B"], ("A", "B")), + ], +) +def test_drop_null_and_nan( + columns: Optional[List[str] | str], expected_columns: Tuple[str, ...] +) -> None: """QueryBuilder.drop_null_and_nan works as expected.""" query = root_builder().drop_null_and_nan(columns).count() assert isinstance(query, Query) + query_expr = query._query_expr assert isinstance(query_expr, GroupByCount) + drop_expr = query_expr.child assert isinstance(drop_expr, DropNullAndNan) + assert drop_expr.columns == expected_columns root_expr = drop_expr.child assert isinstance(root_expr, PrivateSource) assert root_expr.source_id == PRIVATE_ID - expected_columns: List[str] = [] - if columns is not None: - expected_columns = columns - assert drop_expr.columns == tuple(expected_columns) - -@pytest.mark.parametrize("columns", [([]), (None), (["A"]), (["A", "B"])]) -def test_drop_infinity(columns: Optional[List[str]]) -> None: +@pytest.mark.parametrize( + "columns,expected_columns", + [ + ([], tuple()), + (None, tuple()), + ("A", ("A",)), + (["A"], ("A",)), + (["A", "B"], ("A", "B")), + ], +) +def test_drop_infinity( + columns: Optional[List[str]], expected_columns: Tuple[str, ...] +) -> None: """QueryBuilder.drop_infinity works as expected.""" query = root_builder().drop_infinity(columns).count() assert isinstance(query, Query) - query_expr = query._query_expr + query_expr = query._query_expr assert isinstance(query_expr, GroupByCount) + drop_expr = query_expr.child assert isinstance(drop_expr, DropInfinity) + assert drop_expr.columns == expected_columns root_expr = drop_expr.child assert isinstance(root_expr, PrivateSource) assert root_expr.source_id == PRIVATE_ID - expected_columns: List[str] = [] - if columns is not None: - expected_columns = columns - assert drop_expr.columns == tuple(expected_columns) + +@pytest.mark.parametrize( + "columns,expected_columns", + [ + ([], tuple()), + (None, tuple()), + ("A", ("A",)), + (["A"], ("A",)), + (["A", "B"], ("A", "B")), + ], +) +def test_get_groups( + columns: Optional[List[str]], expected_columns: Tuple[str, ...] +) -> None: + """QueryBuilder.drop_infinity works as expected.""" + query = root_builder().get_groups(columns) + + assert isinstance(query, Query) + + query_expr = query._query_expr + assert isinstance(query_expr, GetGroups) + assert query_expr.columns == expected_columns + + root_expr = query_expr.child + assert isinstance(root_expr, PrivateSource) + assert root_expr.source_id == PRIVATE_ID class _TestAggregationsData: @@ -868,30 +913,34 @@ def assert_count_distinct_query_correct( self, query: QueryExpr, expected_groupby_keys: KeySet, - expected_columns: List[str], + expected_columns: Optional[Tuple[str, ...]], expected_output_column: str, ): """Confirm that a count_distinct query is constructed correctly.""" assert isinstance(query, GroupByCountDistinct) - if expected_columns: - assert query.columns_to_count == tuple(expected_columns) - else: - assert query.columns_to_count is None + assert query.columns_to_count == expected_columns assert query.groupby_keys == expected_groupby_keys assert query.output_column == expected_output_column self.assert_root_expr(query.child) @pytest.mark.parametrize( - "name,expected_name,columns", + "name,expected_name,columns,expected_columns", [ - (None, "count_distinct", None), - ("total", "total", ["Col1", "Col2"]), - (None, "count_distinct(A, B)", ["A", "B"]), + (None, "count_distinct", None, tuple()), + ("total", "total", [], tuple()), + ("total", "total", "Col1", ("Col1",)), + ("total", "total", ["Col1", "Col2"], ("Col1", "Col2")), + (None, "count_distinct(A, B)", ["A", "B"], ("A", "B")), ], ) def test_count_distinct_ungrouped( - self, spark, name: Optional[str], expected_name: str, columns: List[str] + self, + spark, + name: Optional[str], + expected_name: str, + columns: Optional[List[str] | str], + expected_columns: Tuple[str, ...], ): """Query returned by ungrouped count_distinct is correct.""" query = root_builder().count_distinct(columns=columns, name=name) @@ -900,19 +949,30 @@ def test_count_distinct_ungrouped( self.assert_count_distinct_query_correct( query_expr, self._keys_from_pandas(spark, pd.DataFrame()), - columns, + expected_columns, expected_name, ) @pytest.mark.parametrize( - "keys_df,name,expected_name,columns", + "keys_df,name,expected_name,columns,expected_columns", ( (keys_df, *options) for keys_df in _TestAggregationsData.keyset_test_cases - for options in ( - (None, "count_distinct", None), - ("total", "total", ["Col1", "Col2"]), - (None, "count_distinct(X, Y)", ["X", "Y"]), + # mypy is being a pain here, requiring an explicit cast for some reason + for options in cast( + List[ + Tuple[ + Optional[str], str, Optional[List[str] | str], Tuple[str, ...] + ] + ], + [ + (None, "count_distinct", None, tuple()), + (None, "count_distinct", [], tuple()), + ("total", "total", [], tuple()), + (None, "count_distinct(Col1)", "Col1", ("Col1",)), + ("total", "total", "Col1", ("Col1",)), + (None, "count_distinct(A, B)", ["A", "B"], ("A", "B")), + ], ) ), ) @@ -922,7 +982,8 @@ def test_count_distinct_keyset( keys_df: pd.DataFrame, name: Optional[str], expected_name: str, - columns: List[str], + columns: Optional[List[str] | str], + expected_columns: Tuple[str, ...], ): """Query returned by groupby with KeySet and count_distinct is correct.""" keys = self._keys_from_pandas(spark, keys_df) @@ -930,7 +991,7 @@ def test_count_distinct_keyset( assert isinstance(query, Query) query_expr = query._query_expr self.assert_count_distinct_query_correct( - query_expr, keys, columns, expected_name + query_expr, keys, expected_columns, expected_name ) def assert_common_query_fields_correct( From 68c0ed799cafd2f1bdff2c015dabe0b0d6c3e433 Mon Sep 17 00:00:00 2001 From: Ted Date: Wed, 12 Nov 2025 16:25:27 +0100 Subject: [PATCH 24/25] Use Core's util for comparing DataFrames everywhere in tests (#117) * do the thing * fix test --------- Co-authored-by: Damien Desfontaines --- test/conftest.py | 36 +--------------- .../ids/queries/test_flat_map_by_id.py | 10 ++--- .../ids/test_count_distinct_optimization.py | 7 ++-- test/system/session/ids/test_partition.py | 18 +++----- test/system/session/rows/test_add_max_rows.py | 31 ++++++-------- test/system/session/test_special_values.py | 26 ++++++------ test/unit/keysets/test_keyset.py | 42 ++++++++----------- test/unit/keysets/test_product_keyset.py | 19 ++++----- .../test_measurement_visitor.py | 5 ++- .../transformation_visitor/conftest.py | 5 +-- test/unit/test_query_builder.py | 7 +--- test/unit/test_query_expr_compiler.py | 19 ++++----- test/unit/test_query_expression.py | 4 +- test/unit/test_session.py | 41 +++++++++--------- 14 files changed, 103 insertions(+), 167 deletions(-) diff --git a/test/conftest.py b/test/conftest.py index 9df58735..cef9a322 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -6,7 +6,7 @@ # TODO(#2206): Import these fixtures from core once it is rewritten import logging -from typing import Any, Dict, List, Optional, Sequence, TypeVar, Union, cast, overload +from typing import Any, Dict, List, TypeVar, Union, cast, overload from unittest.mock import Mock, create_autospec import numpy as np @@ -225,40 +225,6 @@ def pyspark_with_progress(): return spark -def assert_frame_equal_with_sort( - first_df: pd.DataFrame, - second_df: pd.DataFrame, - sort_columns: Optional[Sequence[str]] = None, -): - """Asserts that the two Pandas DataFrames are equal. - - Wrapper around pandas test function. Both dataframes are sorted - since the ordering in Spark is not guaranteed. - - Args: - first_df: First dataframe to compare. - second_df: Second dataframe to compare. - sort_columns: Names of column to sort on. By default sorts by all columns. - """ - if sorted(first_df.columns) != sorted(second_df.columns): - raise ValueError( - "DataFrames must have matching columns. " - f"first_df: {sorted(first_df.columns)}. " - f"second_df: {sorted(second_df.columns)}." - ) - if first_df.empty and second_df.empty: - return - if sort_columns is None: - sort_columns = list(first_df.columns) - if sort_columns: - first_df = first_df.set_index(sort_columns).sort_index().reset_index() - second_df = second_df.set_index(sort_columns).sort_index().reset_index() - # We explicitly pass check_dtype=False the equality check, so that identical - # DataFrames which differ only in dtypes (like one with an int64 column and - # the other with an Int64 column) are considered equal. - pd.testing.assert_frame_equal(first_df, second_df, check_dtype=False) - - def create_mock_measurement( input_domain: Domain = NumpyIntegerDomain(), input_metric: Metric = AbsoluteDifference(), diff --git a/test/system/session/ids/queries/test_flat_map_by_id.py b/test/system/session/ids/queries/test_flat_map_by_id.py index 114f3219..a42b6f26 100644 --- a/test/system/session/ids/queries/test_flat_map_by_id.py +++ b/test/system/session/ids/queries/test_flat_map_by_id.py @@ -9,6 +9,7 @@ import pytest from py4j.protocol import Py4JJavaError from pyspark.sql import Row +from tmlt.core.utils.testing import assert_dataframe_equal from tmlt.analytics import ( AddRowsWithID, @@ -19,7 +20,6 @@ QueryBuilder, ) -from .....conftest import assert_frame_equal_with_sort from .conftest import make_session @@ -42,7 +42,7 @@ def test_simple(spark): ) expected = pd.DataFrame({"sum": [15]}) result = sess.evaluate(q, budget) - assert_frame_equal_with_sort(result.toPandas(), expected) + assert_dataframe_equal(result, expected) def test_map_inputs(spark): @@ -67,7 +67,7 @@ def f(rows): q = QueryBuilder("t").flat_map_by_id(f, {}).enforce(MaxRowsPerID(1)).count() expected = pd.DataFrame({"count": [0]}) result = sess.evaluate(q, budget) - assert_frame_equal_with_sort(result.toPandas(), expected) + assert_dataframe_equal(result, expected) def test_id_conflict(spark): @@ -318,7 +318,7 @@ def test_nulls_nans_infs_allowed( ) result_df = sess.evaluate(base_query.enforce(MaxRowsPerID(10)).count(), budget) - assert_frame_equal_with_sort(result_df.toPandas(), expected_df) + assert_dataframe_equal(result_df, expected_df) @pytest.mark.xfail(reason="tumult-labs/tumult#3298") @@ -380,4 +380,4 @@ def test_no_output_columns(spark): ) expected = pd.DataFrame({"count": [15]}) result = sess.evaluate(q, budget) - assert_frame_equal_with_sort(result.toPandas(), expected) + assert_dataframe_equal(result, expected) diff --git a/test/system/session/ids/test_count_distinct_optimization.py b/test/system/session/ids/test_count_distinct_optimization.py index 0fa36d88..f2b265bc 100644 --- a/test/system/session/ids/test_count_distinct_optimization.py +++ b/test/system/session/ids/test_count_distinct_optimization.py @@ -6,6 +6,7 @@ import pandas as pd import pytest +from tmlt.core.utils.testing import assert_dataframe_equal from tmlt.analytics import ( KeySet, @@ -17,7 +18,6 @@ ) from tmlt.analytics._query_expr import QueryExpr -from ....conftest import assert_frame_equal_with_sort from ..conftest import INF_BUDGET, INF_BUDGET_ZCDP _KEYSET = KeySet.from_dict({"group": ["A", "B"]}) @@ -68,9 +68,8 @@ def test_id_only(base_query: QueryBuilder, session): ) def test_id_only_grouped(query: QueryBuilder, expected_res: pd.DataFrame, session): """Test grouped inference of count-distinct constraints.""" - res = session.evaluate(query, session.remaining_privacy_budget).toPandas() - - assert_frame_equal_with_sort(res, expected_res) + res = session.evaluate(query, session.remaining_privacy_budget) + assert_dataframe_equal(res, expected_res) @pytest.mark.parametrize( diff --git a/test/system/session/ids/test_partition.py b/test/system/session/ids/test_partition.py index 1a34c649..12239bd3 100644 --- a/test/system/session/ids/test_partition.py +++ b/test/system/session/ids/test_partition.py @@ -9,6 +9,7 @@ import sympy as sp from tmlt.core.metrics import AddRemoveKeys as CoreAddRemoveKeys from tmlt.core.metrics import DictMetric, SymmetricDifference +from tmlt.core.utils.testing import assert_dataframe_equal from tmlt.analytics import ( KeySet, @@ -20,7 +21,6 @@ ) from tmlt.analytics._table_identifier import NamedTable, TableCollection -from ....conftest import assert_frame_equal_with_sort from ..conftest import INF_BUDGET, INF_BUDGET_ZCDP _KEYSET = KeySet.from_dict({"group": ["A", "B"]}) @@ -127,16 +127,12 @@ def test_partition_and_create_with_MaxRowsPerID(session, table_stability): QueryBuilder("part0").count(), session.remaining_privacy_budget, ) - assert_frame_equal_with_sort( - answer_session2.toPandas(), pd.DataFrame({"count": [4]}) - ) + assert_dataframe_equal(answer_session2, pd.DataFrame({"count": [4]})) answer_session3 = session3.evaluate( QueryBuilder("part1").count(), session.remaining_privacy_budget, ) - assert_frame_equal_with_sort( - answer_session3.toPandas(), pd.DataFrame({"count": [1]}) - ) + assert_dataframe_equal(answer_session3, pd.DataFrame({"count": [1]})) # pylint: disable=protected-access assert session2._input_metric == DictMetric( {NamedTable("part0"): SymmetricDifference()} @@ -183,16 +179,12 @@ def test_partition_and_create_with_MaxGroupsPerID(session, table_stability): QueryBuilder("part0").enforce(MaxRowsPerID(2)).count(), session.remaining_privacy_budget, ) - assert_frame_equal_with_sort( - answer_session2.toPandas(), pd.DataFrame({"count": [4]}) - ) + assert_dataframe_equal(answer_session2, pd.DataFrame({"count": [4]})) answer_session3 = session3.evaluate( QueryBuilder("part1").enforce(MaxRowsPerID(2)).count(), session.remaining_privacy_budget, ) - assert_frame_equal_with_sort( - answer_session3.toPandas(), pd.DataFrame({"count": [1]}) - ) + assert_dataframe_equal(answer_session3, pd.DataFrame({"count": [1]})) # pylint: disable=protected-access assert session2._input_metric == DictMetric( {TableCollection("a"): CoreAddRemoveKeys({NamedTable("part0"): "id"})} diff --git a/test/system/session/rows/test_add_max_rows.py b/test/system/session/rows/test_add_max_rows.py index 3b7cbb2c..7db4db75 100644 --- a/test/system/session/rows/test_add_max_rows.py +++ b/test/system/session/rows/test_add_max_rows.py @@ -13,7 +13,7 @@ from tmlt.core.measures import ApproxDP, PureDP, RhoZCDP from tmlt.core.utils.exact_number import ExactNumber from tmlt.core.utils.parameters import calculate_noise_scale -from tmlt.core.utils.testing import Case, parametrize +from tmlt.core.utils.testing import Case, assert_dataframe_equal, parametrize from tmlt.analytics import ( AddMaxRowsInMaxGroups, @@ -44,7 +44,6 @@ ) from tmlt.analytics._table_identifier import NamedTable -from ....conftest import assert_frame_equal_with_sort from .conftest import EVALUATE_TESTS Row = Dict[str, Any] @@ -110,7 +109,7 @@ def test_queries_privacy_budget_infinity_puredp( query_expr_or_builder, privacy_budget=PureDPBudget(float("inf")) ) assert isinstance(actual_sdf, DataFrame) - assert_frame_equal_with_sort(actual_sdf.toPandas(), expected_df) + assert_dataframe_equal(actual_sdf, expected_df) @pytest.mark.parametrize( "query_expr_or_builder,expected_expr,expected_df", @@ -191,7 +190,7 @@ def test_queries_privacy_budget_infinity_rhozcdp( query_expr_or_builder, privacy_budget=RhoZCDPBudget(float("inf")) ) assert isinstance(actual_sdf, DataFrame) - assert_frame_equal_with_sort(actual_sdf.toPandas(), expected_df) + assert_dataframe_equal(actual_sdf, expected_df) @pytest.mark.parametrize( "query_expr,session_budget,query_budget,expected", @@ -297,7 +296,7 @@ def test_private_join_privacy_budget_infinity(self, privacy_budget: PrivacyBudge ) actual_sdf = session.evaluate(query_builder, privacy_budget=privacy_budget) assert isinstance(actual_sdf, DataFrame) - assert_frame_equal_with_sort(actual_sdf.toPandas(), expected_df) + assert_dataframe_equal(actual_sdf, expected_df) @pytest.mark.parametrize( "mechanism", [(CountMechanism.DEFAULT), (CountMechanism.LAPLACE)] @@ -412,7 +411,7 @@ def duplicate_rows(_: Row) -> List[Row]: ) actual_sdf = session.evaluate(query, session.remaining_privacy_budget) assert isinstance(actual_sdf, DataFrame) - assert_frame_equal_with_sort(actual_sdf.toPandas(), expected_df) + assert_dataframe_equal(actual_sdf, expected_df) @parametrize( Case("positive")( @@ -644,7 +643,7 @@ def test_create_view_with_stability( .sum("X", 0, 3, name="sum") ) actual = session.evaluate(sum_query, privacy_budget) - assert_frame_equal_with_sort(actual.toPandas(), expected) + assert_dataframe_equal(actual, expected) @pytest.mark.parametrize( "starting_budget,partition_budget", @@ -818,16 +817,12 @@ def test_partition_and_create_correct_answer( ), inf_budget, ) - assert_frame_equal_with_sort( - answer_session2.toPandas(), pd.DataFrame({"count": [3]}) - ) + assert_dataframe_equal(answer_session2, pd.DataFrame({"count": [3]})) answer_session3 = session3.evaluate( QueryBuilder("private1").count(), inf_budget, ) - assert_frame_equal_with_sort( - answer_session3.toPandas(), pd.DataFrame({"count": [1]}) - ) + assert_dataframe_equal(answer_session3, pd.DataFrame({"count": [1]})) @pytest.mark.parametrize("output_measure", [(PureDP()), (ApproxDP()), (RhoZCDP())]) def test_partitions_composed( @@ -1193,9 +1188,9 @@ def test_create_view_composed_correct_answer( .groupby(KeySet.from_dict({})) .sum("i", low=0, high=3, mechanism=mechanism, name="sum") ) - answer = session.evaluate(sum_query, inf_budget).toPandas() + answer = session.evaluate(sum_query, inf_budget) expected = pd.DataFrame({"sum": [9]}) - assert_frame_equal_with_sort(answer, expected) + assert_dataframe_equal(answer, expected) def test_caching(self, spark): """Tests that caching works as expected.""" @@ -1262,7 +1257,7 @@ def test_filter_regression(self, spark) -> None: count_query = QueryBuilder("private").filter("B == 2").groupby(keyset).count() count_result = session.evaluate(count_query, budget_per_query) count_a_b = count_result.select("A", "B") - assert_frame_equal_with_sort(count_a_b.toPandas(), expected_a_b) + assert_dataframe_equal(count_a_b, expected_a_b) median_query = ( QueryBuilder("private") @@ -1272,7 +1267,7 @@ def test_filter_regression(self, spark) -> None: ) median_result = session.evaluate(median_query, budget_per_query) median_a_b = median_result.select("A", "B") - assert_frame_equal_with_sort(median_a_b.toPandas(), expected_a_b) + assert_dataframe_equal(median_a_b, expected_a_b) average_query = ( QueryBuilder("private") @@ -1282,7 +1277,7 @@ def test_filter_regression(self, spark) -> None: ) average_result = session.evaluate(average_query, budget_per_query) average_a_b = average_result.select("A", "B") - assert_frame_equal_with_sort(average_a_b.toPandas(), expected_a_b) + assert_dataframe_equal(average_a_b, expected_a_b) def test_grouping_noninteger_stability(self, spark) -> None: """Test that zCDP grouping_column and non-integer stabilities work.""" diff --git a/test/system/session/test_special_values.py b/test/system/session/test_special_values.py index 846af57f..d4e87e88 100644 --- a/test/system/session/test_special_values.py +++ b/test/system/session/test_special_values.py @@ -10,7 +10,7 @@ import pytest from numpy import sqrt from pyspark.sql import DataFrame -from tmlt.core.utils.testing import Case, parametrize +from tmlt.core.utils.testing import Case, assert_dataframe_equal, parametrize from tmlt.analytics import ( AddOneRow, @@ -28,8 +28,6 @@ ) from tmlt.analytics._schema import Schema, analytics_to_spark_schema -from ...conftest import assert_frame_equal_with_sort - @pytest.fixture(name="sdf_special_values", scope="module") def special_values_dataframe(spark): @@ -162,7 +160,7 @@ def test_default_behavior( ) result = sess.evaluate(query, inf_budget) print(expected_df) - assert_frame_equal_with_sort(result.toPandas(), expected_df) + assert_dataframe_equal(result, expected_df) @parametrize( @@ -246,7 +244,7 @@ def test_replace_null_and_nan( query = base_query.replace_null_and_nan(replace_with).average(column, low, high) result = sess.evaluate(query, inf_budget) expected_df = pd.DataFrame([[expected_average]], columns=[column + "_average"]) - assert_frame_equal_with_sort(result.toPandas(), expected_df) + assert_dataframe_equal(result, expected_df) @parametrize( @@ -351,7 +349,7 @@ def test_drop_null_and_nan( ) result = sess.evaluate(query, inf_budget) expected_df = pd.DataFrame([[expected_sum]], columns=[measure_column + "_sum"]) - assert_frame_equal_with_sort(result.toPandas(), expected_df) + assert_dataframe_equal(result, expected_df) @parametrize( @@ -421,7 +419,7 @@ def test_replace_infinity_average( query = base_query.replace_infinity(replace_with).average("float_infs", low, high) result = sess.evaluate(query, inf_budget) expected_df = pd.DataFrame([[expected_average]], columns=["float_infs_average"]) - assert_frame_equal_with_sort(result.toPandas(), expected_df) + assert_dataframe_equal(result, expected_df) @parametrize( @@ -469,14 +467,14 @@ def test_replace_infinity_other_aggregations( ) result_sum = sess.evaluate(query_sum, inf_budget) expected_df = pd.DataFrame([[expected_sum]], columns=["float_infs_sum"]) - assert_frame_equal_with_sort(result_sum.toPandas(), expected_df) + assert_dataframe_equal(result_sum, expected_df) query_stdev = ( QueryBuilder("private").replace_infinity(replace_with).stdev("float_infs", 0, 1) ) result_stdev = sess.evaluate(query_stdev, inf_budget) expected_df = pd.DataFrame([[expected_stdev]], columns=["float_infs_stdev"]) - assert_frame_equal_with_sort(result_stdev.toPandas(), expected_df) + assert_dataframe_equal(result_stdev, expected_df) query_variance = ( QueryBuilder("private") @@ -485,7 +483,7 @@ def test_replace_infinity_other_aggregations( ) result_variance = sess.evaluate(query_variance, inf_budget) expected_df = pd.DataFrame([[expected_variance]], columns=["float_infs_variance"]) - assert_frame_equal_with_sort(result_variance.toPandas(), expected_df) + assert_dataframe_equal(result_variance, expected_df) @parametrize( @@ -534,7 +532,7 @@ def test_drop_infinity( query = base_query.drop_infinity(columns).sum("float_infs", 0, 1) result = sess.evaluate(query, inf_budget) expected_df = pd.DataFrame([[expected_sum]], columns=["float_infs_sum"]) - assert_frame_equal_with_sort(result.toPandas(), expected_df) + assert_dataframe_equal(result, expected_df) @parametrize( @@ -591,7 +589,7 @@ def test_get_bounds( AddOneRow(), ) result = sess.evaluate(query, inf_budget) - assert_frame_equal_with_sort(result.toPandas(), expected_df) + assert_dataframe_equal(result, expected_df) @parametrize( @@ -649,7 +647,7 @@ def test_privacy_ids( AddRowsWithID("string_nulls"), ) result = sess.evaluate(query, inf_budget) - assert_frame_equal_with_sort(result.toPandas(), expected_df) + assert_dataframe_equal(result, expected_df) @pytest.fixture(name="sdf_for_joins", scope="module") @@ -833,7 +831,7 @@ def test_joins( .build() ) result = sess.evaluate(query, inf_budget) - assert_frame_equal_with_sort(result.toPandas(), expected_df) + assert_dataframe_equal(result, expected_df) @parametrize( diff --git a/test/unit/keysets/test_keyset.py b/test/unit/keysets/test_keyset.py index 2a716bf9..98f4f71d 100644 --- a/test/unit/keysets/test_keyset.py +++ b/test/unit/keysets/test_keyset.py @@ -22,12 +22,10 @@ StructField, StructType, ) -from tmlt.core.utils.testing import Case, parametrize +from tmlt.core.utils.testing import Case, assert_dataframe_equal, parametrize from tmlt.analytics import ColumnDescriptor, ColumnType, KeySet -from ...conftest import assert_frame_equal_with_sort - @pytest.mark.parametrize( "d,expected_df", @@ -91,7 +89,7 @@ def test_from_dict( ) -> None: """Test KeySet.from_dict works""" keyset = KeySet.from_dict(d) - assert_frame_equal_with_sort(keyset.dataframe().toPandas(), expected_df) + assert_dataframe_equal(keyset.dataframe(), expected_df) @pytest.mark.parametrize( @@ -217,7 +215,7 @@ def test_from_tuples( ): """KeySet.from_tuples works as expected""" keyset = KeySet.from_tuples(tuples, columns) - assert_frame_equal_with_sort(keyset.dataframe().toPandas(), expected_df) + assert_dataframe_equal(keyset.dataframe(), expected_df) @parametrize( @@ -299,7 +297,7 @@ def test_from_tuples_invalid_schema( def test_from_dataframe(spark, df_in: pd.DataFrame) -> None: """Test KeySet.from_dataframe works.""" keyset = KeySet.from_dataframe(spark.createDataFrame(df_in)) - assert_frame_equal_with_sort(keyset.dataframe().toPandas(), df_in) + assert_dataframe_equal(keyset.dataframe(), df_in) @pytest.mark.parametrize( @@ -315,7 +313,7 @@ def test_from_dataframe(spark, df_in: pd.DataFrame) -> None: def test_from_dataframe_nonunique(spark, df: pd.DataFrame, expected_df: pd.DataFrame): """Test KeySet.from_dataframe works on a dataframe with duplicate rows.""" keyset = KeySet.from_dataframe(spark.createDataFrame(df)) - assert_frame_equal_with_sort(keyset.dataframe().toPandas(), expected_df) + assert_dataframe_equal(keyset.dataframe(), expected_df) @pytest.mark.parametrize( @@ -345,7 +343,7 @@ def test_from_dataframe_with_null( ) -> None: """Test KeySet.from_dataframe allows nulls.""" keyset = KeySet.from_dataframe(spark.createDataFrame(df_in, schema=schema)) - assert_frame_equal_with_sort(keyset.dataframe().toPandas(), df_in) + assert_dataframe_equal(keyset.dataframe(), df_in) @pytest.mark.parametrize( @@ -398,7 +396,7 @@ def test_filter_str( """Test KeySet.filter works""" keyset = KeySet.from_dataframe(spark.createDataFrame(keyset_df)) filtered_keyset = keyset.filter(condition) - assert_frame_equal_with_sort(filtered_keyset.dataframe().toPandas(), expected_df) + assert_dataframe_equal(filtered_keyset.dataframe(), expected_df) @pytest.mark.parametrize( @@ -508,12 +506,10 @@ def test_filter_condition() -> None: expected = pd.DataFrame( [["abc", 100], ["def", 100], ["ghi", 100]], columns=["A", "B"] ) - assert_frame_equal_with_sort(filtered.dataframe().toPandas(), expected) + assert_dataframe_equal(filtered.dataframe(), expected) filtered2 = keyset.filter(sf.col("A") != "string that is not there") - assert_frame_equal_with_sort( - filtered2.dataframe().toPandas(), keyset.dataframe().toPandas() - ) + assert_dataframe_equal(filtered2.dataframe(), keyset.dataframe()) # This test also uses a Column as a filter condition, and is not @@ -544,7 +540,7 @@ def test_getitem_single(col: str, expected_df: pd.DataFrame) -> None: """Test KeySet[col] returns a keyset for only the requested column.""" keyset = KeySet.from_dict({"A": ["a1", "a2"], "B": [0, 1, 2, 3]}) got = keyset[col] - assert_frame_equal_with_sort(got.dataframe().toPandas(), expected_df) + assert_dataframe_equal(got.dataframe(), expected_df) # This test is not parameterized because Python does not accept @@ -554,16 +550,14 @@ def test_getitem_multiple() -> None: keyset = KeySet.from_dict({"A": ["a1", "a2"], "B": ["b1"], "C": [0, 1]}) got_ab = keyset["A", "B"] expected_ab = pd.DataFrame([["a1", "b1"], ["a2", "b1"]], columns=["A", "B"]) - assert_frame_equal_with_sort(got_ab.dataframe().toPandas(), expected_ab) + assert_dataframe_equal(got_ab.dataframe(), expected_ab) got_bc = keyset["B", "C"] expected_bc = pd.DataFrame([["b1", 0], ["b1", 1]], columns=["B", "C"]) - assert_frame_equal_with_sort(got_bc.dataframe().toPandas(), expected_bc) + assert_dataframe_equal(got_bc.dataframe(), expected_bc) got_abc = keyset["A", "B", "C"] - assert_frame_equal_with_sort( - got_abc.dataframe().toPandas(), keyset.dataframe().toPandas() - ) + assert_dataframe_equal(got_abc.dataframe(), keyset.dataframe()) @pytest.mark.parametrize( @@ -577,7 +571,7 @@ def test_getitem_list(l: List[str], expected_df: pd.DataFrame) -> None: """Test KeySet[[col1, col2, ...]] returns a keyset for requested columns.""" keyset = KeySet.from_dict({"A": ["a1", "a2"], "B": ["b1"], "C": [0, 1]}) got = keyset[l] - assert_frame_equal_with_sort(got.dataframe().toPandas(), expected_df) + assert_dataframe_equal(got.dataframe(), expected_df) @pytest.mark.parametrize( @@ -600,8 +594,8 @@ def test_getitem_list_noncartesian( ) -> None: """Test that indexing multiple columns works on non-Cartesian KeySets.""" keyset = KeySet.from_dataframe(spark.createDataFrame(keys_df)) - actual_df = keyset[columns].dataframe().toPandas() - assert_frame_equal_with_sort(actual_df, expected_df) + actual_df = keyset[columns].dataframe() + assert_dataframe_equal(actual_df, expected_df) @pytest.mark.parametrize( @@ -678,8 +672,8 @@ def test_crossproduct(other: KeySet, expected_df: pd.DataFrame) -> None: keyset = KeySet.from_dict({"A": ["a1", "a2"], "B": [0, 1]}) product_left = keyset * other product_right = other * keyset - assert_frame_equal_with_sort(product_left.dataframe().toPandas(), expected_df) - assert_frame_equal_with_sort(product_right.dataframe().toPandas(), expected_df) + assert_dataframe_equal(product_left.dataframe(), expected_df) + assert_dataframe_equal(product_right.dataframe(), expected_df) @pytest.mark.parametrize( diff --git a/test/unit/keysets/test_product_keyset.py b/test/unit/keysets/test_product_keyset.py index 8916e6e9..6b323e16 100644 --- a/test/unit/keysets/test_product_keyset.py +++ b/test/unit/keysets/test_product_keyset.py @@ -12,11 +12,10 @@ import pytest from pyspark.sql import SparkSession from pyspark.sql.functions import lit +from tmlt.core.utils.testing import assert_dataframe_equal from tmlt.analytics import ColumnDescriptor, ColumnType, KeySet -from ...conftest import assert_frame_equal_with_sort - # pylint: disable=unused-argument @@ -31,7 +30,7 @@ def test_init_with_product_keyset( got = ks1 * product_2_and_3 expected_df = ks1.dataframe().crossJoin(ks2.dataframe()).crossJoin(ks3.dataframe()) - assert_frame_equal_with_sort(got.dataframe().toPandas(), expected_df.toPandas()) + assert_dataframe_equal(got.dataframe(), expected_df) def test_init_fails_with_duplicate_columns(spark: SparkSession) -> None: @@ -122,7 +121,7 @@ def test_getitem_single_column( """Test filtering with __getitem__.""" product = reduce(lambda x, y: x * y, factors) filtered_product = product[select_col] - assert_frame_equal_with_sort(filtered_product.dataframe().toPandas(), expect_df) + assert_dataframe_equal(filtered_product.dataframe(), expect_df) @pytest.mark.parametrize( @@ -172,7 +171,7 @@ def test_getitem_multiple_columns_as_tuple( product = keyset_a * keyset_b * keyset_c filtered = product[select_cols] - assert_frame_equal_with_sort(filtered.dataframe().toPandas(), expect_df) + assert_dataframe_equal(filtered.dataframe(), expect_df) def test_getitem_from_subset_of_columns(spark: SparkSession) -> None: @@ -193,7 +192,7 @@ def test_getitem_from_subset_of_columns(spark: SparkSession) -> None: product = keyset1 * keyset2 filtered = product["A"] expect_df = pd.DataFrame({"A": [1, 2, 3]}) - assert_frame_equal_with_sort(filtered.dataframe().toPandas(), expect_df) + assert_dataframe_equal(filtered.dataframe(), expect_df) filtered_ab = product["A", "B"] expect_df_ab = pd.DataFrame( @@ -204,7 +203,7 @@ def test_getitem_from_subset_of_columns(spark: SparkSession) -> None: ], columns=["A", "B"], ) - assert_frame_equal_with_sort(filtered_ab.dataframe().toPandas(), expect_df_ab) + assert_dataframe_equal(filtered_ab.dataframe(), expect_df_ab) def test_getitem_errors_with_duplicate_columns( @@ -248,7 +247,7 @@ def test_dataframe( factors = [KeySet.from_dataframe(spark.createDataFrame(df)) for df in factor_dfs] product = reduce(lambda x, y: x * y, factors) got = product.dataframe() - assert_frame_equal_with_sort(got.toPandas(), expected_df) + assert_dataframe_equal(got, expected_df) def test_complex_filter(spark: SparkSession) -> None: @@ -272,7 +271,7 @@ def test_complex_filter(spark: SparkSession) -> None: ], columns=["A", "B"], ) - assert_frame_equal_with_sort(filtered.dataframe().toPandas(), expect_df) + assert_dataframe_equal(filtered.dataframe(), expect_df) @pytest.mark.parametrize( @@ -325,7 +324,7 @@ def test_getitem_ordering(spark: SparkSession, column_ordering: Sequence[str]) - selected_keyset = product[column_ordering] assert list(column_ordering) == selected_keyset.columns() got_df = selected_keyset.dataframe() - assert_frame_equal_with_sort(got_df.toPandas(), expect_df) + assert_dataframe_equal(got_df, expect_df) assert list(column_ordering) == got_df.columns diff --git a/test/unit/query_expr_compiler/test_measurement_visitor.py b/test/unit/query_expr_compiler/test_measurement_visitor.py index 7be38fa6..126edeb1 100644 --- a/test/unit/query_expr_compiler/test_measurement_visitor.py +++ b/test/unit/query_expr_compiler/test_measurement_visitor.py @@ -1,5 +1,5 @@ """Tests for MeasurementVisitor.""" -from test.conftest import assert_frame_equal_with_sort, create_empty_input +from test.conftest import create_empty_input from typing import List, Union from unittest.mock import patch @@ -27,6 +27,7 @@ from tmlt.core.transformations.base import Transformation from tmlt.core.transformations.chaining import ChainTT from tmlt.core.utils.exact_number import ExactNumber +from tmlt.core.utils.testing import assert_dataframe_equal from tmlt.analytics import ( KeySet, @@ -1332,4 +1333,4 @@ def test_suppress_aggregates_correctness( self.visitor.adjusted_budget = budget measurement, _ = query.accept(self.visitor) got = measurement(input_data) - assert_frame_equal_with_sort(got.toPandas(), expected_result) + assert_dataframe_equal(got, expected_result) diff --git a/test/unit/query_expr_compiler/transformation_visitor/conftest.py b/test/unit/query_expr_compiler/transformation_visitor/conftest.py index a2c8f1ce..c0546fc5 100644 --- a/test/unit/query_expr_compiler/transformation_visitor/conftest.py +++ b/test/unit/query_expr_compiler/transformation_visitor/conftest.py @@ -33,6 +33,7 @@ from tmlt.core.metrics import AddRemoveKeys, DictMetric, SymmetricDifference from tmlt.core.transformations.base import Transformation from tmlt.core.transformations.chaining import ChainTT +from tmlt.core.utils.testing import assert_dataframe_equal from tmlt.analytics._catalog import Catalog from tmlt.analytics._query_expr_compiler._transformation_visitor import ( @@ -43,8 +44,6 @@ from tmlt.analytics._table_reference import TableReference from tmlt.analytics._transformation_utils import get_table_from_ref -from ....conftest import assert_frame_equal_with_sort - # Example date and timestamp DATE1 = datetime.date.fromisoformat("2022-01-01") TIMESTAMP1 = datetime.datetime.fromisoformat("2022-01-01T12:30:00") @@ -401,7 +400,7 @@ def _validate_result( assert isinstance(t.output_domain, DictDomain) assert isinstance(t.output_metric, (DictMetric, AddRemoveKeys)) result_df = self._get_result(t, ref) - assert_frame_equal_with_sort(result_df, transformed_df) + assert_dataframe_equal(result_df, transformed_df) @pytest.fixture(scope="class") diff --git a/test/unit/test_query_builder.py b/test/unit/test_query_builder.py index 8abeabfc..ae73adfa 100644 --- a/test/unit/test_query_builder.py +++ b/test/unit/test_query_builder.py @@ -15,6 +15,7 @@ import pandas as pd import pytest from pyspark.sql import DataFrame, SparkSession +from tmlt.core.utils.testing import assert_dataframe_equal from tmlt.analytics import ( BinningSpec, @@ -57,8 +58,6 @@ ) from tmlt.analytics._schema import FrozenDict, Schema -from ..conftest import assert_frame_equal_with_sort - PRIVATE_ID = "private" Row = Dict[str, Any] @@ -128,9 +127,7 @@ def test_join_public_dataframe(spark, join_columns: Optional[List[str]]): join_expr = query_expr.child assert isinstance(join_expr, JoinPublic) assert isinstance(join_expr.public_table, DataFrame) - assert_frame_equal_with_sort( - join_expr.public_table.toPandas(), join_table.toPandas() - ) + assert_dataframe_equal(join_expr.public_table, join_table) root_expr = join_expr.child assert isinstance(root_expr, PrivateSource) diff --git a/test/unit/test_query_expr_compiler.py b/test/unit/test_query_expr_compiler.py index 839bee34..9674dbf0 100644 --- a/test/unit/test_query_expr_compiler.py +++ b/test/unit/test_query_expr_compiler.py @@ -32,6 +32,7 @@ ) from tmlt.core.measures import PureDP, RhoZCDP from tmlt.core.metrics import DictMetric, SymmetricDifference +from tmlt.core.utils.testing import assert_dataframe_equal from tmlt.analytics import KeySet, PureDPBudget, RhoZCDPBudget, TruncationStrategy from tmlt.analytics._catalog import Catalog @@ -70,8 +71,6 @@ from tmlt.analytics._table_identifier import NamedTable from tmlt.analytics._transformation_utils import get_table_from_ref -from ..conftest import assert_frame_equal_with_sort - GROUPBY_TWO_COLUMNS = pd.DataFrame([["0", 0], ["0", 1], ["1", 1]], columns=["A", "B"]) GROUPBY_TWO_SCHEMA = StructType( [StructField("A", StringType(), False), StructField("B", LongType(), False)] @@ -651,7 +650,7 @@ def test_count_distinct(self, spark, query_expr: QueryExpr, expected: pd.DataFra table_constraints={t: [] for t in self.stability.keys()}, ) actual = measurement({NamedTable("private"): count_distinct_df}) - assert_frame_equal_with_sort(actual.toPandas(), expected) + assert_dataframe_equal(actual, expected) @pytest.mark.parametrize("query_expr,expected", QUERY_EXPR_COMPILER_TESTS) def test_queries(self, query_expr: QueryExpr, expected: pd.DataFrame): @@ -677,7 +676,7 @@ def test_queries(self, query_expr: QueryExpr, expected: pd.DataFrame): table_constraints={t: [] for t in self.stability.keys()}, ) actual = measurement({NamedTable("private"): self.sdf}) - assert_frame_equal_with_sort(actual.toPandas(), expected) + assert_dataframe_equal(actual, expected) @pytest.mark.parametrize( "query,output_measure,expected", @@ -1002,7 +1001,7 @@ def test_noise_param_combinations( table_constraints={t: [] for t in self.stability.keys()}, ) actual = measurement({NamedTable("private"): self.sdf}) - assert_frame_equal_with_sort(actual.toPandas(), expected) + assert_dataframe_equal(actual, expected) def test_join_public_dataframe(self, spark): """Public join works with public tables given as Spark dataframes.""" @@ -1026,8 +1025,8 @@ def test_join_public_dataframe(self, spark): source_dict = {NamedTable("private"): self.sdf} output_sdf = get_table_from_ref(transformation, reference)(source_dict) - assert_frame_equal_with_sort( - output_sdf.toPandas(), + assert_dataframe_equal( + output_sdf, pd.DataFrame( [ ("0", 0, 0.0, 0.1), @@ -1062,8 +1061,8 @@ def test_join_private(self, spark): source_dict = {NamedTable("private"): self.sdf, NamedTable("private_2"): sdf_2} output_sdf = get_table_from_ref(transformation, reference)(source_dict) - assert_frame_equal_with_sort( - output_sdf.toPandas(), + assert_dataframe_equal( + output_sdf, pd.DataFrame( [ ["0", 0, 0.0, 0], @@ -1249,7 +1248,7 @@ def test_float_groupby_sum(self, spark): ) actual = measurement({NamedTable("private"): sdf_float}) expected = pd.DataFrame({"A": ["0", "1"], "sum": [2.0, 1.0]}) - assert_frame_equal_with_sort(actual.toPandas(), expected) + assert_dataframe_equal(actual, expected) @pytest.mark.parametrize( "query_expr", diff --git a/test/unit/test_query_expression.py b/test/unit/test_query_expression.py index 5c68ab39..20c7dbf1 100644 --- a/test/unit/test_query_expression.py +++ b/test/unit/test_query_expression.py @@ -13,6 +13,7 @@ import pytest from pyspark.sql import DataFrame, SparkSession from pyspark.sql.types import BinaryType, StructField, StructType +from tmlt.core.utils.testing import assert_dataframe_equal from typeguard import TypeCheckError from tmlt.analytics import ( @@ -50,7 +51,6 @@ GROUPBY_AGGREGATION_QUERIES, NON_GROUPBY_AGGREGATION_QUERIES, SIMPLE_TRANSFORMATION_QUERIES, - assert_frame_equal_with_sort, ) """Tests for invalid attributes on dataclasses.""" @@ -542,7 +542,7 @@ def test_join_public_string_nan(spark): df = spark.createDataFrame(pd.DataFrame({"col": ["nan", "NaN", "NAN", "Nan"]})) query_expr = JoinPublic(PrivateSource("a"), df) assert isinstance(query_expr.public_table, DataFrame) - assert_frame_equal_with_sort(query_expr.public_table.toPandas(), df.toPandas()) + assert_dataframe_equal(query_expr.public_table, df) def test_join_public_dataframe_validation_column_type(spark): diff --git a/test/unit/test_session.py b/test/unit/test_session.py index f1333583..393443d7 100644 --- a/test/unit/test_session.py +++ b/test/unit/test_session.py @@ -50,6 +50,7 @@ from tmlt.core.transformations.chaining import ChainTT from tmlt.core.transformations.spark_transformations.partition import PartitionByKeys from tmlt.core.utils.exact_number import ExactNumber +from tmlt.core.utils.testing import assert_dataframe_equal from typeguard import TypeCheckError from tmlt.analytics import ( @@ -89,8 +90,6 @@ from tmlt.analytics._table_identifier import NamedTable, TableCollection from tmlt.analytics.config import config -from ..conftest import assert_frame_equal_with_sort - # Disable redefined-outer-name because spark is used to create dataframes as test # inputs and within tests to check outputs and run queries. # pylint: disable=redefined-outer-name @@ -312,11 +311,11 @@ def test_from_dataframe( output_measure=expected_output_measure, ) mock_composition_init.return_value.assert_called() - assert_frame_equal_with_sort( + assert_dataframe_equal( mock_composition_init.return_value.mock_calls[0][1][0][ NamedTable("private") - ].toPandas(), - self.sdf.toPandas(), + ], + self.sdf, ) mock_session_init.assert_called_with( self=ANY, accountant=ANY, public_sources={} @@ -388,11 +387,11 @@ def test_from_dataframe_add_remove_keys( output_measure=expected_output_measure, ) mock_composition_init.return_value.assert_called() - assert_frame_equal_with_sort( + assert_dataframe_equal( mock_composition_init.return_value.mock_calls[0][1][0][ TableCollection("default_id_space") - ][NamedTable("private")].toPandas(), - self.sdf.toPandas(), + ][NamedTable("private")], + self.sdf, ) mock_session_init.assert_called_with( self=ANY, accountant=ANY, public_sources={} @@ -574,9 +573,9 @@ def test_add_public_dataframe(self): session = Session(accountant=mock_accountant, public_sources={}) session.add_public_dataframe(source_id="public", dataframe=self.join_df) assert "public" in session.public_source_dataframes - assert_frame_equal_with_sort( - session.public_source_dataframes["public"].toPandas(), - self.join_df.toPandas(), + assert_dataframe_equal( + session.public_source_dataframes["public"], + self.join_df, ) expected_schema = self.join_df.schema actual_schema = session.public_source_dataframes["public"].schema @@ -973,10 +972,10 @@ def test_get_groups_with_various_protected_change( actual_sdf = session.evaluate(query, session.remaining_privacy_budget) try: - assert_frame_equal_with_sort(actual_sdf.toPandas(), expected_df) + assert_dataframe_equal(actual_sdf, expected_df) except AssertionError: # Deals with the case where the DFs mismatched due to noise. - assert_frame_equal_with_sort(actual_sdf.toPandas(), possible_df) + assert_dataframe_equal(actual_sdf, possible_df) @pytest.mark.parametrize( "protected_change", @@ -1040,7 +1039,7 @@ def test_get_groups_with_add_rows_with_id(self, spark): query = QueryBuilder("private").enforce(MaxRowsPerID(1)).get_groups(["count"]) expected_df = pd.DataFrame({"count": [0]}) actual_sdf = session.evaluate(query, session.remaining_privacy_budget) - assert_frame_equal_with_sort(actual_sdf.toPandas(), expected_df) + assert_dataframe_equal(actual_sdf, expected_df) @pytest.mark.parametrize("columns", [(["B"]), (["count", "B"])]) def test_get_groups_on_id_column(self, spark, columns: List[str]): @@ -2321,9 +2320,9 @@ def test_build_works_correctly( for table_id, private_source in expected_private_sources.items(): assert accountant._queryable is not None assert isinstance(accountant._queryable, SequentialQueryable) - assert_frame_equal_with_sort( - accountant._queryable._data[table_id].toPandas(), - private_source.toPandas(), + assert_dataframe_equal( + accountant._queryable._data[table_id], + private_source, ) assert accountant.d_in == expected_stabilities @@ -2331,9 +2330,7 @@ def test_build_works_correctly( public_sources = session._public_sources assert public_sources.keys() == expected_public_sources.keys() for key in public_sources: - assert_frame_equal_with_sort( - public_sources[key].toPandas(), expected_public_sources[key].toPandas() - ) + assert_dataframe_equal(public_sources[key], expected_public_sources[key]) assert session._output_measure == expected_output_measure @pytest.mark.parametrize("nullable", [(True), (False)]) @@ -2466,7 +2463,7 @@ def test_automatic_partitions( end_pd_df = end_df.toPandas() if isinstance(expected_df, pd.DataFrame): - assert_frame_equal_with_sort(end_pd_df, expected_df) + assert_dataframe_equal(end_pd_df, expected_df) # Else the expected_df is a range of values for a quantile query. else: for pair, values in expected_df.items(): @@ -2525,7 +2522,7 @@ def test_automatic_partitions_with_ids( end_pd_df = end_df.toPandas() if isinstance(expected_df, pd.DataFrame): - assert_frame_equal_with_sort(end_pd_df, expected_df) + assert_dataframe_equal(end_pd_df, expected_df) # Else the expected_df is a range of values for a quantile query. else: for pair, values in expected_df.items(): From 589df249046635ce400e376061fc1b2c061d8b5e Mon Sep 17 00:00:00 2001 From: Tom Magerlein Date: Fri, 14 Nov 2025 19:48:14 -0500 Subject: [PATCH 25/25] Replace pylint, pydocstyle with ruff check (#116) * Replace pylint, pydocstyle with ruff check Replaces pylint and pydocstyle linters with `ruff check`, and configures Ruff's lint rules. All lints that are not able to be auto-fixed or trivially fixable by hand are temporarily disabled to keep the diff smaller. * Disable RUF010 and revert changes it introduced * Remove unneeded whitespace where pylint control comments were removed * Update pyspark intersphinx location to match Core * Fix doctests, pull in changes from opendp/tumult-core#47 --- doc/conf.py | 4 +- doc/deployment/spark.rst | 9 +- noxfile.py | 3 +- pyproject.toml | 126 ++++++++---------- src/tmlt/analytics/__init__.py | 2 - src/tmlt/analytics/_neighboring_relation.py | 16 +-- src/tmlt/analytics/_noise_info.py | 2 +- src/tmlt/analytics/_query_expr.py | 20 +-- .../_base_measurement_visitor.py | 53 ++++---- .../_base_transformation_visitor.py | 5 +- .../_measurement_visitor.py | 4 +- src/tmlt/analytics/_transformation_utils.py | 2 - src/tmlt/analytics/binning_spec.py | 25 ++-- src/tmlt/analytics/config.py | 2 +- src/tmlt/analytics/keyset/_keyset.py | 17 +-- src/tmlt/analytics/privacy_budget.py | 5 +- src/tmlt/analytics/query_builder.py | 24 ++-- src/tmlt/analytics/session.py | 33 ++--- src/tmlt/analytics/utils.py | 10 +- test/conftest.py | 2 +- .../ids/test_constraint_propagation.py | 2 - .../ids/test_count_distinct_optimization.py | 4 - .../session/ids/test_id_col_operations.py | 3 +- .../session/ids/test_l0_linf_truncation.py | 10 -- test/system/session/ids/test_l1_truncation.py | 4 - test/system/session/ids/test_partition.py | 6 +- .../session/mixed/test_mixed_session.py | 9 +- test/system/session/rows/test_add_max_rows.py | 22 +-- .../rows/test_add_max_rows_in_max_groups.py | 6 +- test/system/session/rows/test_invalid.py | 4 +- test/unit/keysets/test_cross_join.py | 2 - test/unit/keysets/test_decomposition.py | 1 - test/unit/keysets/test_detect.py | 6 +- test/unit/keysets/test_equivalence.py | 5 +- test/unit/keysets/test_filter.py | 2 - test/unit/keysets/test_join.py | 2 - test/unit/keysets/test_keyset.py | 2 +- test/unit/keysets/test_product_keyset.py | 4 +- test/unit/keysets/test_project.py | 4 +- test/unit/keysets/test_rewrite.py | 1 - test/unit/keysets/test_subtract.py | 2 - .../test_measurement_visitor.py | 4 - .../test_constraints.py | 4 +- test/unit/test_binning_spec.py | 2 - test/unit/test_config.py | 9 +- test/unit/test_privacy_budget.py | 8 -- .../test_privacy_budget_rounding_helper.py | 3 - test/unit/test_query_builder.py | 10 -- test/unit/test_query_expr_compiler.py | 3 +- test/unit/test_query_expression.py | 5 +- test/unit/test_schema.py | 4 +- test/unit/test_session.py | 38 +++--- test/unit/test_table_identifiers.py | 2 +- test/unit/test_utils.py | 2 +- uv.lock | 94 +++++-------- 55 files changed, 232 insertions(+), 421 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index 0c9a899e..72bf3599 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -1,5 +1,3 @@ -# pylint: skip-file - # SPDX-License-Identifier: Apache-2.0 # Copyright Tumult Labs 2025 @@ -178,7 +176,7 @@ "numpy": ("https://numpy.org/doc/1.18/", None), "pandas": ("https://pandas.pydata.org/pandas-docs/version/1.2.0/", None), "sympy": ("https://docs.sympy.org/latest/", None), - "pyspark": ("https://archive.apache.org/dist/spark/docs/3.1.1/api/python/", None), + "pyspark": ("https://downloads.apache.org/spark/docs/3.5.7/api/python/", None), } # Substitutions diff --git a/doc/deployment/spark.rst b/doc/deployment/spark.rst index 24953460..fb3b70f9 100644 --- a/doc/deployment/spark.rst +++ b/doc/deployment/spark.rst @@ -44,12 +44,9 @@ database, you should use the following options when creating a Spark session: .enableHiveSupport() .getOrCreate() -To see where Hive's warehouse directory is, you can use the -`Hive CLI `_ -(or its replacement, -`Beehive `_) -to view the -`relevant configuration parameter `_: +To see where Hive's warehouse directory is, you can use the `Hive CLI `_ +(or its replacement, `Beeline `_) +to view the `relevant configuration parameter `_: .. code-block:: diff --git a/noxfile.py b/noxfile.py index 7da191f2..86de375d 100644 --- a/noxfile.py +++ b/noxfile.py @@ -143,9 +143,8 @@ def is_mac(): sm.black() sm.isort() +sm.ruff_check() sm.mypy() -sm.pylint() -sm.pydocstyle() sm.smoketest() sm.release_smoketest() diff --git a/pyproject.toml b/pyproject.toml index 7322cfed..ac527435 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,11 +56,10 @@ required-version = ">=0.7.0" default-groups = "all" [dependency-groups] +ruff = ["ruff >=0.14.3,<1"] black = ["black >=23.3,<24"] isort = ["isort >=5.11,<6"] mypy = ["mypy >=1.14.0"] -pylint = ["pylint >=3.2.5"] -pydocstyle = ["pydocstyle[toml] >=6.3"] test = [ "pytest", "pytest-cov >=5.0,<6", @@ -135,6 +134,60 @@ packages = ["src/tmlt"] ################################################################################ # Linter configuration +[tool.ruff.lint] +# A list of all of Ruff's rules can be found at https://docs.astral.sh/ruff/rules/ +select = [ + # Enable Ruff-specific lints plus Pylint, pydocstyle, pyflakes, and pycodestyle. + # The latter two cover many lints that we previously used pylint for, but + # because they are overlapping Ruff only implements them in one set of rules. + "RUF", "PL", "D", "F", "E", "W", + # Also enable a subset of flake8 rules, for similar reasons to pyflakes/pycodestyle. + "ISC", "SLF" +] +ignore = [ + # too-many-*: These rules are too context-dependent to be generally useful, + # we can evaluate this during code reviews. + "PLR09", + # magic-value-comparison: This rule flags a lot of constants that don't + # really make sense, we can make this call during code reviews. + "PLR2004", + # explicit-f-string-type-conversion: we don't generally use the !r syntax in + # f-strings, which this rule enforces. + "RUF010", + + # TODO: This disables every lint that is currently failing; go through and + # either fix/individually disable each instance, or choose to permanently + # ignore each one. + "PLW1641", # eq-without-hash + "PLC0206", # dict-index-missing-items + "RUF005", # collection-literal-concatenation + "RUF015", # unnecessary-iterable-allocation-for-first-element + "D415", # missing-terminal-punctuation + "RUF043", # pytest-raises-ambiguous-pattern + "D205", # missing-blank-line-after-summary + "D210", # surrounding-whitespace + "D102", # undocumented-public-method + "E501", # line-too-long + "E731", # lambda-assignment + "E741", # ambiguous-variable-name + "SLF001", # private-member-access + "RET504", # unnecessary-assign + "F401", # unused-import + "RUF009", # function-call-in-dataclass-default-argument + "E721", # type-comparison + "D103", # undocumented-public-function + "PLR0124", # comparison-with-itself +] + +# Ruff's RUF001-003 rules disallow certain Unicode characters that are easily +# confused with ASCII characters; this makes sense for the most part, but some +# of our docstrings use Greek letters that fall into that category. This allows +# those characters. +allowed-confusables = ['α', 'ρ', '𝝆'] + +[tool.ruff.lint.pydocstyle] +convention = "google" + [tool.black] force-exclude = "noxfile.py" @@ -162,70 +215,8 @@ module = "test.*" disallow_untyped_defs = false check_untyped_defs = true -[tool.pylint.master] -# See https://github.com/PyCQA/pylint/issues/1975#issuecomment-387924981 -extension-pkg-whitelist = ['numpy'] -load-plugins = ['pylint.extensions.docparams'] -# Only check param docs in docstrings that contain an Args: section. -# Set to "no" to show docstrings missing argument documentation. -accept-no-param-doc = true - -[tool.pylint.'MESSAGES CONTROL'] -enable = [ - # Note: there is a false positive on 'useless-suppression' when you - # use 'disable=line-too-long' at the end of a docstring. - # See: https://github.com/pylint-dev/pylint/issues/8301 - 'useless-suppression' -] -# By default, informational rules like useless-suppression don't cause PyLint to -# produce an error. -fail-on = ['useless-suppression'] -disable = [ - 'arguments-differ', - 'duplicate-code', - 'fixme', - 'invalid-name', - 'logging-format-interpolation', - 'logging-fstring-interpolation', - 'missing-function-docstring', # Redundant with pydocstyle - 'missing-raises-doc', - 'missing-return-doc', - 'no-else-return', - 'super-init-not-called', - 'too-few-public-methods', - 'too-many-ancestors', - 'too-many-arguments', - 'too-many-branches', - 'too-many-instance-attributes', - 'too-many-lines', - 'too-many-locals', - 'too-many-positional-arguments', - 'too-many-public-methods', - 'too-many-return-statements', - 'too-many-statements', - 'unbalanced-tuple-unpacking', - 'unnecessary-lambda-assignment', - 'unsubscriptable-object', - 'use-dict-literal', - # There are a lot of false positives for unsupported-binary-operation - # on Python 3.9: https://github.com/pylint-dev/pylint/issues/7381 - 'unsupported-binary-operation', - # black and isort group tmlt.core separately from tmlt.analytics, - # but pylint thinks they should both be grouped as 'tmlt'. - 'ungrouped-imports', - 'wrong-import-order', -] - -[tool.pylint.FORMAT] -max-line-length = 88 - -[tool.pydocstyle] -convention = "google" -add-ignore = [ - # `D200: One-line docstring should fit on one line with quotes` - # conflicts with pylint's `max-line-length`. - "D200", -] +################################################################################ +# Test configuration [tool.pytest.ini_options] markers = [ @@ -236,8 +227,5 @@ markers = [ # more information and a better future fix. addopts = ["--import-mode=importlib"] -################################################################################ -# Test configuration - [tool.coverage.run] relative_files = true diff --git a/src/tmlt/analytics/__init__.py b/src/tmlt/analytics/__init__.py index 7e86f4d4..eea8640a 100644 --- a/src/tmlt/analytics/__init__.py +++ b/src/tmlt/analytics/__init__.py @@ -38,8 +38,6 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright Tumult Labs 2025 -from typing import List - from tmlt.analytics._utils import AnalyticsInternalError from tmlt.analytics.binning_spec import BinningSpec, BinT from tmlt.analytics.config import Config, FeatureFlag diff --git a/src/tmlt/analytics/_neighboring_relation.py b/src/tmlt/analytics/_neighboring_relation.py index 6e28b7e3..29b1b65f 100644 --- a/src/tmlt/analytics/_neighboring_relation.py +++ b/src/tmlt/analytics/_neighboring_relation.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright Tumult Labs 2025 -# pylint: disable=protected-access from abc import ABC, abstractmethod from dataclasses import dataclass, field @@ -284,14 +283,13 @@ def _validate(self, dfs: Dict[str, DataFrame]) -> List[str]: f" has type {df_field.dataType}." ) key_type = df_field.dataType - else: - if not df_field.dataType == key_type: - raise ValueError( - f"Key column '{key_column}' has type " - f"{df_field.dataType}, but in another" - f" table it has type {key_type}. Key types" - " must match across tables" - ) + elif not df_field.dataType == key_type: + raise ValueError( + f"Key column '{key_column}' has type " + f"{df_field.dataType}, but in another" + f" table it has type {key_type}. Key types" + " must match across tables" + ) return list(self.table_to_key_column.keys()) diff --git a/src/tmlt/analytics/_noise_info.py b/src/tmlt/analytics/_noise_info.py index 2c86cb5c..cafd03a5 100644 --- a/src/tmlt/analytics/_noise_info.py +++ b/src/tmlt/analytics/_noise_info.py @@ -150,7 +150,7 @@ def _inverse_cdf(noise_info: Dict[str, Any], p: float) -> float: @singledispatch def _noise_from_info( - info: Any, # pylint: disable=unused-argument + info: Any, ) -> List[Dict[str, Any]]: """Get noise information from info (for a measurement). diff --git a/src/tmlt/analytics/_query_expr.py b/src/tmlt/analytics/_query_expr.py index f5a4213e..4dcec824 100644 --- a/src/tmlt/analytics/_query_expr.py +++ b/src/tmlt/analytics/_query_expr.py @@ -1147,14 +1147,14 @@ def schema(self, catalog: Catalog) -> Schema: name for name, cd in input_schema.column_descs.items() if (cd.allow_null or cd.allow_nan) - and not (name in [input_schema.grouping_column, input_schema.id_column]) + and name not in [input_schema.grouping_column, input_schema.id_column] ] return Schema( { name: ColumnDescriptor( column_type=cd.column_type, - allow_null=(cd.allow_null and not name in columns_to_change), - allow_nan=(cd.allow_nan and not name in columns_to_change), + allow_null=(cd.allow_null and name not in columns_to_change), + allow_nan=(cd.allow_nan and name not in columns_to_change), allow_inf=cd.allow_inf, ) for name, cd in input_schema.column_descs.items() @@ -1239,7 +1239,7 @@ def schema(self, catalog: Catalog) -> Schema: for name, cd in input_schema.column_descs.items() if cd.column_type == ColumnType.DECIMAL and cd.allow_inf - and not (name in [input_schema.grouping_column, input_schema.id_column]) + and name not in [input_schema.grouping_column, input_schema.id_column] ] return Schema( { @@ -1247,7 +1247,7 @@ def schema(self, catalog: Catalog) -> Schema: column_type=cd.column_type, allow_null=cd.allow_null, allow_nan=cd.allow_nan, - allow_inf=(cd.allow_inf and not name in columns_to_change), + allow_inf=(cd.allow_inf and name not in columns_to_change), ) for name, cd in input_schema.column_descs.items() }, @@ -1322,15 +1322,15 @@ def schema(self, catalog: Catalog) -> Schema: name for name, cd in input_schema.column_descs.items() if (cd.allow_null or cd.allow_nan) - and not name in [input_schema.grouping_column, input_schema.id_column] + and name not in [input_schema.grouping_column, input_schema.id_column] ) return Schema( { name: ColumnDescriptor( column_type=cd.column_type, - allow_null=(cd.allow_null and not name in columns), - allow_nan=(cd.allow_nan and not name in columns), + allow_null=(cd.allow_null and name not in columns), + allow_nan=(cd.allow_nan and name not in columns), allow_inf=(cd.allow_inf), ) for name, cd in input_schema.column_descs.items() @@ -1403,7 +1403,7 @@ def schema(self, catalog: Catalog) -> Schema: for name, cd in input_schema.column_descs.items() if cd.column_type == ColumnType.DECIMAL and cd.allow_inf - and not name in (input_schema.grouping_column, input_schema.id_column) + and name not in (input_schema.grouping_column, input_schema.id_column) ) return Schema( @@ -1412,7 +1412,7 @@ def schema(self, catalog: Catalog) -> Schema: column_type=cd.column_type, allow_null=cd.allow_null, allow_nan=cd.allow_nan, - allow_inf=(cd.allow_inf and not name in columns), + allow_inf=(cd.allow_inf and name not in columns), ) for name, cd in input_schema.column_descs.items() }, diff --git a/src/tmlt/analytics/_query_expr_compiler/_base_measurement_visitor.py b/src/tmlt/analytics/_query_expr_compiler/_base_measurement_visitor.py index 554c1e26..9e0c90e2 100644 --- a/src/tmlt/analytics/_query_expr_compiler/_base_measurement_visitor.py +++ b/src/tmlt/analytics/_query_expr_compiler/_base_measurement_visitor.py @@ -645,34 +645,33 @@ def _validate_approxDP_and_adjust_budget( f"The budget provided was {self.budget}." ) return + elif mechanism in ( + AverageMechanism.LAPLACE, + CountDistinctMechanism.LAPLACE, + CountMechanism.LAPLACE, + StdevMechanism.LAPLACE, + SumMechanism.LAPLACE, + VarianceMechanism.LAPLACE, + ): + warnings.warn( + "When using LAPLACE with an ApproxDPBudget, the delta value of " + "the budget will be replaced with zero." + ) + self.adjusted_budget = ApproxDPBudget(epsilon, 0) + elif mechanism in ( + AverageMechanism.DEFAULT, + CountDistinctMechanism.DEFAULT, + CountMechanism.DEFAULT, + StdevMechanism.DEFAULT, + SumMechanism.DEFAULT, + VarianceMechanism.DEFAULT, + ): + self.adjusted_budget = ApproxDPBudget(epsilon, 0) + elif mechanism is None: + # Quantile has no mechanism + self.adjusted_budget = ApproxDPBudget(epsilon, 0) else: - if mechanism in ( - AverageMechanism.LAPLACE, - CountDistinctMechanism.LAPLACE, - CountMechanism.LAPLACE, - StdevMechanism.LAPLACE, - SumMechanism.LAPLACE, - VarianceMechanism.LAPLACE, - ): - warnings.warn( - "When using LAPLACE with an ApproxDPBudget, the delta value of " - "the budget will be replaced with zero." - ) - self.adjusted_budget = ApproxDPBudget(epsilon, 0) - elif mechanism in ( - AverageMechanism.DEFAULT, - CountDistinctMechanism.DEFAULT, - CountMechanism.DEFAULT, - StdevMechanism.DEFAULT, - SumMechanism.DEFAULT, - VarianceMechanism.DEFAULT, - ): - self.adjusted_budget = ApproxDPBudget(epsilon, 0) - elif mechanism is None: - # Quantile has no mechanism - self.adjusted_budget = ApproxDPBudget(epsilon, 0) - else: - raise AnalyticsInternalError(f"Unknown mechanism {mechanism}.") + raise AnalyticsInternalError(f"Unknown mechanism {mechanism}.") def _validate_measurement(self, measurement: Measurement, mid_stability: sp.Expr): """Validate a measurement.""" diff --git a/src/tmlt/analytics/_query_expr_compiler/_base_transformation_visitor.py b/src/tmlt/analytics/_query_expr_compiler/_base_transformation_visitor.py index 84b41bb8..74733c20 100644 --- a/src/tmlt/analytics/_query_expr_compiler/_base_transformation_visitor.py +++ b/src/tmlt/analytics/_query_expr_compiler/_base_transformation_visitor.py @@ -1082,7 +1082,7 @@ def _get_replace_with( else: # Check that all columns exist for col in replace_with: - if not col in analytics_schema: + if col not in analytics_schema: raise ValueError( f"Cannot replace values in column {col}, because it is not in" " the schema" @@ -1555,9 +1555,8 @@ def visit_enforce_constraint(self, expr: EnforceConstraint) -> Output: child_transformation, child_ref, child_constraints = self._visit_child( expr.child ) - # pylint: disable=protected-access transformation, ref = expr.constraint._enforce(child_transformation, child_ref) - # pylint: enable=protected-access + return self.Output( transformation, ref, diff --git a/src/tmlt/analytics/_query_expr_compiler/_measurement_visitor.py b/src/tmlt/analytics/_query_expr_compiler/_measurement_visitor.py index 43e6c60e..2ca65086 100644 --- a/src/tmlt/analytics/_query_expr_compiler/_measurement_visitor.py +++ b/src/tmlt/analytics/_query_expr_compiler/_measurement_visitor.py @@ -63,9 +63,7 @@ def _handle_enforce( **kwargs, ) -> Tuple[Transformation, TableReference]: """Enforce a constraint after a child transformation.""" - return constraint._enforce( # pylint: disable=protected-access - child_transformation, child_ref, **kwargs - ) + return constraint._enforce(child_transformation, child_ref, **kwargs) def visit_get_groups(self, expr: GetGroups) -> Tuple[Measurement, NoiseInfo]: """Create a measurement from a GetGroups query expression.""" diff --git a/src/tmlt/analytics/_transformation_utils.py b/src/tmlt/analytics/_transformation_utils.py index bd65b8aa..8db813ec 100644 --- a/src/tmlt/analytics/_transformation_utils.py +++ b/src/tmlt/analytics/_transformation_utils.py @@ -3,8 +3,6 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright Tumult Labs 2025 -# pylint: disable=unused-argument - from typing import Callable, Dict, Optional, Tuple, Type, cast from tmlt.core.domains.base import Domain diff --git a/src/tmlt/analytics/binning_spec.py b/src/tmlt/analytics/binning_spec.py index 53b8b0e9..8d46e850 100644 --- a/src/tmlt/analytics/binning_spec.py +++ b/src/tmlt/analytics/binning_spec.py @@ -105,17 +105,16 @@ def _default_bin_names( f"({bin_edge_strs[i]}, {bin_edge_strs[i+1]}]" for i in range(len(bin_edges) - 1) ] + elif include_edges: + return [ + f"[{bin_edge_strs[i]}, {bin_edge_strs[i+1]})" + for i in range(len(bin_edges) - 2) + ] + [f"[{bin_edge_strs[-2]}, {bin_edge_strs[-1]}]"] else: - if include_edges: - return [ - f"[{bin_edge_strs[i]}, {bin_edge_strs[i+1]})" - for i in range(len(bin_edges) - 2) - ] + [f"[{bin_edge_strs[-2]}, {bin_edge_strs[-1]}]"] - else: - return [ - f"[{bin_edge_strs[i]}, {bin_edge_strs[i+1]})" - for i in range(len(bin_edges) - 1) - ] + return [ + f"[{bin_edge_strs[i]}, {bin_edge_strs[i+1]})" + for i in range(len(bin_edges) - 1) + ] @dataclass(frozen=True, init=False, eq=False, repr=False) @@ -234,10 +233,8 @@ def __init__( raise ValueError(f"Invalid bin names: {e}") from e # This typecheck cannot be done safely with isinstance because datetime # is a subclass of date. - if ( - # pylint: disable=unidiomatic-typecheck - nan_bin is not None - and type(nan_bin) != column_type_to_py_type(column_descriptor.column_type) + if nan_bin is not None and type(nan_bin) != column_type_to_py_type( + column_descriptor.column_type ): raise ValueError("NaN bin name must have the same type as other bin names") diff --git a/src/tmlt/analytics/config.py b/src/tmlt/analytics/config.py index 6b62fde9..ee805250 100644 --- a/src/tmlt/analytics/config.py +++ b/src/tmlt/analytics/config.py @@ -114,7 +114,7 @@ class Config: _instance = None - def __new__(cls, *args, **kwargs): # noqa: D102 + def __new__(cls, *args, **kwargs): # Enforces that Config is a singleton. # No docstring to prevent this from showing up in docs. if not cls._instance: diff --git a/src/tmlt/analytics/keyset/_keyset.py b/src/tmlt/analytics/keyset/_keyset.py index e7c39022..93c89e26 100644 --- a/src/tmlt/analytics/keyset/_keyset.py +++ b/src/tmlt/analytics/keyset/_keyset.py @@ -207,11 +207,11 @@ def _detect(columns: Sequence[str]) -> KeySetPlan: # Pydocstyle doesn't seem to understand overloads, so we need to disable the # check that a docstring exists for them. @overload - def __mul__(self, other: KeySet) -> KeySet: # noqa: D105 + def __mul__(self, other: KeySet) -> KeySet: ... @overload - def __mul__(self, other: KeySetPlan) -> KeySetPlan: # noqa: D105 + def __mul__(self, other: KeySetPlan) -> KeySetPlan: ... def __mul__(self, other): @@ -320,11 +320,11 @@ def __getitem__(self, desired_columns: Union[str, Sequence[str]]) -> KeySet: # Pydocstyle doesn't seem to understand overloads, so we need to disable the # check that a docstring exists for them. @overload - def join(self, other: KeySet) -> KeySet: # noqa: D105 + def join(self, other: KeySet) -> KeySet: ... @overload - def join(self, other: KeySetPlan) -> KeySetPlan: # noqa: D105 + def join(self, other: KeySetPlan) -> KeySetPlan: ... def join(self, other): @@ -353,13 +353,11 @@ def join(self, other): ) if isinstance(other, KeySet): return KeySet( - # pylint: disable-next=protected-access Join(self._op_tree, other._op_tree), columns=list(dict.fromkeys(self.columns() + other.columns())), ) else: return KeySetPlan( - # pylint: disable-next=protected-access Join(self._op_tree, other._op_tree), columns=list(dict.fromkeys(self.columns() + other.columns())), ) @@ -413,7 +411,6 @@ def columns(self) -> list[str]: return list(self._columns) def schema(self) -> dict[str, ColumnDescriptor]: - # pylint: disable=line-too-long """Returns the KeySet's schema. Example: @@ -427,7 +424,6 @@ def schema(self) -> dict[str, ColumnDescriptor]: {'A': ColumnDescriptor(column_type=ColumnType.VARCHAR, allow_null=False, allow_nan=False, allow_inf=False), 'B': ColumnDescriptor(column_type=ColumnType.INTEGER, allow_null=True, allow_nan=False, allow_inf=False)} """ - # pylint: enable=line-too-long schema = self._op_tree.schema() return {c: schema[c] for c in self.columns()} # Reorder to match self.columns() @@ -491,7 +487,7 @@ def is_equivalent(self, other: Union[KeySet, KeySetPlan]) -> Optional[bool]: if not isinstance(other, KeySet): return False - if self._op_tree == other._op_tree: # pylint: disable=protected-access + if self._op_tree == other._op_tree: return True # Differing column nullability doesn't necessarily mean that two KeySets @@ -729,7 +725,6 @@ def join(self, other: Union[KeySet, KeySetPlan]) -> KeySetPlan: ) return KeySetPlan( - # pylint: disable-next=protected-access Join(self._op_tree, other._op_tree), columns=list(dict.fromkeys(self.columns() + other.columns())), ) @@ -777,7 +772,7 @@ def is_equivalent(self, other: Union[KeySet, KeySetPlan]) -> Optional[bool]: if not isinstance(other, KeySetPlan): return False - if self._op_tree == other._op_tree: # pylint: disable=protected-access + if self._op_tree == other._op_tree: return True if self.columns() != other.columns(): diff --git a/src/tmlt/analytics/privacy_budget.py b/src/tmlt/analytics/privacy_budget.py index a8d32d4d..2b9680b2 100644 --- a/src/tmlt/analytics/privacy_budget.py +++ b/src/tmlt/analytics/privacy_budget.py @@ -231,7 +231,7 @@ class ApproxDPBudget(PrivacyBudget): This privacy definition is also known as (ε, δ)-differential privacy, and the associated privacy parameters are epsilon and delta. The formal definition can be found `here `__. - """ # pylint: disable=line-too-long + """ _epsilon: ExactNumber _delta: ExactNumber @@ -567,7 +567,6 @@ def _get_adjusted_budget( requested_privacy_budget: The requested privacy budget. remaining_privacy_budget: How much privacy budget we have left. """ - # pylint: disable=protected-access if isinstance(requested_privacy_budget, PureDPBudget) and isinstance( remaining_privacy_budget, PureDPBudget ): @@ -594,7 +593,7 @@ def _get_adjusted_budget( requested_privacy_budget._rho, remaining_privacy_budget._rho ) return RhoZCDPBudget(adjusted_rho) - # pylint: enable=protected-access + else: raise ValueError( "Unable to compute a privacy budget with the requested budget " diff --git a/src/tmlt/analytics/query_builder.py b/src/tmlt/analytics/query_builder.py index c879bd23..b33d1974 100644 --- a/src/tmlt/analytics/query_builder.py +++ b/src/tmlt/analytics/query_builder.py @@ -69,20 +69,20 @@ # Override exported names to include ColumnType and ColumnDescriptor, as well as # types from _query_expr. __all__ = [ - "Row", - "QueryBuilder", - "GroupedQueryBuilder", - "ColumnDescriptor", - "ColumnType", "AnalyticsDefault", "AverageMechanism", - "CountMechanism", + "ColumnDescriptor", + "ColumnType", "CountDistinctMechanism", + "CountMechanism", + "GroupbyCountQuery", + "GroupedQueryBuilder", + "Query", + "QueryBuilder", + "Row", "StdevMechanism", "SumMechanism", "VarianceMechanism", - "Query", - "GroupbyCountQuery", ] Row = Dict[str, Any] @@ -145,7 +145,7 @@ def _is_equivalent(self, other: Any) -> bool: return False query = self._query_expr - other_query = other._query_expr # pylint: disable=protected-access + other_query = other._query_expr return _query_expr_recursive_equivalence(query, other_query) @@ -290,11 +290,11 @@ def __init__(self, source_id: str): self._source_id: str = source_id self._query_expr: QueryExpr = PrivateSource(source_id) - def clone(self) -> QueryBuilder: # noqa: D102 + def clone(self) -> QueryBuilder: # Returns a new QueryBuilder with the same partial query as the current one. # No docstring to prevent this from showing in docs. builder = QueryBuilder(self._source_id) - builder._query_expr = self._query_expr # pylint: disable=protected-access + builder._query_expr = self._query_expr return builder def join_public( @@ -467,7 +467,6 @@ def join_private( truncation_strategy_right: Optional[TruncationStrategy.Type] = None, join_columns: Optional[Sequence[str]] = None, ) -> "QueryBuilder": - # pylint: disable=protected-access """Join the table with another :class:`QueryBuilder`. The current query can also be joined with a named private table @@ -2810,7 +2809,6 @@ def __init__(self, source_id, query_expr, groupby_keys) -> None: Do not construct directly; use :func:`~QueryBuilder.groupby`. """ - # pylint: disable=pointless-string-statement """ Args: source_id: The source id used in the query_expr. diff --git a/src/tmlt/analytics/session.py b/src/tmlt/analytics/session.py index ff34208f..05bc025f 100644 --- a/src/tmlt/analytics/session.py +++ b/src/tmlt/analytics/session.py @@ -7,9 +7,9 @@ from typing import Any, Dict, List, Optional, Tuple, Type, Union, cast from warnings import warn -import pandas as pd # pylint: disable=unused-import +import pandas as pd # needed for doctests import sympy as sp -from pyspark.sql import SparkSession # pylint: disable=unused-import +from pyspark.sql import SparkSession # needed for doctests from pyspark.sql import DataFrame from tabulate import tabulate from tmlt.core.domains.collections import DictDomain @@ -89,7 +89,7 @@ RhoZCDPBudget, _get_adjusted_budget, ) -from tmlt.analytics.protected_change import ( # pylint: disable=unused-import +from tmlt.analytics.protected_change import ( # AddOneRow needed for doctests AddMaxRows, AddMaxRowsInMaxGroups, AddOneRow, @@ -269,7 +269,7 @@ def build(self) -> "Session": source_id: dataframe for source_id, (dataframe, _) in self._private_dataframes.items() } - sess = self.get_class_type()._from_neighboring_relation( # pylint: disable=protected-access + sess = self.get_class_type()._from_neighboring_relation( self._privacy_budget, tables, neighboring_relation ) # check list of ARK identifiers against session's ID spaces @@ -305,7 +305,6 @@ def __init__( @nodoc """ - # pylint: disable=pointless-string-statement """ Args documented for internal use. accountant: A PrivacyAccountant. @@ -328,7 +327,6 @@ def __init__( NamedTable(t): [] for t in self.private_sources } - # pylint: disable=line-too-long @classmethod @typechecked def from_dataframe( @@ -387,7 +385,6 @@ def from_dataframe( specifying what changes to the input data the resulting :class:`Session` should protect. """ - # pylint: enable=line-too-long session_builder = ( cls.Builder() .with_privacy_budget(privacy_budget=privacy_budget) @@ -407,7 +404,6 @@ def _create_accountant_from_neighboring_relation( private_sources: Dict[str, DataFrame], relation: NeighboringRelation, ) -> Tuple[PrivacyAccountant, Any]: - # pylint: disable=protected-access output_measure: Union[PureDP, ApproxDP, RhoZCDP] sympy_budget: Union[sp.Expr, Tuple[sp.Expr, sp.Expr]] if isinstance(privacy_budget, PureDPBudget): @@ -428,7 +424,6 @@ def _create_accountant_from_neighboring_relation( elif isinstance(privacy_budget, RhoZCDPBudget): output_measure = RhoZCDP() sympy_budget = privacy_budget._rho.expr - # pylint: enable=protected-access else: raise ValueError( f"Unsupported PrivacyBudget variant: {type(privacy_budget)}" @@ -620,7 +615,6 @@ def describe( obj: The table or query to be described, or None to describe the whole Session. """ - # pylint: disable=protected-access if obj is None: print(self._describe_self()) elif isinstance(obj, GroupedQueryBuilder): @@ -634,7 +628,6 @@ def describe( print(self._describe_query_obj(QueryBuilder(obj)._query_expr)) else: assert_never(obj) - # pylint: enable=protected-access def _describe_self(self) -> str: """Describes the current state of this session.""" @@ -922,7 +915,6 @@ def _catalog(self) -> Catalog: ) return catalog - # pylint: disable=line-too-long @typechecked def add_public_dataframe(self, source_id: str, dataframe: DataFrame): """Adds a public data source to the session. @@ -972,7 +964,6 @@ def add_public_dataframe(self, source_id: str, dataframe: DataFrame): source_id: The name of the public data source. dataframe: The public data source corresponding to the ``source_id``. """ - # pylint: enable=line-too-long assert_is_identifier(source_id) if source_id in self.public_sources or source_id in self.private_sources: raise ValueError(f"This session already has a table named '{source_id}'.") @@ -1057,11 +1048,10 @@ def _noise_info( [{'noise_mechanism': <_NoiseMechanism.GEOMETRIC: 2>, 'noise_parameter': 2}] """ if isinstance(query_expr, Query): - query_expr = query_expr._query_expr # pylint: disable=protected-access + query_expr = query_expr._query_expr _, _, noise_info = self._compile_and_get_info(query_expr, privacy_budget) return list(iter(noise_info)) - # pylint: disable=line-too-long def evaluate( self, query_expr: Query, @@ -1118,9 +1108,8 @@ def evaluate( query_expr: One query expression to answer. privacy_budget: The privacy budget used for the query. """ - # pylint: enable=line-too-long check_type(query_expr, Query) - query = query_expr._query_expr # pylint: disable=protected-access + query = query_expr._query_expr measurement, adjusted_budget, _ = self._compile_and_get_info( query, privacy_budget ) @@ -1166,7 +1155,6 @@ def evaluate( "for more information." ) from e - # pylint: disable=line-too-long @typechecked def create_view( self, @@ -1234,13 +1222,12 @@ def create_view( source_id: The name, or unique identifier, of the view. cache: Whether or not to cache the view. """ - # pylint: enable=line-too-long assert_is_identifier(source_id) self._activate_accountant() if source_id in self.private_sources or source_id in self.public_sources: raise ValueError(f"Table '{source_id}' already exists.") - query = query_expr._query_expr # pylint: disable=protected-access + query = query_expr._query_expr transformation, ref, constraints = QueryExprCompiler( self._output_measure @@ -1315,7 +1302,7 @@ def _create_partition_constraint( behavior of constraints, not for code maintainability. """ if isinstance(constraint, MaxGroupsPerID): - return constraint._enforce( # pylint: disable=protected-access + return constraint._enforce( child_transformation=child_transformation, child_ref=child_ref, update_metric=True, @@ -1326,7 +1313,7 @@ def _create_partition_constraint( raise AnalyticsInternalError( f"Expected MaxGroupsPerID or MaxRowsPerID constraints, but got {constraint} instead." ) - return constraint._enforce( # pylint: disable=protected-access + return constraint._enforce( child_transformation=child_transformation, child_ref=child_ref, update_metric=True, @@ -1482,7 +1469,6 @@ def _create_partition_transformation( ) return transformation - # pylint: disable=line-too-long @typechecked def partition_and_create( self, @@ -1576,7 +1562,6 @@ def partition_and_create( splits: Mapping of split name to value of partition. Split name is ``source_id`` in new session. """ - # pylint: enable=line-too-long # If you remove this if-block, mypy will complain if not ( isinstance(self._accountant.privacy_budget, ExactNumber) diff --git a/src/tmlt/analytics/utils.py b/src/tmlt/analytics/utils.py index 0658d62d..722e0f7c 100644 --- a/src/tmlt/analytics/utils.py +++ b/src/tmlt/analytics/utils.py @@ -65,9 +65,7 @@ def check_installation(): # If Spark is broken, the Core cleanup atexit hook will fail, which # produces some additional output the user doesn't need to see in # this case. - atexit.unregister( - core_cleanup._cleanup_temp # pylint: disable=protected-access - ) + atexit.unregister(core_cleanup._cleanup_temp) if ( e.args and isinstance(e.args[0], str) @@ -124,8 +122,8 @@ def check_installation(): ) if ( len(result.columns) != 2 - or not "A" in result.columns - or not "count" in result.columns + or "A" not in result.columns + or "count" not in result.columns ): raise RuntimeError( "Expected output to have columns 'A' and 'count', but instead it had" @@ -153,7 +151,7 @@ def check_installation(): "Installation check complete. Tumult Analytics appears to be properly" " installed." ) - except Exception as e: # pylint: disable=broad-except + except Exception as e: print(" FAILED\n") if not str(e).startswith("It looks like the analytics session"): raise RuntimeError( diff --git a/test/conftest.py b/test/conftest.py index cef9a322..d8944dcf 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -358,7 +358,7 @@ def create_empty_input(domain: SparkDataFrameDomain) -> DataFrame: ... -def create_empty_input(domain): # pylint: disable=missing-type-doc +def create_empty_input(domain): """Returns an empty input for a given domain. Args: diff --git a/test/system/session/ids/test_constraint_propagation.py b/test/system/session/ids/test_constraint_propagation.py index 266af448..ec34b19b 100644 --- a/test/system/session/ids/test_constraint_propagation.py +++ b/test/system/session/ids/test_constraint_propagation.py @@ -32,11 +32,9 @@ def _test_propagation(query, expected_constraints, session): """Verify that the table resulting from a query has the expected constraints.""" session.create_view(query, "view", cache=False) - # pylint: disable=protected-access assert set(session._table_constraints[NamedTable("view")]) == set( expected_constraints ) - # pylint: enable=protected-access @pytest.mark.parametrize( diff --git a/test/system/session/ids/test_count_distinct_optimization.py b/test/system/session/ids/test_count_distinct_optimization.py index f2b265bc..432f06ce 100644 --- a/test/system/session/ids/test_count_distinct_optimization.py +++ b/test/system/session/ids/test_count_distinct_optimization.py @@ -143,9 +143,7 @@ def test_insufficient_constraints(query: QueryBuilder, session): ) def test_noise_scale_puredp(query: QueryExpr, expected_noise: List[float], session): """Noise scales are adjusted correctly for different truncations with pure DP.""" - # pylint: disable=protected-access noise_info = session._noise_info(query, PureDPBudget(1)) - # pylint: enable=protected-access noise = [info["noise_parameter"] for info in noise_info] assert noise == expected_noise @@ -189,8 +187,6 @@ def test_noise_scale_puredp(query: QueryExpr, expected_noise: List[float], sessi ) def test_noise_scale_zcdp(query: QueryExpr, expected_noise: List[float], session): """Noise scales are adjusted correctly for different truncations with zCDP.""" - # pylint: disable=protected-access noise_info = session._noise_info(query, RhoZCDPBudget(1)) - # pylint: enable=protected-access noise = [info["noise_parameter"] for info in noise_info] assert noise == expected_noise diff --git a/test/system/session/ids/test_id_col_operations.py b/test/system/session/ids/test_id_col_operations.py index a0069deb..d5efb410 100644 --- a/test/system/session/ids/test_id_col_operations.py +++ b/test/system/session/ids/test_id_col_operations.py @@ -238,7 +238,7 @@ def test_various_session_builds( "id_b2", _session_data[df4], protected_change=AddRowsWithID("id", "b") ) ).build() - # pylint: disable=protected-access + for table_collection, ark_metric in sess._input_metric.key_to_metric.items(): dict_domain = sess._input_domain.key_to_domain[table_collection] assert isinstance(ark_metric, AddRemoveKeys) @@ -264,4 +264,3 @@ def test_various_session_builds( .allow_null for table_id, key_column in ark_metric.df_to_key_column.items() ) - # pylint: enable=protected-access diff --git a/test/system/session/ids/test_l0_linf_truncation.py b/test/system/session/ids/test_l0_linf_truncation.py index acd5db74..24850b97 100644 --- a/test/system/session/ids/test_l0_linf_truncation.py +++ b/test/system/session/ids/test_l0_linf_truncation.py @@ -410,9 +410,7 @@ def test_mismatched_grouping_columns(session): ) def test_noise_scale_puredp(query: QueryExpr, expected_noise: List[float], session): """Noise scales are adjusted correctly for different truncations with pure DP.""" - # pylint: disable=protected-access noise_info = session._noise_info(query, PureDPBudget(1)) - # pylint: enable=protected-access noise = [info["noise_parameter"] for info in noise_info] assert noise == expected_noise @@ -485,9 +483,7 @@ def test_noise_scale_puredp(query: QueryExpr, expected_noise: List[float], sessi ) def test_noise_scale_zcdp(query: QueryExpr, expected_noise: List[float], session): """Noise scales are adjusted correctly for different truncations with zCDP.""" - # pylint: disable=protected-access noise_info = session._noise_info(query, RhoZCDPBudget(1)) - # pylint: enable=protected-access noise = [info["noise_parameter"] for info in noise_info] assert noise == expected_noise @@ -556,10 +552,7 @@ def test_constraint_selection_puredp( for c in constraints: base_query.enforce(c) query = base_query.groupby(group).count() if group else base_query.count() - - # pylint: disable=protected-access noise_info = session._noise_info(query, PureDPBudget(1)) - # pylint: enable=protected-access noise = [info["noise_parameter"] for info in noise_info] assert noise == expected_noise @@ -641,9 +634,6 @@ def test_constraint_selection_zcdp( for c in constraints: base_query.enforce(c) query = base_query.groupby(group).count() if group else base_query.count() - - # pylint: disable=protected-access noise_info = session._noise_info(query, RhoZCDPBudget(1)) - # pylint: enable=protected-access noise = [info["noise_parameter"] for info in noise_info] assert noise == expected_noise diff --git a/test/system/session/ids/test_l1_truncation.py b/test/system/session/ids/test_l1_truncation.py index a1d8aa73..2f9be2dc 100644 --- a/test/system/session/ids/test_l1_truncation.py +++ b/test/system/session/ids/test_l1_truncation.py @@ -304,9 +304,7 @@ def test_stdev_grouped( ) def test_noise_scale_puredp(query: QueryExpr, expected_noise: List[float], session): """Noise scales are adjusted correctly for different truncations with pure DP.""" - # pylint: disable=protected-access noise_info = session._noise_info(query, PureDPBudget(1)) - # pylint: enable=protected-access noise = [info["noise_parameter"] for info in noise_info] assert noise == expected_noise @@ -338,8 +336,6 @@ def test_noise_scale_puredp(query: QueryExpr, expected_noise: List[float], sessi ) def test_noise_scale_zcdp(query: QueryExpr, expected_noise: List[float], session): """Noise scales are adjusted correctly for different truncations with zCDP.""" - # pylint: disable=protected-access noise_info = session._noise_info(query, RhoZCDPBudget(1)) - # pylint: enable=protected-access noise = [info["noise_parameter"] for info in noise_info] assert noise == expected_noise diff --git a/test/system/session/ids/test_partition.py b/test/system/session/ids/test_partition.py index 12239bd3..158a6b47 100644 --- a/test/system/session/ids/test_partition.py +++ b/test/system/session/ids/test_partition.py @@ -133,7 +133,7 @@ def test_partition_and_create_with_MaxRowsPerID(session, table_stability): session.remaining_privacy_budget, ) assert_dataframe_equal(answer_session3, pd.DataFrame({"count": [1]})) - # pylint: disable=protected-access + assert session2._input_metric == DictMetric( {NamedTable("part0"): SymmetricDifference()} ) @@ -142,7 +142,6 @@ def test_partition_and_create_with_MaxRowsPerID(session, table_stability): ) assert session2._accountant.d_in == {NamedTable("part0"): table_stability} assert session3._accountant.d_in == {NamedTable("part1"): table_stability} - # pylint: enable=protected-access @pytest.mark.parametrize( @@ -185,7 +184,7 @@ def test_partition_and_create_with_MaxGroupsPerID(session, table_stability): session.remaining_privacy_budget, ) assert_dataframe_equal(answer_session3, pd.DataFrame({"count": [1]})) - # pylint: disable=protected-access + assert session2._input_metric == DictMetric( {TableCollection("a"): CoreAddRemoveKeys({NamedTable("part0"): "id"})} ) @@ -194,4 +193,3 @@ def test_partition_and_create_with_MaxGroupsPerID(session, table_stability): ) assert session2._accountant.d_in == {TableCollection("a"): table_stability} assert session3._accountant.d_in == {TableCollection("a"): table_stability} - # pylint: enable=protected-access diff --git a/test/system/session/mixed/test_mixed_session.py b/test/system/session/mixed/test_mixed_session.py index 74362d73..7a5a2c66 100644 --- a/test/system/session/mixed/test_mixed_session.py +++ b/test/system/session/mixed/test_mixed_session.py @@ -1,7 +1,8 @@ """Tests for Sessions that employ a mixture of IDs and non-IDs features. These are not meant to be exhaustive, but rather to ensure that the Session -functions properly when used with a mixture of IDs and non-IDs protected changes.""" +functions properly when used with a mixture of IDs and non-IDs protected changes. +""" # SPDX-License-Identifier: Apache-2.0 # Copyright Tumult Labs 2025 @@ -33,18 +34,16 @@ def test_view_constraint(session): .enforce(MaxRowsPerGroupPerID("group", 1)) ) session.create_view(query, "view", cache=False) - # pylint: disable=protected-access + assert session._table_constraints[NamedTable("view")] == [ MaxRowsPerID(1), MaxGroupsPerID("group", 1), MaxRowsPerGroupPerID("group", 1), ] - # pylint: enable=protected-access session.delete_view("view") - # pylint: disable=protected-access + assert NamedTable("view") not in session._table_constraints - # pylint: enable=protected-access # Test creating view, then doing (1) immediate aggregation and diff --git a/test/system/session/rows/test_add_max_rows.py b/test/system/session/rows/test_add_max_rows.py index 7db4db75..a9136132 100644 --- a/test/system/session/rows/test_add_max_rows.py +++ b/test/system/session/rows/test_add_max_rows.py @@ -78,9 +78,7 @@ def test_queries_privacy_budget_infinity_puredp( expected_df: The expected answer. """ if expected_expr is not None: - # pylint: disable=protected-access query_expr = query_expr_or_builder._query_expr - # pylint: enable=protected-access assert query_expr == expected_expr session = Session.from_dataframe( privacy_budget=PureDPBudget(float("inf")), @@ -158,9 +156,7 @@ def test_queries_privacy_budget_infinity_rhozcdp( expected_df: The expected answer. """ if expected_expr is not None: - # pylint: disable=protected-access query_expr = query_expr_or_builder._query_expr - # pylint: enable=protected-access assert query_expr == expected_expr session = Session.from_dataframe( @@ -256,9 +252,7 @@ def test_noise_info( dataframe=self.sdf, protected_change=AddOneRow(), ) - # pylint: disable=protected-access info = session._noise_info(query_expr, query_budget) - # pylint: enable=protected-access assert info == expected @pytest.mark.parametrize( @@ -441,7 +435,6 @@ def duplicate_rows(_: Row) -> List[Row]: ) def test_get_bounds_inf_budget(self, spark, data): """Test that the get_bounds produces reasonable bounds.""" - sdf = spark.createDataFrame(data) session = Session.from_dataframe( privacy_budget=PureDPBudget(float("inf")), @@ -491,7 +484,6 @@ def test_get_bounds_inf_budget(self, spark, data): ) def test_get_bounds_inf_budget_sum(self, spark, data): """Test that the bounds from get_bounds produce a reasonable sum.""" - sdf = spark.createDataFrame(data) session = Session.from_dataframe( privacy_budget=PureDPBudget(float("inf")), @@ -558,7 +550,6 @@ def test_get_bounds_invalid_columns( self, spark, data, column, error_type, message, protected_change ): """Test that get_bounds throws appropriate errors.""" - sdf = spark.createDataFrame(data) session = Session.from_dataframe( privacy_budget=PureDPBudget(float("inf")), @@ -775,8 +766,8 @@ def test_partition_and_create_approxDP_session_pureDP_partition( remaining_budget: PrivacyBudget, ): """Tests using :func:`partition_and_create` to create a new ApproxDP session - that supports PureDP partitions.""" - + that supports PureDP partitions. + """ is_approxDP_starting_budget = isinstance(starting_budget, ApproxDPBudget) if is_approxDP_starting_budget and isinstance(partition_budget, PureDPBudget): partition_budget = ApproxDPBudget(partition_budget.value, 0) @@ -980,7 +971,6 @@ def test_partition_execution_order( "X": ColumnDescriptor(ColumnType.INTEGER), } - # pylint: disable=protected-access assert session1._accountant.state == PrivacyAccountantState.WAITING_FOR_CHILDREN assert session2._accountant.state == PrivacyAccountantState.ACTIVE assert session3._accountant.state == PrivacyAccountantState.WAITING_FOR_SIBLING @@ -1014,8 +1004,6 @@ def test_partition_execution_order( ): session3.create_view(select_query3, "select_view_again", cache=False) - # pylint: enable=protected-access - @pytest.mark.parametrize( "budget", [(PureDPBudget(20)), (ApproxDPBudget(20, 0.5)), (RhoZCDPBudget(20))] ) @@ -1080,7 +1068,6 @@ def test_partition_on_nongrouping_column(self, budget: PrivacyBudget): ) def test_create_view_composed(self, budget: PrivacyBudget): """Composing views with :func:`create_view` works.""" - session = Session.from_dataframe( privacy_budget=budget, source_id="private", @@ -1094,9 +1081,7 @@ def test_create_view_composed(self, budget: PrivacyBudget): max_rows=2, ) session.create_view(transformation_query1, "flatmap1", cache=False) - # pylint: disable=protected-access assert session._accountant.d_in[NamedTable("flatmap1")] == 2 - # pylint: enable=protected-access transformation_query2 = QueryBuilder("flatmap1").flat_map( f=lambda row: [{}, {}], @@ -1105,9 +1090,7 @@ def test_create_view_composed(self, budget: PrivacyBudget): max_rows=3, ) session.create_view(transformation_query2, "flatmap2", cache=False) - # pylint: disable=protected-access assert session._accountant.d_in[NamedTable("flatmap2")] == 6 - # pylint: enable=protected-access @pytest.mark.parametrize( "budget", [(PureDPBudget(10)), (ApproxDPBudget(10, 0.5)), (RhoZCDPBudget(10))] @@ -1194,7 +1177,6 @@ def test_create_view_composed_correct_answer( def test_caching(self, spark): """Tests that caching works as expected.""" - # pylint: disable=protected-access session = Session.from_dataframe( privacy_budget=PureDPBudget(float("inf")), source_id="private", diff --git a/test/system/session/rows/test_add_max_rows_in_max_groups.py b/test/system/session/rows/test_add_max_rows_in_max_groups.py index 9a67b71e..4a335b9d 100644 --- a/test/system/session/rows/test_add_max_rows_in_max_groups.py +++ b/test/system/session/rows/test_add_max_rows_in_max_groups.py @@ -1,10 +1,10 @@ """Tests for Session with the AddMaxRowsInMaxGroups ProtectedChange. - Note that these tests are not intended to be exhaustive. They are intended to be a sanity check that the Session is working correctly with AddMaxRowsInMaxGroups. More thorough tests for Session are in -test/system/session/rows/test_add_max_rows.py.""" +test/system/session/rows/test_add_max_rows.py. +""" # SPDX-License-Identifier: Apache-2.0 # Copyright Tumult Labs 2025 @@ -162,7 +162,5 @@ def test_noise_info( "B", max_groups=1, max_rows_per_group=1 ), ) - # pylint: disable=protected-access info = session._noise_info(query_expr, query_budget) - # pylint: enable=protected-access assert info == expected diff --git a/test/system/session/rows/test_invalid.py b/test/system/session/rows/test_invalid.py index 8cb810b9..98a95228 100644 --- a/test/system/session/rows/test_invalid.py +++ b/test/system/session/rows/test_invalid.py @@ -55,7 +55,7 @@ def test_invalid_queries_evaluate( error_type: Type[Exception], expected_error_msg: str, ): - """evaluate raises error on invalid queries.""" + """Evaluate raises error on invalid queries.""" mock_accountant = Mock() mock_accountant.output_measure = PureDP() mock_accountant.input_metric = DictMetric( @@ -145,7 +145,7 @@ def test_format_insufficient_budget_msg( def test_invalid_privacy_budget_evaluate_and_create( self, output_measure: Union[PureDP, RhoZCDP] ): - """evaluate and create functions raise error on invalid privacy_budget.""" + """Evaluate and create functions raise error on invalid privacy_budget.""" one_budget: Union[PureDPBudget, ApproxDPBudget, RhoZCDPBudget] two_budget: Union[PureDPBudget, ApproxDPBudget, RhoZCDPBudget] if output_measure == PureDP(): diff --git a/test/unit/keysets/test_cross_join.py b/test/unit/keysets/test_cross_join.py index 022e4e85..8184af07 100644 --- a/test/unit/keysets/test_cross_join.py +++ b/test/unit/keysets/test_cross_join.py @@ -202,7 +202,6 @@ def test_valid( assert_dataframe_equal(ks.dataframe(), expected_df) -# pylint: disable=protected-access @parametrize( Case("left_plan")( left=KeySet._detect(["A"]), @@ -235,7 +234,6 @@ def test_valid( expected_columns=["B", "A"], ), ) -# pylint: enable=protected-access def test_valid_plan( left: Union[KeySet, KeySetPlan], right: Union[KeySet, KeySetPlan], diff --git a/test/unit/keysets/test_decomposition.py b/test/unit/keysets/test_decomposition.py index f247015f..1ea8d020 100644 --- a/test/unit/keysets/test_decomposition.py +++ b/test/unit/keysets/test_decomposition.py @@ -213,7 +213,6 @@ def test_valid( expected_factors: list[KeySet], expected_subtracted_values: list[KeySet], ): - # pylint: disable-next=protected-access factors, subtracted_values = ks._decompose(split_columns) _assert_keyset_sequence_equivalent( diff --git a/test/unit/keysets/test_detect.py b/test/unit/keysets/test_detect.py index c101f618..e7724ba1 100644 --- a/test/unit/keysets/test_detect.py +++ b/test/unit/keysets/test_detect.py @@ -14,11 +14,11 @@ def test_detect(): """KeySet.detect works as expected.""" - ks = KeySet._detect(["A", "B"]) # pylint: disable=protected-access + ks = KeySet._detect(["A", "B"]) assert isinstance(ks, KeySetPlan) assert ks.columns() == ["A", "B"] - ks = KeySet._detect(["B", "A"]) # pylint: disable=protected-access + ks = KeySet._detect(["B", "A"]) assert isinstance(ks, KeySetPlan) assert ks.columns() == ["B", "A"] @@ -44,4 +44,4 @@ def test_detect(): def test_invalid(columns: Any, expectation: ContextManager[None]): """Invalid domains are rejected.""" with expectation: - KeySet._detect(columns) # pylint: disable=protected-access + KeySet._detect(columns) diff --git a/test/unit/keysets/test_equivalence.py b/test/unit/keysets/test_equivalence.py index cbac75f3..911860f1 100644 --- a/test/unit/keysets/test_equivalence.py +++ b/test/unit/keysets/test_equivalence.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright Tumult Labs 2025 -# pylint: disable=comparison-with-itself from typing import Optional, Union @@ -258,7 +257,6 @@ def test_equivalence_different_schemas(): assert ks1.is_equivalent(ks3) is False -# pylint: disable=protected-access @parametrize( Case("detect_eq")( ks1=KeySet._detect(["A", "B"]), @@ -328,7 +326,6 @@ def test_equivalence_different_schemas(): equivalent={False, None}, ), ) -# pylint: enable=protected-access def test_plan_equivalence( ks1: KeySetPlan, ks2: KeySetPlan, equivalent: Union[None, bool, set[Optional[bool]]] ): @@ -338,7 +335,7 @@ def test_plan_equivalence( assert ks1.is_equivalent(ks1) assert ks2.is_equivalent(ks2) if isinstance(equivalent, set): - assert (ks1 == ks2) in equivalent # pylint: disable=superfluous-parens + assert (ks1 == ks2) in equivalent assert isinstance(ks1 == ks2, bool) assert ks1.is_equivalent(ks2) in equivalent assert ks2.is_equivalent(ks1) in equivalent diff --git a/test/unit/keysets/test_filter.py b/test/unit/keysets/test_filter.py index 482928a6..965a07e4 100644 --- a/test/unit/keysets/test_filter.py +++ b/test/unit/keysets/test_filter.py @@ -111,7 +111,6 @@ def test_valid( assert_dataframe_equal(ks.dataframe(), expected_df) -# pylint: disable=protected-access @parametrize( Case("one_col_str")( base=KeySet._detect(["A"]), @@ -144,7 +143,6 @@ def test_valid( expected_columns=["A", "B"], ), ) -# pylint: enable=protected-access def test_valid_plan( base: KeySetPlan, condition: Callable[[], Union[str, Column]], diff --git a/test/unit/keysets/test_join.py b/test/unit/keysets/test_join.py index 4e82206b..6a4bcc18 100644 --- a/test/unit/keysets/test_join.py +++ b/test/unit/keysets/test_join.py @@ -178,7 +178,6 @@ def test_valid( assert_dataframe_equal(ks.dataframe(), expected_df) -# pylint: disable=protected-access @parametrize( Case("left_plan")( left=KeySet._detect(["A"]), @@ -211,7 +210,6 @@ def test_valid( expected_columns=["B", "C", "A"], ), ) -# pylint: enable=protected-access def test_valid_plan( left: Union[KeySet, KeySetPlan], right: Union[KeySet, KeySetPlan], diff --git a/test/unit/keysets/test_keyset.py b/test/unit/keysets/test_keyset.py index 98f4f71d..1ae158d3 100644 --- a/test/unit/keysets/test_keyset.py +++ b/test/unit/keysets/test_keyset.py @@ -808,7 +808,7 @@ def test_size_from_df(_, spark, pd_df, expected_size, schema): @pytest.fixture(scope="module") def _eq_hashing_test_data(spark): - "Set up test data." + """Set up test data.""" pdf_ab = pd.DataFrame({"A": ["a1", "a2"], "B": [0, 1]}) df_ab = spark.createDataFrame(pdf_ab) pdf_ac = pd.DataFrame({"A": ["a1", "a2"], "C": [0, 1]}) diff --git a/test/unit/keysets/test_product_keyset.py b/test/unit/keysets/test_product_keyset.py index 6b323e16..aeed1824 100644 --- a/test/unit/keysets/test_product_keyset.py +++ b/test/unit/keysets/test_product_keyset.py @@ -16,8 +16,6 @@ from tmlt.analytics import ColumnDescriptor, ColumnType, KeySet -# pylint: disable=unused-argument - def test_init_with_product_keyset( spark: SparkSession, @@ -391,7 +389,7 @@ def test_size(_, keyset: KeySet, expected: int): @pytest.fixture(scope="module") def _eq_hashing_test_data(spark): - "Set up test data." + """Set up test data.""" df_ab = spark.createDataFrame(pd.DataFrame({"A": ["a1", "a2"], "B": [0, 1]})) df_ij = spark.createDataFrame(pd.DataFrame({"I": ["i1", "i2"], "J": [0, 1]})) df_dc = spark.createDataFrame(pd.DataFrame({"D": ["d1", "d2"], "C": [0, 1]})) diff --git a/test/unit/keysets/test_project.py b/test/unit/keysets/test_project.py index 61019a2f..7b9f59be 100644 --- a/test/unit/keysets/test_project.py +++ b/test/unit/keysets/test_project.py @@ -69,7 +69,7 @@ ), Case("remove_detect_columns")( base=KeySet.from_tuples([(1, 2, 3)], ["A", "B", "C"]) - * KeySet._detect(["D", "E", "F"]), # pylint: disable=protected-access + * KeySet._detect(["D", "E", "F"]), columns=["A", "B"], expected_df=pd.DataFrame({"A": [1], "B": [2]}), expected_schema={ @@ -118,7 +118,6 @@ def test_valid( assert_dataframe_equal(ks.dataframe(), expected_df) -# pylint: disable=protected-access @parametrize( Case("one_column_str")( base=KeySet._detect(["A", "B"]), @@ -147,7 +146,6 @@ def test_valid( columns=["C", "A", "E"], ), ) -# pylint: enable=protected-access def test_valid_plan( base: KeySetPlan, columns: Union[str, Sequence[str]], diff --git a/test/unit/keysets/test_rewrite.py b/test/unit/keysets/test_rewrite.py index 83cd15f4..e0e57e87 100644 --- a/test/unit/keysets/test_rewrite.py +++ b/test/unit/keysets/test_rewrite.py @@ -200,7 +200,6 @@ def test_rewrite_equality( if not allow_unchanged: # Ensure that rewriting actually happened - # pylint: disable-next=protected-access assert ks_rewritten._op_tree != ks_original._op_tree assert ks_rewritten.columns() == ks_original.columns() diff --git a/test/unit/keysets/test_subtract.py b/test/unit/keysets/test_subtract.py index dfc68239..a449e7fc 100644 --- a/test/unit/keysets/test_subtract.py +++ b/test/unit/keysets/test_subtract.py @@ -151,7 +151,6 @@ def test_valid( assert ks.schema() == expected_schema -# pylint: disable=protected-access @parametrize( Case("single_column")( left=KeySet._detect(["A"]), @@ -177,7 +176,6 @@ def test_valid( expected_columns=["A", "B"], ), ) -# pylint: enable=protected-access def test_valid_plan( left: KeySetPlan, right: KeySet, diff --git a/test/unit/query_expr_compiler/test_measurement_visitor.py b/test/unit/query_expr_compiler/test_measurement_visitor.py index 126edeb1..8e1625fd 100644 --- a/test/unit/query_expr_compiler/test_measurement_visitor.py +++ b/test/unit/query_expr_compiler/test_measurement_visitor.py @@ -355,7 +355,6 @@ def test_validate_measurement(self): mock_measurement.privacy_function.return_value = self.visitor.budget.value mid_stability = ExactNumber(2).expr # This should finish without raising an error - # pylint: disable=protected-access self.visitor._validate_measurement(mock_measurement, mid_stability) # Change it so that the privacy function returns something else @@ -365,7 +364,6 @@ def test_validate_measurement(self): match="Privacy function does not match per-query privacy budget.", ): self.visitor._validate_measurement(mock_measurement, mid_stability) - # pylint: enable=protected-access def _check_measurement(self, measurement: Measurement): """Check the basic attributes of a measurement (for all query exprs). @@ -693,10 +691,8 @@ def test_visit_groupby_quantile( noise_info: NoiseInfo, ) -> None: """Test visit_groupby_quantile.""" - # pylint: disable=protected-access self.run_with_empty_data_and_check_schema(query._query_expr, output_measure) self.check_noise_info(query._query_expr, output_measure, noise_info) - # pylint: enable=protected-access @pytest.mark.parametrize( "query,output_measure,noise_info", diff --git a/test/unit/query_expr_compiler/transformation_visitor/test_constraints.py b/test/unit/query_expr_compiler/transformation_visitor/test_constraints.py index e52a7868..02e8f44d 100644 --- a/test/unit/query_expr_compiler/transformation_visitor/test_constraints.py +++ b/test/unit/query_expr_compiler/transformation_visitor/test_constraints.py @@ -56,7 +56,7 @@ def test_max_rows_per_id(self, constraint_max: int): rows_per_id = result_df.groupby("id")["id"].count() assert all( rows_per_id <= constraint_max - ), f"MaxRowsPerID constraint violated, counts were:\n{str(rows_per_id)}" + ), f"MaxRowsPerID constraint violated, counts were:\n{rows_per_id}" self._test_is_subset(input_df, result_df) @@ -79,7 +79,7 @@ def test_max_groups_per_id(self, grouping_col: str, constraint_max: int): groups_per_id = result_df.groupby("id").nunique()[grouping_col] assert all( groups_per_id <= constraint_max - ), f"MaxGroupsPerID constraint violated, counts were:\n{str(groups_per_id)}" + ), f"MaxGroupsPerID constraint violated, counts were:\n{groups_per_id}" self._test_is_subset(input_df, result_df) diff --git a/test/unit/test_binning_spec.py b/test/unit/test_binning_spec.py index c5ad4045..dde28eb9 100644 --- a/test/unit/test_binning_spec.py +++ b/test/unit/test_binning_spec.py @@ -533,7 +533,6 @@ def test_immutable(): """Checks that each binning spec attribute is immutable.""" binspec = BinningSpec([0, 1, 2]) - # pylint: disable=protected-access with pytest.raises(FrozenInstanceError): binspec.bin_edges = [0, 1, 2, 3] # type: ignore @@ -571,7 +570,6 @@ def test_immutable(): with pytest.raises(FrozenInstanceError): binspec._both_endpoints = True # type: ignore - # pylint: enable=protected-access def test_repr(): diff --git a/test/unit/test_config.py b/test/unit/test_config.py index 35bb62b0..6a6e10af 100644 --- a/test/unit/test_config.py +++ b/test/unit/test_config.py @@ -20,16 +20,14 @@ def test_config_singleton(): # Adding feature flags for use in the tests is necessary because the collection # of feature flags existing at any given time is not stable. Unfortunately doing -# so makes mypy and pylint very unhappy, so we're ignoring errors related to the -# existence of an attribute on a class for the rest of this file. +# so makes mypy very unhappy, so we're ignoring errors related to the existence +# of an attribute on a class for the rest of this file. # mypy: disable-error-code=attr-defined -# pylint: disable=no-member @pytest.fixture def _with_example_features(): - # pylint: disable=protected-access """Add some example feature flags for testing.""" class _Features(Config.Features): @@ -112,7 +110,6 @@ def test_config_feature_flag_raise_if_disabled(): @pytest.mark.usefixtures("_with_example_features") def test_config_feature_flag_raise_if_disabled_snippet(): - # pylint: disable=protected-access """Feature flags' raise_if_disabled produces example code that enables flag.""" cfg = Config() @@ -130,6 +127,6 @@ def test_config_feature_flag_raise_if_disabled_snippet(): ), "No snippet to enable flag found in exception message" enable_snippet = error_message[enable_snippet_idx:] with patch("tmlt.analytics.config.config", cfg): - exec(enable_snippet, {}, {}) # pylint: disable=exec-used + exec(enable_snippet, {}, {}) assert ff, f"Flag {ff._name} did not get set by snippet from exception message" ff.disable() diff --git a/test/unit/test_privacy_budget.py b/test/unit/test_privacy_budget.py index 36b49641..8ecb2734 100644 --- a/test/unit/test_privacy_budget.py +++ b/test/unit/test_privacy_budget.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright Tumult Labs 2025 -# pylint: disable=pointless-string-statement import math from typing import List, Type, Union @@ -265,17 +264,14 @@ def test_budget_hashing(budgets: List[PrivacyBudget], equal: bool): assert budget0_hash != budget1_hash -# pylint: disable=protected-access def test_PureDPBudget_immutability(): """Tests that the PureDPBudget is immutable.""" - with pytest.raises(AttributeError): PureDPBudget(1)._epsilon = 2 # type: ignore def test_ApproxDPBudget_immutability(): """Tests that the ApproxDPBudget is immutable.""" - with pytest.raises(AttributeError): ApproxDPBudget(1, 0.1)._epsilon = 2 # type: ignore with pytest.raises(AttributeError): @@ -284,14 +280,10 @@ def test_ApproxDPBudget_immutability(): def test_RhoZCDPBudget_immutability(): """Tests that the RhoZCDPBudget is immutable.""" - with pytest.raises(AttributeError): RhoZCDPBudget(1)._rho = 2 # type: ignore -# pylint: enable=protected-access - - @pytest.mark.parametrize( "budget_a, budget_b, equal", [ diff --git a/test/unit/test_privacy_budget_rounding_helper.py b/test/unit/test_privacy_budget_rounding_helper.py index cdde7227..3da5de4c 100644 --- a/test/unit/test_privacy_budget_rounding_helper.py +++ b/test/unit/test_privacy_budget_rounding_helper.py @@ -2,8 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright Tumult Labs 2025 -# pylint: disable=pointless-string-statement - from tmlt.core.utils.exact_number import ExactNumber from typeguard import typechecked @@ -31,7 +29,6 @@ def test_int_request(): we should never run into the tolerance threshold issue. This means the requested budget should be returned in all cases. """ - adjusted = _get_adjusted_budget(PURE_DP_99, PURE_DP_100) assert adjusted == PURE_DP_99 adjusted = _get_adjusted_budget(PURE_DP_101, PURE_DP_100) diff --git a/test/unit/test_query_builder.py b/test/unit/test_query_builder.py index ae73adfa..e0057954 100644 --- a/test/unit/test_query_builder.py +++ b/test/unit/test_query_builder.py @@ -2,10 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright Tumult Labs 2025 -# pylint: disable=no-member, protected-access -# Disabling no-member because attributes of specific query types need to be referenced, -# and the general QueryExpr type doesn't have the attribute. -# Disabling protected-access to access the _query_expr attribute of Query regularly. import datetime import re @@ -72,7 +68,6 @@ def root_builder(): @pytest.mark.parametrize("join_columns", [(None), (["B"])]) def test_join_public(join_columns: Optional[List[str]]): """QueryBuilder.join_public works as expected with a public source ID.""" - join_table = "public" query = ( root_builder() @@ -104,7 +99,6 @@ def test_join_public(join_columns: Optional[List[str]]): @pytest.mark.parametrize("join_columns", [(None), (["B"])]) def test_join_public_dataframe(spark, join_columns: Optional[List[str]]): """QueryBuilder.join_public works as expected when used with a dataframe.""" - join_table = spark.createDataFrame(pd.DataFrame({"A": [1, 2]})) query = ( root_builder() @@ -625,7 +619,6 @@ def test_histogram(): def test_histogram_options(): """QueryBuilder.histogram works as expected, with options.""" - query = root_builder().histogram("A", [0, 5, 10], name="New") assert isinstance(query, Query) query_expr = query._query_expr @@ -1377,7 +1370,6 @@ def test_suppress_aggregates( ) def test_query_immutability(query: Query): """Tests that Query objects are immutable.""" - with pytest.raises(FrozenInstanceError): query._query_expr = QueryBuilder("testdf").count()._query_expr # type: ignore @@ -1435,9 +1427,7 @@ def test_query_immutability(query: Query): ) def test_query_fast_equality_check(query1: Query, query2: Query, equal: bool): """Tests that Query objects are equal when they should be.""" - # pylint: disable=protected-access assert query1._is_equivalent(query2) == equal - # pylint: enable=protected-access def root_grouped_builder(): diff --git a/test/unit/test_query_expr_compiler.py b/test/unit/test_query_expr_compiler.py index 9674dbf0..379549d2 100644 --- a/test/unit/test_query_expr_compiler.py +++ b/test/unit/test_query_expr_compiler.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright Tumult Labs 2025 -# pylint: disable=protected-access import datetime from typing import Dict, Union @@ -403,7 +402,7 @@ @pytest.fixture(name="test_data", scope="class") def setup(spark, request) -> None: - "Set up test data." + """Set up test data.""" sdf = spark.createDataFrame( pd.DataFrame( [["0", 0, 0.0], ["0", 0, 1.0], ["0", 1, 2.0], ["1", 0, 3.0]], diff --git a/test/unit/test_query_expression.py b/test/unit/test_query_expression.py index 20c7dbf1..52238fde 100644 --- a/test/unit/test_query_expression.py +++ b/test/unit/test_query_expression.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright Tumult Labs 2025 -# pylint: disable=too-many-arguments, pointless-string-statement import datetime import re @@ -579,7 +578,7 @@ def test_join_public_dataframe_validation_column_type(spark): ], ) def test_invalid_suppress_aggregates( - spark: SparkSession, # pylint: disable=unused-argument + spark: SparkSession, child: GroupByCount, column: str, threshold: int, @@ -604,4 +603,4 @@ def test_queryexpr_hashing(queryexpr): """Tests that each query expression has enabled hashing and eq.""" test_dict = {queryexpr: 1} assert test_dict[queryexpr] == 1 - assert queryexpr == queryexpr # pylint: disable=comparison-with-itself + assert queryexpr == queryexpr diff --git a/test/unit/test_schema.py b/test/unit/test_schema.py index 56c087d4..f52de2bf 100644 --- a/test/unit/test_schema.py +++ b/test/unit/test_schema.py @@ -68,7 +68,6 @@ def test_schema_equality() -> None: def test_schema_hash() -> None: """Makes sure that schema hash is consistent.""" - columns_1 = {"a": "VARCHAR", "b": "INTEGER"} columns_2 = {"a": "VARCHAR", "b": "INTEGER"} columns_3 = {"y": "VARCHAR", "z": "INTEGER"} @@ -87,7 +86,6 @@ def test_schema_hash() -> None: def test_frozen_dict(): """FrozenDict works like an immutable dict.""" - a = FrozenDict.from_dict({"a": 1, "b": 2}) assert a["a"] == 1 assert a["b"] == 2 @@ -151,7 +149,7 @@ def test_frozen_dict_order_comparison(): fd3 = FrozenDict.from_dict({3: 4, 1: 2}) fd4 = FrozenDict.from_dict({1: 2, 3: 5}) - assert fd1 == fd1 # pylint: disable=comparison-with-itself + assert fd1 == fd1 assert fd1 == fd2 assert fd1 != fd3 assert fd1 != fd4 diff --git a/test/unit/test_session.py b/test/unit/test_session.py index 393443d7..8f6e4442 100644 --- a/test/unit/test_session.py +++ b/test/unit/test_session.py @@ -3,8 +3,6 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright Tumult Labs 2025 -# pylint: disable=protected-access - import re from typing import Any, Dict, List, Tuple, Type, Union from unittest.mock import ANY, Mock, patch @@ -92,7 +90,6 @@ # Disable redefined-outer-name because spark is used to create dataframes as test # inputs and within tests to check outputs and run queries. -# pylint: disable=redefined-outer-name def _privacy_budget_to_exact_number( @@ -452,7 +449,6 @@ def test_from_neighboring_relation_single( """Tests that :func:`Session._from_neighboring_relation` works as expected with a single relation. """ - sess = Session._from_neighboring_relation( privacy_budget=budget, private_sources={"private": self.sdf}, @@ -493,7 +489,6 @@ def test_from_neighboring_relation_add_remove_keys( """Tests that :func:`Session._from_neighboring_relation` works as expected with a single AddRemoveKeys relation. """ - sess = Session._from_neighboring_relation( privacy_budget=budget, private_sources={"private": self.sdf}, @@ -584,7 +579,8 @@ def test_add_public_dataframe(self): @pytest.mark.parametrize("d_in", [(sp.Integer(1)), (sp.sqrt(sp.Integer(2)))]) def test_evaluate_puredp_session_approxdp_query(self, spark, d_in): """Confirm that using an approxdp query on a puredp accountant raises an - error.""" + error. + """ with patch.object( QueryExprCompiler, "__call__", autospec=True ) as mock_compiler, patch( @@ -1165,7 +1161,7 @@ def test_describe(self, spark): ], ) ) - # pylint: enable=line-too-long + session.describe() mock_print.assert_called_with(expected) @@ -1237,13 +1233,12 @@ def test_describe_with_constraints(self, spark): ], ) ) + session.describe() - # pylint: enable=line-too-long mock_print.assert_called_with(expected) def test_describe_with_id_column(self, spark): """Test :func:`_describe` with a table with an ID column.""" - with patch("builtins.print") as mock_print, patch( "tmlt.core.measurements.interactive_measurements.PrivacyAccountant" ) as mock_accountant: @@ -1311,7 +1306,7 @@ def test_describe_with_id_column(self, spark): ], ) ) - # pylint: enable=line-too-long + session.describe() mock_print.assert_called_with(expected) @@ -1507,8 +1502,8 @@ def test_describe_table_with_constraints( + """\n\tConstraints:\n""" + expected_output ) + session.describe("private") - # pylint: enable=line-too-long mock_print.assert_called_with(expected) def test_supported_spark_types(self, spark): @@ -1652,7 +1647,7 @@ def _setup_accountant(self, mock_accountant) -> None: mock_accountant.d_in = {NamedTable("private"): sp.Integer(1)} def test_invalid_dataframe_initialization(self): - """session raises error on invalid dataframe type""" + """Session raises error on invalid dataframe type""" with patch( "tmlt.core.measurements.interactive_measurements.PrivacyAccountant" ) as mock_accountant: @@ -1678,7 +1673,7 @@ def test_invalid_dataframe_initialization(self): session.add_public_dataframe(source_id="public", dataframe=self.pdf) def test_invalid_data_properties(self, spark): - """session raises error on invalid data properties""" + """Session raises error on invalid data properties""" with patch( "tmlt.core.measurements.interactive_measurements.PrivacyAccountant" ) as mock_accountant: @@ -1814,7 +1809,7 @@ def test_invalid_key_column(self) -> None: def test_invalid_source_id( self, source_id: str, exception_type: Type[Exception], expected_error_msg: str ): - """session raises error on invalid source_id.""" + """Session raises error on invalid source_id.""" with patch( "tmlt.core.measurements.interactive_measurements.PrivacyAccountant" ) as mock_accountant: @@ -1873,7 +1868,7 @@ def test_invalid_public_source(self): "query_expr", [(["filter private A == 0"]), ([QueryBuilder("private")])] ) def test_invalid_queries_evaluate(self, query_expr: Any): - """evaluate raises error on invalid queries.""" + """Evaluate raises error on invalid queries.""" with patch( "tmlt.core.measurements.interactive_measurements.PrivacyAccountant" ) as mock_accountant: @@ -1903,7 +1898,7 @@ def test_invalid_queries_create( exception_type: Type[Exception], expected_error_msg: str, ): - """create functions raise error on invalid input queries.""" + """Create functions raise error on invalid input queries.""" with patch( "tmlt.core.measurements.interactive_measurements.PrivacyAccountant" ) as mock_accountant: @@ -2137,7 +2132,8 @@ def test_duplicate_source_id(self): def test_build_invalid_identifier(self): """Tests that build fails if protected change does - not have associated ID space.""" + not have associated ID space. + """ builder = ( Session.Builder() .with_private_dataframe( @@ -2243,7 +2239,7 @@ def test_build_with_id_and_only_one_df(self) -> None: @pytest.mark.parametrize( "builder,expected_sympy_budget,expected_output_measure," - + "private_dataframes,public_dataframes", + "private_dataframes,public_dataframes", [ ( Session.Builder().with_privacy_budget(PureDPBudget(10)), @@ -2445,7 +2441,6 @@ def test_automatic_partitions( protected_change: ProtectedChange, ): """Tests that partition selection is automatically called with correct queries.""" - # Turning on experimental features for this test. with config.features.auto_partition_selection.enabled(): spark = SparkSession.builder.getOrCreate() @@ -2558,7 +2553,6 @@ def test_automatic_partition_selection_invalid_budget( expected_error: str, ): """Test that Automatic Partition Selection queries with an invalid budget error.""" - with config.features.auto_partition_selection.enabled(): spark = SparkSession.builder.getOrCreate() test_df = spark.createDataFrame(input_data) @@ -2593,8 +2587,8 @@ def test_automatic_partition_selection_invalid_budget( ) def test_automatic_partition_null_keyset(query_expr: Query, expected_columns: List): """Tests that automatic partition selection with null keyset raises a warning and - completes with an output dataframe with len(0) but the correct schema.""" - + completes with an output dataframe with len(0) but the correct schema. + """ with config.features.auto_partition_selection.enabled(): spark = SparkSession.builder.getOrCreate() # An empty DF ensures that automatic partition selection returns a null keyset. diff --git a/test/unit/test_table_identifiers.py b/test/unit/test_table_identifiers.py index 72396170..84a24464 100644 --- a/test/unit/test_table_identifiers.py +++ b/test/unit/test_table_identifiers.py @@ -20,7 +20,7 @@ def test_table_equality(): assert TableCollection(name="private1") != TableCollection(name="private2") temp_table = TemporaryTable() - assert temp_table == temp_table # pylint: disable=comparison-with-itself + assert temp_table == temp_table assert temp_table != TemporaryTable() diff --git a/test/unit/test_utils.py b/test/unit/test_utils.py index be4b0f01..baa0785e 100644 --- a/test/unit/test_utils.py +++ b/test/unit/test_utils.py @@ -8,6 +8,6 @@ ### Test for tmlt.analytics.utils.check_installation() # We want the `spark` argument here so that the test will use the # (session-wide, pytest-provided) spark session. -def test_check_installation(spark) -> None: # pylint: disable=unused-argument +def test_check_installation(spark) -> None: """Test that check_installation works (doesn't raise an error).""" check_installation() diff --git a/uv.lock b/uv.lock index 2180c3e0..99176380 100644 --- a/uv.lock +++ b/uv.lock @@ -379,15 +379,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/99/c7/d1ec24fb280caa5a79b6b950db565dab30210a66259d17d5bb2b3a9f878d/dependency_groups-1.3.1-py3-none-any.whl", hash = "sha256:51aeaa0dfad72430fcfb7bcdbefbd75f3792e5919563077f30bc0d73f4493030", size = 8664, upload-time = "2025-05-02T00:34:27.085Z" }, ] -[[package]] -name = "dill" -version = "0.4.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/12/80/630b4b88364e9a8c8c5797f4602d0f76ef820909ee32f0bacb9f90654042/dill-0.4.0.tar.gz", hash = "sha256:0633f1d2df477324f53a895b02c901fb961bdbf65a17122586ea7019292cbcf0", size = 186976, upload-time = "2025-04-16T00:41:48.867Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/50/3d/9373ad9c56321fdab5b41197068e1d8c25883b3fea29dd361f9b55116869/dill-0.4.0-py3-none-any.whl", hash = "sha256:44f54bf6412c2c8464c14e8243eb163690a9800dbe2c367330883b19c7561049", size = 119668, upload-time = "2025-04-16T00:41:47.671Z" }, -] - [[package]] name = "distlib" version = "0.3.9" @@ -739,15 +730,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6a/b9/59e120d24a2ec5fc2d30646adb2efb4621aab3c6d83d66fb2a7a182db032/matplotlib-3.10.3-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb73d8aa75a237457988f9765e4dfe1c0d2453c5ca4eabc897d4309672c8e014", size = 8594298, upload-time = "2025-05-08T19:10:51.738Z" }, ] -[[package]] -name = "mccabe" -version = "0.7.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e7/ff/0ffefdcac38932a54d2b5eed4e0ba8a408f215002cd178ad1df0f2806ff8/mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325", size = 9658, upload-time = "2022-01-24T01:14:51.113Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/27/1a/1f68f9ba0c207934b35b86a8ca3aad8395a3d6dd7921c0686e23853ff5a9/mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e", size = 7350, upload-time = "2022-01-24T01:14:49.62Z" }, -] - [[package]] name = "mdurl" version = "0.1.2" @@ -1308,23 +1290,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e2/0d/8ba33fa83a7dcde13eb3c1c2a0c1cc29950a048bfed6d9b0d8b6bd710b4c/pydata_sphinx_theme-0.16.1-py3-none-any.whl", hash = "sha256:225331e8ac4b32682c18fcac5a57a6f717c4e632cea5dd0e247b55155faeccde", size = 6723264, upload-time = "2024-12-17T10:53:35.645Z" }, ] -[[package]] -name = "pydocstyle" -version = "6.3.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "snowballstemmer" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/e9/5c/d5385ca59fd065e3c6a5fe19f9bc9d5ea7f2509fa8c9c22fb6b2031dd953/pydocstyle-6.3.0.tar.gz", hash = "sha256:7ce43f0c0ac87b07494eb9c0b462c0b73e6ff276807f204d6b53edc72b7e44e1", size = 36796, upload-time = "2023-01-17T20:29:19.838Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/36/ea/99ddefac41971acad68f14114f38261c1f27dac0b3ec529824ebc739bdaa/pydocstyle-6.3.0-py3-none-any.whl", hash = "sha256:118762d452a49d6b05e194ef344a55822987a462831ade91ec5c06fd2169d019", size = 38038, upload-time = "2023-01-17T20:29:18.094Z" }, -] - -[package.optional-dependencies] -toml = [ - { name = "tomli", marker = "python_full_version < '3.11'" }, -] - [[package]] name = "pygments" version = "2.19.2" @@ -1334,25 +1299,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" }, ] -[[package]] -name = "pylint" -version = "3.3.7" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "astroid" }, - { name = "colorama", marker = "sys_platform == 'win32'" }, - { name = "dill" }, - { name = "isort" }, - { name = "mccabe" }, - { name = "platformdirs" }, - { name = "tomli", marker = "python_full_version < '3.11'" }, - { name = "tomlkit" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/1c/e4/83e487d3ddd64ab27749b66137b26dc0c5b5c161be680e6beffdc99070b3/pylint-3.3.7.tar.gz", hash = "sha256:2b11de8bde49f9c5059452e0c310c079c746a0a8eeaa789e5aa966ecc23e4559", size = 1520709, upload-time = "2025-05-04T17:07:51.089Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e8/83/bff755d09e31b5d25cc7fdc4bf3915d1a404e181f1abf0359af376845c24/pylint-3.3.7-py3-none-any.whl", hash = "sha256:43860aafefce92fca4cf6b61fe199cdc5ae54ea28f9bf4cd49de267b5195803d", size = 522565, upload-time = "2025-05-04T17:07:48.714Z" }, -] - [[package]] name = "pyparsing" version = "3.2.3" @@ -1575,6 +1521,32 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0d/9b/63f4c7ebc259242c89b3acafdb37b41d1185c07ff0011164674e9076b491/rich-14.0.0-py3-none-any.whl", hash = "sha256:1c9491e1951aac09caffd42f448ee3d04e58923ffe14993f6e83068dc395d7e0", size = 243229, upload-time = "2025-03-30T14:15:12.283Z" }, ] +[[package]] +name = "ruff" +version = "0.14.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/df/55/cccfca45157a2031dcbb5a462a67f7cf27f8b37d4b3b1cd7438f0f5c1df6/ruff-0.14.4.tar.gz", hash = "sha256:f459a49fe1085a749f15414ca76f61595f1a2cc8778ed7c279b6ca2e1fd19df3", size = 5587844, upload-time = "2025-11-06T22:07:45.033Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/17/b9/67240254166ae1eaa38dec32265e9153ac53645a6c6670ed36ad00722af8/ruff-0.14.4-py3-none-linux_armv6l.whl", hash = "sha256:e6604613ffbcf2297cd5dcba0e0ac9bd0c11dc026442dfbb614504e87c349518", size = 12606781, upload-time = "2025-11-06T22:07:01.841Z" }, + { url = "https://files.pythonhosted.org/packages/46/c8/09b3ab245d8652eafe5256ab59718641429f68681ee713ff06c5c549f156/ruff-0.14.4-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:d99c0b52b6f0598acede45ee78288e5e9b4409d1ce7f661f0fa36d4cbeadf9a4", size = 12946765, upload-time = "2025-11-06T22:07:05.858Z" }, + { url = "https://files.pythonhosted.org/packages/14/bb/1564b000219144bf5eed2359edc94c3590dd49d510751dad26202c18a17d/ruff-0.14.4-py3-none-macosx_11_0_arm64.whl", hash = "sha256:9358d490ec030f1b51d048a7fd6ead418ed0826daf6149e95e30aa67c168af33", size = 11928120, upload-time = "2025-11-06T22:07:08.023Z" }, + { url = "https://files.pythonhosted.org/packages/a3/92/d5f1770e9988cc0742fefaa351e840d9aef04ec24ae1be36f333f96d5704/ruff-0.14.4-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:81b40d27924f1f02dfa827b9c0712a13c0e4b108421665322218fc38caf615c2", size = 12370877, upload-time = "2025-11-06T22:07:10.015Z" }, + { url = "https://files.pythonhosted.org/packages/e2/29/e9282efa55f1973d109faf839a63235575519c8ad278cc87a182a366810e/ruff-0.14.4-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f5e649052a294fe00818650712083cddc6cc02744afaf37202c65df9ea52efa5", size = 12408538, upload-time = "2025-11-06T22:07:13.085Z" }, + { url = "https://files.pythonhosted.org/packages/8e/01/930ed6ecfce130144b32d77d8d69f5c610e6d23e6857927150adf5d7379a/ruff-0.14.4-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aa082a8f878deeba955531f975881828fd6afd90dfa757c2b0808aadb437136e", size = 13141942, upload-time = "2025-11-06T22:07:15.386Z" }, + { url = "https://files.pythonhosted.org/packages/6a/46/a9c89b42b231a9f487233f17a89cbef9d5acd538d9488687a02ad288fa6b/ruff-0.14.4-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:1043c6811c2419e39011890f14d0a30470f19d47d197c4858b2787dfa698f6c8", size = 14544306, upload-time = "2025-11-06T22:07:17.631Z" }, + { url = "https://files.pythonhosted.org/packages/78/96/9c6cf86491f2a6d52758b830b89b78c2ae61e8ca66b86bf5a20af73d20e6/ruff-0.14.4-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a9f3a936ac27fb7c2a93e4f4b943a662775879ac579a433291a6f69428722649", size = 14210427, upload-time = "2025-11-06T22:07:19.832Z" }, + { url = "https://files.pythonhosted.org/packages/71/f4/0666fe7769a54f63e66404e8ff698de1dcde733e12e2fd1c9c6efb689cb5/ruff-0.14.4-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:95643ffd209ce78bc113266b88fba3d39e0461f0cbc8b55fb92505030fb4a850", size = 13658488, upload-time = "2025-11-06T22:07:22.32Z" }, + { url = "https://files.pythonhosted.org/packages/ee/79/6ad4dda2cfd55e41ac9ed6d73ef9ab9475b1eef69f3a85957210c74ba12c/ruff-0.14.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:456daa2fa1021bc86ca857f43fe29d5d8b3f0e55e9f90c58c317c1dcc2afc7b5", size = 13354908, upload-time = "2025-11-06T22:07:24.347Z" }, + { url = "https://files.pythonhosted.org/packages/b5/60/f0b6990f740bb15c1588601d19d21bcc1bd5de4330a07222041678a8e04f/ruff-0.14.4-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:f911bba769e4a9f51af6e70037bb72b70b45a16db5ce73e1f72aefe6f6d62132", size = 13587803, upload-time = "2025-11-06T22:07:26.327Z" }, + { url = "https://files.pythonhosted.org/packages/c9/da/eaaada586f80068728338e0ef7f29ab3e4a08a692f92eb901a4f06bbff24/ruff-0.14.4-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:76158a7369b3979fa878612c623a7e5430c18b2fd1c73b214945c2d06337db67", size = 12279654, upload-time = "2025-11-06T22:07:28.46Z" }, + { url = "https://files.pythonhosted.org/packages/66/d4/b1d0e82cf9bf8aed10a6d45be47b3f402730aa2c438164424783ac88c0ed/ruff-0.14.4-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:f3b8f3b442d2b14c246e7aeca2e75915159e06a3540e2f4bed9f50d062d24469", size = 12357520, upload-time = "2025-11-06T22:07:31.468Z" }, + { url = "https://files.pythonhosted.org/packages/04/f4/53e2b42cc82804617e5c7950b7079d79996c27e99c4652131c6a1100657f/ruff-0.14.4-py3-none-musllinux_1_2_i686.whl", hash = "sha256:c62da9a06779deecf4d17ed04939ae8b31b517643b26370c3be1d26f3ef7dbde", size = 12719431, upload-time = "2025-11-06T22:07:33.831Z" }, + { url = "https://files.pythonhosted.org/packages/a2/94/80e3d74ed9a72d64e94a7b7706b1c1ebaa315ef2076fd33581f6a1cd2f95/ruff-0.14.4-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:5a443a83a1506c684e98acb8cb55abaf3ef725078be40237463dae4463366349", size = 13464394, upload-time = "2025-11-06T22:07:35.905Z" }, + { url = "https://files.pythonhosted.org/packages/54/1a/a49f071f04c42345c793d22f6cf5e0920095e286119ee53a64a3a3004825/ruff-0.14.4-py3-none-win32.whl", hash = "sha256:643b69cb63cd996f1fc7229da726d07ac307eae442dd8974dbc7cf22c1e18fff", size = 12493429, upload-time = "2025-11-06T22:07:38.43Z" }, + { url = "https://files.pythonhosted.org/packages/bc/22/e58c43e641145a2b670328fb98bc384e20679b5774258b1e540207580266/ruff-0.14.4-py3-none-win_amd64.whl", hash = "sha256:26673da283b96fe35fa0c939bf8411abec47111644aa9f7cfbd3c573fb125d2c", size = 13635380, upload-time = "2025-11-06T22:07:40.496Z" }, + { url = "https://files.pythonhosted.org/packages/30/bd/4168a751ddbbf43e86544b4de8b5c3b7be8d7167a2a5cb977d274e04f0a1/ruff-0.14.4-py3-none-win_arm64.whl", hash = "sha256:dd09c292479596b0e6fec8cd95c65c3a6dc68e9ad17b8f2382130f87ff6a75bb", size = 12663065, upload-time = "2025-11-06T22:07:42.603Z" }, +] + [[package]] name = "scipy" version = "1.15.3" @@ -1975,11 +1947,8 @@ isort = [ mypy = [ { name = "mypy" }, ] -pydocstyle = [ - { name = "pydocstyle", extra = ["toml"] }, -] -pylint = [ - { name = "pylint" }, +ruff = [ + { name = "ruff" }, ] scripting = [ { name = "nox" }, @@ -2031,8 +2000,7 @@ docs-examples = [ ] isort = [{ name = "isort", specifier = ">=5.11,<6" }] mypy = [{ name = "mypy", specifier = ">=1.14.0" }] -pydocstyle = [{ name = "pydocstyle", extras = ["toml"], specifier = ">=6.3" }] -pylint = [{ name = "pylint", specifier = ">=3.2.5" }] +ruff = [{ name = "ruff", specifier = ">=0.14.3,<1" }] scripting = [ { name = "nox", specifier = ">=2024.3.2" }, { name = "tmlt-nox-utils", git = "https://github.com/opendp/tumult-tools.git?subdirectory=nox-utils" }, @@ -2078,8 +2046,8 @@ wheels = [ [[package]] name = "tmlt-nox-utils" -version = "0.0.0.post23+df6aa1c0" -source = { git = "https://github.com/opendp/tumult-tools.git?subdirectory=nox-utils#df6aa1c070f189fc6aad44fdde812781951300a9" } +version = "0.0.0.post30+8504968" +source = { git = "https://github.com/opendp/tumult-tools.git?subdirectory=nox-utils#0850496832ff017c00046a8bb8ae331945812879" } dependencies = [ { name = "gitpython" }, { name = "nox" },