这是indexloc提供的服务,不要输入任何密码
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/merge_queue.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
name: dist
path: dist
- run: uv lock --check
- run: uv run --only-group scripting nox -t lint
- run: uv run --only-group scripting nox -t lint -- --check
Test:
runs-on: ubuntu-latest
needs: Package
Expand Down
2 changes: 0 additions & 2 deletions benchmark/benchmarking_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright Tumult Labs 2025

# pylint: disable=attribute-defined-outside-init

import time
import pandas as pd
from pathlib import Path
Expand Down
12 changes: 6 additions & 6 deletions benchmark/count_sum.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def main():

# Single Groupby Column of varying domain sizes (1 row/group)
for domain_size in [100, 400, 10000, 40000, 160000, 640000]:
df = spark.createDataFrame( # pylint: disable=no-member
df = spark.createDataFrame(
spark.sparkContext.parallelize(
[(i, randint(0, 1)) for i in range(domain_size)]
),
Expand All @@ -140,7 +140,7 @@ def main():

# Single groupby column, group size = 1M
for size in [100000, 900000, 10000000]:
df = spark.createDataFrame( # pylint: disable=no-member
df = spark.createDataFrame(
spark.sparkContext.parallelize(
[
(i, randint(0, 1))
Expand Down Expand Up @@ -171,7 +171,7 @@ def main():

# Group size = 10K
for size in [10000, 100000, 1000000, 10000000]:
df = spark.createDataFrame( # pylint: disable=no-member
df = spark.createDataFrame(
spark.sparkContext.parallelize(
[
(i, randint(0, 1))
Expand Down Expand Up @@ -202,7 +202,7 @@ def main():

# Group size = 100
for size in [10000, 40000, 160000, 640000, 2560000]:
df = spark.createDataFrame( # pylint: disable=no-member
df = spark.createDataFrame(
spark.sparkContext.parallelize(
[(i, randint(0, 1)) for j in range(100) for i in range(int(size / 100))]
),
Expand Down Expand Up @@ -244,7 +244,7 @@ def main():
for i in range(num_cols)
]
)
sdf = spark.createDataFrame( # pylint: disable=no-member
sdf = spark.createDataFrame(
spark.sparkContext.parallelize(
np.repeat(
np.transpose(
Expand Down Expand Up @@ -296,7 +296,7 @@ def main():
for i in range(num_cols)
]
)
sdf = spark.createDataFrame( # pylint: disable=no-member
sdf = spark.createDataFrame(
spark.sparkContext.parallelize(
np.repeat(
np.transpose(
Expand Down
6 changes: 3 additions & 3 deletions benchmark/noise_mechanism.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def main():

for size in [100, 400, 10000, 40000, 160000, 640000]:
df = pd.DataFrame({"count": [0] * size})
sdf = spark.createDataFrame(df) # pylint: disable=no-member
sdf = spark.createDataFrame(df)
running_time = evaluate_runtime(
input_domain=input_domain,
measure_column="count",
Expand All @@ -72,7 +72,7 @@ def main():

for size in [100, 400, 10000, 40000, 160000, 640000]:
df = pd.DataFrame({"count": [0] * size})
sdf = spark.createDataFrame(df) # pylint: disable=no-member
sdf = spark.createDataFrame(df)
running_time = evaluate_runtime(
input_domain=input_domain,
measure_column="count",
Expand All @@ -90,7 +90,7 @@ def main():

for size in [100, 400, 10000, 40000, 160000, 640000]:
df = pd.DataFrame({"count": [0] * size})
sdf = spark.createDataFrame(df) # pylint: disable=no-member
sdf = spark.createDataFrame(df)
running_time = evaluate_runtime(
input_domain=input_domain,
measure_column="count",
Expand Down
2 changes: 1 addition & 1 deletion benchmark/private_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def generate_dataframe(
for i in range(group_count)
for _ in range(next(group_size_factory) + randint(-fuzz, fuzz))
]
df = spark.createDataFrame( # pylint: disable=no-member
df = spark.createDataFrame(
spark.sparkContext.parallelize(data), schema=list(dom.schema)
)
return df, dom, len(data)
Expand Down
12 changes: 6 additions & 6 deletions benchmark/public_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def __call__(
domain_size = 2
rows_in_private = 100
input_domain = SparkDataFrameDomain(schema=self.schema)
private_df = self.spark.createDataFrame( # pylint: disable=no-member
private_df = self.spark.createDataFrame(
pd.DataFrame(
[[1.2, i] for i in range(domain_size)]
* int(rows_in_private / domain_size),
Expand Down Expand Up @@ -121,7 +121,7 @@ def __call__(
columns=["B", "C"],
)
for rows in rows_private:
private_df = self.spark.createDataFrame( # pylint: disable=no-member
private_df = self.spark.createDataFrame(
pd.DataFrame(
[[1.2, i] for i in range(domain_size)]
* int(rows / domain_size),
Expand Down Expand Up @@ -157,7 +157,7 @@ def __call__(
rows_in_public = 10000
rows_in_private = 100
input_domain = SparkDataFrameDomain(schema=self.schema)
private_df = self.spark.createDataFrame( # pylint: disable=no-member
private_df = self.spark.createDataFrame(
pd.DataFrame(
[[10.0, i] for i in range(domain_size)]
* int(rows_in_private / domain_size),
Expand Down Expand Up @@ -205,7 +205,7 @@ def __call__(
)
for cols in columns_private:
schema = {f"Col_{i}": SparkFloatColumnDescriptor() for i in range(cols)}
private_df = self.spark.createDataFrame( # pylint: disable=no-member
private_df = self.spark.createDataFrame(
pd.DataFrame(
[tuple(range(cols))] * rows_in_private, columns=schema.keys()
)
Expand Down Expand Up @@ -248,7 +248,7 @@ def __call__(
schema = {
f"Col_{i}": SparkStringColumnDescriptor() for i in range(num_cols)
}
private_df = self.spark.createDataFrame( # pylint: disable=no-member
private_df = self.spark.createDataFrame(
pd.DataFrame(data, columns=columns)
)
private_df = private_df.withColumn("B", lit("B"))
Expand Down Expand Up @@ -281,7 +281,7 @@ def __call__(
rows, join_columns = 4000, 1
input_domain = SparkDataFrameDomain(schema=self.schema)
for size in domain_sizes:
private_df = self.spark.createDataFrame( # pylint: disable=no-member
private_df = self.spark.createDataFrame(
pd.DataFrame(
[[10.0, i] for i in range(size)] * int(rows / size),
columns=["A", "B"],
Expand Down
4 changes: 1 addition & 3 deletions benchmark/quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ def wrap_evaluation_multiple_group_counts(
group_counts: List[int],
benchmark_result: pd.DataFrame,
) -> pd.DataFrame:
# pylint: disable=unused-variable
"""Evaluate quantile runtime over multiple sizes = group_counts. Returns the
resulting benchmarking information as a pandas dataframe.

Expand Down Expand Up @@ -119,7 +118,7 @@ def wrap_evaluation_multiple_group_counts(
_ = df.collect() # Help spark warm up.
else:
groupby_domains = {"A": list(range(int(size / group_size)))}
df = spark.createDataFrame( # pylint: disable=no-member
df = spark.createDataFrame(
spark.sparkContext.parallelize(
[
(i, randint(lower, upper))
Expand Down Expand Up @@ -154,7 +153,6 @@ def wrap_evaluation_multiple_group_counts(
def benchmark_groupby_quantile(
spark: SparkSession, quantile: float, epsilon: ExactNumberInput
) -> pd.DataFrame:
# pylint: disable=unused-variable
"""Evaluate quantile runtime with various params. Return the resulting
pandas dataframe.

Expand Down
8 changes: 4 additions & 4 deletions benchmark/sparkflatmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def main():
# Various rows
max_num_row = 10
for size in [100, 10000, 100000]:
sdf = spark.createDataFrame( # pylint: disable=no-member
sdf = spark.createDataFrame(
spark.sparkContext.parallelize(
[(i, choice(["X", "Y"])) for i in range(size)]
),
Expand Down Expand Up @@ -146,7 +146,7 @@ def main():
schema = StructType(
[StructField("Col_{}".format(i), IntegerType(), True) for i in range(cols)]
)
sdf = spark.createDataFrame( # pylint: disable=no-member
sdf = spark.createDataFrame(
spark.sparkContext.parallelize([tuple(range(cols))] * rows), schema=schema
)
augment = False
Expand Down Expand Up @@ -201,7 +201,7 @@ def main():
schema = StructType(
[StructField("Col_{}".format(i), IntegerType(), True) for i in range(cols)]
)
sdf = spark.createDataFrame( # pylint: disable=no-member
sdf = spark.createDataFrame(
spark.sparkContext.parallelize([tuple(range(cols))] * rows), schema=schema
)
for max_num_rows in [1, 10, 50]:
Expand Down Expand Up @@ -269,7 +269,7 @@ def my_map(row):
StructField("B", IntegerType(), True),
]
)
sdf = spark.createDataFrame( # pylint: disable=no-member
sdf = spark.createDataFrame(
spark.sparkContext.parallelize([(i, randint(0, 1)) for i in range(10000)]),
schema=schema,
)
Expand Down
8 changes: 4 additions & 4 deletions benchmark/sparkmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,15 +94,15 @@ def main():
schema = StructType(
[StructField("A", IntegerType(), True), StructField("B", IntegerType(), True)]
)
sdf = spark.createDataFrame( # pylint: disable=no-member
sdf = spark.createDataFrame(
spark.sparkContext.parallelize([(i, randint(0, 1)) for i in range(1250000)]),
schema=schema,
)
_ = sdf.collect() # Help spark warm up.

# various rows
for size in [100, 400, 10000, 40000, 160000, 320000]:
sdf = spark.createDataFrame( # pylint: disable=no-member
sdf = spark.createDataFrame(
spark.sparkContext.parallelize([(i, randint(0, 1)) for i in range(size)]),
schema=schema,
)
Expand Down Expand Up @@ -161,7 +161,7 @@ def main():
schema = StructType(
[StructField(f"Col_{i}", IntegerType(), True) for i in range(size)]
)
sdf = spark.createDataFrame( # pylint: disable=no-member
sdf = spark.createDataFrame(
spark.sparkContext.parallelize([tuple(range(size))] * 10000), schema=schema
)
augment = False
Expand Down Expand Up @@ -237,7 +237,7 @@ def my_map(row):
StructField("B", IntegerType(), True),
]
)
sdf = spark.createDataFrame( # pylint: disable=no-member
sdf = spark.createDataFrame(
spark.sparkContext.parallelize([(i, randint(0, 1)) for i in range(10000)]),
schema=schema,
)
Expand Down
4 changes: 1 addition & 3 deletions doc/conf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# pylint: skip-file

# SPDX-License-Identifier: Apache-2.0
# Copyright Tumult Labs 2025

Expand Down Expand Up @@ -173,7 +171,7 @@ def autoapi_prepare_jinja_env(jinja_env):
"numpy": ("https://numpy.org/doc/1.18/", None),
"pandas": ("https://pandas.pydata.org/pandas-docs/version/1.2.0/", None),
"sympy": ("https://docs.sympy.org/latest/", None),
"pyspark": ("https://spark.apache.org/docs/3.5.1/api/python/", None),
"pyspark": ("https://downloads.apache.org/spark/docs/3.5.7/api/python/", None),
}

# The ACM website seems to have some sort of protection that foils the linkchecker.
Expand Down
2 changes: 1 addition & 1 deletion doc/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ Introduction to Tumult Core

Tumult Core is a collection of composable components for implementing
algorithms to perform differentially private computations. The design of Tumult Core
is based on the design proposed in the `OpenDP White Paper <https://projects.iq.harvard.edu/files/opendifferentialprivacy/files/opendp_white_paper_11may2020.pdf>`__,
is based on the design proposed in the `OpenDP White Paper <https://web.archive.org/web/20250505214711/https://projects.iq.harvard.edu/files/opendp/files/opendp_programming_framework_11may2020_1_01.pdf>`_,
and can automatically verify the privacy properties of algorithms constructed
from Tumult Core components. Tumult Core is scalable, includes a wide variety of components,
and supports multiple privacy definitions.
Expand Down
2 changes: 1 addition & 1 deletion doc/topic-guides/architecture.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ Tumult Core Architecture
Tumult Core is a collection of composable components for implementing
algorithms to perform differentially private computations. The design of Tumult Core
is based on the design proposed in the
`OpenDP White Paper <https://projects.iq.harvard.edu/files/opendp/files/opendp_programming_framework_11may2020_1_01.pdf>`_.
`OpenDP White Paper <https://web.archive.org/web/20250505214711/https://projects.iq.harvard.edu/files/opendp/files/opendp_programming_framework_11may2020_1_01.pdf>`_.
On this page, we give an overview of this design. Readers who want more
information should refer to the linked white paper.

Expand Down
9 changes: 3 additions & 6 deletions doc/topic-guides/spark.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,9 @@ database, you should use the following options when creating a Spark session:
.enableHiveSupport()
.getOrCreate()

To see where Hive's warehouse directory is, you can use the
`Hive CLI <https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Cli#LanguageManualCli-HiveInteractiveShellCommands>`_
(or its replacement,
`Beehive <https://cwiki.apache.org/confluence/display/Hive/HiveServer2+Clients#HiveServer2Clients-BeelineHiveCommands>`_)
to view the
`relevant configuration parameter <https://cwiki.apache.org/confluence/display/Hive/AdminManual+Metastore+3.0+Administration#AdminManualMetastore3.0Administration-GeneralConfiguration>`_:
To see where Hive's warehouse directory is, you can use the `Hive CLI <https://hive.apache.org/docs/latest/language/languagemanual-cli/#hive-interactive-shell-commands>`_
(or its replacement, `Beeline <https://hive.apache.org/docs/latest/user/hiveserver2-clients/#beeline-hive-commands>`_)
to view the `relevant configuration parameter <https://hive.apache.org/docs/latest/admin/adminmanual-metastore-3-0-administration/#general-configuration>`_:

.. code-block::

Expand Down
3 changes: 1 addition & 2 deletions noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,9 +178,8 @@ def build(session):

sm.black()
sm.isort()
sm.ruff_check()
sm.mypy()
sm.pylint()
sm.pydocstyle()

sm.smoketest()
sm.release_smoketest()
Expand Down
Loading