-
Notifications
You must be signed in to change notification settings - Fork 3
Closed
Description
sdf = spark.createDataFrame([(1.,), (float('inf'),), (float('nan'),)], ["col"])
sess = Session.from_dataframe(PureDPBudget(1), "data", sdf, AddOneRow())
sess.create_view(
QueryBuilder("data").drop_null_and_nan(["col"])
.join_private(
QueryBuilder("data").drop_infinity(["col"]),
TruncationStrategy.DropNonUnique(),
TruncationStrategy.DropNonUnique(),
),
"view",
cache=False,
)Fails with:
---------------------------------------------------------------------------
AnalyticsInternalError Traceback (most recent call last)
Cell In[11], line 1
----> 1 sess.create_view(
2 QueryBuilder("data").drop_null_and_nan(["col"])
3 .join_private(
4 QueryBuilder("data").drop_infinity(["col"]),
5 TruncationStrategy.DropNonUnique(),
6 TruncationStrategy.DropNonUnique(),
7 ),
8 "view",
9 cache=False,
10 )
File ~/Documents/Prog/Python/tumult-analytics/src/tmlt/analytics/session.py:1245, in create_view(self, query_expr, source_id, cache)
1241 raise ValueError(f"Table '{source_id}' already exists.")
1243 query = query_expr._query_expr # pylint: disable=protected-access
-> 1245 transformation, ref, constraints = QueryExprCompiler(
1246 self._output_measure
1247 ).build_transformation(
1248 query=query,
1249 input_domain=self._input_domain,
1250 input_metric=self._input_metric,
1251 public_sources=self._public_sources,
1252 catalog=self._catalog,
1253 table_constraints=self._table_constraints,
1254 )
1255 if cache:
1256 transformation, ref = persist_table(
1257 base_transformation=transformation, base_ref=ref
1258 )
File ~/Documents/Prog/Python/tumult-analytics/src/tmlt/analytics/_query_expr_compiler/_compiler.py:236, in QueryExprCompiler.build_transformation(self, query, input_domain, input_metric, public_sources, catalog, table_constraints)
232 if not isinstance(transformation, Transformation):
233 raise AnalyticsInternalError(
234 "Unable to create transformation for this query."
235 )
--> 236 transformation_visitor.validate_transformation(
237 query, transformation, reference, catalog
238 )
239 return transformation, reference, constraints
File ~/Documents/Prog/Python/tumult-analytics/src/tmlt/analytics/_query_expr_compiler/_base_transformation_visitor.py:274, in BaseTransformationVisitor.validate_transformation(self, query, transformation, reference, catalog)
268 expected_output_metric = SymmetricDifference()
270 if (
271 lookup_domain(transformation.output_domain, reference)
272 != expected_output_domain
273 ):
--> 274 raise AnalyticsInternalError(
275 f"Expected output domain {expected_output_domain}, "
276 f"but got {lookup_domain(transformation.output_domain, reference)}."
277 )
278 if (
279 lookup_metric(transformation.output_metric, reference)
280 != expected_output_metric
281 ):
282 raise AnalyticsInternalError(
283 f"Expected output metric {expected_output_metric}, "
284 f"but got {lookup_metric(transformation.output_metric, reference)}."
285 )
AnalyticsInternalError: Expected output domain SparkDataFrameDomain(schema={'col': SparkFloatColumnDescriptor(allow_nan=False, allow_inf=True, allow_null=False, size=64)}), but got SparkDataFrameDomain(schema={'col': SparkFloatColumnDescriptor(allow_nan=False, allow_inf=False, allow_null=False, size=64)}).
This is probably a bug! Please let us know about it at:
https://github.com/opendp/tumult-analytics/issues/new
The join should work, and the joined column should be free of both nulls, NaN values, and infinities.
Alternatively, maybe nobody should ever join on a floating-point column, and we should just forbid it?
Metadata
Metadata
Assignees
Labels
No labels