From 0a0f48020db5b5aedff516bd220c268b1ad77da9 Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Mon, 6 Dec 2021 23:17:52 -0500 Subject: [PATCH] is keyword fix for integers etc. --- tuplex/codegen/src/BlockGeneratorVisitor.cc | 32 ++++++++++++++++++++- tuplex/test/core/DataFrameOperations.cc | 24 ++++++++++++++++ 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/tuplex/codegen/src/BlockGeneratorVisitor.cc b/tuplex/codegen/src/BlockGeneratorVisitor.cc index 1c6a7fa5a..cbcffab6d 100644 --- a/tuplex/codegen/src/BlockGeneratorVisitor.cc +++ b/tuplex/codegen/src/BlockGeneratorVisitor.cc @@ -961,12 +961,42 @@ namespace tuplex { assert(R); if(tt == TokenType::IS || tt == TokenType::ISNOT) { + + // special case: left/right is not boolean + // --> Python allows that, it's bad coding style though. + // compile and hint. + if(leftType != python::Type::BOOLEAN || rightType != python::Type::BOOLEAN) { + _logger.warn("SyntaxWarning: UDF contains is comparison between " + leftType.desc() + " and " + + rightType.desc() + ". Better avoid, is should be only used to test against booleans or None."); + + // though upcast may be defined, for is this will be ignored. + + // only for integers there's actual code. Else, assume always false due to memory + // address issue + if(leftType == rightType && leftType == python::Type::I64) { + _logger.warn("SyntaxWarning: Emitting code for integer is comparison, i.e. for integers in range [-5, 256] is behaves like =="); + + // result is: L == R && -5 <= L <= 256 + assert(L && R); + auto equal = builder.CreateICmpEQ(L, R); + auto upperBound = builder.CreateICmpSLE(L, _env->i64Const(256)); + auto lowerBound = builder.CreateICmpSGE(L, _env->i64Const(-5)); + // could short-circuit here, but & does fine as well... + auto resValue = builder.CreateAnd(equal, builder.CreateAnd(upperBound, lowerBound)); + return _env->upcastToBoolean(builder, resValue); + } else { + return _env->boolConst(false); + } + } + + // rest of the code is for the boolean case assert(leftType == python::Type::BOOLEAN || rightType == python::Type::BOOLEAN); + // one of the types must be boolean, otherwise compareInst with _isnull would've taken care. if((leftType == python::Type::BOOLEAN) != (rightType == python::Type::BOOLEAN)) { // one of the types is boolean, other isn't. comparison results in false. return _env->boolConst(tt == TokenType::ISNOT); - } + } // both must be boolean. auto cmpPredicate = (tt == TokenType::ISNOT) ? llvm::CmpInst::Predicate::ICMP_NE : llvm::CmpInst::Predicate::ICMP_EQ; diff --git a/tuplex/test/core/DataFrameOperations.cc b/tuplex/test/core/DataFrameOperations.cc index 8d018da45..fdd21f55a 100644 --- a/tuplex/test/core/DataFrameOperations.cc +++ b/tuplex/test/core/DataFrameOperations.cc @@ -407,4 +407,28 @@ TEST_F(DataFrameTest, RenameColumns) { // check now fuzzy matching auto& err_ds = ds3.renameColumn("secund", "+1"); EXPECT_TRUE(err_ds.isError()); +} + +TEST_F(DataFrameTest, IsKeywordAndFilter) { + // Following causes a bug (https://github.com/tuplex/tuplex/issues/54), this test is to fix it. + // c = Context() + // c.parallelize([1, 2, 3]).filter(lambda x: x is 2).collect() + + using namespace tuplex; + Context c(microTestOptions()); + + // rename test, position based: + auto& ds = c.parallelize({Row(1), Row(2), Row(3)}).filter(UDF("lambda x: x is 2")); + ASSERT_FALSE(ds.isError()); + + // for integers -5 <= x <= 256 python is weird, is acts like equality! + auto v = ds.collectAsVector(); + ASSERT_EQ(v.size(), 1); + EXPECT_EQ(v.front().getInt(0), 2); + + // also check here floats to be sure the filter doesn't screw things up. + ds = c.parallelize({Row(1.0), Row(2.0), Row(3.0)}).filter(UDF("lambda x: x is 2.0")); + ASSERT_FALSE(ds.isError()); + v = ds.collectAsVector(); + ASSERT_EQ(v.size(), 0); } \ No newline at end of file