From 22f52432de7fbd91b2d31763f832c9bcfc909369 Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Thu, 22 Sep 2022 19:20:50 -0400 Subject: [PATCH 01/14] preparing v0.3.4 release --- doc/source/conf.py | 2 +- scripts/set_version.py | 2 +- setup.py | 2 +- tuplex/historyserver/thserver/version.py | 2 +- tuplex/python/setup.py | 2 +- tuplex/python/tuplex/utils/version.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 29e9d36a6..52275332d 100755 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -36,7 +36,7 @@ # The short X.Y version version="0.3" # The full version, including alpha/beta/rc tags -release="0.3.4dev" +release="0.3.4" # -- General configuration --------------------------------------------------- diff --git a/scripts/set_version.py b/scripts/set_version.py index 9ca7eac05..d1b6a08c0 100755 --- a/scripts/set_version.py +++ b/scripts/set_version.py @@ -15,7 +15,7 @@ def LooseVersion(v): # to create a testpypi version use X.Y.devN -version = '0.3.4dev' +version = '0.3.4' # https://pypi.org/simple/tuplex/ # or https://test.pypi.org/simple/tuplex/ diff --git a/setup.py b/setup.py index 82646935e..6f1144b0e 100644 --- a/setup.py +++ b/setup.py @@ -653,7 +653,7 @@ def tplx_package_data(): # logic and declaration, and simpler if you include description/version in a file. setup(name="tuplex", python_requires='>=3.7.0', - version="0.3.4dev", + version="0.3.4", author="Leonhard Spiegelberg", author_email="tuplex@cs.brown.edu", description="Tuplex is a novel big data analytics framework incorporating a Python UDF compiler based on LLVM " diff --git a/tuplex/historyserver/thserver/version.py b/tuplex/historyserver/thserver/version.py index be3ddfc75..1f80cd420 100644 --- a/tuplex/historyserver/thserver/version.py +++ b/tuplex/historyserver/thserver/version.py @@ -1,2 +1,2 @@ # (c) L.Spiegelberg 2017 - 2022 -__version__="0.3.4dev" \ No newline at end of file +__version__="0.3.4" \ No newline at end of file diff --git a/tuplex/python/setup.py b/tuplex/python/setup.py index 2d5eeac45..512413d6e 100644 --- a/tuplex/python/setup.py +++ b/tuplex/python/setup.py @@ -29,7 +29,7 @@ setup( name="Tuplex", - version="0.3.4dev", + version="0.3.4", packages=find_packages(), package_data={ # include libs in libexec diff --git a/tuplex/python/tuplex/utils/version.py b/tuplex/python/tuplex/utils/version.py index be3ddfc75..1f80cd420 100644 --- a/tuplex/python/tuplex/utils/version.py +++ b/tuplex/python/tuplex/utils/version.py @@ -1,2 +1,2 @@ # (c) L.Spiegelberg 2017 - 2022 -__version__="0.3.4dev" \ No newline at end of file +__version__="0.3.4" \ No newline at end of file From 2bbdbf50d7f9cb847dba3543a881228f758f5256 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 30 Sep 2022 11:56:34 -0400 Subject: [PATCH 02/14] add helper script for debug python and fix expect_eq to assert_eq within looptest --- scripts/install_debug_python.sh | 30 +++++++ tuplex/test/core/LoopTest.cc | 142 ++++++++++++++++---------------- 2 files changed, 101 insertions(+), 71 deletions(-) create mode 100755 scripts/install_debug_python.sh diff --git a/scripts/install_debug_python.sh b/scripts/install_debug_python.sh new file mode 100755 index 000000000..209ebeb9f --- /dev/null +++ b/scripts/install_debug_python.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +PYTHON_VERSION="3.9.14" +URL="https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tar.xz" + +# specify here with what flags to build the version (cf. https://pythonextensionpatterns.readthedocs.io/en/latest/debugging/debug_in_ide.html) +# and https://pythonextensionpatterns.readthedocs.io/en/latest/debugging/debug_python.html +DEBUG_OPTIONS="--with-pydebug --without-pymalloc --with-valgrind" +PREFIX="$HOME/.local/python${PYTHON_VERSION}-dbg" + +[ -d .cache ] && rm -rf .cache +mkdir -p .cache + +# save current working dir +CWD=$PWD + +cd .cache + +# download python +echo ">>> downloading python ${PYTHON_VERSION}" +wget $URL +echo ">>> extracting python" +tar xf Python-${PYTHON_VERSION}.tar.xz +cd Python-${PYTHON_VERSION} + +mkdir -p debug && cd debug && ../configure --prefix=${PREFIX} ${DEBUG_OPTIONS} && make -j$(nproc) && make test + + +cd $CWD + diff --git a/tuplex/test/core/LoopTest.cc b/tuplex/test/core/LoopTest.cc index f14217382..8e3d3a745 100644 --- a/tuplex/test/core/LoopTest.cc +++ b/tuplex/test/core/LoopTest.cc @@ -133,7 +133,7 @@ TEST_F(LoopTest, CodegenTestListDict) { Row(10) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(27)); } @@ -150,7 +150,7 @@ TEST_F(LoopTest, CodegenTestRange) { Row(0) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(10)); } @@ -167,7 +167,7 @@ TEST_F(LoopTest, CodegenTestEmptyString) { Row("should be the same") }).map(UDF(func_empty)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], std::string("should be the same")); } @@ -185,7 +185,7 @@ TEST_F(LoopTest, CodegenTestString) { Row("") }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 2); + ASSERT_EQ(v.size(), 2); EXPECT_EQ(v[0], std::string("test12ab")); EXPECT_EQ(v[1], std::string("12ab")); } @@ -204,7 +204,7 @@ TEST_F(LoopTest, CodegenTestExprIsId) { Row(10) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(20)); } @@ -221,7 +221,7 @@ TEST_F(LoopTest, CodegenTestEmptyTuple) { Row(10) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(10)); } @@ -238,7 +238,7 @@ TEST_F(LoopTest, CodegenTestSingleElementTuple) { Row("Num is ") }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], std::string("Num is 100")); } @@ -255,7 +255,7 @@ TEST_F(LoopTest, CodegenTestTupleSameType) { Row(0) }).map(UDF(func1)).collectAsVector(); - EXPECT_EQ(v1.size(), 1); + ASSERT_EQ(v1.size(), 1); EXPECT_EQ(v1[0], Row(180)); auto func2 = "def f(x):\n" @@ -267,7 +267,7 @@ TEST_F(LoopTest, CodegenTestTupleSameType) { Row(0) }).map(UDF(func2)).collectAsVector(); - EXPECT_EQ(v2.size(), 1); + ASSERT_EQ(v2.size(), 1); EXPECT_EQ(v2[0], Row(14)); } @@ -284,7 +284,7 @@ TEST_F(LoopTest, CodegenTestTupleMixedType) { Row(2) }).map(UDF(func1)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(7)); auto func2 = "def f(x):\n" @@ -296,7 +296,7 @@ TEST_F(LoopTest, CodegenTestTupleMixedType) { Row("0") }).map(UDF(func2)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v2[0], std::string("012345.6{}")); } @@ -315,7 +315,7 @@ TEST_F(LoopTest, CodegenTestTupleIsId) { Row("Expression is ") }).map(UDF(func1)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], std::string("Expression is 5 + 100.5 - 1234")); } @@ -336,7 +336,7 @@ TEST_F(LoopTest, CodegenTestNested) { Row(2) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(625579680)); auto func2 = "def f(x):\n" @@ -351,7 +351,7 @@ TEST_F(LoopTest, CodegenTestNested) { Row(200) }).map(UDF(func2)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v2[0], Row(23400)); } @@ -374,7 +374,7 @@ TEST_F(LoopTest, CodegenTestNestedMixedType) { Row(2) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(8472)); } @@ -395,7 +395,7 @@ TEST_F(LoopTest, CodegenTestForWithIf) { Row(1) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(11)); } @@ -414,7 +414,7 @@ TEST_F(LoopTest, CodegenTestForElse) { Row(0) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(20)); } @@ -434,7 +434,7 @@ TEST_F(LoopTest, CodegenTestForSimpleContinue) { Row(0) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(11)); } @@ -459,7 +459,7 @@ TEST_F(LoopTest, CodegenTestForNestedContinue) { Row(0) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(94)); } @@ -477,7 +477,7 @@ TEST_F(LoopTest, CodegenTestForSimpleBreak) { Row(0) }).map(UDF(func1)).collectAsVector(); - EXPECT_EQ(v1.size(), 1); + ASSERT_EQ(v1.size(), 1); EXPECT_EQ(v1[0], Row(0)); } @@ -503,7 +503,7 @@ TEST_F(LoopTest, CodegenTestForNestedBreak) { Row(-1000) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(-14364)); } @@ -524,7 +524,7 @@ TEST_F(LoopTest, CodegenTestForBreakElse) { Row(10) }).map(UDF(func1)).collectAsVector(); - EXPECT_EQ(v1.size(), 1); + ASSERT_EQ(v1.size(), 1); EXPECT_EQ(v1[0], Row(25)); auto func2 = "def f(x):\n" @@ -540,7 +540,7 @@ TEST_F(LoopTest, CodegenTestForBreakElse) { Row("string: ") }).map(UDF(func2)).collectAsVector(); - EXPECT_EQ(v2.size(), 1); + ASSERT_EQ(v2.size(), 1); EXPECT_EQ(v2[0], std::string("string: abcd!")); } @@ -559,7 +559,7 @@ TEST_F(LoopTest, CodegenTestWhile) { Row(0) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(10)); } @@ -579,7 +579,7 @@ TEST_F(LoopTest, CodegenTestWhileElse) { Row("") }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], std::string("abababababEND")); } @@ -603,7 +603,7 @@ TEST_F(LoopTest, CodegenTestWhileSimpleContinue) { Row(1) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(4452810)); } @@ -632,7 +632,7 @@ TEST_F(LoopTest, CodegenTestWhileNestedContinue) { Row(0.0) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(-1131.5)); } @@ -654,7 +654,7 @@ TEST_F(LoopTest, CodegenTestWhileSimpleBreak) { Row(2000) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(840)); } @@ -681,7 +681,7 @@ TEST_F(LoopTest, CodegenTestWhileNestedBreak) { Row(10) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(6088)); } @@ -705,7 +705,7 @@ TEST_F(LoopTest, CodegenTestNestedForWhile) { Row(0) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(2049)); } @@ -724,7 +724,7 @@ TEST_F(LoopTest, CodegenTestForTupleElse) { Row("") }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], std::string("{}val1234tuple.")); } @@ -743,7 +743,7 @@ TEST_F(LoopTest, CodegenTestForTupleBreak) { Row(100) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(127)); } @@ -765,7 +765,7 @@ TEST_F(LoopTest, CodegenTestForTupleNestedBreak) { Row(10) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(249)); } @@ -784,7 +784,7 @@ TEST_F(LoopTest, CodegenTestForTupleContinue) { Row(0) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(14)); } @@ -811,7 +811,7 @@ TEST_F(LoopTest, CodegenTestForTupleNestedContinue) { Row("") }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], std::string("a:1245, b:1245, c:1245, d:1245")); } @@ -830,7 +830,7 @@ TEST_F(LoopTest, CodegenTestForEmptyExprWithElse) { Row(0) }).map(UDF(func1)).collectAsVector(); - EXPECT_EQ(v1.size(), 1); + ASSERT_EQ(v1.size(), 1); EXPECT_EQ(v1[0], Row(-1)); auto func2 = "def f(x):\n" @@ -844,7 +844,7 @@ TEST_F(LoopTest, CodegenTestForEmptyExprWithElse) { Row(0) }).map(UDF(func2)).collectAsVector(); - EXPECT_EQ(v2.size(), 1); + ASSERT_EQ(v2.size(), 1); EXPECT_EQ(v2[0], Row(10)); auto func3 = "def f(x):\n" @@ -884,7 +884,7 @@ TEST_F(LoopTest, CodegenTestGeneralI) { Row("0") }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], std::string("01a151a141a131a121a11a[]51a[]41a[]31a[]21a[]1b151b141b131b121b11b[]51b[]41b[" "]31b[]21b[]2a152a142a132a12a112a[]52a[]42a[]32a[]2a[]12a123452a123442a123432a12342a123" "412a.52a.42a.32a.2a.12b152b142b132b12b112b[]52b[]42b[]32b[]2b[]12b123452b123442b123432" @@ -924,7 +924,7 @@ TEST_F(LoopTest, CodegenTestGeneralII) { Row(1000) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], 4898); } @@ -942,7 +942,7 @@ TEST_F(LoopTest, CodegenTestMultiIdTupleI) { Row(10) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(110)); } @@ -959,7 +959,7 @@ TEST_F(LoopTest, CodegenTestMultiIdTupleII) { Row(0) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(21)); } @@ -977,7 +977,7 @@ TEST_F(LoopTest, CodegenTestListofTuple) { Row(10.0) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(171.0)); } @@ -994,7 +994,7 @@ TEST_F(LoopTest, CodegenTestMultiIdListI) { Row(0) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(100)); } @@ -1011,7 +1011,7 @@ TEST_F(LoopTest, CodegenTestMultiIdListII) { Row(0) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(21)); auto func2 = "def f(x):\n" @@ -1024,7 +1024,7 @@ TEST_F(LoopTest, CodegenTestMultiIdListII) { Row(0) }).map(UDF(func2)).collectAsVector(); - EXPECT_EQ(v2.size(), 1); + ASSERT_EQ(v2.size(), 1); EXPECT_EQ(v2[0], Row(-15)); auto func3 = "def f(x):\n" @@ -1037,7 +1037,7 @@ TEST_F(LoopTest, CodegenTestMultiIdListII) { Row(0) }).map(UDF(func3)).collectAsVector(); - EXPECT_EQ(v3.size(), 1); + ASSERT_EQ(v3.size(), 1); EXPECT_EQ(v3[0], Row(10)); } @@ -1058,7 +1058,7 @@ TEST_F(LoopTest, CodegenTestListAsExprlistI) { Row(0) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(280)); } @@ -1076,7 +1076,7 @@ TEST_F(LoopTest, CodegenTestListAsExprlistII) { Row(-5) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(10)); } @@ -1100,7 +1100,7 @@ TEST_F(LoopTest, CodegenTestLoopInIf) { Row(-8) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 2); + ASSERT_EQ(v.size(), 2); EXPECT_EQ(v[0], Row(20)); EXPECT_EQ(v[1], Row(2)); } @@ -1118,7 +1118,7 @@ TEST_F(LoopTest, CodegenTestExprWithoutParentheses) { Row(10) }).map(UDF(func1)).collectAsVector(); - EXPECT_EQ(v1.size(), 1); + ASSERT_EQ(v1.size(), 1); EXPECT_EQ(v1[0], Row(20)); auto func2 = "def f(x):\n" @@ -1147,7 +1147,7 @@ TEST_F(LoopTest, CodegenTestLoopWithIterIteratorI) { Row(1) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(11)); } @@ -1165,7 +1165,7 @@ TEST_F(LoopTest, CodegenTestLoopWithIterIteratorII) { Row(0) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(1704)); } @@ -1184,7 +1184,7 @@ TEST_F(LoopTest, CodegenTestLoopWithEnumerateIterator) { Row(0) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(14, "abcd")); } @@ -1203,7 +1203,7 @@ TEST_F(LoopTest, CodegenTestLoopWithZipIterator) { Row(0) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(50, "ab")); } @@ -1223,7 +1223,7 @@ TEST_F(LoopTest, CodegenTestLoopWithIteratorGeneralI) { Row(0) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(2, 10)); } @@ -1245,7 +1245,7 @@ TEST_F(LoopTest, CodegenTestLoopWithIteratorGeneralII) { Row(0) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(4, Tuple(2, 30))); } @@ -1266,7 +1266,7 @@ TEST_F(LoopTest, CodegenTestLoopWithIteratorGeneralIII) { Row(6000.0) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(5000.5, 6000.0)); } @@ -1293,7 +1293,7 @@ TEST_F(LoopTest, CodegenTestLoopWithIteratorGeneralIV) { Row(0) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row("!", "d", Tuple(17, "!"))); } @@ -1321,7 +1321,7 @@ TEST_F(LoopTest, CodegenTestLoopOverInputDataI) { Row(10, List(1, 2, 3, 4)) }).map(UDF(func, "", ce)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_DOUBLE_EQ(v[0].getDouble(0), sqrt(1.25)); } @@ -1339,7 +1339,7 @@ TEST_F(LoopTest, CodegenTestLoopOverInputDataII) { Row(List("a", "bc", "def", "ghij", "k")) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row("abcdefghijk")); } @@ -1360,7 +1360,7 @@ TEST_F(LoopTest, CodegenTestLoopTracingWithContinue) { Row(0) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(4.0)); } @@ -1381,7 +1381,7 @@ TEST_F(LoopTest, CodegenTestLoopTracingWithBreak) { Row(10) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 2); + ASSERT_EQ(v.size(), 2); EXPECT_EQ(v[0], Row(10.0)); EXPECT_EQ(v[1], Row(1.0)); } @@ -1403,7 +1403,7 @@ TEST_F(LoopTest, CodegenTestLoopTypeSpeculationGeneralI) { Row(4) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(8.0)); } @@ -1422,7 +1422,7 @@ TEST_F(LoopTest, CodegenTestLoopTypeSpeculationGeneralII) { Row(10) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(20.0)); } @@ -1446,7 +1446,7 @@ TEST_F(LoopTest, CodegenTestLoopTypeSpeculationGeneralIII) { Row(1) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 3); + ASSERT_EQ(v.size(), 3); EXPECT_EQ(v[0], Row(10)); EXPECT_EQ(v[1], Row(10)); EXPECT_EQ(v[2], Row(11.0)); @@ -1472,7 +1472,7 @@ TEST_F(LoopTest, CodegenTestLoopTypeSpeculationGeneralIV) { Row(0) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 3); + ASSERT_EQ(v.size(), 3); EXPECT_EQ(v[0], Row(10)); EXPECT_EQ(v[1], Row(12.0)); EXPECT_EQ(v[2], Row(12.0)); @@ -1499,7 +1499,7 @@ TEST_F(LoopTest, CodegenTestLoopTypeSpeculationGeneralV) { Row(7) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 3); + ASSERT_EQ(v.size(), 3); EXPECT_EQ(v[0], Row(101)); EXPECT_EQ(v[1], Row(4096.0)); EXPECT_EQ(v[2], Row(16384.0)); @@ -1524,7 +1524,7 @@ TEST_F(LoopTest, CodegenTestLoopTypeSpeculationGeneralVI) { Row(3) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 4); + ASSERT_EQ(v.size(), 4); EXPECT_EQ(v[0], Row(6600.0)); EXPECT_EQ(v[1], Row(-49476.0)); EXPECT_EQ(v[2], Row(-109120.0)); @@ -1548,7 +1548,7 @@ TEST_F(LoopTest, CodegenTestLoopTypeSpeculationGeneralVII) { Row(100) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 1); + ASSERT_EQ(v.size(), 1); EXPECT_EQ(v[0], Row(365.0)); } @@ -1573,7 +1573,7 @@ TEST_F(LoopTest, CodegenTestLoopTypeSpeculationGeneralVIII) { Row(30) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 3); + ASSERT_EQ(v.size(), 3); EXPECT_EQ(v[0], Row(2264921595.0)); EXPECT_EQ(v[1], Row(8304717290.0)); EXPECT_EQ(v[2], Row(18119387085.0)); @@ -1602,7 +1602,7 @@ TEST_F(LoopTest, CodegenTestLoopTypeSpeculationGeneralIX) { Row(30) }).map(UDF(func)).collectAsVector(); - EXPECT_EQ(v.size(), 3); + ASSERT_EQ(v.size(), 3); EXPECT_EQ(v[0], Row(2068.5)); EXPECT_EQ(v[1], Row(16286.0)); EXPECT_EQ(v[2], Row(54653.5)); From f760e2eccc423f8db3a1113f49e6084a9a68e90c Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 30 Sep 2022 13:20:29 -0400 Subject: [PATCH 03/14] linking util (ptty) for python debug version --- tuplex/core/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tuplex/core/CMakeLists.txt b/tuplex/core/CMakeLists.txt index d14ad349f..da224299a 100755 --- a/tuplex/core/CMakeLists.txt +++ b/tuplex/core/CMakeLists.txt @@ -83,4 +83,5 @@ target_link_libraries(libcore Boost::thread Boost::system Boost::filesystem - ) \ No newline at end of file + util + ) From e3abe08b16bf4177123fcc7bd1d294e2bef7f585 Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Fri, 30 Sep 2022 17:00:44 -0400 Subject: [PATCH 04/14] fix refcount in FunctionGlobals test --- 02_Working_with_files.ipynb | 3145 +++++++++++++++++ .../test/adapters/cpython/PythonHelperTest.cc | 17 +- 2 files changed, 3156 insertions(+), 6 deletions(-) create mode 100644 02_Working_with_files.ipynb diff --git a/02_Working_with_files.ipynb b/02_Working_with_files.ipynb new file mode 100644 index 000000000..62cb7b4d6 --- /dev/null +++ b/02_Working_with_files.ipynb @@ -0,0 +1,3145 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "_M1WAAa_N3MO" + }, + "source": [ + "## 2. Working with files\n", + "\n", + "\n", + "\n", + "In the 2nd part of the Tuplex intro series, we'll take a look at how to work with CSV and text files. First, let's install Tuplex again in our notebook." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Hged1I1rUyEf", + "outputId": "92169a2c-6703-4a00-be65-aeef1758831b" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Python 3.7.13\r\n" + ] + } + ], + "source": [ + "!python3 --version" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "JZsjAqGVTi9M", + "outputId": "295bb25c-2bed-4448-c789-3922176fe588" + }, + "outputs": [], + "source": [ + "# # install Colab compatible upgrades to avoid dependency errors\n", + "# !pip install -q folium==0.2.1\n", + "# !pip install -q --upgrade urllib3==1.25.11\n", + "# !pip install flask-socketio flask-pymongo eventlet==0.30.0\n", + "# !pip uninstall jedi -y && pip3 install 'jedi>=0.10'\n", + "\n", + "# # install Tuplex\n", + "# #!pip install -q -i https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple tuplex==0.3.2rc1\n", + "# #!pip install -q tuplex\n", + "\n", + "# # !pip install -i https://test.pypi.org/simple/ tuplex==0.3.dev20220822143933006789\n", + "# #!pip install -i https://test.pypi.org/simple/ tuplex" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "NOWk6l8gWK8n", + "outputId": "67a805ef-5430-4e90-85cb-9473a0dc6bd5" + }, + "outputs": [], + "source": [ + "# downloads temp tuplex file\n", + "#!gdown https://drive.google.com/uc?id=1-TxhNpVg6TW96rNvLWv_2NWUz2tdoLnN" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "IlPT6aJcVB52", + "outputId": "d8ec0e8d-1405-4124-8fcb-6d417520f8e9" + }, + "outputs": [], + "source": [ + "#!pip3 install --force-reinstall /content/tuplex-0.3.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing ./wheelhouse/tuplex-0.3.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl\n", + "Requirement already satisfied: prompt-toolkit in /home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages (from tuplex==0.3.3) (3.0.31)\n", + "Collecting flask-pymongo\n", + " Downloading Flask_PyMongo-2.3.0-py2.py3-none-any.whl (12 kB)\n", + "Collecting iso8601\n", + " Using cached iso8601-1.0.2-py3-none-any.whl (9.7 kB)\n", + "Requirement already satisfied: attrs>=19.2.0 in /home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages (from tuplex==0.3.3) (22.1.0)\n", + "Collecting Flask==2.0.2\n", + " Downloading Flask-2.0.2-py3-none-any.whl (95 kB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m95.2/95.2 KB\u001b[0m \u001b[31m1.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: jedi in /home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages (from tuplex==0.3.3) (0.18.1)\n", + "Collecting pluggy\n", + " Downloading pluggy-1.0.0-py2.py3-none-any.whl (13 kB)\n", + "Requirement already satisfied: six>=1.11.0 in /home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages (from tuplex==0.3.3) (1.16.0)\n", + "Requirement already satisfied: psutil in /home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages (from tuplex==0.3.3) (5.9.2)\n", + "Collecting astor\n", + " Using cached astor-0.8.1-py2.py3-none-any.whl (27 kB)\n", + "Collecting flask-socketio\n", + " Downloading Flask_SocketIO-5.3.1-py3-none-any.whl (17 kB)\n", + "Collecting eventlet==0.30.0\n", + " Downloading eventlet-0.30.0-py2.py3-none-any.whl (224 kB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m224.1/224.1 KB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m[31m2.9 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n", + "\u001b[?25hCollecting gunicorn\n", + " Downloading gunicorn-20.1.0-py3-none-any.whl (79 kB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m79.5/79.5 KB\u001b[0m \u001b[31m789.0 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m31m1.8 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n", + "\u001b[?25hCollecting dill>=0.2.7.1\n", + " Using cached dill-0.3.5.1-py2.py3-none-any.whl (95 kB)\n", + "Collecting PyYAML>=3.13\n", + " Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m596.3/596.3 KB\u001b[0m \u001b[31m1.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n", + "\u001b[?25hCollecting pymongo\n", + " Downloading pymongo-4.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (471 kB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m471.9/471.9 KB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m:01\u001b[0m\n", + "\u001b[?25hCollecting py>=1.5.2\n", + " Using cached py-1.11.0-py2.py3-none-any.whl (98 kB)\n", + "Requirement already satisfied: pygments>=2.4.1 in /home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages (from tuplex==0.3.3) (2.13.0)\n", + "Collecting Werkzeug<2.2.0\n", + " Downloading Werkzeug-2.1.2-py3-none-any.whl (224 kB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m224.9/224.9 KB\u001b[0m \u001b[31m867.4 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m1m825.4 kB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: wcwidth>=0.1.7 in /home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages (from tuplex==0.3.3) (0.2.5)\n", + "Collecting cloudpickle<2.0.0,>=0.6.1\n", + " Using cached cloudpickle-1.6.0-py3-none-any.whl (23 kB)\n", + "Collecting dnspython<2.0.0,>=1.15.0\n", + " Downloading dnspython-1.16.0-py2.py3-none-any.whl (188 kB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m188.4/188.4 KB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m[36m0:00:01\u001b[0mm eta \u001b[36m0:00:01\u001b[0m\n", + "\u001b[?25hCollecting greenlet>=0.3\n", + " Downloading greenlet-1.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (150 kB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m150.7/150.7 KB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n", + "\u001b[?25hCollecting click>=7.1.2\n", + " Downloading click-8.1.3-py3-none-any.whl (96 kB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m96.6/96.6 KB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting itsdangerous>=2.0\n", + " Downloading itsdangerous-2.1.2-py3-none-any.whl (15 kB)\n", + "Requirement already satisfied: Jinja2>=3.0 in /home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages (from Flask==2.0.2->tuplex==0.3.3) (3.1.2)\n", + "Collecting python-socketio>=5.0.2\n", + " Downloading python_socketio-5.7.1-py3-none-any.whl (56 kB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.6/56.6 KB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: setuptools>=3.0 in /home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages (from gunicorn->tuplex==0.3.3) (47.1.0)\n", + "Requirement already satisfied: parso<0.9.0,>=0.8.0 in /home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages (from jedi->tuplex==0.3.3) (0.8.3)\n", + "Requirement already satisfied: importlib-metadata>=0.12 in /home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages (from pluggy->tuplex==0.3.3) (4.12.0)\n", + "Requirement already satisfied: zipp>=0.5 in /home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages (from importlib-metadata>=0.12->pluggy->tuplex==0.3.3) (3.8.1)\n", + "Requirement already satisfied: typing-extensions>=3.6.4 in /home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages (from importlib-metadata>=0.12->pluggy->tuplex==0.3.3) (4.3.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages (from Jinja2>=3.0->Flask==2.0.2->tuplex==0.3.3) (2.1.1)\n", + "Collecting bidict>=0.21.0\n", + " Downloading bidict-0.22.0-py3-none-any.whl (36 kB)\n", + "Collecting python-engineio>=4.3.0\n", + " Downloading python_engineio-4.3.4-py3-none-any.whl (52 kB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m52.9/52.9 KB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: Werkzeug, PyYAML, python-engineio, pymongo, py, itsdangerous, iso8601, gunicorn, greenlet, dnspython, dill, cloudpickle, bidict, astor, python-socketio, pluggy, eventlet, click, Flask, flask-socketio, flask-pymongo, tuplex\n", + "Successfully installed Flask-2.0.2 PyYAML-6.0 Werkzeug-2.1.2 astor-0.8.1 bidict-0.22.0 click-8.1.3 cloudpickle-1.6.0 dill-0.3.5.1 dnspython-1.16.0 eventlet-0.30.0 flask-pymongo-2.3.0 flask-socketio-5.3.1 greenlet-1.1.3 gunicorn-20.1.0 iso8601-1.0.2 itsdangerous-2.1.2 pluggy-1.0.0 py-1.11.0 pymongo-4.2.0 python-engineio-4.3.4 python-socketio-5.7.1 tuplex-0.3.3\n", + "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.2.2 is available.\n", + "You should consider upgrading via the '/home/leonhard/.pyenv/versions/3.7.13/bin/python3.7 -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip3 install wheelhouse/tuplex-0.3.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZOobvZcVO2_H" + }, + "source": [ + "### 2.1 Basic IO - Reading CSV files\n", + "To read in a csv file, Tuplex provides an API function `csv`" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "IAGEc-okO2im", + "outputId": "c9ce8493-5b2e-443f-98b1-c35258d384d6" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ERROR:root:Failed to start or connect to Tuplex WebUI. Details: MongoDB (mongod) not found on PATH. In order to use Tuplex's WebUI, you need MongoDB installed or point the framework to a running MongoDB instance\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Welcome to\n", + "\n", + " _____ _\n", + " |_ _| _ _ __ | | _____ __\n", + " | || | | | '_ \\| |/ _ \\ \\/ /\n", + " | || |_| | |_) | | __/> <\n", + " |_| \\__,_| .__/|_|\\___/_/\\_\\ 0.3.3\n", + " |_|\n", + " \n", + "using Python 3.7.13 (default, Sep 12 2022, 22:16:36) \n", + "[GCC 11.2.0] on linux\n", + "[2022-09-12 22:44:45.614] [local ee] [info] loaded runtime library from/home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages/tuplex/libexec/tuplex_runtime.cpython-37m-x86_64-linux-gnu.so\n", + "[2022-09-12 22:44:45.614] [local ee] [info] initializing LLVM backend\n", + "[2022-09-12 22:44:45.614] [local ee] [warning] init JIT compiler also only in local mode\n", + "[2022-09-12 22:44:45.615] [LLVM] [info] compiling code for skylake\n", + "[2022-09-12 22:44:45.619] [history server] [warning] could not connect to http://localhost:5000/api/version, if you wish to disable the webui consider setting tuplex.webui=False for the context.\n", + "[2022-09-12 22:44:45.619] [memory] [info] allocated bitmap managed memory region (1.00 GB, 32.00 MB block size)\n", + "[2022-09-12 22:44:45.619] [E/1] [info] provided cache path file:///tmp/tuplex-cache-leonhard/E1 does not exist. Attempting to create it.\n", + "[2022-09-12 22:44:45.620] [E/1] [info] created cache directory file:///tmp/tuplex-cache-leonhard/E1\n", + "[2022-09-12 22:44:45.620] [memory] [info] allocated bitmap managed memory region (1.00 GB, 32.00 MB block size)\n", + "[2022-09-12 22:44:45.620] [E/2] [info] provided cache path file:///tmp/tuplex-cache-leonhard/E2 does not exist. Attempting to create it.\n", + "[2022-09-12 22:44:45.620] [E/2] [info] created cache directory file:///tmp/tuplex-cache-leonhard/E2\n", + "[2022-09-12 22:44:45.620] [memory] [info] allocated bitmap managed memory region (1.00 GB, 32.00 MB block size)\n", + "[2022-09-12 22:44:45.621] [E/3] [info] provided cache path file:///tmp/tuplex-cache-leonhard/E3 does not exist. Attempting to create it.\n", + "[2022-09-12 22:44:45.621] [E/3] [info] created cache directory file:///tmp/tuplex-cache-leonhard/E3\n", + "[2022-09-12 22:44:45.621] [memory] [info] allocated bitmap managed memory region (1.00 GB, 32.00 MB block size)\n", + "[2022-09-12 22:44:45.621] [E/4] [info] provided cache path file:///tmp/tuplex-cache-leonhard/E4 does not exist. Attempting to create it.\n", + "[2022-09-12 22:44:45.621] [E/4] [info] created cache directory file:///tmp/tuplex-cache-leonhard/E4\n", + "[2022-09-12 22:44:45.621] [memory] [info] allocated bitmap managed memory region (1.00 GB, 32.00 MB block size)\n", + "[2022-09-12 22:44:45.622] [E/5] [info] provided cache path file:///tmp/tuplex-cache-leonhard/E5 does not exist. Attempting to create it.\n", + "[2022-09-12 22:44:45.622] [E/5] [info] created cache directory file:///tmp/tuplex-cache-leonhard/E5\n", + "[2022-09-12 22:44:45.622] [memory] [info] allocated bitmap managed memory region (1.00 GB, 32.00 MB block size)\n", + "[2022-09-12 22:44:45.622] [E/6] [info] provided cache path file:///tmp/tuplex-cache-leonhard/E6 does not exist. Attempting to create it.\n", + "[2022-09-12 22:44:45.622] [E/6] [info] created cache directory file:///tmp/tuplex-cache-leonhard/E6\n", + "[2022-09-12 22:44:45.622] [memory] [info] allocated bitmap managed memory region (1.00 GB, 32.00 MB block size)\n", + "[2022-09-12 22:44:45.623] [E/7] [info] provided cache path file:///tmp/tuplex-cache-leonhard/E7 does not exist. Attempting to create it.\n", + "[2022-09-12 22:44:45.623] [E/7] [info] created cache directory file:///tmp/tuplex-cache-leonhard/E7\n", + "[2022-09-12 22:44:45.623] [memory] [info] allocated bitmap managed memory region (1.00 GB, 32.00 MB block size)\n", + "[2022-09-12 22:44:45.623] [E/8] [info] provided cache path file:///tmp/tuplex-cache-leonhard/E8 does not exist. Attempting to create it.\n", + "[2022-09-12 22:44:45.623] [E/8] [info] created cache directory file:///tmp/tuplex-cache-leonhard/E8\n", + "[2022-09-12 22:44:45.623] [local execution engine] [info] started local executor E/1 (1.00 GB, 32.00 MB default partition size)\n", + "[2022-09-12 22:44:45.624] [E/1] [info] starting detached process queue\n", + "[2022-09-12 22:44:45.624] [local execution engine] [info] started local executor E/2 (1.00 GB, 32.00 MB default partition size)\n", + "[2022-09-12 22:44:45.624] [E/1] [info] initialized runtime memory (4.00 MB)\n", + "[2022-09-12 22:44:45.624] [E/2] [info] starting detached process queue\n", + "[2022-09-12 22:44:45.624] [local execution engine] [info] started local executor E/3 (1.00 GB, 32.00 MB default partition size)\n", + "[2022-09-12 22:44:45.624] [E/3] [info] starting detached process queue\n", + "[2022-09-12 22:44:45.624] [E/2] [info] initialized runtime memory (4.00 MB)\n", + "[2022-09-12 22:44:45.624] [E/3] [info] initialized runtime memory (4.00 MB)\n", + "[2022-09-12 22:44:45.624] [local execution engine] [info] started local executor E/4 (1.00 GB, 32.00 MB default partition size)\n", + "[2022-09-12 22:44:45.624] [E/4] [info] starting detached process queue\n", + "[2022-09-12 22:44:45.625] [E/4] [info] initialized runtime memory (4.00 MB)\n", + "[2022-09-12 22:44:45.625] [local execution engine] [info] started local executor E/5 (1.00 GB, 32.00 MB default partition size)\n", + "[2022-09-12 22:44:45.625] [E/5] [info] starting detached process queue\n", + "[2022-09-12 22:44:45.625] [E/5] [info] initialized runtime memory (4.00 MB)\n", + "[2022-09-12 22:44:45.625] [local execution engine] [info] started local executor E/6 (1.00 GB, 32.00 MB default partition size)\n", + "[2022-09-12 22:44:45.625] [local execution engine] [info] started local executor E/7 (1.00 GB, 32.00 MB default partition size)\n", + "[2022-09-12 22:44:45.625] [E/7] [info] starting detached process queue\n", + "[2022-09-12 22:44:45.625] [local execution engine] [info] started local executor E/8 (1.00 GB, 32.00 MB default partition size)\n", + "[2022-09-12 22:44:45.626] [E/7] [info] initialized runtime memory (4.00 MB)\n", + "[2022-09-12 22:44:45.626] [memory] [info] allocated bitmap managed memory region (1.00 GB, 32.00 MB block size)\n", + "[2022-09-12 22:44:45.626] [E/6] [info] starting detached process queue\n", + "[2022-09-12 22:44:45.626] [E/6] [info] initialized runtime memory (4.00 MB)\n", + "[2022-09-12 22:44:45.626] [driver] [info] provided cache path file:///tmp/tuplex-cache-leonhard/driver does not exist. Attempting to create it.\n", + "[2022-09-12 22:44:45.626] [driver] [info] created cache directory file:///tmp/tuplex-cache-leonhard/driver\n", + "[2022-09-12 22:44:45.626] [local execution engine] [info] started driver (1.00 GB, 32.00 MB default partition size)\n", + "[2022-09-12 22:44:45.641] [E/8] [info] starting detached process queue\n", + "[2022-09-12 22:44:45.642] [E/8] [info] initialized runtime memory (4.00 MB)\n" + ] + } + ], + "source": [ + "import tuplex\n", + "\n", + "c = tuplex.Context({'tuplex.redirectToPythonLogging':False})" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zdb8DfszEzC8" + }, + "source": [ + "Google Colab provides by default some sample data. We can simply load it into Tuplex using the `csv` command." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "MJz8q4Tw9Bsy" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2022-09-12 22:44:56.155] [posix filesystem] [warning] did not find any files for pattern 'sample_data/california_housing_train.csv'\n", + "[2022-09-12 22:44:56.155] [fileinputoperator] [info] found 0 files (0.00 B) to process.\n", + "[2022-09-12 22:44:56.155] [fileinputoperator] [warning] no input files found, can't infer type from given path: sample_data/california_housing_train.csv\n" + ] + } + ], + "source": [ + "ds = c.csv('sample_data/california_housing_train.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GXauZfPs9H2i", + "outputId": "1daf8a8e-9dcb-4660-eebe-bac6047d999e" + }, + "outputs": [], + "source": [ + "ds.show(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XdGZgph6E8kz" + }, + "source": [ + "Without any further information, Tuplex automatically deduces types for each column. In order to check what types Tuplex deduced, we can use the `columns` and `types` properties of a Tuplex dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "EsosZ2csE71x", + "outputId": "9a7c7cd3-0cd8-45eb-b11d-50afedc0828c" + }, + "outputs": [ + { + "ename": "TypeError", + "evalue": "zip argument #1 must support iteration", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/tmp/ipykernel_220616/1148875277.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;31m# print out as nicely formatted dictionary\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mdict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtypes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m: zip argument #1 must support iteration" + ] + } + ], + "source": [ + "columns = ds.columns\n", + "types = ds.types\n", + "\n", + "# print out as nicely formatted dictionary\n", + "dict(zip(columns, types))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ebt3udXFFctc" + }, + "source": [ + "Sometimes however, it may be desirable to assign specific types to individual columns. Luckily, Tuplex provides a mechanism for this as well:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "3_kkRMKfFb7W", + "outputId": "305e92f3-4f04-4fe1-e4af-05d52f01dbdf" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2022-09-12 22:45:02.469] [posix filesystem] [warning] did not find any files for pattern 'sample_data/california_housing_train.csv'\n", + "[2022-09-12 22:45:02.469] [fileinputoperator] [info] found 0 files (0.00 B) to process.\n", + "[2022-09-12 22:45:02.469] [fileinputoperator] [warning] no input files found, can't infer type from given path: sample_data/california_housing_train.csv\n" + ] + } + ], + "source": [ + "c.csv('sample_data/california_housing_train.csv', type_hints={'longitude' : float, 'latitude' : str}).show(4)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TZgHfZFYdGTP" + }, + "source": [ + "Let's say we now want to create a file containing only data entries where the `housing_median_age` is larger than `50`:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "Ato7fqLjc5H1" + }, + "outputs": [], + "source": [ + "ds.filter(lambda r: r['housing_median_age'] > 50).tocsv('lt50.csv', num_parts=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PCgh3TkJd-9Y" + }, + "source": [ + "In order to speedup data output, Tuplex by default uses multiple threads to create multiple output parts." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "6YNy969JdfBG", + "outputId": "8b0afcdd-d4a6-4ec4-f48c-265c76a6b15c" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "head: cannot open 'lt50.part0.csv' for reading: No such file or directory\r\n" + ] + } + ], + "source": [ + "!head lt50.part0.csv" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ETBMyA2GeGVA" + }, + "source": [ + "Besides CSV files, Tuplex also has experimental support to read/write [ORC files](https://https://orc.apache.org/), which may be a more space efficient solution depending on the data and workload." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "id": "8W97qlM9eBkL" + }, + "outputs": [], + "source": [ + "ds.toorc('lt50.orc')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cSCTtDPAGwM6" + }, + "source": [ + "Similarly, the orc files can be read using the `orc` command." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ACpfbMz5GS4Q", + "outputId": "e67982b4-14cf-4d5e-e29b-e148dcd404bd" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2022-09-12 22:45:05.208] [posix filesystem] [warning] did not find any files for pattern 'lt50.part0.orc'\n", + "[2022-09-12 22:45:05.208] [fileinputoperator] [info] found 0 files (0.00 B) to process.\n", + "[2022-09-12 22:45:05.208] [fileinputoperator] [warning] no input files found, can't infer type from sample.\n" + ] + } + ], + "source": [ + "c.orc('lt50.part0.orc').show(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3sJ05Og2rubO" + }, + "source": [ + "## 2.2 Working with larger files\n", + "Naturally, the benefit of Tuplex's compilation comes into play when working with larger files. To demonstrate this, let's assume we want to work with the 311 original data. A subset of this (1GB) can be downloaded via the following command" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GZ5ahr4zryaP", + "outputId": "8f617ae3-3f4d-48b6-c4ab-b03949794e28" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading...\n", + "From: https://drive.google.com/uc?id=18e2GyoQKLnQ2_uaUcaSOsLRlIT-7tqpN\n", + "To: /home/leonhard/projects/2nd-copy/311_subset.tar.gz\n", + "100%|████████████████████████████████████████| 214M/214M [00:37<00:00, 5.77MB/s]\n" + ] + } + ], + "source": [ + "!gdown https://drive.google.com/uc?id=18e2GyoQKLnQ2_uaUcaSOsLRlIT-7tqpN && tar xf 311_subset.tar.gz" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "psV4--h7w5vK" + }, + "source": [ + "Next, let's create a new context with more memory to process the larger file. You can still reuse the old one albeit at the cost of incurring a lot of disk swapping. Therefore, we delete the old context to free up the space." + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "id": "WZQeKTxF3hvH" + }, + "outputs": [], + "source": [ + "del c" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "MEiheSMPFQmy", + "outputId": "b7be1922-6dd1-492d-f606-e6e4c52a057a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Unique Key,Created Date,Closed Date,Agency,Agency Name,Complaint Type,Descriptor,Location Type,Incident Zip,Incident Address,Street Name,Cross Street 1,Cross Street 2,Intersection Street 1,Intersection Street 2,Address Type,City,Landmark,Facility Type,Status,Due Date,Resolution Description,Resolution Action Updated Date,Community Board,BBL,Borough,X Coordinate (State Plane),Y Coordinate (State Plane),Open Data Channel Type,Park Facility Name,Park Borough,Vehicle Type,Taxi Company Borough,Taxi Pick Up Location,Bridge Highway Name,Bridge Highway Direction,Road Ramp,Bridge Highway Segment,Latitude,Longitude,Location\r\n", + "19937896,03/01/2011 02:27:57 PM,03/14/2011 03:59:20 PM,DOF,Refunds and Adjustments,DOF Property - Payment Issue,Misapplied Payment,Property Address,10027,,,,,,,ADDRESS,NEW YORK,,N/A,Closed,03/22/2011 02:27:57 PM,The Department of Finance resolved this issue.,03/14/2011 03:59:20 PM,09 MANHATTAN,1019820050,MANHATTAN,,,PHONE,Unspecified,MANHATTAN,,,,,,,,,,\r\n", + "19937901,03/01/2011 10:41:13 AM,03/15/2011 04:14:19 PM,DOT,Department of Transportation,Street Sign - Dangling,Street Cleaning - ASP,Street,11232,186 25 STREET,25 STREET,3 AVENUE,4 AVENUE,,,ADDRESS,BROOKLYN,,N/A,Closed,03/15/2011 05:32:23 PM,The Department of Transportation has completed the request or corrected the condition.,03/15/2011 04:14:19 PM,07 BROOKLYN,3006540024,BROOKLYN,984640,180028,PHONE,Unspecified,BROOKLYN,,,,,,,,40.660811976282695,-73.99859430999363,\"(40.660811976282695, -73.99859430999363)\"\r\n", + "19937902,03/01/2011 09:07:45 AM,03/15/2011 08:26:09 AM,DOT,Department of Transportation,Street Sign - Missing,Other/Unknown,Street,11358,,,,,158 STREET,NORTHERN BOULEVARD,INTERSECTION,FLUSHING,,N/A,Closed,03/15/2011 02:24:33 PM,The Department of Transportation has completed the request or corrected the condition.,03/15/2011 08:26:09 AM,07 QUEENS,,QUEENS,1037621,217498,PHONE,Unspecified,QUEENS,,,,,,,,40.763497105049986,-73.80733639290203,\"(40.763497105049986, -73.80733639290203)\"\r\n", + "19937903,03/01/2011 05:39:26 PM,04/04/2011 11:32:57 AM,DOT,Department of Transportation,Street Sign - Missing,School Crossing,Street,10014,10 SHERIDAN SQUARE,SHERIDAN SQUARE,BARROW STREET,GROVE STREET,,,ADDRESS,NEW YORK,,N/A,Closed,04/01/2011 03:43:12 PM,\"Upon inspection, the reported condition was not found, therefore no action was taken.\",04/04/2011 11:32:57 AM,02 MANHATTAN,1005920040,MANHATTAN,983719,206336,PHONE,Unspecified,MANHATTAN,,,,,,,,40.733021305197404,-74.00191597502526,\"(40.733021305197404, -74.00191597502526)\"\r\n", + "19937904,03/01/2011 11:08:14 AM,03/02/2011 07:55:37 AM,DOT,Department of Transportation,Street Sign - Missing,Stop,Street,10069,,,,,WEST 63 STREET,WEST END AVENUE,INTERSECTION,NEW YORK,,N/A,Closed,03/08/2011 11:08:14 AM,\"The condition has been inspected/investigated, see customer notes for more information.\",03/02/2011 07:55:37 AM,07 MANHATTAN,,MANHATTAN,987400,221308,PHONE,Unspecified,MANHATTAN,,,,,,,,40.77411510013836,-73.98862703263869,\"(40.77411510013836, -73.98862703263869)\"\r\n", + "19937906,03/01/2011 03:16:09 PM,03/02/2011 09:06:30 AM,DOF,Correspondence Unit,DOF Property - Request Copy,Copy of Notice of Property Value,Property Address,11105,,,,,,,ADDRESS,ASTORIA,,N/A,Closed,03/06/2011 03:16:09 PM,The Department of Finance mailed the requested item.,03/02/2011 09:06:31 AM,01 QUEENS,4009650074,QUEENS,,,PHONE,Unspecified,QUEENS,,,,,,,,,,\r\n", + "19937907,03/01/2011 01:22:59 PM,03/02/2011 09:06:28 AM,DOF,Correspondence Unit,DOF Property - Request Copy,Copy of Notice of Property Value,Property Address,10469,,,,,,,ADDRESS,BRONX,,N/A,Closed,03/06/2011 01:22:59 PM,The Department of Finance mailed the requested item.,03/02/2011 09:06:28 AM,12 BRONX,2046970142,BRONX,,,PHONE,Unspecified,BRONX,,,,,,,,,,\r\n", + "19937908,03/01/2011 12:01:58 PM,03/02/2011 09:05:26 AM,DOF,Correspondence Unit,DOF Property - Request Copy,Copy of Notice of Property Value,Property Address,10305,,,,,,,ADDRESS,STATEN ISLAND,,N/A,Closed,03/06/2011 12:01:58 PM,The Department of Finance mailed the requested item.,03/02/2011 09:05:26 AM,02 STATEN ISLAND,5032350004,STATEN ISLAND,,,PHONE,Unspecified,STATEN ISLAND,,,,,,,,,,\r\n", + "19937909,03/01/2011 02:35:46 PM,03/02/2011 09:06:31 AM,DOF,Correspondence Unit,DOF Property - Request Copy,Copy of Notice of Property Value,Property Address,11221,,,,,,,ADDRESS,BROOKLYN,,N/A,Closed,03/06/2011 02:35:46 PM,The Department of Finance mailed the requested item.,03/02/2011 09:06:31 AM,04 BROOKLYN,3033660059,BROOKLYN,,,PHONE,Unspecified,BROOKLYN,,,,,,,,,,\r\n" + ] + } + ], + "source": [ + "!head 311_subset.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "gO0jOErMxBYq", + "outputId": "d46afe03-86e9-491c-8563-47225ebf9e42" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ERROR:root:Failed to start or connect to Tuplex WebUI. Details: MongoDB (mongod) not found on PATH. In order to use Tuplex's WebUI, you need MongoDB installed or point the framework to a running MongoDB instance\n", + "INFO:local ee:loaded runtime library from/home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages/tuplex/libexec/tuplex_runtime.cpython-37m-x86_64-linux-gnu.so\n", + "INFO:local ee:initializing LLVM backend\n", + "WARNING:local ee:init JIT compiler also only in local mode\n", + "INFO:LLVM:compiling code for skylake\n", + "WARNING:history server:could not connect to http://localhost:5000/api/version, if you wish to disable the webui consider setting tuplex.webui=False for the context.\n", + "INFO:memory:allocated bitmap managed memory region (2.00 GB, 32.00 MB block size)\n", + "INFO:local execution engine:started local executor E/1 (2.00 GB, 32.00 MB default partition size)\n" + ] + } + ], + "source": [ + "c = tuplex.Context({'tuplex.redirectToPythonLogging':True, 'tuplex.executorCount':1, 'tuplex.executorMemory':'2G', 'tuplex.driverMemory':'2G'})" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QdOhBdtYkyPn" + }, + "source": [ + "Again, we can use Tuplex's autodetection feature to load the file and assign meaningful default types." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "3T56PhtlvGaL", + "outputId": "e83fc6ee-f7e4-4321-ea10-3f5f5ce20b7f" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:E/1:starting detached process queue\n", + "INFO:E/1:initialized runtime memory (4.00 MB)\n", + "INFO:fileinputoperator:found 1 file (999.08 MB) to process.\n", + "INFO:global:sampled file:///home/leonhard/projects/2nd-copy/311_subset.csv on 256.00 KB\n" + ] + } + ], + "source": [ + "ds = c.csv('311_subset.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "sMIkneXZvZZY", + "outputId": "3f9d4c08-1ef9-40cd-8f6a-eba35e51315f" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Unique Key': typing.Union[int, NoneType],\n", + " 'Created Date': typing.Union[str, NoneType],\n", + " 'Closed Date': typing.Union[str, NoneType],\n", + " 'Agency': typing.Union[str, NoneType],\n", + " 'Agency Name': typing.Union[str, NoneType],\n", + " 'Complaint Type': typing.Union[str, NoneType],\n", + " 'Descriptor': typing.Union[str, NoneType],\n", + " 'Location Type': typing.Union[str, NoneType],\n", + " 'Incident Zip': typing.Union[int, NoneType],\n", + " 'Incident Address': typing.Union[str, NoneType],\n", + " 'Street Name': typing.Union[str, NoneType],\n", + " 'Cross Street 1': typing.Union[str, NoneType],\n", + " 'Cross Street 2': typing.Union[str, NoneType],\n", + " 'Intersection Street 1': typing.Union[str, NoneType],\n", + " 'Intersection Street 2': typing.Union[str, NoneType],\n", + " 'Address Type': typing.Union[str, NoneType],\n", + " 'City': typing.Union[str, NoneType],\n", + " 'Landmark': typing.Union[str, NoneType],\n", + " 'Facility Type': typing.Union[str, NoneType],\n", + " 'Status': typing.Union[str, NoneType],\n", + " 'Due Date': typing.Union[str, NoneType],\n", + " 'Resolution Description': typing.Union[str, NoneType],\n", + " 'Resolution Action Updated Date': typing.Union[str, NoneType],\n", + " 'Community Board': typing.Union[str, NoneType],\n", + " 'BBL': typing.Union[int, NoneType],\n", + " 'Borough': typing.Union[str, NoneType],\n", + " 'X Coordinate (State Plane)': typing.Union[int, NoneType],\n", + " 'Y Coordinate (State Plane)': typing.Union[int, NoneType],\n", + " 'Open Data Channel Type': typing.Union[str, NoneType],\n", + " 'Park Facility Name': typing.Union[str, NoneType],\n", + " 'Park Borough': typing.Union[str, NoneType],\n", + " 'Vehicle Type': typing.Union[str, NoneType],\n", + " 'Taxi Company Borough': typing.Union[str, NoneType],\n", + " 'Taxi Pick Up Location': typing.Union[str, NoneType],\n", + " 'Bridge Highway Name': typing.Union[str, NoneType],\n", + " 'Bridge Highway Direction': typing.Union[str, NoneType],\n", + " 'Road Ramp': typing.Union[str, NoneType],\n", + " 'Bridge Highway Segment': typing.Union[str, NoneType],\n", + " 'Latitude': typing.Union[float, NoneType],\n", + " 'Longitude': typing.Union[float, NoneType],\n", + " 'Location': typing.Union[str, NoneType]}" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dict(zip(ds.columns, ds.types))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "81tBWIvZtW0x" + }, + "source": [ + "Executing a simple query on the input data creates a logical plan under the hood, which then gets optimized into a physical plan together with auto-generated efficient code that gets lowered ultimately to native code optimized for the machine it is executed on." + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KsSHOrfIvhIO", + "outputId": "cb874465-2a30-48c1-aa0f-e92145be361b" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:logical planner:logical optimization took 0.005175ms\n", + "INFO:codegen:generating pipeline for (Option[i64]) -> (Option[i64]) (1 operator pipelined)\n", + "INFO:codegen:generating lambda function for ((Option[i64])) -> Option[i64]\n", + "WARNING:history server:Could not register job, is history server running? To disable this warning, set webui=False in the context configuration.\n", + "INFO:global:Optimization via LLVM passes took 0.012285 ms\n", + "INFO:global:starting code compilation\n", + "INFO:global:first compile done\n", + "INFO:global:functor Stage_0 retrieved from llvm\n", + "INFO:global:retrieving init/release stage functors\n", + "INFO:global:Compiled code paths for stage 0 in 0.01 ms\n", + "INFO:global:[Transform Stage] Stage 0 compiled to x86 in 0.0185374s\n", + "INFO:local ee:split /home/leonhard/projects/2nd-copy/311_subset.csv into 15 tasks\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000269s (5 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000376s (0 normal rows, 0 exceptions)\n", + "INFO:driver:[Task Finished] Transform to mem in 0.000557s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000182s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000126s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000113s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000264s (0 normal rows, 0 exceptions)\n", + "INFO:driver:[Task Finished] Transform to mem in 0.000148s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000149s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000127s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000226s (0 normal rows, 0 exceptions)\n", + "INFO:driver:[Task Finished] Transform to mem in 0.000143s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000192s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000166s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000160s (0 normal rows, 0 exceptions)\n", + "INFO:global:[Transform Stage] Stage 0 completed 15 load&transform tasks in 0.0246091s\n", + "INFO:global:[Transform Stage] Stage 0 total wall clock time: 0.00319552s, 19 input rows, time to process 1 row via fast path: 0.168185ms\n", + "INFO:global:[Transform Stage] Stage 0 completed 15 sink tasks in 1.8249e-05s\n", + "INFO:global:[Transform Stage] Stage 0 took 0.0432127s\n", + "INFO:global:Query Execution took 0.0675056s. (planning: 0.0238491s, execution: 0.0436565s)\n", + "INFO:global:Collecting result of 5 rows took 0.000089 seconds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------+\n", + "| Unique Key |\n", + "+------------+\n", + "| 19937896 |\n", + "+------------+\n", + "| 19937901 |\n", + "+------------+\n", + "| 19937902 |\n", + "+------------+\n", + "| 19937903 |\n", + "+------------+\n", + "| 19937904 |\n", + "+------------+\n" + ] + } + ], + "source": [ + "ds.selectColumns(['Unique Key']).show(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uPqLcSjJt80V" + }, + "source": [ + "As for every operation, we can retrieve help using Python's builtin documentation featue." + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "kgnGE2OYntU4", + "outputId": "93b19289-51c2-49ca-9d50-659af8e2c667" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Help on method selectColumns in module tuplex.dataset:\n", + "\n", + "selectColumns(columns) method of tuplex.dataset.DataSet instance\n", + " selects a subset of columns as defined through columns which is a list or a single column\n", + " \n", + " Args:\n", + " columns: list of strings or integers. A string should reference a column name, whereas as an integer refers to an index. Indices may be negative according to python rules. Order in list determines output order\n", + " \n", + " Returns:\n", + " tuplex.dataset.DataSet: A Tuplex Dataset object that allows further ETL operations\n", + "\n" + ] + } + ], + "source": [ + "help(ds.selectColumns)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "foF-WYEnuKbm" + }, + "source": [ + "I.e., when looking up the semantics of the `selectColumns` operation, it's also possible to use integers instead of strings to select columns for more flexibility." + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "G07YcJeGuU7K", + "outputId": "ce51b323-45c4-4664-d142-2658266270b2" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:logical planner:logical optimization took 0.003723ms\n", + "INFO:codegen:generating pipeline for (Option[i64],Option[str]) -> (Option[i64],Option[str]) (1 operator pipelined)\n", + "INFO:codegen:generating lambda function for ((Option[i64],Option[str])) -> (Option[i64],Option[str])\n", + "WARNING:history server:Could not register job, is history server running? To disable this warning, set webui=False in the context configuration.\n", + "INFO:global:Optimization via LLVM passes took 0.012829 ms\n", + "INFO:global:starting code compilation\n", + "INFO:global:first compile done\n", + "INFO:global:functor Stage_0 retrieved from llvm\n", + "INFO:global:retrieving init/release stage functors\n", + "INFO:global:Compiled code paths for stage 0 in 0.00 ms\n", + "INFO:global:[Transform Stage] Stage 0 compiled to x86 in 0.0181983s\n", + "INFO:local ee:split /home/leonhard/projects/2nd-copy/311_subset.csv into 15 tasks\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000188s (3 normal rows, 0 exceptions)\n", + "INFO:driver:[Task Finished] Transform to mem in 0.000281s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000489s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000271s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000112s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000106s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000103s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000105s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000107s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000105s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000103s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000104s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000106s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000104s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000104s (0 normal rows, 0 exceptions)\n", + "INFO:global:[Transform Stage] Stage 0 completed 15 load&transform tasks in 0.0231235s\n", + "INFO:global:[Transform Stage] Stage 0 total wall clock time: 0.00238825s, 17 input rows, time to process 1 row via fast path: 0.140486ms\n", + "INFO:global:[Transform Stage] Stage 0 completed 15 sink tasks in 1.3223e-05s\n", + "INFO:global:[Transform Stage] Stage 0 took 0.0413734s\n", + "INFO:global:Query Execution took 0.0624538s. (planning: 0.0207632s, execution: 0.0416907s)\n", + "INFO:global:Collecting result of 3 rows took 0.000076 seconds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------+--------------------------+\n", + "| Unique Key | Created Date |\n", + "+------------+--------------------------+\n", + "| 19937896 | '03/01/2011 02:27:57 PM' |\n", + "+------------+--------------------------+\n", + "| 19937901 | '03/01/2011 10:41:13 AM' |\n", + "+------------+--------------------------+\n", + "| 19937902 | '03/01/2011 09:07:45 AM' |\n", + "+------------+--------------------------+\n" + ] + } + ], + "source": [ + "ds.selectColumns([0, 1]).show(3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "O0K0POANzMGJ" + }, + "source": [ + "Let's say, we want to use a slightly more complicated pipeline now. As an initial step, let's first investigate what kind ofcomplaint types there are. To find the corresponding column, we can use the meta-data associated with a dataset and then design a first, exploratory query." + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "_LXRLILHzQd5", + "outputId": "445160d3-6514-4c51-a9da-82aac86997f1" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Unique Key | Created Date | Closed Date | Agency | Agency Name\n", + "Complaint Type | Descriptor | Location Type | Incident Zip | Incident Address\n", + "Street Name | Cross Street 1 | Cross Street 2 | Intersection Street 1 | Intersection Street 2\n", + "Address Type | City | Landmark | Facility Type | Status\n", + "Due Date | Resolution Description | Resolution Action Updated Date | Community Board | BBL\n", + "Borough | X Coordinate (State Plane) | Y Coordinate (State Plane) | Open Data Channel Type | Park Facility Name\n", + "Park Borough | Vehicle Type | Taxi Company Borough | Taxi Pick Up Location | Bridge Highway Name\n", + "Bridge Highway Direction | Road Ramp | Bridge Highway Segment | Latitude | Longitude\n", + "Location\n" + ] + } + ], + "source": [ + "def print_table(arr, break_after=5):\n", + " for i in range(len(arr) // break_after +1):\n", + " print(' | '.join(arr[i * break_after:(i +1)* break_after]))\n", + "\n", + "print_table(ds.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "aW2AjU9r0Jqt", + "outputId": "e049e45c-0cd8-467f-b6ef-16938e53bfda" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:logical planner:logical optimization took 0.003931ms\n", + "INFO:codegen:generating pipeline for (Option[str]) -> (Option[str]) (2 operators pipelined)\n", + "INFO:codegen:generating lambda function for ((Option[str])) -> Option[str]\n", + "WARNING:history server:Could not register job, is history server running? To disable this warning, set webui=False in the context configuration.\n", + "INFO:global:provided option to explicitly merge bad rows in order back, however rows will be hashed. Disabling option. To silence this warning, set\n", + "tuplex.optimizer.mergeExceptionsInOrder=false\n", + "INFO:global:Optimization via LLVM passes took 0.010179 ms\n", + "INFO:global:starting code compilation\n", + "INFO:global:first compile done\n", + "INFO:global:functor Stage_0 retrieved from llvm\n", + "INFO:global:retrieving init/release stage functors\n", + "INFO:global:Compiled code paths for stage 0 in 0.00 ms\n", + "INFO:global:[Transform Stage] Stage 0 compiled to x86 in 0.0143798s\n", + "INFO:local ee:split /home/leonhard/projects/2nd-copy/311_subset.csv into 15 tasks\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.227395s (0 normal rows, 0 exceptions, 129 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.221090s (0 normal rows, 0 exceptions, 137 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.216516s (0 normal rows, 0 exceptions, 128 buckets)\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.226671s (0 normal rows, 0 exceptions, 136 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.209629s (0 normal rows, 0 exceptions, 200 buckets)\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.224845s (0 normal rows, 0 exceptions, 145 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.217637s (0 normal rows, 0 exceptions, 142 buckets)\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.219661s (0 normal rows, 0 exceptions, 142 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.226369s (0 normal rows, 0 exceptions, 143 buckets)\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.216218s (0 normal rows, 0 exceptions, 138 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.213966s (0 normal rows, 0 exceptions, 140 buckets)\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.217093s (0 normal rows, 0 exceptions, 135 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.245751s (0 normal rows, 0 exceptions, 123 buckets)\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.267397s (0 normal rows, 0 exceptions, 117 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.362738s (0 normal rows, 0 exceptions, 126 buckets)\n", + "INFO:global:[Transform Stage] Stage 0 completed 15 load&transform tasks in 1.93791s\n", + "INFO:global:[Transform Stage] Stage 0 total wall clock time: 3.51298s, 1907219 input rows, time to process 1 row via fast path: 0.00184194ms\n", + "INFO:global:[Transform Stage] Stage 0 completed 15 sink tasks in 0.00117526s\n", + "INFO:global:[Transform Stage] Stage 0 took 1.95351s\n", + "INFO:global:[Transform Stage] skipped stage 1 because there is nothing todo here.\n", + "INFO:global:Query Execution took 1.99631s. (planning: 0.0233447s, execution: 1.97296s)\n", + "INFO:python:Data transfer back to Python took 0.000431 seconds\n" + ] + } + ], + "source": [ + "complaint_types = ds.selectColumns(['Complaint Type']).unique().collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "JaN8v0Zu0hpT", + "outputId": "276d3df0-9b65-486b-d7ab-6f1246064cb1" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Mosquitoes', 'DOF Parking - Payment Issue', 'DOF Property - Update Account', 'Street Condition', 'Trans Fat', 'Plumbing', 'Benefit Card Replacement', 'DOF Parking - Address Update', 'Non-Emergency Police Matter', 'Harboring Bees/Wasps', 'Home Delivered Meal - Missed Delivery', 'HPD Literature Request', 'Health', 'Beach/Pool/Sauna Complaint', 'Unsanitary Animal Facility', 'Ferry Complaint', 'Illegal Parking', 'Drug Activity', 'DRIE', 'Dead/Dying Tree', 'Overflowing Litter Baskets', 'Unleashed Dog', 'BEST/Site Safety', 'Vending', 'Sidewalk Condition', 'Highway Sign - Damaged', 'Bridge Condition', 'Public Payphone Complaint', 'Overgrown Tree/Branches', 'Broken Parking Meter', 'Animal-Abuse', 'Taxi Complaint', 'Green Taxi Complaint', 'NONCONST', 'Abandoned Vehicle', 'Noise - Commercial', 'Noise - Helicopter', 'New Tree Request', 'Noise', 'Illegal Fireworks', 'X-Ray Machine/Equipment', 'Discipline and Suspension', 'Animal in a Park', 'Transportation Provider Complaint', 'Tattooing', 'Hazardous Materials', 'Homeless Street Condition', 'WATER LEAK', 'HEAP Assistance', 'Lifeguard', 'Bus Stop Shelter Placement', 'Tanning', 'Bottled Water', 'GENERAL', \"Alzheimer's Care\", 'DHS Advantage - Third Party', 'Construction Lead Dust', 'Highway Sign - Missing', 'Sweeping/Missed-Inadequate', 'DPR Internal', 'Curb Condition', 'Noise - House of Worship', 'Window Guard', 'Sanitation Condition', 'PAINT/PLASTER', 'Radioactive Material', 'Summer Camp', 'For Hire Vehicle Report', 'Panhandling', 'Legal Services Provider Complaint', 'Unsanitary Animal Pvt Property', 'Bike Rack Condition', 'Weatherization', 'Home Delivered Meal Complaint', 'Emergency Response Team (ERT)', 'Illegal Animal Kept as Pet', 'DOF Property - Request Copy', 'Animal Facility - No Permit', 'Utility Program', 'Squeegee', 'DEP Street Condition', 'Home Care Provider Complaint', 'Investigations and Discipline (IAD)', 'SCRIE', 'Electrical', 'Urinating in Public', 'PAINT - PLASTER', 'Homeless Person Assistance', 'Water Conservation', 'Noise - Residential', 'Street Sign - Missing', 'City Vehicle Placard Complaint', 'OUTSIDE BUILDING', 'FLOORING/STAIRS', 'Portable Toilet', 'Housing Options', 'Building Marshals office', 'DOF Parking - Request Copy', 'Posting Advertisement', 'Parent Leadership', 'Sewer', 'Construction', 'DOOR/WINDOW', 'Parking Card', 'Recycling Enforcement', 'LinkNYC', 'Mobile Food Vendor', 'SAFETY', 'Taxpayer Advocate Inquiry', 'Special Projects Inspection Team (SPIT)', 'DHS Income Savings Requirement', 'Food Establishment', 'Drinking', 'ELECTRIC', 'Bereavement Support Group', 'DOF Parking - Tax Exemption', 'Derelict Vehicles', 'Water System', 'DHS Advantage -Landlord/Broker', 'Pet Shop', 'For Hire Vehicle Complaint', 'Boilers', 'Building Condition', 'Traffic Signal Condition', 'Street Sign - Dangling', 'Other Enforcement', 'Highway Condition', 'UNSANITARY CONDITION', 'Ferry Inquiry', 'HEAT/HOT WATER', 'Graffiti', 'Blocked Driveway', 'Bus Stop Shelter Complaint', 'DOF Parking - Request Status', 'School Maintenance', 'ATF', 'Case Management Agency Complaint', 'Home Repair', 'Rodent', 'Lost Property', 'OEM Literature Request', 'Industrial Waste', 'Public Toilet', 'Tunnel Condition', 'ELEVATOR', 'Violation of Park Rules', 'Taxi Compliment', 'Indoor Air Quality', 'Damaged Tree', 'Noise Survey', 'Cranes and Derricks', 'Derelict Vehicle', 'Safety', 'Elevator', 'DOF Property - Property Value', 'DOF Property - Reduction Issue', 'Illegal Animal Sold', 'Highway Sign - Dangling', 'Disorderly Youth', 'PLUMBING', 'Calorie Labeling', 'Housing - Low Income Senior', 'Illegal Tree Damage', 'Found Property', 'Municipal Parking Facility', 'Missed Collection (All Materials)', 'DOF Parking - DMV Clearance', 'Elder Abuse', 'Food Poisoning', 'Broken Muni Meter', 'Mold', 'APPLIANCE', 'NORC Complaint', 'Registration and Transfers', 'General Construction/Plumbing', 'Special Natural Area District (SNAD)', 'Construction Safety Enforcement', 'Indoor Sewage', 'Building/Use', 'DHS Advantage - Tenant', 'Bike/Roller/Skate Chronic', 'Noise - Street/Sidewalk', 'Teaching/Learning/Instruction', 'Borough Office', 'Drinking Water', 'Noise - Park', 'DCA / DOH New License Application Request', 'Non-Residential Heat', 'Day Care', 'Unsanitary Pigeon Condition', 'Air Quality', 'Noise - Vehicle', 'Dead Tree', 'Dirty Conditions', 'Consumer Complaint', 'Homeless Encampment', 'DOF Property - Payment Issue', 'Executive Inspections', 'Water Quality', 'Smoking', 'Sustainability Enforcement', 'DOF Property - City Rebate', 'Lead', 'Asbestos', 'Animal Abuse', 'Street Light Condition', 'Root/Sewer/Sidewalk Condition', 'Traffic', 'Street Sign - Damaged', 'DOF Property - Owner Issue', 'Advocate-Personal Exemptions', 'Senior Center Complaint', 'Vacant Lot', 'DOF Property - RPIE Issue', 'Taxi Report', 'Standing Water', 'Derelict Bicycle', 'Poison Ivy', 'Scaffold Safety', 'Maintenance or Facility', 'Ferry Permit', 'Snow']\n" + ] + } + ], + "source": [ + "print(complaint_types)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rV8aG_rV0lAE" + }, + "source": [ + "Looking at the data, we see that there are some complaints regarding mosquitoes. Likely, because it gets quite hot and humid in summer in New York City! Can the data back this up?\n", + "\n", + "To find out, let's plot the number of mosquito complaints per month for the last year. A helpful function for aggregating the results is `aggregateByKey`:" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "l-5hyAme0xga", + "outputId": "1b916d6d-72b5-4962-9408-22aa343df1e5" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Help on method aggregateByKey in module tuplex.dataset:\n", + "\n", + "aggregateByKey(combine, aggregate, initial_value, key_columns) method of tuplex.dataset.DataSet instance\n", + " An experimental aggregateByKey function similar to aggregate. There are several scenarios that do not work with this function yet and its performance hasn't been properly\n", + " optimized either. Data is grouped by the supplied key_columns. Then, for each group a new aggregate is initialized using the initial_value, which can be thought of as a neutral value.\n", + " The aggregate function is then called for each element and the current aggregate structure. It is guaranteed that the combine function is called at least once per group by applying the initial_value to the aggregate.\n", + " Args:\n", + " combine: a UDF to combine two aggregates (results of the aggregate function or the initial_value). E.g., cobmine = lambda agg1, agg2: agg1 + agg2. The initial value should be the neutral element.\n", + " aggregate: a UDF which produces a result by combining a value with the aggregate initialized by initial_value. E.g., aggreagte = lambda agg, value: agg + value sums up values.\n", + " initial_value: a neutral initial value.\n", + " key_columns: the columns to group the aggregate by, a sequence of a mix of strings or integers. If specified as a single string or number, aggregation is over a single column.\n", + " Returns:\n", + " Dataset\n", + "\n" + ] + } + ], + "source": [ + "help(ds.aggregateByKey)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aYeMEv-81nQT" + }, + "source": [ + "Next, let's use a UDF to extract the month and year of the complaint and limit the search to complain types so Tuplex automatically processes fewer rows." + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "gvPtyBDE1m1_", + "outputId": "bb111e74-4122-44de-cc38-681504fded6e" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:logical planner:logical optimization took 0.004637ms\n", + "INFO:codegen:generating pipeline for (Option[str]) -> (Option[str]) (1 operator pipelined)\n", + "INFO:codegen:generating lambda function for ((Option[str])) -> Option[str]\n", + "WARNING:history server:Could not register job, is history server running? To disable this warning, set webui=False in the context configuration.\n", + "INFO:global:Optimization via LLVM passes took 0.010873 ms\n", + "INFO:global:starting code compilation\n", + "INFO:global:first compile done\n", + "INFO:global:functor Stage_0 retrieved from llvm\n", + "INFO:global:retrieving init/release stage functors\n", + "INFO:global:Compiled code paths for stage 0 in 0.00 ms\n", + "INFO:global:[Transform Stage] Stage 0 compiled to x86 in 0.0160895s\n", + "INFO:local ee:split /home/leonhard/projects/2nd-copy/311_subset.csv into 15 tasks\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000198s (5 normal rows, 0 exceptions)\n", + "INFO:driver:[Task Finished] Transform to mem in 0.000220s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000408s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000380s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000185s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000112s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000107s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000107s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000149s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000450s (0 normal rows, 0 exceptions)\n", + "INFO:driver:[Task Finished] Transform to mem in 0.000231s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000239s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000111s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000187s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.000107s (0 normal rows, 0 exceptions)\n", + "INFO:global:[Transform Stage] Stage 0 completed 15 load&transform tasks in 0.0214662s\n", + "INFO:global:[Transform Stage] Stage 0 total wall clock time: 0.0031914s, 19 input rows, time to process 1 row via fast path: 0.167968ms\n", + "INFO:global:[Transform Stage] Stage 0 completed 15 sink tasks in 1.9245e-05s\n", + "INFO:global:[Transform Stage] Stage 0 took 0.0376218s\n", + "INFO:global:Query Execution took 0.0492011s. (planning: 0.0112488s, execution: 0.0379523s)\n", + "INFO:global:Collecting result of 5 rows took 0.000101 seconds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------------+\n", + "| Created Date |\n", + "+--------------------------+\n", + "| '03/01/2011 02:27:57 PM' |\n", + "+--------------------------+\n", + "| '03/01/2011 10:41:13 AM' |\n", + "+--------------------------+\n", + "| '03/01/2011 09:07:45 AM' |\n", + "+--------------------------+\n", + "| '03/01/2011 05:39:26 PM' |\n", + "+--------------------------+\n", + "| '03/01/2011 11:08:14 AM' |\n", + "+--------------------------+\n" + ] + } + ], + "source": [ + "ds.selectColumns(['Created Date']).show(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "A3sguZZ30xiq", + "outputId": "0774a0ae-478b-4aba-9fc9-e3e6d39da590" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:logical planner:logical optimization took 0.015383ms\n", + "INFO:codegen:generating pipeline for (Option[str],Option[str]) -> (i64,i64,Option[str]) (5 operators pipelined)\n", + "INFO:codegen:generating lambda function for ((Option[str],Option[str])) -> boolean\n", + "INFO:codegen:generating function extract_month for ((Option[str],Option[str])) -> i64\n", + "INFO:codegen:generating function extract_year for ((Option[str],Option[str],i64)) -> i64\n", + "INFO:codegen:generating lambda function for ((Option[str],Option[str],i64,i64)) -> boolean\n", + "INFO:codegen:generating lambda function for ((Option[str],Option[str],i64,i64)) -> (i64,i64,Option[str])\n", + "WARNING:history server:Could not register job, is history server running? To disable this warning, set webui=False in the context configuration.\n", + "INFO:global:Optimization via LLVM passes took 0.033716 ms\n", + "INFO:global:starting code compilation\n", + "INFO:global:first compile done\n", + "INFO:global:functor Stage_0 retrieved from llvm\n", + "INFO:global:retrieving init/release stage functors\n", + "INFO:global:Compiled code paths for stage 0 in 0.01 ms\n", + "INFO:global:[Transform Stage] Stage 0 compiled to x86 in 0.0496439s\n", + "INFO:local ee:split /home/leonhard/projects/2nd-copy/311_subset.csv into 15 tasks\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.219055s (0 normal rows, 0 exceptions)\n", + "INFO:driver:[Task Finished] Transform to mem in 0.221205s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.217267s (0 normal rows, 0 exceptions)\n", + "INFO:driver:[Task Finished] Transform to mem in 0.231919s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.206888s (4 normal rows, 0 exceptions)\n", + "INFO:driver:[Task Finished] Transform to mem in 0.211544s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.213482s (0 normal rows, 0 exceptions)\n", + "INFO:driver:[Task Finished] Transform to mem in 0.210397s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.232479s (0 normal rows, 0 exceptions)\n", + "INFO:driver:[Task Finished] Transform to mem in 0.236717s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.207968s (0 normal rows, 0 exceptions)\n", + "INFO:driver:[Task Finished] Transform to mem in 0.214165s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.221885s (0 normal rows, 0 exceptions)\n", + "INFO:driver:[Task Finished] Transform to mem in 0.211296s (0 normal rows, 0 exceptions)\n", + "INFO:E/1:[Task Finished] Transform to mem in 0.359222s (0 normal rows, 0 exceptions)\n", + "INFO:global:[Transform Stage] Stage 0 completed 15 load&transform tasks in 1.87937s\n", + "INFO:global:[Transform Stage] Stage 0 total wall clock time: 3.41549s, 1907219 input rows, time to process 1 row via fast path: 0.00179082ms\n", + "INFO:global:[Transform Stage] Stage 0 completed 15 sink tasks in 2.5592e-05s\n", + "INFO:global:[Transform Stage] Stage 0 took 1.92908s\n", + "INFO:global:Query Execution took 1.965s. (planning: 0.0355522s, execution: 1.92945s)\n", + "INFO:global:Collecting result of 4 rows took 0.000079 seconds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+------+----------------+\n", + "| Month | Year | Complaint Type |\n", + "+-------+------+----------------+\n", + "| 12 | 2019 | 'Mosquitoes' |\n", + "+-------+------+----------------+\n", + "| 12 | 2019 | 'Mosquitoes' |\n", + "+-------+------+----------------+\n", + "| 12 | 2019 | 'Mosquitoes' |\n", + "+-------+------+----------------+\n", + "| 7 | 2019 | 'Mosquitoes' |\n", + "+-------+------+----------------+\n" + ] + } + ], + "source": [ + "year_to_investigate = 2019\n", + "\n", + "def extract_month(row):\n", + " date = row['Created Date']\n", + " date = date[:date.find(' ')]\n", + " return int(date.split('/')[0])\n", + "\n", + "def extract_year(row):\n", + " date = row['Created Date']\n", + " date = date[:date.find(' ')]\n", + " return int(date.split('/')[-1])\n", + "\n", + "ds2 = ds.withColumn('Month', extract_month) \\\n", + " .withColumn('Year', extract_year) \\\n", + " .filter(lambda row: 'Mosquito' in row['Complaint Type']) \\\n", + " .filter(lambda row: row['Year'] == year_to_investigate) \\\n", + " .selectColumns(['Month', 'Year', 'Complaint Type'])\n", + "\n", + "\n", + "ds2.show(5)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wENeFNWEejM8" + }, + "source": [ + "We can now use the aggregateByKey function to count the number of mosquito complaints per month in 2019." + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "joNXIjhSeh0i", + "outputId": "415a3fb4-0490-4594-a0f8-f33da61213aa" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:logical planner:logical optimization took 0.018371ms\n", + "INFO:codegen:generating pipeline for (Option[str],Option[str]) -> (i64,i64) (6 operators pipelined)\n", + "INFO:codegen:generating lambda function for ((Option[str],Option[str])) -> boolean\n", + "INFO:codegen:generating function extract_month for ((Option[str],Option[str])) -> i64\n", + "INFO:codegen:generating function extract_year for ((Option[str],Option[str],i64)) -> i64\n", + "INFO:codegen:generating lambda function for ((Option[str],Option[str],i64,i64)) -> boolean\n", + "INFO:codegen:generating lambda function for ((Option[str],Option[str],i64,i64)) -> (i64,i64,Option[str])\n", + "INFO:codegen:generating function combine_udf for (i64,i64) -> i64\n", + "INFO:codegen:generating function aggregate_udf for (i64,(i64,i64,Option[str])) -> i64\n", + "WARNING:history server:Could not register job, is history server running? To disable this warning, set webui=False in the context configuration.\n", + "INFO:global:provided option to explicitly merge bad rows in order back, however rows will be hashed. Disabling option. To silence this warning, set\n", + "tuplex.optimizer.mergeExceptionsInOrder=false\n", + "INFO:global:Optimization via LLVM passes took 0.037240 ms\n", + "INFO:global:starting code compilation\n", + "INFO:global:first compile done\n", + "INFO:global:functor Stage_0 retrieved from llvm\n", + "INFO:global:retrieving init/release stage functors\n", + "INFO:global:Compiled code paths for stage 0 in 0.02 ms\n", + "INFO:global:[Transform Stage] Stage 0 compiled to x86 in 0.0570446s\n", + "INFO:local ee:split /home/leonhard/projects/2nd-copy/311_subset.csv into 15 tasks\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.227490s (0 normal rows, 0 exceptions, 0 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.220710s (0 normal rows, 0 exceptions, 0 buckets)\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.216524s (0 normal rows, 0 exceptions, 0 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.214208s (0 normal rows, 0 exceptions, 0 buckets)\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.211271s (0 normal rows, 0 exceptions, 2 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.233693s (0 normal rows, 0 exceptions, 0 buckets)\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.207874s (0 normal rows, 0 exceptions, 0 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.211661s (0 normal rows, 0 exceptions, 0 buckets)\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.213264s (0 normal rows, 0 exceptions, 0 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.234989s (0 normal rows, 0 exceptions, 0 buckets)\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.206323s (0 normal rows, 0 exceptions, 0 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.227720s (0 normal rows, 0 exceptions, 0 buckets)\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.235625s (0 normal rows, 0 exceptions, 0 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.220421s (0 normal rows, 0 exceptions, 0 buckets)\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.343307s (0 normal rows, 0 exceptions, 0 buckets)\n", + "INFO:global:[Transform Stage] Stage 0 completed 15 load&transform tasks in 1.86291s\n", + "INFO:global:[Transform Stage] Stage 0 total wall clock time: 3.42508s, 1907219 input rows, time to process 1 row via fast path: 0.00179585ms\n", + "INFO:global:[Transform Stage] Stage 0 completed 15 sink tasks in 0.00101284s\n", + "INFO:global:[Transform Stage] Stage 0 took 1.921s\n", + "INFO:global:[Transform Stage] skipped stage 1 because there is nothing todo here.\n", + "INFO:global:Query Execution took 1.96409s. (planning: 0.0426338s, execution: 1.92146s)\n", + "INFO:global:Collecting result of 2 rows took 0.000038 seconds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+---+\n", + "| Month | |\n", + "+-------+---+\n", + "| 12 | 3 |\n", + "+-------+---+\n", + "| 7 | 1 |\n", + "+-------+---+\n" + ] + } + ], + "source": [ + "def combine_udf(a, b):\n", + " return a + b\n", + "\n", + "def aggregate_udf(agg, row):\n", + " return agg + 1\n", + "\n", + "ds2.aggregateByKey(combine_udf, aggregate_udf, 0, [\"Month\"]).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kY_4W26xhmYZ" + }, + "source": [ + "Yet, it seems that mosquito complaints are actually not that common. In total there are 4 complaints for the whole year, of which 3 are in December. Thus we actually can't draw with such little support any meaningful conclusions about mosquitos in NYC from the 311 dataset.\n", + "\n", + "Let's step back and check actually, what kind of complaint is actually the most common:" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "RSV4XLHgh_gC", + "outputId": "62a20ea1-bdab-4758-9a55-1251f8cff21e" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:logical planner:logical optimization took 0.002201ms\n", + "INFO:codegen:generating pipeline for (Option[i64],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[i64],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[i64],Option[str],Option[i64],Option[i64],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[f64],Option[f64],Option[str]) -> (Option[str],i64) (1 operator pipelined)\n", + "INFO:codegen:generating function combine_udf for (i64,i64) -> i64\n", + "INFO:codegen:generating function aggregate_udf for (i64,(Option[i64],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[i64],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[i64],Option[str],Option[i64],Option[i64],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[f64],Option[f64],Option[str])) -> i64\n", + "WARNING:history server:Could not register job, is history server running? To disable this warning, set webui=False in the context configuration.\n", + "INFO:global:provided option to explicitly merge bad rows in order back, however rows will be hashed. Disabling option. To silence this warning, set\n", + "tuplex.optimizer.mergeExceptionsInOrder=false\n", + "INFO:global:Optimization via LLVM passes took 0.091600 ms\n", + "INFO:global:starting code compilation\n", + "INFO:global:first compile done\n", + "INFO:global:functor Stage_0 retrieved from llvm\n", + "INFO:global:retrieving init/release stage functors\n", + "INFO:global:Compiled code paths for stage 0 in 0.03 ms\n", + "INFO:global:[Transform Stage] Stage 0 compiled to x86 in 0.127125s\n", + "INFO:local ee:split /home/leonhard/projects/2nd-copy/311_subset.csv into 15 tasks\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.284140s (0 normal rows, 61 exceptions, 129 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.276317s (0 normal rows, 31 exceptions, 137 buckets)\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.272109s (0 normal rows, 32 exceptions, 136 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.272250s (0 normal rows, 62 exceptions, 128 buckets)\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.266658s (0 normal rows, 33 exceptions, 200 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.271441s (0 normal rows, 46 exceptions, 145 buckets)\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.271498s (0 normal rows, 60 exceptions, 142 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.273922s (0 normal rows, 48 exceptions, 142 buckets)\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.282784s (0 normal rows, 41 exceptions, 143 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.282431s (0 normal rows, 1 exception, 138 buckets)\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.282806s (0 normal rows, 5 exceptions, 140 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.275421s (0 normal rows, 15 exceptions, 135 buckets)\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.274954s (0 normal rows, 4 exceptions, 123 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.279680s (0 normal rows, 37 exceptions, 117 buckets)\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.433613s (0 normal rows, 34 exceptions, 126 buckets)\n", + "INFO:global:[Transform Stage] Stage 0 completed 15 load&transform tasks in 2.3697s\n", + "INFO:global:[Transform Stage] Stage 0 total wall clock time: 4.30002s, 1907219 input rows, time to process 1 row via fast path: 0.0022546ms\n", + "INFO:global:Exception details: \n", + "+------------+-------------------------------------+-------+\n", + "| OperatorID | Exception | Count |\n", + "+------------+-------------------------------------+-------+\n", + "| 100135 | tuplex.internal.BadParseStringInput | 510 |\n", + "+------------+-------------------------------------+-------+\n", + "INFO:local ee:created combined normal-case result in 0.001234s\n", + "INFO:local ee:compiled pure python pipeline in 0.001353s\n", + "INFO:local ee:creating hybrid intermediates took 0.000003s\n", + "INFO:local ee:Created 15 resolve tasks in 0.000494s\n", + "INFO:local ee:15/15 tasks require executing the slow path.\n", + "INFO:E/1:[Task Finished] Resolve in 0.032112s\n", + "INFO:E/1:[Task Finished] Resolve in 0.012562s\n", + "INFO:E/1:[Task Finished] Resolve in 0.033372s\n", + "INFO:driver:[Task Finished] Resolve in 0.072717s\n", + "INFO:E/1:[Task Finished] Resolve in 0.031012s\n", + "INFO:driver:[Task Finished] Resolve in 0.032600s\n", + "INFO:driver:[Task Finished] Resolve in 0.044743s\n", + "INFO:E/1:[Task Finished] Resolve in 0.065935s\n", + "INFO:E/1:[Task Finished] Resolve in 0.009902s\n", + "INFO:driver:[Task Finished] Resolve in 0.030727s\n", + "INFO:E/1:[Task Finished] Resolve in 0.013373s\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=For Hire Vehicle Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=For Hire Vehicle Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=For Hire Vehicle Complaint\n", + "setdefault w. key=Housing - Low Income Senior\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Housing - Low Income Senior\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Sidewalk Condition\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Broken Parking Meter\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Broken Muni Meter\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=For Hire Vehicle Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=SCRIE\n", + "setdefault w. key=Housing Options\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=For Hire Vehicle Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Housing Options\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Food Establishment\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Sidewalk Condition\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=For Hire Vehicle Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Housing Options\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Curb Condition\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Street Condition\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=SCRIE\n", + "setdefault w. key=SCRIE\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Housing - Low Income Senior\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=SCRIE\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Housing - Low Income Senior\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=SCRIE\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=For Hire Vehicle Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Housing Options\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Damaged Tree\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=For Hire Vehicle Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Sidewalk Condition\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Housing Options\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Broken Muni Meter\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Alzheimer's Care\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=For Hire Vehicle Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Sidewalk Condition\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Alzheimer's Care\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=SCRIE\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Housing - Low Income Senior\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:driver:[Task Finished] Resolve in 0.011660s\n", + "INFO:E/1:[Task Finished] Resolve in 0.015524s\n", + "INFO:driver:[Task Finished] Resolve in 0.025153s\n", + "INFO:E/1:[Task Finished] Resolve in 0.030859s\n", + "INFO:local ee:slow path resolved 510/510 exceptions in 0.248894s\n", + "INFO:local ee:slow path for Stage 0: total wall clock time: 0.462249s, time to process 1 row via slow path: 0.906371ms\n", + "INFO:global:[Transform Stage] Stage 0 completed 15 resolve tasks in 0.248915s\n", + "INFO:global:[Transform Stage] Stage 0 completed 15 sink tasks in 0.00174519s\n", + "INFO:global:[Transform Stage] Stage 0 took 2.74754s\n", + "INFO:global:[Transform Stage] skipped stage 1 because there is nothing todo here.\n", + "INFO:global:Query Execution took 2.7727s. (planning: 0.0225033s, execution: 2.75019s)\n", + "INFO:global:Collecting result of 222 rows took 0.001815 seconds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Housing Options\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Housing - Low Income Senior\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Food Establishment\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=SCRIE\n", + "setdefault w. key=SCRIE\n", + "setdefault w. key=Consumer Complaint\n", + "+---------------------------------------------+--------+\n", + "| Complaint Type | |\n", + "+---------------------------------------------+--------+\n", + "| 'Mosquitoes' | 4 |\n", + "+---------------------------------------------+--------+\n", + "| 'DOF Parking - Payment Issue' | 19487 |\n", + "+---------------------------------------------+--------+\n", + "| 'DOF Property - Update Account' | 65 |\n", + "+---------------------------------------------+--------+\n", + "| 'Street Condition' | 95585 |\n", + "+---------------------------------------------+--------+\n", + "| 'Trans Fat' | 6 |\n", + "+---------------------------------------------+--------+\n", + "| 'Plumbing' | 85 |\n", + "+---------------------------------------------+--------+\n", + "| 'Benefit Card Replacement' | 41 |\n", + "+---------------------------------------------+--------+\n", + "| 'DOF Parking - Address Update' | 1 |\n", + "+---------------------------------------------+--------+\n", + "| 'Non-Emergency Police Matter' | 13897 |\n", + "+---------------------------------------------+--------+\n", + "| 'Harboring Bees/Wasps' | 502 |\n", + "+---------------------------------------------+--------+\n", + "| 'Home Delivered Meal - Missed Delivery' | 265 |\n", + "+---------------------------------------------+--------+\n", + "| 'HPD Literature Request' | 3844 |\n", + "+---------------------------------------------+--------+\n", + "| 'Health' | 2 |\n", + "+---------------------------------------------+--------+\n", + "| 'Beach/Pool/Sauna Complaint' | 517 |\n", + "+---------------------------------------------+--------+\n", + "| 'Unsanitary Animal Facility' | 154 |\n", + "+---------------------------------------------+--------+\n", + "| 'Ferry Complaint' | 391 |\n", + "+---------------------------------------------+--------+\n", + "| 'Illegal Parking' | 117805 |\n", + "+---------------------------------------------+--------+\n", + "| 'Drug Activity' | 70 |\n", + "+---------------------------------------------+--------+\n", + "| 'DRIE' | 4 |\n", + "+---------------------------------------------+--------+\n", + "| 'Dead/Dying Tree' | 132 |\n", + "+---------------------------------------------+--------+\n", + "| 'Overflowing Litter Baskets' | 7 |\n", + "+---------------------------------------------+--------+\n", + "| 'Unleashed Dog' | 1969 |\n", + "+---------------------------------------------+--------+\n", + "| 'BEST/Site Safety' | 8 |\n", + "+---------------------------------------------+--------+\n", + "| 'Vending' | 11235 |\n", + "+---------------------------------------------+--------+\n", + "| 'Sidewalk Condition' | 56500 |\n", + "+---------------------------------------------+--------+\n", + "| 'Highway Sign - Damaged' | 187 |\n", + "+---------------------------------------------+--------+\n", + "| 'Bridge Condition' | 1415 |\n", + "+---------------------------------------------+--------+\n", + "| 'Public Payphone Complaint' | 2527 |\n", + "+---------------------------------------------+--------+\n", + "| 'Overgrown Tree/Branches' | 36842 |\n", + "+---------------------------------------------+--------+\n", + "| 'Broken Parking Meter' | 7377 |\n", + "+---------------------------------------------+--------+\n", + "| 'Animal-Abuse' | 184 |\n", + "+---------------------------------------------+--------+\n", + "| 'Taxi Complaint' | 45876 |\n", + "+---------------------------------------------+--------+\n", + "| 'Green Taxi Complaint' | 9 |\n", + "+---------------------------------------------+--------+\n", + "| 'NONCONST' | 1 |\n", + "+---------------------------------------------+--------+\n", + "| 'Abandoned Vehicle' | 884 |\n", + "+---------------------------------------------+--------+\n", + "| 'Noise - Commercial' | 71607 |\n", + "+---------------------------------------------+--------+\n", + "| 'Noise - Helicopter' | 2049 |\n", + "+---------------------------------------------+--------+\n", + "| 'New Tree Request' | 42969 |\n", + "+---------------------------------------------+--------+\n", + "| 'Noise' | 1114 |\n", + "+---------------------------------------------+--------+\n", + "| 'Illegal Fireworks' | 770 |\n", + "+---------------------------------------------+--------+\n", + "| 'X-Ray Machine/Equipment' | 43 |\n", + "+---------------------------------------------+--------+\n", + "| 'Discipline and Suspension' | 4 |\n", + "+---------------------------------------------+--------+\n", + "| 'Animal in a Park' | 3683 |\n", + "+---------------------------------------------+--------+\n", + "| 'Transportation Provider Complaint' | 49 |\n", + "+---------------------------------------------+--------+\n", + "| 'Tattooing' | 263 |\n", + "+---------------------------------------------+--------+\n", + "| 'Hazardous Materials' | 59 |\n", + "+---------------------------------------------+--------+\n", + "| 'Homeless Street Condition' | 36 |\n", + "+---------------------------------------------+--------+\n", + "| 'WATER LEAK' | 265 |\n", + "+---------------------------------------------+--------+\n", + "| 'HEAP Assistance' | 545 |\n", + "+---------------------------------------------+--------+\n", + "| 'Lifeguard' | 31 |\n", + "+---------------------------------------------+--------+\n", + "| 'Bus Stop Shelter Placement' | 749 |\n", + "+---------------------------------------------+--------+\n", + "| 'Tanning' | 2 |\n", + "+---------------------------------------------+--------+\n", + "| 'Bottled Water' | 37 |\n", + "+---------------------------------------------+--------+\n", + "| 'GENERAL' | 255 |\n", + "+---------------------------------------------+--------+\n", + "| 'Alzheimer's Care' | 727 |\n", + "+---------------------------------------------+--------+\n", + "| 'DHS Advantage - Third Party' | 315 |\n", + "+---------------------------------------------+--------+\n", + "| 'Construction Lead Dust' | 19 |\n", + "+---------------------------------------------+--------+\n", + "| 'Highway Sign - Missing' | 122 |\n", + "+---------------------------------------------+--------+\n", + "| 'Sweeping/Missed-Inadequate' | 5 |\n", + "+---------------------------------------------+--------+\n", + "| 'DPR Internal' | 702 |\n", + "+---------------------------------------------+--------+\n", + "| 'Curb Condition' | 7988 |\n", + "+---------------------------------------------+--------+\n", + "| 'Noise - House of Worship' | 3140 |\n", + "+---------------------------------------------+--------+\n", + "| 'Window Guard' | 410 |\n", + "+---------------------------------------------+--------+\n", + "| 'Sanitation Condition' | 189 |\n", + "+---------------------------------------------+--------+\n", + "| 'PAINT/PLASTER' | 259 |\n", + "+---------------------------------------------+--------+\n", + "| 'Radioactive Material' | 32 |\n", + "+---------------------------------------------+--------+\n", + "| 'Summer Camp' | 218 |\n", + "+---------------------------------------------+--------+\n", + "| 'For Hire Vehicle Report' | 17 |\n", + "+---------------------------------------------+--------+\n", + "| 'Panhandling' | 595 |\n", + "+---------------------------------------------+--------+\n", + "| 'Legal Services Provider Complaint' | 61 |\n", + "+---------------------------------------------+--------+\n", + "| 'Unsanitary Animal Pvt Property' | 2076 |\n", + "+---------------------------------------------+--------+\n", + "| 'Bike Rack Condition' | 381 |\n", + "+---------------------------------------------+--------+\n", + "| 'Weatherization' | 488 |\n", + "+---------------------------------------------+--------+\n", + "| 'Home Delivered Meal Complaint' | 146 |\n", + "+---------------------------------------------+--------+\n", + "| 'Emergency Response Team (ERT)' | 89 |\n", + "+---------------------------------------------+--------+\n", + "| 'Illegal Animal Kept as Pet' | 548 |\n", + "+---------------------------------------------+--------+\n", + "| 'DOF Property - Request Copy' | 12693 |\n", + "+---------------------------------------------+--------+\n", + "| 'Animal Facility - No Permit' | 121 |\n", + "+---------------------------------------------+--------+\n", + "| 'Utility Program' | 587 |\n", + "+---------------------------------------------+--------+\n", + "| 'Squeegee' | 16 |\n", + "+---------------------------------------------+--------+\n", + "| 'DEP Street Condition' | 1 |\n", + "+---------------------------------------------+--------+\n", + "| 'Home Care Provider Complaint' | 56 |\n", + "+---------------------------------------------+--------+\n", + "| 'Investigations and Discipline (IAD)' | 3 |\n", + "+---------------------------------------------+--------+\n", + "| 'SCRIE' | 30933 |\n", + "+---------------------------------------------+--------+\n", + "| 'Electrical' | 24 |\n", + "+---------------------------------------------+--------+\n", + "| 'Urinating in Public' | 1154 |\n", + "+---------------------------------------------+--------+\n", + "| 'PAINT - PLASTER' | 1 |\n", + "+---------------------------------------------+--------+\n", + "| 'Homeless Person Assistance' | 354 |\n", + "+---------------------------------------------+--------+\n", + "| 'Water Conservation' | 35 |\n", + "+---------------------------------------------+--------+\n", + "| 'Noise - Residential' | 404057 |\n", + "+---------------------------------------------+--------+\n", + "| 'Street Sign - Missing' | 12170 |\n", + "+---------------------------------------------+--------+\n", + "| 'City Vehicle Placard Complaint' | 13 |\n", + "+---------------------------------------------+--------+\n", + "| 'OUTSIDE BUILDING' | 3 |\n", + "+---------------------------------------------+--------+\n", + "| 'FLOORING/STAIRS' | 88 |\n", + "+---------------------------------------------+--------+\n", + "| 'Portable Toilet' | 4 |\n", + "+---------------------------------------------+--------+\n", + "| 'Housing Options' | 3744 |\n", + "+---------------------------------------------+--------+\n", + "| 'Building Marshals office' | 5 |\n", + "+---------------------------------------------+--------+\n", + "| 'DOF Parking - Request Copy' | 2411 |\n", + "+---------------------------------------------+--------+\n", + "| 'Posting Advertisement' | 448 |\n", + "+---------------------------------------------+--------+\n", + "| 'Parent Leadership' | 1 |\n", + "+---------------------------------------------+--------+\n", + "| 'Sewer' | 848 |\n", + "+---------------------------------------------+--------+\n", + "| 'Construction' | 14876 |\n", + "+---------------------------------------------+--------+\n", + "| 'DOOR/WINDOW' | 244 |\n", + "+---------------------------------------------+--------+\n", + "| 'Parking Card' | 113 |\n", + "+---------------------------------------------+--------+\n", + "| 'Recycling Enforcement' | 4 |\n", + "+---------------------------------------------+--------+\n", + "| 'LinkNYC' | 4 |\n", + "+---------------------------------------------+--------+\n", + "| 'Mobile Food Vendor' | 29 |\n", + "+---------------------------------------------+--------+\n", + "| 'SAFETY' | 80 |\n", + "+---------------------------------------------+--------+\n", + "| 'Taxpayer Advocate Inquiry' | 1 |\n", + "+---------------------------------------------+--------+\n", + "| 'Special Projects Inspection Team (SPIT)' | 78 |\n", + "+---------------------------------------------+--------+\n", + "| 'DHS Income Savings Requirement' | 91 |\n", + "+---------------------------------------------+--------+\n", + "| 'Food Establishment' | 19238 |\n", + "+---------------------------------------------+--------+\n", + "| 'Drinking' | 3735 |\n", + "+---------------------------------------------+--------+\n", + "| 'ELECTRIC' | 220 |\n", + "+---------------------------------------------+--------+\n", + "| 'Bereavement Support Group' | 74 |\n", + "+---------------------------------------------+--------+\n", + "| 'DOF Parking - Tax Exemption' | 1110 |\n", + "+---------------------------------------------+--------+\n", + "| 'Derelict Vehicles' | 779 |\n", + "+---------------------------------------------+--------+\n", + "| 'Water System' | 1260 |\n", + "+---------------------------------------------+--------+\n", + "| 'DHS Advantage -Landlord/Broker' | 11752 |\n", + "+---------------------------------------------+--------+\n", + "| 'Pet Shop' | 7 |\n", + "+---------------------------------------------+--------+\n", + "| 'For Hire Vehicle Complaint' | 7135 |\n", + "+---------------------------------------------+--------+\n", + "| 'Boilers' | 46 |\n", + "+---------------------------------------------+--------+\n", + "| 'Building Condition' | 11 |\n", + "+---------------------------------------------+--------+\n", + "| 'Traffic Signal Condition' | 670 |\n", + "+---------------------------------------------+--------+\n", + "| 'Street Sign - Dangling' | 5851 |\n", + "+---------------------------------------------+--------+\n", + "| 'Other Enforcement' | 15 |\n", + "+---------------------------------------------+--------+\n", + "| 'Highway Condition' | 8130 |\n", + "+---------------------------------------------+--------+\n", + "| 'UNSANITARY CONDITION' | 670 |\n", + "+---------------------------------------------+--------+\n", + "| 'Ferry Inquiry' | 1011 |\n", + "+---------------------------------------------+--------+\n", + "| 'HEAT/HOT WATER' | 7952 |\n", + "+---------------------------------------------+--------+\n", + "| 'Graffiti' | 1025 |\n", + "+---------------------------------------------+--------+\n", + "| 'Blocked Driveway' | 158577 |\n", + "+---------------------------------------------+--------+\n", + "| 'Bus Stop Shelter Complaint' | 3369 |\n", + "+---------------------------------------------+--------+\n", + "| 'DOF Parking - Request Status' | 3430 |\n", + "+---------------------------------------------+--------+\n", + "| 'School Maintenance' | 3245 |\n", + "+---------------------------------------------+--------+\n", + "| 'ATF' | 2 |\n", + "+---------------------------------------------+--------+\n", + "| 'Case Management Agency Complaint' | 61 |\n", + "+---------------------------------------------+--------+\n", + "| 'Home Repair' | 1659 |\n", + "+---------------------------------------------+--------+\n", + "| 'Rodent' | 475 |\n", + "+---------------------------------------------+--------+\n", + "| 'Lost Property' | 324 |\n", + "+---------------------------------------------+--------+\n", + "| 'OEM Literature Request' | 1103 |\n", + "+---------------------------------------------+--------+\n", + "| 'Industrial Waste' | 23 |\n", + "+---------------------------------------------+--------+\n", + "| 'Public Toilet' | 133 |\n", + "+---------------------------------------------+--------+\n", + "| 'Tunnel Condition' | 12 |\n", + "+---------------------------------------------+--------+\n", + "| 'ELEVATOR' | 11 |\n", + "+---------------------------------------------+--------+\n", + "| 'Violation of Park Rules' | 4185 |\n", + "+---------------------------------------------+--------+\n", + "| 'Taxi Compliment' | 1026 |\n", + "+---------------------------------------------+--------+\n", + "| 'Indoor Air Quality' | 11672 |\n", + "+---------------------------------------------+--------+\n", + "| 'Damaged Tree' | 92930 |\n", + "+---------------------------------------------+--------+\n", + "| 'Noise Survey' | 250 |\n", + "+---------------------------------------------+--------+\n", + "| 'Cranes and Derricks' | 4 |\n", + "+---------------------------------------------+--------+\n", + "| 'Derelict Vehicle' | 30603 |\n", + "+---------------------------------------------+--------+\n", + "| 'Safety' | 1 |\n", + "+---------------------------------------------+--------+\n", + "| 'Elevator' | 371 |\n", + "+---------------------------------------------+--------+\n", + "| 'DOF Property - Property Value' | 110 |\n", + "+---------------------------------------------+--------+\n", + "| 'DOF Property - Reduction Issue' | 21365 |\n", + "+---------------------------------------------+--------+\n", + "| 'Illegal Animal Sold' | 72 |\n", + "+---------------------------------------------+--------+\n", + "| 'Highway Sign - Dangling' | 49 |\n", + "+---------------------------------------------+--------+\n", + "| 'Disorderly Youth' | 2078 |\n", + "+---------------------------------------------+--------+\n", + "| 'PLUMBING' | 511 |\n", + "+---------------------------------------------+--------+\n", + "| 'Calorie Labeling' | 17 |\n", + "+---------------------------------------------+--------+\n", + "| 'Housing - Low Income Senior' | 11013 |\n", + "+---------------------------------------------+--------+\n", + "| 'Illegal Tree Damage' | 5255 |\n", + "+---------------------------------------------+--------+\n", + "| 'Found Property' | 219 |\n", + "+---------------------------------------------+--------+\n", + "| 'Municipal Parking Facility' | 165 |\n", + "+---------------------------------------------+--------+\n", + "| 'Missed Collection (All Materials)' | 1351 |\n", + "+---------------------------------------------+--------+\n", + "| 'DOF Parking - DMV Clearance' | 506 |\n", + "+---------------------------------------------+--------+\n", + "| 'Elder Abuse' | 1521 |\n", + "+---------------------------------------------+--------+\n", + "| 'Food Poisoning' | 7287 |\n", + "+---------------------------------------------+--------+\n", + "| 'Broken Muni Meter' | 71519 |\n", + "+---------------------------------------------+--------+\n", + "| 'Mold' | 898 |\n", + "+---------------------------------------------+--------+\n", + "| 'APPLIANCE' | 147 |\n", + "+---------------------------------------------+--------+\n", + "| 'NORC Complaint' | 3 |\n", + "+---------------------------------------------+--------+\n", + "| 'Registration and Transfers' | 7 |\n", + "+---------------------------------------------+--------+\n", + "| 'General Construction/Plumbing' | 987 |\n", + "+---------------------------------------------+--------+\n", + "| 'Special Natural Area District (SNAD)' | 2 |\n", + "+---------------------------------------------+--------+\n", + "| 'Construction Safety Enforcement' | 51 |\n", + "+---------------------------------------------+--------+\n", + "| 'Indoor Sewage' | 2475 |\n", + "+---------------------------------------------+--------+\n", + "| 'Building/Use' | 520 |\n", + "+---------------------------------------------+--------+\n", + "| 'DHS Advantage - Tenant' | 12835 |\n", + "+---------------------------------------------+--------+\n", + "| 'Bike/Roller/Skate Chronic' | 1304 |\n", + "+---------------------------------------------+--------+\n", + "| 'Noise - Street/Sidewalk' | 72362 |\n", + "+---------------------------------------------+--------+\n", + "| 'Teaching/Learning/Instruction' | 7 |\n", + "+---------------------------------------------+--------+\n", + "| 'Borough Office' | 21 |\n", + "+---------------------------------------------+--------+\n", + "| 'Drinking Water' | 270 |\n", + "+---------------------------------------------+--------+\n", + "| 'Noise - Park' | 9283 |\n", + "+---------------------------------------------+--------+\n", + "| 'DCA / DOH New License Application Request' | 2301 |\n", + "+---------------------------------------------+--------+\n", + "| 'Non-Residential Heat' | 861 |\n", + "+---------------------------------------------+--------+\n", + "| 'Day Care' | 9 |\n", + "+---------------------------------------------+--------+\n", + "| 'Unsanitary Pigeon Condition' | 537 |\n", + "+---------------------------------------------+--------+\n", + "| 'Air Quality' | 131 |\n", + "+---------------------------------------------+--------+\n", + "| 'Noise - Vehicle' | 41089 |\n", + "+---------------------------------------------+--------+\n", + "| 'Dead Tree' | 26331 |\n", + "+---------------------------------------------+--------+\n", + "| 'Dirty Conditions' | 579 |\n", + "+---------------------------------------------+--------+\n", + "| 'Consumer Complaint' | 50527 |\n", + "+---------------------------------------------+--------+\n", + "| 'Homeless Encampment' | 6881 |\n", + "+---------------------------------------------+--------+\n", + "| 'DOF Property - Payment Issue' | 9600 |\n", + "+---------------------------------------------+--------+\n", + "| 'Executive Inspections' | 6 |\n", + "+---------------------------------------------+--------+\n", + "| 'Water Quality' | 107 |\n", + "+---------------------------------------------+--------+\n", + "| 'Smoking' | 1690 |\n", + "+---------------------------------------------+--------+\n", + "| 'Sustainability Enforcement' | 13 |\n", + "+---------------------------------------------+--------+\n", + "| 'DOF Property - City Rebate' | 335 |\n", + "+---------------------------------------------+--------+\n", + "| 'Lead' | 4243 |\n", + "+---------------------------------------------+--------+\n", + "| 'Asbestos' | 1419 |\n", + "+---------------------------------------------+--------+\n", + "| 'Animal Abuse' | 4536 |\n", + "+---------------------------------------------+--------+\n", + "| 'Street Light Condition' | 1090 |\n", + "+---------------------------------------------+--------+\n", + "| 'Root/Sewer/Sidewalk Condition' | 20992 |\n", + "+---------------------------------------------+--------+\n", + "| 'Traffic' | 8982 |\n", + "+---------------------------------------------+--------+\n", + "| 'Street Sign - Damaged' | 22554 |\n", + "+---------------------------------------------+--------+\n", + "| 'DOF Property - Owner Issue' | 20120 |\n", + "+---------------------------------------------+--------+\n", + "| 'Advocate-Personal Exemptions' | 4 |\n", + "+---------------------------------------------+--------+\n", + "| 'Senior Center Complaint' | 790 |\n", + "+---------------------------------------------+--------+\n", + "| 'Vacant Lot' | 8 |\n", + "+---------------------------------------------+--------+\n", + "| 'DOF Property - RPIE Issue' | 507 |\n", + "+---------------------------------------------+--------+\n", + "| 'Taxi Report' | 37 |\n", + "+---------------------------------------------+--------+\n", + "| 'Standing Water' | 2427 |\n", + "+---------------------------------------------+--------+\n", + "| 'Derelict Bicycle' | 51 |\n", + "+---------------------------------------------+--------+\n", + "| 'Poison Ivy' | 360 |\n", + "+---------------------------------------------+--------+\n", + "| 'Scaffold Safety' | 3 |\n", + "+---------------------------------------------+--------+\n", + "| 'Maintenance or Facility' | 23440 |\n", + "+---------------------------------------------+--------+\n", + "| 'Ferry Permit' | 51 |\n", + "+---------------------------------------------+--------+\n", + "| 'Snow' | 16 |\n", + "+---------------------------------------------+--------+\n" + ] + } + ], + "source": [ + "ds.aggregateByKey(combine_udf, aggregate_udf, 0, [\"Complaint Type\"]).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CnkMPDVOXZmo" + }, + "source": [ + "To see what the most common complaint is, let's sort the output:" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "id": "u7JG8-TpXYsS" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:logical planner:logical optimization took 0.003317ms\n", + "INFO:codegen:generating pipeline for (Option[i64],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[i64],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[i64],Option[str],Option[i64],Option[i64],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[f64],Option[f64],Option[str]) -> (Option[str],i64) (1 operator pipelined)\n", + "INFO:codegen:generating function combine_udf for (i64,i64) -> i64\n", + "INFO:codegen:generating function aggregate_udf for (i64,(Option[i64],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[i64],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[i64],Option[str],Option[i64],Option[i64],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[f64],Option[f64],Option[str])) -> i64\n", + "WARNING:history server:Could not register job, is history server running? To disable this warning, set webui=False in the context configuration.\n", + "INFO:global:provided option to explicitly merge bad rows in order back, however rows will be hashed. Disabling option. To silence this warning, set\n", + "tuplex.optimizer.mergeExceptionsInOrder=false\n", + "INFO:global:Optimization via LLVM passes took 0.099010 ms\n", + "INFO:global:starting code compilation\n", + "INFO:global:first compile done\n", + "INFO:global:functor Stage_0 retrieved from llvm\n", + "INFO:global:retrieving init/release stage functors\n", + "INFO:global:Compiled code paths for stage 0 in 0.03 ms\n", + "INFO:global:[Transform Stage] Stage 0 compiled to x86 in 0.136291s\n", + "INFO:local ee:split /home/leonhard/projects/2nd-copy/311_subset.csv into 15 tasks\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.346857s (0 normal rows, 61 exceptions, 129 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.350948s (0 normal rows, 31 exceptions, 137 buckets)\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.269277s (0 normal rows, 32 exceptions, 136 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.268737s (0 normal rows, 62 exceptions, 128 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.267930s (0 normal rows, 46 exceptions, 145 buckets)\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.292654s (0 normal rows, 33 exceptions, 200 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.274232s (0 normal rows, 60 exceptions, 142 buckets)\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.295006s (0 normal rows, 48 exceptions, 142 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.269720s (0 normal rows, 41 exceptions, 143 buckets)\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.279908s (0 normal rows, 1 exception, 138 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.267421s (0 normal rows, 5 exceptions, 140 buckets)\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.306931s (0 normal rows, 15 exceptions, 135 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.266008s (0 normal rows, 4 exceptions, 123 buckets)\n", + "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.280537s (0 normal rows, 37 exceptions, 117 buckets)\n", + "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.438606s (0 normal rows, 34 exceptions, 126 buckets)\n", + "INFO:global:[Transform Stage] Stage 0 completed 15 load&transform tasks in 2.42753s\n", + "INFO:global:[Transform Stage] Stage 0 total wall clock time: 4.47477s, 1907219 input rows, time to process 1 row via fast path: 0.00234623ms\n", + "INFO:global:Exception details: \n", + "+------------+-------------------------------------+-------+\n", + "| OperatorID | Exception | Count |\n", + "+------------+-------------------------------------+-------+\n", + "| 100135 | tuplex.internal.BadParseStringInput | 510 |\n", + "+------------+-------------------------------------+-------+\n", + "INFO:local ee:created combined normal-case result in 0.001441s\n", + "INFO:local ee:compiled pure python pipeline in 0.001211s\n", + "INFO:local ee:creating hybrid intermediates took 0.000004s\n", + "INFO:local ee:Created 15 resolve tasks in 0.000370s\n", + "INFO:local ee:15/15 tasks require executing the slow path.\n", + "INFO:driver:[Task Finished] Resolve in 0.018784s\n", + "INFO:driver:[Task Finished] Resolve in 0.020323s\n", + "INFO:E/1:[Task Finished] Resolve in 0.054611s\n", + "INFO:E/1:[Task Finished] Resolve in 0.025628s\n", + "INFO:E/1:[Task Finished] Resolve in 0.024868s\n", + "INFO:driver:[Task Finished] Resolve in 0.063070s\n", + "INFO:E/1:[Task Finished] Resolve in 0.049932s\n", + "INFO:E/1:[Task Finished] Resolve in 0.013301s\n", + "INFO:driver:[Task Finished] Resolve in 0.051306s\n", + "INFO:E/1:[Task Finished] Resolve in 0.000610s\n", + "INFO:E/1:[Task Finished] Resolve in 0.006864s\n", + "INFO:E/1:[Task Finished] Resolve in 0.003635s\n", + "INFO:driver:[Task Finished] Resolve in 0.021057s\n", + "INFO:E/1:[Task Finished] Resolve in 0.028195s\n", + "INFO:driver:[Task Finished] Resolve in 0.019220s\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=For Hire Vehicle Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Housing Options\n", + "setdefault w. key=For Hire Vehicle Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Housing - Low Income Senior\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Sidewalk Condition\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Broken Parking Meter\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=For Hire Vehicle Complaint\n", + "setdefault w. key=Housing - Low Income Senior\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Housing Options\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Curb Condition\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Street Condition\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Broken Muni Meter\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=For Hire Vehicle Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=SCRIE\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=For Hire Vehicle Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=SCRIE\n", + "setdefault w. key=SCRIE\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Housing Options\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Food Establishment\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Sidewalk Condition\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=For Hire Vehicle Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Housing Options\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Damaged Tree\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=For Hire Vehicle Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Sidewalk Condition\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Housing - Low Income Senior\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=SCRIE\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Housing - Low Income Senior\n", + "setdefault w. key=Housing Options\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=SCRIE\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=For Hire Vehicle Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Sidewalk Condition\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Broken Muni Meter\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=For Hire Vehicle Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Alzheimer's Care\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=SCRIE\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Housing - Low Income Senior\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Alzheimer's Care\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Housing - Low Income Senior\n", + "setdefault w. key=Housing Options\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Food Establishment\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Taxi Complaint\n", + "setdefault w. key=Consumer Complaint\n", + "setdefault w. key=SCRIE\n", + "setdefault w. key=SCRIE\n", + "setdefault w. key=Consumer Complaint\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:local ee:slow path resolved 510/510 exceptions in 0.223588s\n", + "INFO:local ee:slow path for Stage 0: total wall clock time: 0.401403s, time to process 1 row via slow path: 0.787065ms\n", + "INFO:global:[Transform Stage] Stage 0 completed 15 resolve tasks in 0.223618s\n", + "INFO:global:[Transform Stage] Stage 0 completed 15 sink tasks in 0.00194741s\n", + "INFO:global:[Transform Stage] Stage 0 took 2.78945s\n", + "INFO:global:[Transform Stage] skipped stage 1 because there is nothing todo here.\n", + "INFO:global:Query Execution took 2.81643s. (planning: 0.0238355s, execution: 2.7926s)\n", + "INFO:python:Data transfer back to Python took 0.001298 seconds\n" + ] + }, + { + "data": { + "text/plain": [ + "[('Mosquitoes', 4),\n", + " ('DOF Parking - Payment Issue', 19487),\n", + " ('DOF Property - Update Account', 65),\n", + " ('Street Condition', 95585),\n", + " ('Trans Fat', 6)]" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = ds.aggregateByKey(combine_udf, aggregate_udf, 0, [\"Complaint Type\"]).collect()\n", + "\n", + "sorted(data, key=lambda x: x[1])\n", + "\n", + "data[:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pbHtmBoaYqQD" + }, + "source": [ + "As we can see, ?? is the most common complaint." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7_iruiYiiWmd" + }, + "source": [ + "(c) 2017 - 2022 Tuplex team" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UyneYgH5XwQz" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.13" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/tuplex/test/adapters/cpython/PythonHelperTest.cc b/tuplex/test/adapters/cpython/PythonHelperTest.cc index db070c9bc..5c071ee12 100644 --- a/tuplex/test/adapters/cpython/PythonHelperTest.cc +++ b/tuplex/test/adapters/cpython/PythonHelperTest.cc @@ -391,18 +391,24 @@ TEST_F(PythonHelperTest, FunctionGlobals) { PyObject* pyFunc = python::runAndGet(code, "func"); + // convert globals object to lookup dictionary + unordered_map> globals; + auto pyGlobals = PyFunction_GetGlobals(pyFunc); + + // fetch globals, locals cout<> globals; - auto pyGlobals = PyFunction_GetGlobals(pyFunc); // iterate over dictionary PyObject *key = nullptr, *val = nullptr; Py_ssize_t pos = 0; // must be initialized to 0 to start iteration, however internal iterator variable. Don't use semantically. while(PyDict_Next(pyGlobals, &pos, &key, &val)) { + // b. key/value will be used twice, inc refcount by one + Py_XINCREF(key); + Py_XINCREF(val); auto curKeyType = mapPythonClassToTuplexType(key, false); auto curValType = mapPythonClassToTuplexType(val, false); assert(curKeyType == python::Type::STRING); @@ -416,11 +422,10 @@ TEST_F(PythonHelperTest, FunctionGlobals) { for(auto item : globals) cout<(item.second).desc()<<"]: "<(item.second)< Date: Fri, 30 Sep 2022 19:43:43 -0400 Subject: [PATCH 05/14] checking --- .../adapters/cpython/include/PythonHelpers.h | 11 + tuplex/adapters/cpython/src/PythonGIL.cc | 98 +++- tuplex/core/src/ee/local/LocalBackend.cc | 10 +- tuplex/core/src/physical/ResolveTask.cc | 531 +++++++++--------- tuplex/test/core/AssertAndRaise.cc | 13 +- 5 files changed, 382 insertions(+), 281 deletions(-) diff --git a/tuplex/adapters/cpython/include/PythonHelpers.h b/tuplex/adapters/cpython/include/PythonHelpers.h index b3abf40a9..a3246f2cf 100644 --- a/tuplex/adapters/cpython/include/PythonHelpers.h +++ b/tuplex/adapters/cpython/include/PythonHelpers.h @@ -354,6 +354,17 @@ namespace python { */ extern void unlockGIL(); +#ifndef NDEBUG + inline void checkPythonIntegrity() { + python::lockGIL(); + if(PyErr_Occurred()) { + std::cerr<<"internal python error"< #include +#include + namespace python { // GIL details: @@ -27,37 +29,87 @@ namespace python { ss.flush(); auto thread_id = ss.str(); int64_t id = -1; +#ifndef LINUX sscanf(thread_id.c_str(), "%lld", &id); +#else + sscanf(thread_id.c_str(), "%ld", &id); +#endif return id; } // GIL management here static std::atomic_bool gil(false); // true if a thread holds the gil, false else static std::mutex gilMutex; // access to all the properties below + PyGILState_STATE gstate; // for non-main thread lock + + // cf. https://pythonextensionpatterns.readthedocs.io/en/latest/thread_safety.html#f1 + static PyThread_type_lock gil_lock(nullptr); + + static void acquire_lock() { + // lazy init lock -> called on first entry. + if(!gil_lock) { + gil_lock = PyThread_allocate_lock(); + if(!gil_lock) { + std::cerr<<"failed to initialize lock"< convert to uint64_t and use this for thread safe access static std::atomic_int64_t gilID(-1); // id of thread who holds gil static std::atomic_int64_t interpreterID(-1); // thread which holds the interpreter static std::atomic_bool interpreterInitialized(false); // checks whether interpreter is initialized or not + std::thread::id gil_main_thread_id; // vars for python management static std::atomic gilState(nullptr); void lockGIL() { - gilMutex.lock(); - assert(gilState); - PyEval_RestoreThread(gilState); // acquires GIL! + gilMutex.lock(); // <-- acquire the managing lock. No other thread can lock the gil! => what if another thread tries to unlock? -> security concern... + + // what is the current thread id? is it the main thread? => then lock the gil via restore thread etc. + // if not, need to use GILState_Ensure + if(std::this_thread::get_id() == gil_main_thread_id) { + if(!gilState) + gilState = PyGILState_GetThisThreadState(); + assert(gilState); + PyEval_RestoreThread(gilState); // acquires GIL! + } else { + assert(interpreterInitialized); + gstate = PyGILState_Ensure(); + } + assert(PyGILState_Check()); gil = true; + gilState = nullptr; gilID = thisThreadID(); } void unlockGIL() { - gilMutex.unlock(); + // is it the main thread? and does it hold the manipulation lock? + if(std::this_thread::get_id() == gil_main_thread_id) { + gilState = PyEval_SaveThread(); + } else { + assert(interpreterInitialized); + PyGILState_Release(gstate); + gstate = PyGILState_UNLOCKED; + } gil = false; - gilState = PyEval_SaveThread(); gilID = thisThreadID(); + gilMutex.unlock(); } bool holdsGIL() { @@ -65,14 +117,17 @@ namespace python { } void acquireGIL() { - gilMutex.lock(); - PyEval_AcquireLock(); - PyEval_AcquireThread(gilState); // acquires GIL! - gil = true; - gilID = thisThreadID(); +// gilMutex.lock(); +// // PyEval_AcquireLock(); +// PyEval_AcquireThread(gilState); // acquires GIL! +// gil = true; +// gilID = thisThreadID(); + std::cerr<<"acquire GIL is deprecated"<resolveFunctor || !tstage->purePythonCode().empty()) { using namespace std; @@ -1284,14 +1287,16 @@ namespace tuplex { auto input_intermediates = tstage->initData(); // lazy init hybrids - if(!input_intermediates.hybrids) { + if(!input_intermediates.hybrids && input_intermediates.numArgs > 0) { auto num_predecessors = tstage->predecessors().size(); input_intermediates.hybrids = new PyObject*[num_predecessors]; // @TODO: free these intermediates. Where is this done? for(int i = 0; i < num_predecessors; ++i) input_intermediates.hybrids[i] = nullptr; + } else { + assert(input_intermediates.hybrids == nullptr); + input_intermediates.hybrids = nullptr; } - python::lockGIL(); // construct intermediates from predecessors @@ -1321,7 +1326,6 @@ namespace tuplex { python::unlockGIL(); // check whether hybrids exist. If not, create them quickly! - assert(input_intermediates.hybrids); logger().info("creating hybrid intermediates took " + std::to_string(timer.time()) + "s"); timer.reset(); diff --git a/tuplex/core/src/physical/ResolveTask.cc b/tuplex/core/src/physical/ResolveTask.cc index 0e03a0bcf..656196a71 100644 --- a/tuplex/core/src/physical/ResolveTask.cc +++ b/tuplex/core/src/physical/ResolveTask.cc @@ -457,270 +457,287 @@ namespace tuplex { // fallback 2: interpreter path // --> only go there if a non-true exception was recorded. Else, it will be dealt with above if(resCode == -1 && _interpreterFunctor) { + assert(!python::holdsGIL()); + // acquire GIL python::lockGIL(); - PyCallable_Check(_interpreterFunctor); - - // holds the pythonized data - PyObject* tuple = nullptr; - - bool parse_cells = false; - - // there are different data reps for certain error codes. - // => decode the correct object from memory & then feed it into the pipeline... - if(ecCode == ecToI64(ExceptionCode::BADPARSE_STRING_INPUT)) { - // it's a string! - tuple = tupleFromParseException(ebuf, eSize); - parse_cells = true; // need to parse cells in python mode. - } else if(ecCode == ecToI64(ExceptionCode::NORMALCASEVIOLATION)) { - // changed, why are these names so random here? makes no sense... - auto row = Row::fromMemory(exceptionsInputSchema(), ebuf, eSize); - - tuple = python::rowToPython(row, true); - parse_cells = false; - // called below... - } else if (ecCode == ecToI64(ExceptionCode::PYTHON_PARALLELIZE)) { - auto pyObj = python::deserializePickledObject(python::getMainModule(), (char *) ebuf, eSize); - tuple = pyObj; - parse_cells = false; - } else { - // normal case, i.e. an exception occurred somewhere. - // --> this means if pipeline is using string as input, we should convert - auto row = Row::fromMemory(exceptionsInputSchema(), ebuf, eSize); - - // cell source automatically takes input, i.e. no need to convert. simply get tuple from row object - tuple = python::rowToPython(row, true); - -#ifndef NDEBUG - if(PyTuple_Check(tuple)) { - // make sure tuple is valid... - for(unsigned i = 0; i < PyTuple_Size(tuple); ++i) { - auto elemObj = PyTuple_GET_ITEM(tuple, i); - assert(elemObj); - } - } -#endif - parse_cells = false; - } - // compute - // @TODO: we need to encode the hashmaps as these hybrid objects! - // ==> for more efficiency we prob should store one per executor! - // the same goes for any hashmap... - - assert(tuple); -#ifndef NDEBUG - if(!tuple) { - owner()->error("bad decode, using () as dummy..."); - tuple = PyTuple_New(0); // empty tuple. + // check there is no errror! + if(PyErr_Occurred()) { + std::cerr<<"python error!"< 1) { - // nothing todo... - } else { - auto tmp_tuple = PyTuple_New(1); - PyTuple_SET_ITEM(tmp_tuple, 0, tuple); - tuple = tmp_tuple; - } - -#ifndef NDEBUG - // // to print python object - // Py_XINCREF(tuple); - // PyObject_Print(tuple, stdout, 0); - // std::cout< note: unify handling this with the other cases... - assert(_htable->hybrid_hm); - Py_XINCREF(_htable->hybrid_hm); - PyTuple_SET_ITEM(args, num_python_args - 1, _htable->hybrid_hm); - } - - auto kwargs = PyDict_New(); PyDict_SetItemString(kwargs, "parse_cells", python::boolean(parse_cells)); - auto pcr = python::callFunctionEx(_interpreterFunctor, args, kwargs); - - if(pcr.exceptionCode != ExceptionCode::SUCCESS) { - // this should not happen, bad internal error. codegen'ed python should capture everything. - owner()->error("bad internal python error: " + pcr.exceptionMessage); - python::unlockGIL(); - return; - } else { - // all good, row is fine. exception occurred? - assert(pcr.res); - - // type check: save to regular rows OR save to python row collection - if(!pcr.res) { - owner()->error("bad internal python error, NULL object returned"); - } else { - -#ifndef NDEBUG - // // uncomment to print res obj - // Py_XINCREF(pcr.res); - // PyObject_Print(pcr.res, stdout, 0); - // std::cout<(cptr), strlen(cptr), BUF_FORMAT_NORMAL_OUTPUT); // don't write '\0'! - } else { - - // there are three options where to store the result now - - // 1. fits targetOutputSchema (i.e. row becomes normalcase row) - bool outputAsNormalRow = python::Type::UNKNOWN != unifyTypes(rowType, _targetOutputSchema.getRowType(), _allowNumericTypeUnification) - && canUpcastToRowType(rowType, _targetOutputSchema.getRowType()); - // 2. fits generalCaseOutputSchema (i.e. row becomes generalcase row) - bool outputAsGeneralRow = python::Type::UNKNOWN != unifyTypes(rowType, - commonCaseOutputSchema().getRowType(), _allowNumericTypeUnification) - && canUpcastToRowType(rowType, commonCaseOutputSchema().getRowType()); - - // 3. doesn't fit, store as python object. => we should use block storage for this as well. Then data can be shared. - - // can upcast? => note that the && is necessary because of cases where outputSchema is - // i64 but the given row type f64. We can cast up i64 to f64 but not the other way round. - if(outputAsNormalRow) { - Row resRow = python::pythonToRow(rowObj).upcastedRow(_targetOutputSchema.getRowType()); - assert(resRow.getRowType() == _targetOutputSchema.getRowType()); - - // write to buffer & perform callback - auto buf_size = 2 * resRow.serializedLength(); - uint8_t *buf = new uint8_t[buf_size]; - memset(buf, 0, buf_size); - auto serialized_length = resRow.serializeToMemory(buf, buf_size); - // call row func! - // --> merge row distinguishes between those two cases. Distinction has to be done there - // because of compiled functor who calls mergeRow in the write function... - mergeRow(buf, serialized_length, BUF_FORMAT_NORMAL_OUTPUT); - delete [] buf; - } else if(outputAsGeneralRow) { - Row resRow = python::pythonToRow(rowObj).upcastedRow(commonCaseOutputSchema().getRowType()); - assert(resRow.getRowType() == commonCaseOutputSchema().getRowType()); - - // write to buffer & perform callback - auto buf_size = 2 * resRow.serializedLength(); - uint8_t *buf = new uint8_t[buf_size]; - memset(buf, 0, buf_size); - auto serialized_length = resRow.serializeToMemory(buf, buf_size); - // call row func! - // --> merge row distinguishes between those two cases. Distinction has to be done there - // because of compiled functor who calls mergeRow in the write function... - mergeRow(buf, serialized_length, BUF_FORMAT_GENERAL_OUTPUT); - delete [] buf; - } else { - // Unwrap single element tuples before writing them to the fallback sink - if(PyTuple_Check(rowObj) && PyTuple_Size(rowObj) == 1) { - writePythonObjectToFallbackSink(PyTuple_GetItem(rowObj, 0)); - } else { - writePythonObjectToFallbackSink(rowObj); - } - } - // Py_XDECREF(rowObj); - } - } - -#ifndef NDEBUG - if(PyErr_Occurred()) { - // print out the otber objects... - std::cout<<__FILE__<<":"<<__LINE__<<" python error not cleared properly!"< decode the correct object from memory & then feed it into the pipeline... +// if(ecCode == ecToI64(ExceptionCode::BADPARSE_STRING_INPUT)) { +// // it's a string! +// tuple = tupleFromParseException(ebuf, eSize); +// parse_cells = true; // need to parse cells in python mode. +// } else if(ecCode == ecToI64(ExceptionCode::NORMALCASEVIOLATION)) { +// // changed, why are these names so random here? makes no sense... +// auto row = Row::fromMemory(exceptionsInputSchema(), ebuf, eSize); +// +// tuple = python::rowToPython(row, true); +// parse_cells = false; +// // called below... +// } else if (ecCode == ecToI64(ExceptionCode::PYTHON_PARALLELIZE)) { +// auto pyObj = python::deserializePickledObject(python::getMainModule(), (char *) ebuf, eSize); +// tuple = pyObj; +// parse_cells = false; +// } else { +// // normal case, i.e. an exception occurred somewhere. +// // --> this means if pipeline is using string as input, we should convert +// auto row = Row::fromMemory(exceptionsInputSchema(), ebuf, eSize); +// +// // cell source automatically takes input, i.e. no need to convert. simply get tuple from row object +// tuple = python::rowToPython(row, true); +// +//#ifndef NDEBUG +// if(PyTuple_Check(tuple)) { +// // make sure tuple is valid... +// for(unsigned i = 0; i < PyTuple_Size(tuple); ++i) { +// auto elemObj = PyTuple_GET_ITEM(tuple, i); +// assert(elemObj); +// } +// } +//#endif +// parse_cells = false; +// } +// +// // compute +// // @TODO: we need to encode the hashmaps as these hybrid objects! +// // ==> for more efficiency we prob should store one per executor! +// // the same goes for any hashmap... +// +// assert(tuple); +//#ifndef NDEBUG +// if(!tuple) { +// owner()->error("bad decode, using () as dummy..."); +// tuple = PyTuple_New(0); // empty tuple. +// } +//#endif +// +// // note: current python pipeline always expects a tuple arg. hence pack current element. +// if(PyTuple_Check(tuple) && PyTuple_Size(tuple) > 1) { +// // nothing todo... +// } else { +// auto tmp_tuple = PyTuple_New(1); +// PyTuple_SET_ITEM(tmp_tuple, 0, tuple); +// tuple = tmp_tuple; +// } +// +//#ifndef NDEBUG +// // // to print python object +// // Py_XINCREF(tuple); +// // PyObject_Print(tuple, stdout, 0); +// // std::cout< note: unify handling this with the other cases... +// assert(_htable->hybrid_hm); +// Py_XINCREF(_htable->hybrid_hm); +// PyTuple_SET_ITEM(args, num_python_args - 1, _htable->hybrid_hm); +// } +// +// auto kwargs = PyDict_New(); +// auto py_parse_cells = python::boolean(parse_cells); +// PyDict_SetItemString(kwargs, "parse_cells", py_parse_cells); +// auto pcr = python::callFunctionEx(_interpreterFunctor, args, kwargs); +// +// if(pcr.exceptionCode != ExceptionCode::SUCCESS) { +// // this should not happen, bad internal error. codegen'ed python should capture everything. +// owner()->error("bad internal python error: " + pcr.exceptionMessage); +// python::unlockGIL(); +// return; +// } else { +// // all good, row is fine. exception occurred? +// assert(pcr.res); +// +// // type check: save to regular rows OR save to python row collection +// if(!pcr.res) { +// owner()->error("bad internal python error, NULL object returned"); +// } else { +// +//#ifndef NDEBUG +// // // uncomment to print res obj +// // Py_XINCREF(pcr.res); +// // PyObject_Print(pcr.res, stdout, 0); +// // std::cout<(cptr), strlen(cptr), BUF_FORMAT_NORMAL_OUTPUT); // don't write '\0'! +// } else { +// +// // there are three options where to store the result now +// +// // 1. fits targetOutputSchema (i.e. row becomes normalcase row) +// bool outputAsNormalRow = python::Type::UNKNOWN != unifyTypes(rowType, _targetOutputSchema.getRowType(), _allowNumericTypeUnification) +// && canUpcastToRowType(rowType, _targetOutputSchema.getRowType()); +// // 2. fits generalCaseOutputSchema (i.e. row becomes generalcase row) +// bool outputAsGeneralRow = python::Type::UNKNOWN != unifyTypes(rowType, +// commonCaseOutputSchema().getRowType(), _allowNumericTypeUnification) +// && canUpcastToRowType(rowType, commonCaseOutputSchema().getRowType()); +// +// // 3. doesn't fit, store as python object. => we should use block storage for this as well. Then data can be shared. +// +// // can upcast? => note that the && is necessary because of cases where outputSchema is +// // i64 but the given row type f64. We can cast up i64 to f64 but not the other way round. +// if(outputAsNormalRow) { +// Row resRow = python::pythonToRow(rowObj).upcastedRow(_targetOutputSchema.getRowType()); +// assert(resRow.getRowType() == _targetOutputSchema.getRowType()); +// +// // write to buffer & perform callback +// auto buf_size = 2 * resRow.serializedLength(); +// uint8_t *buf = new uint8_t[buf_size]; +// memset(buf, 0, buf_size); +// auto serialized_length = resRow.serializeToMemory(buf, buf_size); +// // call row func! +// // --> merge row distinguishes between those two cases. Distinction has to be done there +// // because of compiled functor who calls mergeRow in the write function... +// mergeRow(buf, serialized_length, BUF_FORMAT_NORMAL_OUTPUT); +// delete [] buf; +// } else if(outputAsGeneralRow) { +// Row resRow = python::pythonToRow(rowObj).upcastedRow(commonCaseOutputSchema().getRowType()); +// assert(resRow.getRowType() == commonCaseOutputSchema().getRowType()); +// +// // write to buffer & perform callback +// auto buf_size = 2 * resRow.serializedLength(); +// uint8_t *buf = new uint8_t[buf_size]; +// memset(buf, 0, buf_size); +// auto serialized_length = resRow.serializeToMemory(buf, buf_size); +// // call row func! +// // --> merge row distinguishes between those two cases. Distinction has to be done there +// // because of compiled functor who calls mergeRow in the write function... +// mergeRow(buf, serialized_length, BUF_FORMAT_GENERAL_OUTPUT); +// delete [] buf; +// } else { +// // Unwrap single element tuples before writing them to the fallback sink +// if(PyTuple_Check(rowObj) && PyTuple_Size(rowObj) == 1) { +// writePythonObjectToFallbackSink(PyTuple_GetItem(rowObj, 0)); +// } else { +// writePythonObjectToFallbackSink(rowObj); +// } +// } +// // Py_XDECREF(rowObj); +// } +// } +// +//#ifndef NDEBUG +// if(PyErr_Occurred()) { +// // print out the otber objects... +// std::cout<<__FILE__<<":"<<__LINE__<<" python error not cleared properly!"< works. multithreaded fails??? Context c(opt); auto code = "def f(x):\n" @@ -36,12 +37,12 @@ TEST_F(AssertAndRaiseTest, Assert) { // with resolver auto v1 = ds.resolve(ExceptionCode::ASSERTIONERROR, UDF("lambda x: (x-1) * (x-1)")).collectAsVector(); - ASSERT_EQ(v1.size(), 5); - EXPECT_EQ(v1[0].getInt(0), 1); // -> 1 * 1 = 1 - EXPECT_EQ(v1[1].getInt(0), 1); // -> (2-1) * (2-1) = 1 - EXPECT_EQ(v1[2].getInt(0), 9); // -> 3 * 3 = 9 - EXPECT_EQ(v1[3].getInt(0), 9); // -> (4 - 1) * (4 -1 ) = 9 - EXPECT_EQ(v1[4].getInt(0), 25); // -> 5 * 5 = 25 +// ASSERT_EQ(v1.size(), 5); +// EXPECT_EQ(v1[0].getInt(0), 1); // -> 1 * 1 = 1 +// EXPECT_EQ(v1[1].getInt(0), 1); // -> (2-1) * (2-1) = 1 +// EXPECT_EQ(v1[2].getInt(0), 9); // -> 3 * 3 = 9 +// EXPECT_EQ(v1[3].getInt(0), 9); // -> (4 - 1) * (4 -1 ) = 9 +// EXPECT_EQ(v1[4].getInt(0), 25); // -> 5 * 5 = 25 } TEST_F(AssertAndRaiseTest, Raise) { From 111d5db9551dda8a0fbe014e98bcc3aaf764731a Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Fri, 30 Sep 2022 19:50:37 -0400 Subject: [PATCH 06/14] cmake fix --- tuplex/CMakeLists.txt | 13 +- tuplex/core/src/physical/ResolveTask.cc | 540 ++++++++++++------------ tuplex/test/core/AssertAndRaise.cc | 12 +- 3 files changed, 279 insertions(+), 286 deletions(-) diff --git a/tuplex/CMakeLists.txt b/tuplex/CMakeLists.txt index 44249fe8b..9fd0a9ef4 100755 --- a/tuplex/CMakeLists.txt +++ b/tuplex/CMakeLists.txt @@ -567,12 +567,13 @@ function(FindPython3Exe NAMES VERSION EXECUTABLE) # check version (must match VERSION) execute_process(COMMAND "${TEMP_EXE}" -c "import platform;print(platform.python_version())" RESULT_VARIABLE _result OUTPUT_VARIABLE TEMP_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE) # check if version matches - - compare_version_strings(${VERSION} ${TEMP_VERSION} _result) - if(result EQUAL 0) - message(STATUS "Found ${TEMP_EXE} with version ${TEMP_VERSION} matching desired version ${VERSION}") - set(${EXECUTABLE} ${TEMP_EXE} PARENT_SCOPE) # write out - endif() + if(VERSION AND TEMP_VERSION) + compare_version_strings(${VERSION} ${TEMP_VERSION} _result) + if(result EQUAL 0) + message(STATUS "Found ${TEMP_EXE} with version ${TEMP_VERSION} matching desired version ${VERSION}") + set(${EXECUTABLE} ${TEMP_EXE} PARENT_SCOPE) # write out + endif() + endif() endif() endfunction() diff --git a/tuplex/core/src/physical/ResolveTask.cc b/tuplex/core/src/physical/ResolveTask.cc index 656196a71..fa9ab3312 100644 --- a/tuplex/core/src/physical/ResolveTask.cc +++ b/tuplex/core/src/physical/ResolveTask.cc @@ -459,285 +459,277 @@ namespace tuplex { if(resCode == -1 && _interpreterFunctor) { assert(!python::holdsGIL()); - // acquire GIL python::lockGIL(); - // check there is no errror! - if(PyErr_Occurred()) { - std::cerr<<"python error!"< decode the correct object from memory & then feed it into the pipeline... + if(ecCode == ecToI64(ExceptionCode::BADPARSE_STRING_INPUT)) { + // it's a string! + tuple = tupleFromParseException(ebuf, eSize); + parse_cells = true; // need to parse cells in python mode. + } else if(ecCode == ecToI64(ExceptionCode::NORMALCASEVIOLATION)) { + // changed, why are these names so random here? makes no sense... + auto row = Row::fromMemory(exceptionsInputSchema(), ebuf, eSize); + + tuple = python::rowToPython(row, true); + parse_cells = false; + // called below... + } else if (ecCode == ecToI64(ExceptionCode::PYTHON_PARALLELIZE)) { + auto pyObj = python::deserializePickledObject(python::getMainModule(), (char *) ebuf, eSize); + tuple = pyObj; + parse_cells = false; + } else { + // normal case, i.e. an exception occurred somewhere. + // --> this means if pipeline is using string as input, we should convert + auto row = Row::fromMemory(exceptionsInputSchema(), ebuf, eSize); + + // cell source automatically takes input, i.e. no need to convert. simply get tuple from row object + tuple = python::rowToPython(row, true); + +#ifndef NDEBUG + if(PyTuple_Check(tuple)) { + // make sure tuple is valid... + for(unsigned i = 0; i < PyTuple_Size(tuple); ++i) { + auto elemObj = PyTuple_GET_ITEM(tuple, i); + assert(elemObj); + } + } +#endif + parse_cells = false; + } + + // compute + // @TODO: we need to encode the hashmaps as these hybrid objects! + // ==> for more efficiency we prob should store one per executor! + // the same goes for any hashmap... + + assert(tuple); +#ifndef NDEBUG + if(!tuple) { + owner()->error("bad decode, using () as dummy..."); + tuple = PyTuple_New(0); // empty tuple. + } +#endif + + // note: current python pipeline always expects a tuple arg. hence pack current element. + if(PyTuple_Check(tuple) && PyTuple_Size(tuple) > 1) { + // nothing todo... + } else { + auto tmp_tuple = PyTuple_New(1); + PyTuple_SET_ITEM(tmp_tuple, 0, tuple); + tuple = tmp_tuple; + } + +#ifndef NDEBUG + // // to print python object + // Py_XINCREF(tuple); + // PyObject_Print(tuple, stdout, 0); + // std::cout< note: unify handling this with the other cases... + assert(_htable->hybrid_hm); + Py_XINCREF(_htable->hybrid_hm); + PyTuple_SET_ITEM(args, num_python_args - 1, _htable->hybrid_hm); + } + auto kwargs = PyDict_New(); + auto py_parse_cells = python::boolean(parse_cells); + PyDict_SetItemString(kwargs, "parse_cells", py_parse_cells); + auto pcr = python::callFunctionEx(_interpreterFunctor, args, kwargs); -// -// // catch any exceptions here -// try { -// PyCallable_Check(_interpreterFunctor); -// -// // holds the pythonized data -// PyObject* tuple = nullptr; -// bool parse_cells = false; -// -// // there are different data reps for certain error codes. -// // => decode the correct object from memory & then feed it into the pipeline... -// if(ecCode == ecToI64(ExceptionCode::BADPARSE_STRING_INPUT)) { -// // it's a string! -// tuple = tupleFromParseException(ebuf, eSize); -// parse_cells = true; // need to parse cells in python mode. -// } else if(ecCode == ecToI64(ExceptionCode::NORMALCASEVIOLATION)) { -// // changed, why are these names so random here? makes no sense... -// auto row = Row::fromMemory(exceptionsInputSchema(), ebuf, eSize); -// -// tuple = python::rowToPython(row, true); -// parse_cells = false; -// // called below... -// } else if (ecCode == ecToI64(ExceptionCode::PYTHON_PARALLELIZE)) { -// auto pyObj = python::deserializePickledObject(python::getMainModule(), (char *) ebuf, eSize); -// tuple = pyObj; -// parse_cells = false; -// } else { -// // normal case, i.e. an exception occurred somewhere. -// // --> this means if pipeline is using string as input, we should convert -// auto row = Row::fromMemory(exceptionsInputSchema(), ebuf, eSize); -// -// // cell source automatically takes input, i.e. no need to convert. simply get tuple from row object -// tuple = python::rowToPython(row, true); -// -//#ifndef NDEBUG -// if(PyTuple_Check(tuple)) { -// // make sure tuple is valid... -// for(unsigned i = 0; i < PyTuple_Size(tuple); ++i) { -// auto elemObj = PyTuple_GET_ITEM(tuple, i); -// assert(elemObj); -// } -// } -//#endif -// parse_cells = false; -// } -// -// // compute -// // @TODO: we need to encode the hashmaps as these hybrid objects! -// // ==> for more efficiency we prob should store one per executor! -// // the same goes for any hashmap... -// -// assert(tuple); -//#ifndef NDEBUG -// if(!tuple) { -// owner()->error("bad decode, using () as dummy..."); -// tuple = PyTuple_New(0); // empty tuple. -// } -//#endif -// -// // note: current python pipeline always expects a tuple arg. hence pack current element. -// if(PyTuple_Check(tuple) && PyTuple_Size(tuple) > 1) { -// // nothing todo... -// } else { -// auto tmp_tuple = PyTuple_New(1); -// PyTuple_SET_ITEM(tmp_tuple, 0, tuple); -// tuple = tmp_tuple; -// } -// -//#ifndef NDEBUG -// // // to print python object -// // Py_XINCREF(tuple); -// // PyObject_Print(tuple, stdout, 0); -// // std::cout< note: unify handling this with the other cases... -// assert(_htable->hybrid_hm); -// Py_XINCREF(_htable->hybrid_hm); -// PyTuple_SET_ITEM(args, num_python_args - 1, _htable->hybrid_hm); -// } -// -// auto kwargs = PyDict_New(); -// auto py_parse_cells = python::boolean(parse_cells); -// PyDict_SetItemString(kwargs, "parse_cells", py_parse_cells); -// auto pcr = python::callFunctionEx(_interpreterFunctor, args, kwargs); -// -// if(pcr.exceptionCode != ExceptionCode::SUCCESS) { -// // this should not happen, bad internal error. codegen'ed python should capture everything. -// owner()->error("bad internal python error: " + pcr.exceptionMessage); -// python::unlockGIL(); -// return; -// } else { -// // all good, row is fine. exception occurred? -// assert(pcr.res); -// -// // type check: save to regular rows OR save to python row collection -// if(!pcr.res) { -// owner()->error("bad internal python error, NULL object returned"); -// } else { -// -//#ifndef NDEBUG -// // // uncomment to print res obj -// // Py_XINCREF(pcr.res); -// // PyObject_Print(pcr.res, stdout, 0); -// // std::cout<(cptr), strlen(cptr), BUF_FORMAT_NORMAL_OUTPUT); // don't write '\0'! -// } else { -// -// // there are three options where to store the result now -// -// // 1. fits targetOutputSchema (i.e. row becomes normalcase row) -// bool outputAsNormalRow = python::Type::UNKNOWN != unifyTypes(rowType, _targetOutputSchema.getRowType(), _allowNumericTypeUnification) -// && canUpcastToRowType(rowType, _targetOutputSchema.getRowType()); -// // 2. fits generalCaseOutputSchema (i.e. row becomes generalcase row) -// bool outputAsGeneralRow = python::Type::UNKNOWN != unifyTypes(rowType, -// commonCaseOutputSchema().getRowType(), _allowNumericTypeUnification) -// && canUpcastToRowType(rowType, commonCaseOutputSchema().getRowType()); -// -// // 3. doesn't fit, store as python object. => we should use block storage for this as well. Then data can be shared. -// -// // can upcast? => note that the && is necessary because of cases where outputSchema is -// // i64 but the given row type f64. We can cast up i64 to f64 but not the other way round. -// if(outputAsNormalRow) { -// Row resRow = python::pythonToRow(rowObj).upcastedRow(_targetOutputSchema.getRowType()); -// assert(resRow.getRowType() == _targetOutputSchema.getRowType()); -// -// // write to buffer & perform callback -// auto buf_size = 2 * resRow.serializedLength(); -// uint8_t *buf = new uint8_t[buf_size]; -// memset(buf, 0, buf_size); -// auto serialized_length = resRow.serializeToMemory(buf, buf_size); -// // call row func! -// // --> merge row distinguishes between those two cases. Distinction has to be done there -// // because of compiled functor who calls mergeRow in the write function... -// mergeRow(buf, serialized_length, BUF_FORMAT_NORMAL_OUTPUT); -// delete [] buf; -// } else if(outputAsGeneralRow) { -// Row resRow = python::pythonToRow(rowObj).upcastedRow(commonCaseOutputSchema().getRowType()); -// assert(resRow.getRowType() == commonCaseOutputSchema().getRowType()); -// -// // write to buffer & perform callback -// auto buf_size = 2 * resRow.serializedLength(); -// uint8_t *buf = new uint8_t[buf_size]; -// memset(buf, 0, buf_size); -// auto serialized_length = resRow.serializeToMemory(buf, buf_size); -// // call row func! -// // --> merge row distinguishes between those two cases. Distinction has to be done there -// // because of compiled functor who calls mergeRow in the write function... -// mergeRow(buf, serialized_length, BUF_FORMAT_GENERAL_OUTPUT); -// delete [] buf; -// } else { -// // Unwrap single element tuples before writing them to the fallback sink -// if(PyTuple_Check(rowObj) && PyTuple_Size(rowObj) == 1) { -// writePythonObjectToFallbackSink(PyTuple_GetItem(rowObj, 0)); -// } else { -// writePythonObjectToFallbackSink(rowObj); -// } -// } -// // Py_XDECREF(rowObj); -// } -// } -// -//#ifndef NDEBUG -// if(PyErr_Occurred()) { -// // print out the otber objects... -// std::cout<<__FILE__<<":"<<__LINE__<<" python error not cleared properly!"<error("bad internal python error: " + pcr.exceptionMessage); + python::unlockGIL(); + return; + } else { + // all good, row is fine. exception occurred? + assert(pcr.res); + + // type check: save to regular rows OR save to python row collection + if(!pcr.res) { + owner()->error("bad internal python error, NULL object returned"); + } else { + +#ifndef NDEBUG + // // uncomment to print res obj + // Py_XINCREF(pcr.res); + // PyObject_Print(pcr.res, stdout, 0); + // std::cout<(cptr), strlen(cptr), BUF_FORMAT_NORMAL_OUTPUT); // don't write '\0'! + } else { + + // there are three options where to store the result now + + // 1. fits targetOutputSchema (i.e. row becomes normalcase row) + bool outputAsNormalRow = python::Type::UNKNOWN != unifyTypes(rowType, _targetOutputSchema.getRowType(), _allowNumericTypeUnification) + && canUpcastToRowType(rowType, _targetOutputSchema.getRowType()); + // 2. fits generalCaseOutputSchema (i.e. row becomes generalcase row) + bool outputAsGeneralRow = python::Type::UNKNOWN != unifyTypes(rowType, + commonCaseOutputSchema().getRowType(), _allowNumericTypeUnification) + && canUpcastToRowType(rowType, commonCaseOutputSchema().getRowType()); + + // 3. doesn't fit, store as python object. => we should use block storage for this as well. Then data can be shared. + + // can upcast? => note that the && is necessary because of cases where outputSchema is + // i64 but the given row type f64. We can cast up i64 to f64 but not the other way round. + if(outputAsNormalRow) { + Row resRow = python::pythonToRow(rowObj).upcastedRow(_targetOutputSchema.getRowType()); + assert(resRow.getRowType() == _targetOutputSchema.getRowType()); + + // write to buffer & perform callback + auto buf_size = 2 * resRow.serializedLength(); + uint8_t *buf = new uint8_t[buf_size]; + memset(buf, 0, buf_size); + auto serialized_length = resRow.serializeToMemory(buf, buf_size); + // call row func! + // --> merge row distinguishes between those two cases. Distinction has to be done there + // because of compiled functor who calls mergeRow in the write function... + mergeRow(buf, serialized_length, BUF_FORMAT_NORMAL_OUTPUT); + delete [] buf; + } else if(outputAsGeneralRow) { + Row resRow = python::pythonToRow(rowObj).upcastedRow(commonCaseOutputSchema().getRowType()); + assert(resRow.getRowType() == commonCaseOutputSchema().getRowType()); + + // write to buffer & perform callback + auto buf_size = 2 * resRow.serializedLength(); + uint8_t *buf = new uint8_t[buf_size]; + memset(buf, 0, buf_size); + auto serialized_length = resRow.serializeToMemory(buf, buf_size); + // call row func! + // --> merge row distinguishes between those two cases. Distinction has to be done there + // because of compiled functor who calls mergeRow in the write function... + mergeRow(buf, serialized_length, BUF_FORMAT_GENERAL_OUTPUT); + delete [] buf; + } else { + // Unwrap single element tuples before writing them to the fallback sink + if(PyTuple_Check(rowObj) && PyTuple_Size(rowObj) == 1) { + writePythonObjectToFallbackSink(PyTuple_GetItem(rowObj, 0)); + } else { + writePythonObjectToFallbackSink(rowObj); + } + } + // Py_XDECREF(rowObj); + } + } + +#ifndef NDEBUG + if(PyErr_Occurred()) { + // print out the otber objects... + std::cout<<__FILE__<<":"<<__LINE__<<" python error not cleared properly!"< 1 * 1 = 1 -// EXPECT_EQ(v1[1].getInt(0), 1); // -> (2-1) * (2-1) = 1 -// EXPECT_EQ(v1[2].getInt(0), 9); // -> 3 * 3 = 9 -// EXPECT_EQ(v1[3].getInt(0), 9); // -> (4 - 1) * (4 -1 ) = 9 -// EXPECT_EQ(v1[4].getInt(0), 25); // -> 5 * 5 = 25 + ASSERT_EQ(v1.size(), 5); + EXPECT_EQ(v1[0].getInt(0), 1); // -> 1 * 1 = 1 + EXPECT_EQ(v1[1].getInt(0), 1); // -> (2-1) * (2-1) = 1 + EXPECT_EQ(v1[2].getInt(0), 9); // -> 3 * 3 = 9 + EXPECT_EQ(v1[3].getInt(0), 9); // -> (4 - 1) * (4 -1 ) = 9 + EXPECT_EQ(v1[4].getInt(0), 25); // -> 5 * 5 = 25 } TEST_F(AssertAndRaiseTest, Raise) { From 5c7c7a9918a64b0fe73815d08e4da4107166bf4d Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Sat, 1 Oct 2022 20:36:20 -0400 Subject: [PATCH 07/14] ref count issues --- tuplex/core/src/TraceVisitor.cc | 6 ++++-- tuplex/test/core/PythonRowTest.cc | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tuplex/core/src/TraceVisitor.cc b/tuplex/core/src/TraceVisitor.cc index 29d6f0b5d..9e2ad891a 100644 --- a/tuplex/core/src/TraceVisitor.cc +++ b/tuplex/core/src/TraceVisitor.cc @@ -120,6 +120,7 @@ namespace tuplex { auto sym = PyDict_GetItemString(mainDict, node->_name.c_str()); if(sym) { + Py_XINCREF(sym); addTraceResult(node, TraceItem(sym, node->_name)); } else { @@ -128,9 +129,10 @@ namespace tuplex { auto builtinDict = PyModule_GetDict(builtins); assert(builtinDict); sym = PyDict_GetItemString(builtinDict, node->_name.c_str()); - if(sym) + if(sym) { + Py_XINCREF(sym); addTraceResult(node, TraceItem(sym, node->_name)); - else { + } else { PyErr_SetString(PyExc_NameError, ("could not find identifier " + node->_name).c_str()); // i.e., could early exit function... diff --git a/tuplex/test/core/PythonRowTest.cc b/tuplex/test/core/PythonRowTest.cc index a1e9e0f6d..ad83402da 100644 --- a/tuplex/test/core/PythonRowTest.cc +++ b/tuplex/test/core/PythonRowTest.cc @@ -350,7 +350,9 @@ TEST(PythonFunc, Cloupickle) { PyInterpreterGuard g; std::stringstream err_stream; bool rc = python::cloudpickleCompatibility(&err_stream); - + if(!rc) { + std::cerr< Date: Sat, 1 Oct 2022 20:43:46 -0400 Subject: [PATCH 08/14] another gil fix --- tuplex/adapters/cpython/src/PythonGIL.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tuplex/adapters/cpython/src/PythonGIL.cc b/tuplex/adapters/cpython/src/PythonGIL.cc index 15d1bbd9f..1199edda7 100644 --- a/tuplex/adapters/cpython/src/PythonGIL.cc +++ b/tuplex/adapters/cpython/src/PythonGIL.cc @@ -74,6 +74,7 @@ namespace python { static std::atomic_int64_t interpreterID(-1); // thread which holds the interpreter static std::atomic_bool interpreterInitialized(false); // checks whether interpreter is initialized or not std::thread::id gil_main_thread_id; + std::thread::id gil_id; // id of the thread holding the gil right now. // vars for python management static std::atomic gilState(nullptr); @@ -93,6 +94,7 @@ namespace python { gstate = PyGILState_Ensure(); } assert(PyGILState_Check()); + gil_id = std::this_thread::get_id(); gil = true; gilState = nullptr; gilID = thisThreadID(); @@ -107,13 +109,14 @@ namespace python { PyGILState_Release(gstate); gstate = PyGILState_UNLOCKED; } + gil_id = std::thread::id(); gil = false; gilID = thisThreadID(); gilMutex.unlock(); } bool holdsGIL() { - return gil; + return gil && std::this_thread::get_id() == gil_id; } void acquireGIL() { From 6127d01cbe1a84ff868bafa6a040463a673cf1e7 Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Sat, 1 Oct 2022 20:53:32 -0400 Subject: [PATCH 09/14] disable test --- tuplex/test/wrappers/WrapperTest.cc | 138 ++++++++++++++-------------- 1 file changed, 69 insertions(+), 69 deletions(-) diff --git a/tuplex/test/wrappers/WrapperTest.cc b/tuplex/test/wrappers/WrapperTest.cc index fc4fda77f..fea33dd16 100644 --- a/tuplex/test/wrappers/WrapperTest.cc +++ b/tuplex/test/wrappers/WrapperTest.cc @@ -569,75 +569,75 @@ TEST_F(WrapperTest, extractPriceExample) { } } -TEST(Python, FastTupleConstruction) { - using namespace tuplex; - - python::initInterpreter(); - std::cout<<"\n-------------"< worth it?? prob not really... - - int N = 1001000; - - Timer timer; - auto listObj = PyList_New(N); - for(int i = 0; i < N; ++i) { - auto tupleObj = PyTuple_New(4); - PyTuple_SET_ITEM(tupleObj, 0, python::PyString_FromString("hello world")); - PyTuple_SET_ITEM(tupleObj, 1, python::PyString_FromString("hello whkljdkhjorld")); - PyTuple_SET_ITEM(tupleObj, 2, python::PyString_FromString("dkfjopdjfophjhello world")); - PyTuple_SET_ITEM(tupleObj, 3, PyLong_FromLongLong(12345)); - PyList_SET_ITEM(listObj, i, tupleObj); - } - std::cout<<"w/o opt took: "<ob_item[0] = python::PyString_FromString("hello world"); - tp->ob_item[1] = python::PyString_FromString("hello whkljdkhjorld"); - tp->ob_item[2] = python::PyString_FromString("dkfjopdjfophjhello world"); - tp->ob_item[3] = PyLong_FromLongLong(12345); - - PyList_SET_ITEM(listObj, i, (PyObject*)tp); - } - std::cout<<"tuple construction optimized, took: "<ob_item = (PyObject**)PyMem_Calloc(N, sizeof(PyObject*)); - // TODO: mem check... - Py_SIZE(lo) = N; - lo->allocated = N; - PyObject_GC_Track(lo); - for(int i = 0; i < N; ++i) { - - // directly call python functions without all the crap - auto tp = (PyTupleObject*)PyObject_GC_NewVar(PyTupleObject, &PyTuple_Type, tupleSize); - PyObject_GC_Track(tp); - if(!tp) // out of mem.. - break; - - tp->ob_item[0] = python::PyString_FromString("hello world"); - tp->ob_item[1] = python::PyString_FromString("hello whkljdkhjorld"); - tp->ob_item[2] = python::PyString_FromString("dkfjopdjfophjhello world"); - tp->ob_item[3] = PyLong_FromLongLong(12345); - - lo->ob_item[i] = (PyObject*)tp; - } - std::cout<<"tuple + list construction optimized, took: "< worth it?? prob not really... +// +// int N = 1001000; +// +// Timer timer; +// auto listObj = PyList_New(N); +// for(int i = 0; i < N; ++i) { +// auto tupleObj = PyTuple_New(4); +// PyTuple_SET_ITEM(tupleObj, 0, python::PyString_FromString("hello world")); +// PyTuple_SET_ITEM(tupleObj, 1, python::PyString_FromString("hello whkljdkhjorld")); +// PyTuple_SET_ITEM(tupleObj, 2, python::PyString_FromString("dkfjopdjfophjhello world")); +// PyTuple_SET_ITEM(tupleObj, 3, PyLong_FromLongLong(12345)); +// PyList_SET_ITEM(listObj, i, tupleObj); +// } +// std::cout<<"w/o opt took: "<ob_item[0] = python::PyString_FromString("hello world"); +// tp->ob_item[1] = python::PyString_FromString("hello whkljdkhjorld"); +// tp->ob_item[2] = python::PyString_FromString("dkfjopdjfophjhello world"); +// tp->ob_item[3] = PyLong_FromLongLong(12345); +// +// PyList_SET_ITEM(listObj, i, (PyObject*)tp); +// } +// std::cout<<"tuple construction optimized, took: "<ob_item = (PyObject**)PyMem_Calloc(N, sizeof(PyObject*)); +// // TODO: mem check... +// Py_SIZE(lo) = N; +// lo->allocated = N; +// PyObject_GC_Track(lo); +// for(int i = 0; i < N; ++i) { +// +// // directly call python functions without all the crap +// auto tp = (PyTupleObject*)PyObject_GC_NewVar(PyTupleObject, &PyTuple_Type, tupleSize); +// PyObject_GC_Track(tp); +// if(!tp) // out of mem.. +// break; +// +// tp->ob_item[0] = python::PyString_FromString("hello world"); +// tp->ob_item[1] = python::PyString_FromString("hello whkljdkhjorld"); +// tp->ob_item[2] = python::PyString_FromString("dkfjopdjfophjhello world"); +// tp->ob_item[3] = PyLong_FromLongLong(12345); +// +// lo->ob_item[i] = (PyObject*)tp; +// } +// std::cout<<"tuple + list construction optimized, took: "< Date: Sat, 1 Oct 2022 21:09:51 -0400 Subject: [PATCH 10/14] proper thread init and removing an obscure, faulty test --- tuplex/adapters/cpython/src/PythonGIL.cc | 1 + tuplex/adapters/cpython/src/PythonHelpers.cc | 2 ++ tuplex/test/core/SerializationTypeTest.cc | 3 ++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tuplex/adapters/cpython/src/PythonGIL.cc b/tuplex/adapters/cpython/src/PythonGIL.cc index 1199edda7..497a0943a 100644 --- a/tuplex/adapters/cpython/src/PythonGIL.cc +++ b/tuplex/adapters/cpython/src/PythonGIL.cc @@ -167,6 +167,7 @@ namespace python { gil = true; gilID = interpreterID = thisThreadID(); } + gil_id = std::this_thread::get_id(); gilMutex.lock(); interpreterInitialized = true; } diff --git a/tuplex/adapters/cpython/src/PythonHelpers.cc b/tuplex/adapters/cpython/src/PythonHelpers.cc index aa87c22bd..eb5f8ebff 100644 --- a/tuplex/adapters/cpython/src/PythonHelpers.cc +++ b/tuplex/adapters/cpython/src/PythonHelpers.cc @@ -1363,6 +1363,8 @@ namespace python { // mapping type to internal types, unknown as default python::Type mapPythonClassToTuplexType(PyObject *o, bool autoUpcast) { + assert(o); + if(Py_None == o) return python::Type::NULLVALUE; diff --git a/tuplex/test/core/SerializationTypeTest.cc b/tuplex/test/core/SerializationTypeTest.cc index f1fb1ace2..f2255ab94 100644 --- a/tuplex/test/core/SerializationTypeTest.cc +++ b/tuplex/test/core/SerializationTypeTest.cc @@ -29,6 +29,7 @@ class SerializationTypeTest : public ::testing::Test { namespace python { void testRowSerializationType(const std::string& PyLiteral, const std::string& expectedType, bool autoUpcast) { auto PyObj = python::runAndGet("obj = " + PyLiteral, "obj"); + ASSERT_TRUE(PyObj); auto rowType = python::mapPythonClassToTuplexType(PyObj, autoUpcast); auto row = python::pythonToRow(PyObj, rowType, false); EXPECT_EQ(row.getRowType().desc(), expectedType); @@ -45,6 +46,6 @@ TEST_F(SerializationTypeTest, ListSerializationTest) { TEST_F(SerializationTypeTest, OptionTupleSerializationTest) { python::testRowSerializationType("([(100, -10000000000), None, (5, 2147483647)])", "([Option[(i64,i64)]])", false); python::testRowSerializationType("([('string', None, False), (None, (1, [1, 2]), None)])", "([(Option[str],Option[(i64,[i64])],Option[boolean])])", false); - python::testRowSerializationType("[(1, (1, 2)) ,(2, None)])]", "([(Option[str],Option[(i64,[i64])],Option[boolean])])", false); + // no idea what the original here is supposed to be: python::testRowSerializationType("[(1, (1, 2)) ,(2, None)])]", "([(Option[str],Option[(i64,[i64])],Option[boolean])])", false); python::testRowSerializationType("('qwert', [ ((False, 2), True, 'ab'), None, (None, False, None), ((None, None), True, 'efghijk')])", "(str,[Option[(Option[(Option[boolean],Option[i64])],boolean,Option[str])]])", false); } \ No newline at end of file From d7c7d5fdaed1f565c2b76f0da4539a0afda275b6 Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Sat, 1 Oct 2022 21:14:22 -0400 Subject: [PATCH 11/14] cleanup --- 02_Working_with_files.ipynb | 3145 ----------------------------------- 1 file changed, 3145 deletions(-) delete mode 100644 02_Working_with_files.ipynb diff --git a/02_Working_with_files.ipynb b/02_Working_with_files.ipynb deleted file mode 100644 index 62cb7b4d6..000000000 --- a/02_Working_with_files.ipynb +++ /dev/null @@ -1,3145 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "_M1WAAa_N3MO" - }, - "source": [ - "## 2. Working with files\n", - "\n", - "\n", - "\n", - "In the 2nd part of the Tuplex intro series, we'll take a look at how to work with CSV and text files. First, let's install Tuplex again in our notebook." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Hged1I1rUyEf", - "outputId": "92169a2c-6703-4a00-be65-aeef1758831b" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Python 3.7.13\r\n" - ] - } - ], - "source": [ - "!python3 --version" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "JZsjAqGVTi9M", - "outputId": "295bb25c-2bed-4448-c789-3922176fe588" - }, - "outputs": [], - "source": [ - "# # install Colab compatible upgrades to avoid dependency errors\n", - "# !pip install -q folium==0.2.1\n", - "# !pip install -q --upgrade urllib3==1.25.11\n", - "# !pip install flask-socketio flask-pymongo eventlet==0.30.0\n", - "# !pip uninstall jedi -y && pip3 install 'jedi>=0.10'\n", - "\n", - "# # install Tuplex\n", - "# #!pip install -q -i https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple tuplex==0.3.2rc1\n", - "# #!pip install -q tuplex\n", - "\n", - "# # !pip install -i https://test.pypi.org/simple/ tuplex==0.3.dev20220822143933006789\n", - "# #!pip install -i https://test.pypi.org/simple/ tuplex" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "NOWk6l8gWK8n", - "outputId": "67a805ef-5430-4e90-85cb-9473a0dc6bd5" - }, - "outputs": [], - "source": [ - "# downloads temp tuplex file\n", - "#!gdown https://drive.google.com/uc?id=1-TxhNpVg6TW96rNvLWv_2NWUz2tdoLnN" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "IlPT6aJcVB52", - "outputId": "d8ec0e8d-1405-4124-8fcb-6d417520f8e9" - }, - "outputs": [], - "source": [ - "#!pip3 install --force-reinstall /content/tuplex-0.3.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing ./wheelhouse/tuplex-0.3.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl\n", - "Requirement already satisfied: prompt-toolkit in /home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages (from tuplex==0.3.3) (3.0.31)\n", - "Collecting flask-pymongo\n", - " Downloading Flask_PyMongo-2.3.0-py2.py3-none-any.whl (12 kB)\n", - "Collecting iso8601\n", - " Using cached iso8601-1.0.2-py3-none-any.whl (9.7 kB)\n", - "Requirement already satisfied: attrs>=19.2.0 in /home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages (from tuplex==0.3.3) (22.1.0)\n", - "Collecting Flask==2.0.2\n", - " Downloading Flask-2.0.2-py3-none-any.whl (95 kB)\n", - "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m95.2/95.2 KB\u001b[0m \u001b[31m1.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: jedi in /home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages (from tuplex==0.3.3) (0.18.1)\n", - "Collecting pluggy\n", - " Downloading pluggy-1.0.0-py2.py3-none-any.whl (13 kB)\n", - "Requirement already satisfied: six>=1.11.0 in /home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages (from tuplex==0.3.3) (1.16.0)\n", - "Requirement already satisfied: psutil in /home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages (from tuplex==0.3.3) (5.9.2)\n", - "Collecting astor\n", - " Using cached astor-0.8.1-py2.py3-none-any.whl (27 kB)\n", - "Collecting flask-socketio\n", - " Downloading Flask_SocketIO-5.3.1-py3-none-any.whl (17 kB)\n", - "Collecting eventlet==0.30.0\n", - " Downloading eventlet-0.30.0-py2.py3-none-any.whl (224 kB)\n", - "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m224.1/224.1 KB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m[31m2.9 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n", - "\u001b[?25hCollecting gunicorn\n", - " Downloading gunicorn-20.1.0-py3-none-any.whl (79 kB)\n", - "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m79.5/79.5 KB\u001b[0m \u001b[31m789.0 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m31m1.8 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n", - "\u001b[?25hCollecting dill>=0.2.7.1\n", - " Using cached dill-0.3.5.1-py2.py3-none-any.whl (95 kB)\n", - "Collecting PyYAML>=3.13\n", - " Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)\n", - "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m596.3/596.3 KB\u001b[0m \u001b[31m1.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n", - "\u001b[?25hCollecting pymongo\n", - " Downloading pymongo-4.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (471 kB)\n", - "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m471.9/471.9 KB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m:01\u001b[0m\n", - "\u001b[?25hCollecting py>=1.5.2\n", - " Using cached py-1.11.0-py2.py3-none-any.whl (98 kB)\n", - "Requirement already satisfied: pygments>=2.4.1 in /home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages (from tuplex==0.3.3) (2.13.0)\n", - "Collecting Werkzeug<2.2.0\n", - " Downloading Werkzeug-2.1.2-py3-none-any.whl (224 kB)\n", - "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m224.9/224.9 KB\u001b[0m \u001b[31m867.4 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m1m825.4 kB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: wcwidth>=0.1.7 in /home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages (from tuplex==0.3.3) (0.2.5)\n", - "Collecting cloudpickle<2.0.0,>=0.6.1\n", - " Using cached cloudpickle-1.6.0-py3-none-any.whl (23 kB)\n", - "Collecting dnspython<2.0.0,>=1.15.0\n", - " Downloading dnspython-1.16.0-py2.py3-none-any.whl (188 kB)\n", - "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m188.4/188.4 KB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m[36m0:00:01\u001b[0mm eta \u001b[36m0:00:01\u001b[0m\n", - "\u001b[?25hCollecting greenlet>=0.3\n", - " Downloading greenlet-1.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (150 kB)\n", - "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m150.7/150.7 KB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n", - "\u001b[?25hCollecting click>=7.1.2\n", - " Downloading click-8.1.3-py3-none-any.whl (96 kB)\n", - "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m96.6/96.6 KB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting itsdangerous>=2.0\n", - " Downloading itsdangerous-2.1.2-py3-none-any.whl (15 kB)\n", - "Requirement already satisfied: Jinja2>=3.0 in /home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages (from Flask==2.0.2->tuplex==0.3.3) (3.1.2)\n", - "Collecting python-socketio>=5.0.2\n", - " Downloading python_socketio-5.7.1-py3-none-any.whl (56 kB)\n", - "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.6/56.6 KB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: setuptools>=3.0 in /home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages (from gunicorn->tuplex==0.3.3) (47.1.0)\n", - "Requirement already satisfied: parso<0.9.0,>=0.8.0 in /home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages (from jedi->tuplex==0.3.3) (0.8.3)\n", - "Requirement already satisfied: importlib-metadata>=0.12 in /home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages (from pluggy->tuplex==0.3.3) (4.12.0)\n", - "Requirement already satisfied: zipp>=0.5 in /home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages (from importlib-metadata>=0.12->pluggy->tuplex==0.3.3) (3.8.1)\n", - "Requirement already satisfied: typing-extensions>=3.6.4 in /home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages (from importlib-metadata>=0.12->pluggy->tuplex==0.3.3) (4.3.0)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages (from Jinja2>=3.0->Flask==2.0.2->tuplex==0.3.3) (2.1.1)\n", - "Collecting bidict>=0.21.0\n", - " Downloading bidict-0.22.0-py3-none-any.whl (36 kB)\n", - "Collecting python-engineio>=4.3.0\n", - " Downloading python_engineio-4.3.4-py3-none-any.whl (52 kB)\n", - "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m52.9/52.9 KB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hInstalling collected packages: Werkzeug, PyYAML, python-engineio, pymongo, py, itsdangerous, iso8601, gunicorn, greenlet, dnspython, dill, cloudpickle, bidict, astor, python-socketio, pluggy, eventlet, click, Flask, flask-socketio, flask-pymongo, tuplex\n", - "Successfully installed Flask-2.0.2 PyYAML-6.0 Werkzeug-2.1.2 astor-0.8.1 bidict-0.22.0 click-8.1.3 cloudpickle-1.6.0 dill-0.3.5.1 dnspython-1.16.0 eventlet-0.30.0 flask-pymongo-2.3.0 flask-socketio-5.3.1 greenlet-1.1.3 gunicorn-20.1.0 iso8601-1.0.2 itsdangerous-2.1.2 pluggy-1.0.0 py-1.11.0 pymongo-4.2.0 python-engineio-4.3.4 python-socketio-5.7.1 tuplex-0.3.3\n", - "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.2.2 is available.\n", - "You should consider upgrading via the '/home/leonhard/.pyenv/versions/3.7.13/bin/python3.7 -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!pip3 install wheelhouse/tuplex-0.3.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZOobvZcVO2_H" - }, - "source": [ - "### 2.1 Basic IO - Reading CSV files\n", - "To read in a csv file, Tuplex provides an API function `csv`" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "IAGEc-okO2im", - "outputId": "c9ce8493-5b2e-443f-98b1-c35258d384d6" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "ERROR:root:Failed to start or connect to Tuplex WebUI. Details: MongoDB (mongod) not found on PATH. In order to use Tuplex's WebUI, you need MongoDB installed or point the framework to a running MongoDB instance\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Welcome to\n", - "\n", - " _____ _\n", - " |_ _| _ _ __ | | _____ __\n", - " | || | | | '_ \\| |/ _ \\ \\/ /\n", - " | || |_| | |_) | | __/> <\n", - " |_| \\__,_| .__/|_|\\___/_/\\_\\ 0.3.3\n", - " |_|\n", - " \n", - "using Python 3.7.13 (default, Sep 12 2022, 22:16:36) \n", - "[GCC 11.2.0] on linux\n", - "[2022-09-12 22:44:45.614] [local ee] [info] loaded runtime library from/home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages/tuplex/libexec/tuplex_runtime.cpython-37m-x86_64-linux-gnu.so\n", - "[2022-09-12 22:44:45.614] [local ee] [info] initializing LLVM backend\n", - "[2022-09-12 22:44:45.614] [local ee] [warning] init JIT compiler also only in local mode\n", - "[2022-09-12 22:44:45.615] [LLVM] [info] compiling code for skylake\n", - "[2022-09-12 22:44:45.619] [history server] [warning] could not connect to http://localhost:5000/api/version, if you wish to disable the webui consider setting tuplex.webui=False for the context.\n", - "[2022-09-12 22:44:45.619] [memory] [info] allocated bitmap managed memory region (1.00 GB, 32.00 MB block size)\n", - "[2022-09-12 22:44:45.619] [E/1] [info] provided cache path file:///tmp/tuplex-cache-leonhard/E1 does not exist. Attempting to create it.\n", - "[2022-09-12 22:44:45.620] [E/1] [info] created cache directory file:///tmp/tuplex-cache-leonhard/E1\n", - "[2022-09-12 22:44:45.620] [memory] [info] allocated bitmap managed memory region (1.00 GB, 32.00 MB block size)\n", - "[2022-09-12 22:44:45.620] [E/2] [info] provided cache path file:///tmp/tuplex-cache-leonhard/E2 does not exist. Attempting to create it.\n", - "[2022-09-12 22:44:45.620] [E/2] [info] created cache directory file:///tmp/tuplex-cache-leonhard/E2\n", - "[2022-09-12 22:44:45.620] [memory] [info] allocated bitmap managed memory region (1.00 GB, 32.00 MB block size)\n", - "[2022-09-12 22:44:45.621] [E/3] [info] provided cache path file:///tmp/tuplex-cache-leonhard/E3 does not exist. Attempting to create it.\n", - "[2022-09-12 22:44:45.621] [E/3] [info] created cache directory file:///tmp/tuplex-cache-leonhard/E3\n", - "[2022-09-12 22:44:45.621] [memory] [info] allocated bitmap managed memory region (1.00 GB, 32.00 MB block size)\n", - "[2022-09-12 22:44:45.621] [E/4] [info] provided cache path file:///tmp/tuplex-cache-leonhard/E4 does not exist. Attempting to create it.\n", - "[2022-09-12 22:44:45.621] [E/4] [info] created cache directory file:///tmp/tuplex-cache-leonhard/E4\n", - "[2022-09-12 22:44:45.621] [memory] [info] allocated bitmap managed memory region (1.00 GB, 32.00 MB block size)\n", - "[2022-09-12 22:44:45.622] [E/5] [info] provided cache path file:///tmp/tuplex-cache-leonhard/E5 does not exist. Attempting to create it.\n", - "[2022-09-12 22:44:45.622] [E/5] [info] created cache directory file:///tmp/tuplex-cache-leonhard/E5\n", - "[2022-09-12 22:44:45.622] [memory] [info] allocated bitmap managed memory region (1.00 GB, 32.00 MB block size)\n", - "[2022-09-12 22:44:45.622] [E/6] [info] provided cache path file:///tmp/tuplex-cache-leonhard/E6 does not exist. Attempting to create it.\n", - "[2022-09-12 22:44:45.622] [E/6] [info] created cache directory file:///tmp/tuplex-cache-leonhard/E6\n", - "[2022-09-12 22:44:45.622] [memory] [info] allocated bitmap managed memory region (1.00 GB, 32.00 MB block size)\n", - "[2022-09-12 22:44:45.623] [E/7] [info] provided cache path file:///tmp/tuplex-cache-leonhard/E7 does not exist. Attempting to create it.\n", - "[2022-09-12 22:44:45.623] [E/7] [info] created cache directory file:///tmp/tuplex-cache-leonhard/E7\n", - "[2022-09-12 22:44:45.623] [memory] [info] allocated bitmap managed memory region (1.00 GB, 32.00 MB block size)\n", - "[2022-09-12 22:44:45.623] [E/8] [info] provided cache path file:///tmp/tuplex-cache-leonhard/E8 does not exist. Attempting to create it.\n", - "[2022-09-12 22:44:45.623] [E/8] [info] created cache directory file:///tmp/tuplex-cache-leonhard/E8\n", - "[2022-09-12 22:44:45.623] [local execution engine] [info] started local executor E/1 (1.00 GB, 32.00 MB default partition size)\n", - "[2022-09-12 22:44:45.624] [E/1] [info] starting detached process queue\n", - "[2022-09-12 22:44:45.624] [local execution engine] [info] started local executor E/2 (1.00 GB, 32.00 MB default partition size)\n", - "[2022-09-12 22:44:45.624] [E/1] [info] initialized runtime memory (4.00 MB)\n", - "[2022-09-12 22:44:45.624] [E/2] [info] starting detached process queue\n", - "[2022-09-12 22:44:45.624] [local execution engine] [info] started local executor E/3 (1.00 GB, 32.00 MB default partition size)\n", - "[2022-09-12 22:44:45.624] [E/3] [info] starting detached process queue\n", - "[2022-09-12 22:44:45.624] [E/2] [info] initialized runtime memory (4.00 MB)\n", - "[2022-09-12 22:44:45.624] [E/3] [info] initialized runtime memory (4.00 MB)\n", - "[2022-09-12 22:44:45.624] [local execution engine] [info] started local executor E/4 (1.00 GB, 32.00 MB default partition size)\n", - "[2022-09-12 22:44:45.624] [E/4] [info] starting detached process queue\n", - "[2022-09-12 22:44:45.625] [E/4] [info] initialized runtime memory (4.00 MB)\n", - "[2022-09-12 22:44:45.625] [local execution engine] [info] started local executor E/5 (1.00 GB, 32.00 MB default partition size)\n", - "[2022-09-12 22:44:45.625] [E/5] [info] starting detached process queue\n", - "[2022-09-12 22:44:45.625] [E/5] [info] initialized runtime memory (4.00 MB)\n", - "[2022-09-12 22:44:45.625] [local execution engine] [info] started local executor E/6 (1.00 GB, 32.00 MB default partition size)\n", - "[2022-09-12 22:44:45.625] [local execution engine] [info] started local executor E/7 (1.00 GB, 32.00 MB default partition size)\n", - "[2022-09-12 22:44:45.625] [E/7] [info] starting detached process queue\n", - "[2022-09-12 22:44:45.625] [local execution engine] [info] started local executor E/8 (1.00 GB, 32.00 MB default partition size)\n", - "[2022-09-12 22:44:45.626] [E/7] [info] initialized runtime memory (4.00 MB)\n", - "[2022-09-12 22:44:45.626] [memory] [info] allocated bitmap managed memory region (1.00 GB, 32.00 MB block size)\n", - "[2022-09-12 22:44:45.626] [E/6] [info] starting detached process queue\n", - "[2022-09-12 22:44:45.626] [E/6] [info] initialized runtime memory (4.00 MB)\n", - "[2022-09-12 22:44:45.626] [driver] [info] provided cache path file:///tmp/tuplex-cache-leonhard/driver does not exist. Attempting to create it.\n", - "[2022-09-12 22:44:45.626] [driver] [info] created cache directory file:///tmp/tuplex-cache-leonhard/driver\n", - "[2022-09-12 22:44:45.626] [local execution engine] [info] started driver (1.00 GB, 32.00 MB default partition size)\n", - "[2022-09-12 22:44:45.641] [E/8] [info] starting detached process queue\n", - "[2022-09-12 22:44:45.642] [E/8] [info] initialized runtime memory (4.00 MB)\n" - ] - } - ], - "source": [ - "import tuplex\n", - "\n", - "c = tuplex.Context({'tuplex.redirectToPythonLogging':False})" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zdb8DfszEzC8" - }, - "source": [ - "Google Colab provides by default some sample data. We can simply load it into Tuplex using the `csv` command." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "MJz8q4Tw9Bsy" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2022-09-12 22:44:56.155] [posix filesystem] [warning] did not find any files for pattern 'sample_data/california_housing_train.csv'\n", - "[2022-09-12 22:44:56.155] [fileinputoperator] [info] found 0 files (0.00 B) to process.\n", - "[2022-09-12 22:44:56.155] [fileinputoperator] [warning] no input files found, can't infer type from given path: sample_data/california_housing_train.csv\n" - ] - } - ], - "source": [ - "ds = c.csv('sample_data/california_housing_train.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "GXauZfPs9H2i", - "outputId": "1daf8a8e-9dcb-4660-eebe-bac6047d999e" - }, - "outputs": [], - "source": [ - "ds.show(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "XdGZgph6E8kz" - }, - "source": [ - "Without any further information, Tuplex automatically deduces types for each column. In order to check what types Tuplex deduced, we can use the `columns` and `types` properties of a Tuplex dataset." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "EsosZ2csE71x", - "outputId": "9a7c7cd3-0cd8-45eb-b11d-50afedc0828c" - }, - "outputs": [ - { - "ename": "TypeError", - "evalue": "zip argument #1 must support iteration", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/tmp/ipykernel_220616/1148875277.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;31m# print out as nicely formatted dictionary\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mdict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtypes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m: zip argument #1 must support iteration" - ] - } - ], - "source": [ - "columns = ds.columns\n", - "types = ds.types\n", - "\n", - "# print out as nicely formatted dictionary\n", - "dict(zip(columns, types))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ebt3udXFFctc" - }, - "source": [ - "Sometimes however, it may be desirable to assign specific types to individual columns. Luckily, Tuplex provides a mechanism for this as well:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "3_kkRMKfFb7W", - "outputId": "305e92f3-4f04-4fe1-e4af-05d52f01dbdf" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2022-09-12 22:45:02.469] [posix filesystem] [warning] did not find any files for pattern 'sample_data/california_housing_train.csv'\n", - "[2022-09-12 22:45:02.469] [fileinputoperator] [info] found 0 files (0.00 B) to process.\n", - "[2022-09-12 22:45:02.469] [fileinputoperator] [warning] no input files found, can't infer type from given path: sample_data/california_housing_train.csv\n" - ] - } - ], - "source": [ - "c.csv('sample_data/california_housing_train.csv', type_hints={'longitude' : float, 'latitude' : str}).show(4)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TZgHfZFYdGTP" - }, - "source": [ - "Let's say we now want to create a file containing only data entries where the `housing_median_age` is larger than `50`:" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "Ato7fqLjc5H1" - }, - "outputs": [], - "source": [ - "ds.filter(lambda r: r['housing_median_age'] > 50).tocsv('lt50.csv', num_parts=0)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PCgh3TkJd-9Y" - }, - "source": [ - "In order to speedup data output, Tuplex by default uses multiple threads to create multiple output parts." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "6YNy969JdfBG", - "outputId": "8b0afcdd-d4a6-4ec4-f48c-265c76a6b15c" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "head: cannot open 'lt50.part0.csv' for reading: No such file or directory\r\n" - ] - } - ], - "source": [ - "!head lt50.part0.csv" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ETBMyA2GeGVA" - }, - "source": [ - "Besides CSV files, Tuplex also has experimental support to read/write [ORC files](https://https://orc.apache.org/), which may be a more space efficient solution depending on the data and workload." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "id": "8W97qlM9eBkL" - }, - "outputs": [], - "source": [ - "ds.toorc('lt50.orc')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cSCTtDPAGwM6" - }, - "source": [ - "Similarly, the orc files can be read using the `orc` command." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ACpfbMz5GS4Q", - "outputId": "e67982b4-14cf-4d5e-e29b-e148dcd404bd" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2022-09-12 22:45:05.208] [posix filesystem] [warning] did not find any files for pattern 'lt50.part0.orc'\n", - "[2022-09-12 22:45:05.208] [fileinputoperator] [info] found 0 files (0.00 B) to process.\n", - "[2022-09-12 22:45:05.208] [fileinputoperator] [warning] no input files found, can't infer type from sample.\n" - ] - } - ], - "source": [ - "c.orc('lt50.part0.orc').show(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3sJ05Og2rubO" - }, - "source": [ - "## 2.2 Working with larger files\n", - "Naturally, the benefit of Tuplex's compilation comes into play when working with larger files. To demonstrate this, let's assume we want to work with the 311 original data. A subset of this (1GB) can be downloaded via the following command" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "GZ5ahr4zryaP", - "outputId": "8f617ae3-3f4d-48b6-c4ab-b03949794e28" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Downloading...\n", - "From: https://drive.google.com/uc?id=18e2GyoQKLnQ2_uaUcaSOsLRlIT-7tqpN\n", - "To: /home/leonhard/projects/2nd-copy/311_subset.tar.gz\n", - "100%|████████████████████████████████████████| 214M/214M [00:37<00:00, 5.77MB/s]\n" - ] - } - ], - "source": [ - "!gdown https://drive.google.com/uc?id=18e2GyoQKLnQ2_uaUcaSOsLRlIT-7tqpN && tar xf 311_subset.tar.gz" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "psV4--h7w5vK" - }, - "source": [ - "Next, let's create a new context with more memory to process the larger file. You can still reuse the old one albeit at the cost of incurring a lot of disk swapping. Therefore, we delete the old context to free up the space." - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": { - "id": "WZQeKTxF3hvH" - }, - "outputs": [], - "source": [ - "del c" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "MEiheSMPFQmy", - "outputId": "b7be1922-6dd1-492d-f606-e6e4c52a057a" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Unique Key,Created Date,Closed Date,Agency,Agency Name,Complaint Type,Descriptor,Location Type,Incident Zip,Incident Address,Street Name,Cross Street 1,Cross Street 2,Intersection Street 1,Intersection Street 2,Address Type,City,Landmark,Facility Type,Status,Due Date,Resolution Description,Resolution Action Updated Date,Community Board,BBL,Borough,X Coordinate (State Plane),Y Coordinate (State Plane),Open Data Channel Type,Park Facility Name,Park Borough,Vehicle Type,Taxi Company Borough,Taxi Pick Up Location,Bridge Highway Name,Bridge Highway Direction,Road Ramp,Bridge Highway Segment,Latitude,Longitude,Location\r\n", - "19937896,03/01/2011 02:27:57 PM,03/14/2011 03:59:20 PM,DOF,Refunds and Adjustments,DOF Property - Payment Issue,Misapplied Payment,Property Address,10027,,,,,,,ADDRESS,NEW YORK,,N/A,Closed,03/22/2011 02:27:57 PM,The Department of Finance resolved this issue.,03/14/2011 03:59:20 PM,09 MANHATTAN,1019820050,MANHATTAN,,,PHONE,Unspecified,MANHATTAN,,,,,,,,,,\r\n", - "19937901,03/01/2011 10:41:13 AM,03/15/2011 04:14:19 PM,DOT,Department of Transportation,Street Sign - Dangling,Street Cleaning - ASP,Street,11232,186 25 STREET,25 STREET,3 AVENUE,4 AVENUE,,,ADDRESS,BROOKLYN,,N/A,Closed,03/15/2011 05:32:23 PM,The Department of Transportation has completed the request or corrected the condition.,03/15/2011 04:14:19 PM,07 BROOKLYN,3006540024,BROOKLYN,984640,180028,PHONE,Unspecified,BROOKLYN,,,,,,,,40.660811976282695,-73.99859430999363,\"(40.660811976282695, -73.99859430999363)\"\r\n", - "19937902,03/01/2011 09:07:45 AM,03/15/2011 08:26:09 AM,DOT,Department of Transportation,Street Sign - Missing,Other/Unknown,Street,11358,,,,,158 STREET,NORTHERN BOULEVARD,INTERSECTION,FLUSHING,,N/A,Closed,03/15/2011 02:24:33 PM,The Department of Transportation has completed the request or corrected the condition.,03/15/2011 08:26:09 AM,07 QUEENS,,QUEENS,1037621,217498,PHONE,Unspecified,QUEENS,,,,,,,,40.763497105049986,-73.80733639290203,\"(40.763497105049986, -73.80733639290203)\"\r\n", - "19937903,03/01/2011 05:39:26 PM,04/04/2011 11:32:57 AM,DOT,Department of Transportation,Street Sign - Missing,School Crossing,Street,10014,10 SHERIDAN SQUARE,SHERIDAN SQUARE,BARROW STREET,GROVE STREET,,,ADDRESS,NEW YORK,,N/A,Closed,04/01/2011 03:43:12 PM,\"Upon inspection, the reported condition was not found, therefore no action was taken.\",04/04/2011 11:32:57 AM,02 MANHATTAN,1005920040,MANHATTAN,983719,206336,PHONE,Unspecified,MANHATTAN,,,,,,,,40.733021305197404,-74.00191597502526,\"(40.733021305197404, -74.00191597502526)\"\r\n", - "19937904,03/01/2011 11:08:14 AM,03/02/2011 07:55:37 AM,DOT,Department of Transportation,Street Sign - Missing,Stop,Street,10069,,,,,WEST 63 STREET,WEST END AVENUE,INTERSECTION,NEW YORK,,N/A,Closed,03/08/2011 11:08:14 AM,\"The condition has been inspected/investigated, see customer notes for more information.\",03/02/2011 07:55:37 AM,07 MANHATTAN,,MANHATTAN,987400,221308,PHONE,Unspecified,MANHATTAN,,,,,,,,40.77411510013836,-73.98862703263869,\"(40.77411510013836, -73.98862703263869)\"\r\n", - "19937906,03/01/2011 03:16:09 PM,03/02/2011 09:06:30 AM,DOF,Correspondence Unit,DOF Property - Request Copy,Copy of Notice of Property Value,Property Address,11105,,,,,,,ADDRESS,ASTORIA,,N/A,Closed,03/06/2011 03:16:09 PM,The Department of Finance mailed the requested item.,03/02/2011 09:06:31 AM,01 QUEENS,4009650074,QUEENS,,,PHONE,Unspecified,QUEENS,,,,,,,,,,\r\n", - "19937907,03/01/2011 01:22:59 PM,03/02/2011 09:06:28 AM,DOF,Correspondence Unit,DOF Property - Request Copy,Copy of Notice of Property Value,Property Address,10469,,,,,,,ADDRESS,BRONX,,N/A,Closed,03/06/2011 01:22:59 PM,The Department of Finance mailed the requested item.,03/02/2011 09:06:28 AM,12 BRONX,2046970142,BRONX,,,PHONE,Unspecified,BRONX,,,,,,,,,,\r\n", - "19937908,03/01/2011 12:01:58 PM,03/02/2011 09:05:26 AM,DOF,Correspondence Unit,DOF Property - Request Copy,Copy of Notice of Property Value,Property Address,10305,,,,,,,ADDRESS,STATEN ISLAND,,N/A,Closed,03/06/2011 12:01:58 PM,The Department of Finance mailed the requested item.,03/02/2011 09:05:26 AM,02 STATEN ISLAND,5032350004,STATEN ISLAND,,,PHONE,Unspecified,STATEN ISLAND,,,,,,,,,,\r\n", - "19937909,03/01/2011 02:35:46 PM,03/02/2011 09:06:31 AM,DOF,Correspondence Unit,DOF Property - Request Copy,Copy of Notice of Property Value,Property Address,11221,,,,,,,ADDRESS,BROOKLYN,,N/A,Closed,03/06/2011 02:35:46 PM,The Department of Finance mailed the requested item.,03/02/2011 09:06:31 AM,04 BROOKLYN,3033660059,BROOKLYN,,,PHONE,Unspecified,BROOKLYN,,,,,,,,,,\r\n" - ] - } - ], - "source": [ - "!head 311_subset.csv" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "gO0jOErMxBYq", - "outputId": "d46afe03-86e9-491c-8563-47225ebf9e42" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "ERROR:root:Failed to start or connect to Tuplex WebUI. Details: MongoDB (mongod) not found on PATH. In order to use Tuplex's WebUI, you need MongoDB installed or point the framework to a running MongoDB instance\n", - "INFO:local ee:loaded runtime library from/home/leonhard/.pyenv/versions/3.7.13/lib/python3.7/site-packages/tuplex/libexec/tuplex_runtime.cpython-37m-x86_64-linux-gnu.so\n", - "INFO:local ee:initializing LLVM backend\n", - "WARNING:local ee:init JIT compiler also only in local mode\n", - "INFO:LLVM:compiling code for skylake\n", - "WARNING:history server:could not connect to http://localhost:5000/api/version, if you wish to disable the webui consider setting tuplex.webui=False for the context.\n", - "INFO:memory:allocated bitmap managed memory region (2.00 GB, 32.00 MB block size)\n", - "INFO:local execution engine:started local executor E/1 (2.00 GB, 32.00 MB default partition size)\n" - ] - } - ], - "source": [ - "c = tuplex.Context({'tuplex.redirectToPythonLogging':True, 'tuplex.executorCount':1, 'tuplex.executorMemory':'2G', 'tuplex.driverMemory':'2G'})" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "QdOhBdtYkyPn" - }, - "source": [ - "Again, we can use Tuplex's autodetection feature to load the file and assign meaningful default types." - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "3T56PhtlvGaL", - "outputId": "e83fc6ee-f7e4-4321-ea10-3f5f5ce20b7f" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:E/1:starting detached process queue\n", - "INFO:E/1:initialized runtime memory (4.00 MB)\n", - "INFO:fileinputoperator:found 1 file (999.08 MB) to process.\n", - "INFO:global:sampled file:///home/leonhard/projects/2nd-copy/311_subset.csv on 256.00 KB\n" - ] - } - ], - "source": [ - "ds = c.csv('311_subset.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "sMIkneXZvZZY", - "outputId": "3f9d4c08-1ef9-40cd-8f6a-eba35e51315f" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'Unique Key': typing.Union[int, NoneType],\n", - " 'Created Date': typing.Union[str, NoneType],\n", - " 'Closed Date': typing.Union[str, NoneType],\n", - " 'Agency': typing.Union[str, NoneType],\n", - " 'Agency Name': typing.Union[str, NoneType],\n", - " 'Complaint Type': typing.Union[str, NoneType],\n", - " 'Descriptor': typing.Union[str, NoneType],\n", - " 'Location Type': typing.Union[str, NoneType],\n", - " 'Incident Zip': typing.Union[int, NoneType],\n", - " 'Incident Address': typing.Union[str, NoneType],\n", - " 'Street Name': typing.Union[str, NoneType],\n", - " 'Cross Street 1': typing.Union[str, NoneType],\n", - " 'Cross Street 2': typing.Union[str, NoneType],\n", - " 'Intersection Street 1': typing.Union[str, NoneType],\n", - " 'Intersection Street 2': typing.Union[str, NoneType],\n", - " 'Address Type': typing.Union[str, NoneType],\n", - " 'City': typing.Union[str, NoneType],\n", - " 'Landmark': typing.Union[str, NoneType],\n", - " 'Facility Type': typing.Union[str, NoneType],\n", - " 'Status': typing.Union[str, NoneType],\n", - " 'Due Date': typing.Union[str, NoneType],\n", - " 'Resolution Description': typing.Union[str, NoneType],\n", - " 'Resolution Action Updated Date': typing.Union[str, NoneType],\n", - " 'Community Board': typing.Union[str, NoneType],\n", - " 'BBL': typing.Union[int, NoneType],\n", - " 'Borough': typing.Union[str, NoneType],\n", - " 'X Coordinate (State Plane)': typing.Union[int, NoneType],\n", - " 'Y Coordinate (State Plane)': typing.Union[int, NoneType],\n", - " 'Open Data Channel Type': typing.Union[str, NoneType],\n", - " 'Park Facility Name': typing.Union[str, NoneType],\n", - " 'Park Borough': typing.Union[str, NoneType],\n", - " 'Vehicle Type': typing.Union[str, NoneType],\n", - " 'Taxi Company Borough': typing.Union[str, NoneType],\n", - " 'Taxi Pick Up Location': typing.Union[str, NoneType],\n", - " 'Bridge Highway Name': typing.Union[str, NoneType],\n", - " 'Bridge Highway Direction': typing.Union[str, NoneType],\n", - " 'Road Ramp': typing.Union[str, NoneType],\n", - " 'Bridge Highway Segment': typing.Union[str, NoneType],\n", - " 'Latitude': typing.Union[float, NoneType],\n", - " 'Longitude': typing.Union[float, NoneType],\n", - " 'Location': typing.Union[str, NoneType]}" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dict(zip(ds.columns, ds.types))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "81tBWIvZtW0x" - }, - "source": [ - "Executing a simple query on the input data creates a logical plan under the hood, which then gets optimized into a physical plan together with auto-generated efficient code that gets lowered ultimately to native code optimized for the machine it is executed on." - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "KsSHOrfIvhIO", - "outputId": "cb874465-2a30-48c1-aa0f-e92145be361b" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:logical planner:logical optimization took 0.005175ms\n", - "INFO:codegen:generating pipeline for (Option[i64]) -> (Option[i64]) (1 operator pipelined)\n", - "INFO:codegen:generating lambda function for ((Option[i64])) -> Option[i64]\n", - "WARNING:history server:Could not register job, is history server running? To disable this warning, set webui=False in the context configuration.\n", - "INFO:global:Optimization via LLVM passes took 0.012285 ms\n", - "INFO:global:starting code compilation\n", - "INFO:global:first compile done\n", - "INFO:global:functor Stage_0 retrieved from llvm\n", - "INFO:global:retrieving init/release stage functors\n", - "INFO:global:Compiled code paths for stage 0 in 0.01 ms\n", - "INFO:global:[Transform Stage] Stage 0 compiled to x86 in 0.0185374s\n", - "INFO:local ee:split /home/leonhard/projects/2nd-copy/311_subset.csv into 15 tasks\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000269s (5 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000376s (0 normal rows, 0 exceptions)\n", - "INFO:driver:[Task Finished] Transform to mem in 0.000557s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000182s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000126s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000113s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000264s (0 normal rows, 0 exceptions)\n", - "INFO:driver:[Task Finished] Transform to mem in 0.000148s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000149s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000127s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000226s (0 normal rows, 0 exceptions)\n", - "INFO:driver:[Task Finished] Transform to mem in 0.000143s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000192s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000166s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000160s (0 normal rows, 0 exceptions)\n", - "INFO:global:[Transform Stage] Stage 0 completed 15 load&transform tasks in 0.0246091s\n", - "INFO:global:[Transform Stage] Stage 0 total wall clock time: 0.00319552s, 19 input rows, time to process 1 row via fast path: 0.168185ms\n", - "INFO:global:[Transform Stage] Stage 0 completed 15 sink tasks in 1.8249e-05s\n", - "INFO:global:[Transform Stage] Stage 0 took 0.0432127s\n", - "INFO:global:Query Execution took 0.0675056s. (planning: 0.0238491s, execution: 0.0436565s)\n", - "INFO:global:Collecting result of 5 rows took 0.000089 seconds\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------------+\n", - "| Unique Key |\n", - "+------------+\n", - "| 19937896 |\n", - "+------------+\n", - "| 19937901 |\n", - "+------------+\n", - "| 19937902 |\n", - "+------------+\n", - "| 19937903 |\n", - "+------------+\n", - "| 19937904 |\n", - "+------------+\n" - ] - } - ], - "source": [ - "ds.selectColumns(['Unique Key']).show(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uPqLcSjJt80V" - }, - "source": [ - "As for every operation, we can retrieve help using Python's builtin documentation featue." - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "kgnGE2OYntU4", - "outputId": "93b19289-51c2-49ca-9d50-659af8e2c667" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Help on method selectColumns in module tuplex.dataset:\n", - "\n", - "selectColumns(columns) method of tuplex.dataset.DataSet instance\n", - " selects a subset of columns as defined through columns which is a list or a single column\n", - " \n", - " Args:\n", - " columns: list of strings or integers. A string should reference a column name, whereas as an integer refers to an index. Indices may be negative according to python rules. Order in list determines output order\n", - " \n", - " Returns:\n", - " tuplex.dataset.DataSet: A Tuplex Dataset object that allows further ETL operations\n", - "\n" - ] - } - ], - "source": [ - "help(ds.selectColumns)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "foF-WYEnuKbm" - }, - "source": [ - "I.e., when looking up the semantics of the `selectColumns` operation, it's also possible to use integers instead of strings to select columns for more flexibility." - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "G07YcJeGuU7K", - "outputId": "ce51b323-45c4-4664-d142-2658266270b2" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:logical planner:logical optimization took 0.003723ms\n", - "INFO:codegen:generating pipeline for (Option[i64],Option[str]) -> (Option[i64],Option[str]) (1 operator pipelined)\n", - "INFO:codegen:generating lambda function for ((Option[i64],Option[str])) -> (Option[i64],Option[str])\n", - "WARNING:history server:Could not register job, is history server running? To disable this warning, set webui=False in the context configuration.\n", - "INFO:global:Optimization via LLVM passes took 0.012829 ms\n", - "INFO:global:starting code compilation\n", - "INFO:global:first compile done\n", - "INFO:global:functor Stage_0 retrieved from llvm\n", - "INFO:global:retrieving init/release stage functors\n", - "INFO:global:Compiled code paths for stage 0 in 0.00 ms\n", - "INFO:global:[Transform Stage] Stage 0 compiled to x86 in 0.0181983s\n", - "INFO:local ee:split /home/leonhard/projects/2nd-copy/311_subset.csv into 15 tasks\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000188s (3 normal rows, 0 exceptions)\n", - "INFO:driver:[Task Finished] Transform to mem in 0.000281s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000489s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000271s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000112s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000106s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000103s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000105s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000107s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000105s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000103s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000104s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000106s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000104s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000104s (0 normal rows, 0 exceptions)\n", - "INFO:global:[Transform Stage] Stage 0 completed 15 load&transform tasks in 0.0231235s\n", - "INFO:global:[Transform Stage] Stage 0 total wall clock time: 0.00238825s, 17 input rows, time to process 1 row via fast path: 0.140486ms\n", - "INFO:global:[Transform Stage] Stage 0 completed 15 sink tasks in 1.3223e-05s\n", - "INFO:global:[Transform Stage] Stage 0 took 0.0413734s\n", - "INFO:global:Query Execution took 0.0624538s. (planning: 0.0207632s, execution: 0.0416907s)\n", - "INFO:global:Collecting result of 3 rows took 0.000076 seconds\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------------+--------------------------+\n", - "| Unique Key | Created Date |\n", - "+------------+--------------------------+\n", - "| 19937896 | '03/01/2011 02:27:57 PM' |\n", - "+------------+--------------------------+\n", - "| 19937901 | '03/01/2011 10:41:13 AM' |\n", - "+------------+--------------------------+\n", - "| 19937902 | '03/01/2011 09:07:45 AM' |\n", - "+------------+--------------------------+\n" - ] - } - ], - "source": [ - "ds.selectColumns([0, 1]).show(3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "O0K0POANzMGJ" - }, - "source": [ - "Let's say, we want to use a slightly more complicated pipeline now. As an initial step, let's first investigate what kind ofcomplaint types there are. To find the corresponding column, we can use the meta-data associated with a dataset and then design a first, exploratory query." - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "_LXRLILHzQd5", - "outputId": "445160d3-6514-4c51-a9da-82aac86997f1" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Unique Key | Created Date | Closed Date | Agency | Agency Name\n", - "Complaint Type | Descriptor | Location Type | Incident Zip | Incident Address\n", - "Street Name | Cross Street 1 | Cross Street 2 | Intersection Street 1 | Intersection Street 2\n", - "Address Type | City | Landmark | Facility Type | Status\n", - "Due Date | Resolution Description | Resolution Action Updated Date | Community Board | BBL\n", - "Borough | X Coordinate (State Plane) | Y Coordinate (State Plane) | Open Data Channel Type | Park Facility Name\n", - "Park Borough | Vehicle Type | Taxi Company Borough | Taxi Pick Up Location | Bridge Highway Name\n", - "Bridge Highway Direction | Road Ramp | Bridge Highway Segment | Latitude | Longitude\n", - "Location\n" - ] - } - ], - "source": [ - "def print_table(arr, break_after=5):\n", - " for i in range(len(arr) // break_after +1):\n", - " print(' | '.join(arr[i * break_after:(i +1)* break_after]))\n", - "\n", - "print_table(ds.columns)" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "aW2AjU9r0Jqt", - "outputId": "e049e45c-0cd8-467f-b6ef-16938e53bfda" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:logical planner:logical optimization took 0.003931ms\n", - "INFO:codegen:generating pipeline for (Option[str]) -> (Option[str]) (2 operators pipelined)\n", - "INFO:codegen:generating lambda function for ((Option[str])) -> Option[str]\n", - "WARNING:history server:Could not register job, is history server running? To disable this warning, set webui=False in the context configuration.\n", - "INFO:global:provided option to explicitly merge bad rows in order back, however rows will be hashed. Disabling option. To silence this warning, set\n", - "tuplex.optimizer.mergeExceptionsInOrder=false\n", - "INFO:global:Optimization via LLVM passes took 0.010179 ms\n", - "INFO:global:starting code compilation\n", - "INFO:global:first compile done\n", - "INFO:global:functor Stage_0 retrieved from llvm\n", - "INFO:global:retrieving init/release stage functors\n", - "INFO:global:Compiled code paths for stage 0 in 0.00 ms\n", - "INFO:global:[Transform Stage] Stage 0 compiled to x86 in 0.0143798s\n", - "INFO:local ee:split /home/leonhard/projects/2nd-copy/311_subset.csv into 15 tasks\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.227395s (0 normal rows, 0 exceptions, 129 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.221090s (0 normal rows, 0 exceptions, 137 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.216516s (0 normal rows, 0 exceptions, 128 buckets)\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.226671s (0 normal rows, 0 exceptions, 136 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.209629s (0 normal rows, 0 exceptions, 200 buckets)\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.224845s (0 normal rows, 0 exceptions, 145 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.217637s (0 normal rows, 0 exceptions, 142 buckets)\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.219661s (0 normal rows, 0 exceptions, 142 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.226369s (0 normal rows, 0 exceptions, 143 buckets)\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.216218s (0 normal rows, 0 exceptions, 138 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.213966s (0 normal rows, 0 exceptions, 140 buckets)\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.217093s (0 normal rows, 0 exceptions, 135 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.245751s (0 normal rows, 0 exceptions, 123 buckets)\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.267397s (0 normal rows, 0 exceptions, 117 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.362738s (0 normal rows, 0 exceptions, 126 buckets)\n", - "INFO:global:[Transform Stage] Stage 0 completed 15 load&transform tasks in 1.93791s\n", - "INFO:global:[Transform Stage] Stage 0 total wall clock time: 3.51298s, 1907219 input rows, time to process 1 row via fast path: 0.00184194ms\n", - "INFO:global:[Transform Stage] Stage 0 completed 15 sink tasks in 0.00117526s\n", - "INFO:global:[Transform Stage] Stage 0 took 1.95351s\n", - "INFO:global:[Transform Stage] skipped stage 1 because there is nothing todo here.\n", - "INFO:global:Query Execution took 1.99631s. (planning: 0.0233447s, execution: 1.97296s)\n", - "INFO:python:Data transfer back to Python took 0.000431 seconds\n" - ] - } - ], - "source": [ - "complaint_types = ds.selectColumns(['Complaint Type']).unique().collect()" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "JaN8v0Zu0hpT", - "outputId": "276d3df0-9b65-486b-d7ab-6f1246064cb1" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['Mosquitoes', 'DOF Parking - Payment Issue', 'DOF Property - Update Account', 'Street Condition', 'Trans Fat', 'Plumbing', 'Benefit Card Replacement', 'DOF Parking - Address Update', 'Non-Emergency Police Matter', 'Harboring Bees/Wasps', 'Home Delivered Meal - Missed Delivery', 'HPD Literature Request', 'Health', 'Beach/Pool/Sauna Complaint', 'Unsanitary Animal Facility', 'Ferry Complaint', 'Illegal Parking', 'Drug Activity', 'DRIE', 'Dead/Dying Tree', 'Overflowing Litter Baskets', 'Unleashed Dog', 'BEST/Site Safety', 'Vending', 'Sidewalk Condition', 'Highway Sign - Damaged', 'Bridge Condition', 'Public Payphone Complaint', 'Overgrown Tree/Branches', 'Broken Parking Meter', 'Animal-Abuse', 'Taxi Complaint', 'Green Taxi Complaint', 'NONCONST', 'Abandoned Vehicle', 'Noise - Commercial', 'Noise - Helicopter', 'New Tree Request', 'Noise', 'Illegal Fireworks', 'X-Ray Machine/Equipment', 'Discipline and Suspension', 'Animal in a Park', 'Transportation Provider Complaint', 'Tattooing', 'Hazardous Materials', 'Homeless Street Condition', 'WATER LEAK', 'HEAP Assistance', 'Lifeguard', 'Bus Stop Shelter Placement', 'Tanning', 'Bottled Water', 'GENERAL', \"Alzheimer's Care\", 'DHS Advantage - Third Party', 'Construction Lead Dust', 'Highway Sign - Missing', 'Sweeping/Missed-Inadequate', 'DPR Internal', 'Curb Condition', 'Noise - House of Worship', 'Window Guard', 'Sanitation Condition', 'PAINT/PLASTER', 'Radioactive Material', 'Summer Camp', 'For Hire Vehicle Report', 'Panhandling', 'Legal Services Provider Complaint', 'Unsanitary Animal Pvt Property', 'Bike Rack Condition', 'Weatherization', 'Home Delivered Meal Complaint', 'Emergency Response Team (ERT)', 'Illegal Animal Kept as Pet', 'DOF Property - Request Copy', 'Animal Facility - No Permit', 'Utility Program', 'Squeegee', 'DEP Street Condition', 'Home Care Provider Complaint', 'Investigations and Discipline (IAD)', 'SCRIE', 'Electrical', 'Urinating in Public', 'PAINT - PLASTER', 'Homeless Person Assistance', 'Water Conservation', 'Noise - Residential', 'Street Sign - Missing', 'City Vehicle Placard Complaint', 'OUTSIDE BUILDING', 'FLOORING/STAIRS', 'Portable Toilet', 'Housing Options', 'Building Marshals office', 'DOF Parking - Request Copy', 'Posting Advertisement', 'Parent Leadership', 'Sewer', 'Construction', 'DOOR/WINDOW', 'Parking Card', 'Recycling Enforcement', 'LinkNYC', 'Mobile Food Vendor', 'SAFETY', 'Taxpayer Advocate Inquiry', 'Special Projects Inspection Team (SPIT)', 'DHS Income Savings Requirement', 'Food Establishment', 'Drinking', 'ELECTRIC', 'Bereavement Support Group', 'DOF Parking - Tax Exemption', 'Derelict Vehicles', 'Water System', 'DHS Advantage -Landlord/Broker', 'Pet Shop', 'For Hire Vehicle Complaint', 'Boilers', 'Building Condition', 'Traffic Signal Condition', 'Street Sign - Dangling', 'Other Enforcement', 'Highway Condition', 'UNSANITARY CONDITION', 'Ferry Inquiry', 'HEAT/HOT WATER', 'Graffiti', 'Blocked Driveway', 'Bus Stop Shelter Complaint', 'DOF Parking - Request Status', 'School Maintenance', 'ATF', 'Case Management Agency Complaint', 'Home Repair', 'Rodent', 'Lost Property', 'OEM Literature Request', 'Industrial Waste', 'Public Toilet', 'Tunnel Condition', 'ELEVATOR', 'Violation of Park Rules', 'Taxi Compliment', 'Indoor Air Quality', 'Damaged Tree', 'Noise Survey', 'Cranes and Derricks', 'Derelict Vehicle', 'Safety', 'Elevator', 'DOF Property - Property Value', 'DOF Property - Reduction Issue', 'Illegal Animal Sold', 'Highway Sign - Dangling', 'Disorderly Youth', 'PLUMBING', 'Calorie Labeling', 'Housing - Low Income Senior', 'Illegal Tree Damage', 'Found Property', 'Municipal Parking Facility', 'Missed Collection (All Materials)', 'DOF Parking - DMV Clearance', 'Elder Abuse', 'Food Poisoning', 'Broken Muni Meter', 'Mold', 'APPLIANCE', 'NORC Complaint', 'Registration and Transfers', 'General Construction/Plumbing', 'Special Natural Area District (SNAD)', 'Construction Safety Enforcement', 'Indoor Sewage', 'Building/Use', 'DHS Advantage - Tenant', 'Bike/Roller/Skate Chronic', 'Noise - Street/Sidewalk', 'Teaching/Learning/Instruction', 'Borough Office', 'Drinking Water', 'Noise - Park', 'DCA / DOH New License Application Request', 'Non-Residential Heat', 'Day Care', 'Unsanitary Pigeon Condition', 'Air Quality', 'Noise - Vehicle', 'Dead Tree', 'Dirty Conditions', 'Consumer Complaint', 'Homeless Encampment', 'DOF Property - Payment Issue', 'Executive Inspections', 'Water Quality', 'Smoking', 'Sustainability Enforcement', 'DOF Property - City Rebate', 'Lead', 'Asbestos', 'Animal Abuse', 'Street Light Condition', 'Root/Sewer/Sidewalk Condition', 'Traffic', 'Street Sign - Damaged', 'DOF Property - Owner Issue', 'Advocate-Personal Exemptions', 'Senior Center Complaint', 'Vacant Lot', 'DOF Property - RPIE Issue', 'Taxi Report', 'Standing Water', 'Derelict Bicycle', 'Poison Ivy', 'Scaffold Safety', 'Maintenance or Facility', 'Ferry Permit', 'Snow']\n" - ] - } - ], - "source": [ - "print(complaint_types)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rV8aG_rV0lAE" - }, - "source": [ - "Looking at the data, we see that there are some complaints regarding mosquitoes. Likely, because it gets quite hot and humid in summer in New York City! Can the data back this up?\n", - "\n", - "To find out, let's plot the number of mosquito complaints per month for the last year. A helpful function for aggregating the results is `aggregateByKey`:" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "l-5hyAme0xga", - "outputId": "1b916d6d-72b5-4962-9408-22aa343df1e5" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Help on method aggregateByKey in module tuplex.dataset:\n", - "\n", - "aggregateByKey(combine, aggregate, initial_value, key_columns) method of tuplex.dataset.DataSet instance\n", - " An experimental aggregateByKey function similar to aggregate. There are several scenarios that do not work with this function yet and its performance hasn't been properly\n", - " optimized either. Data is grouped by the supplied key_columns. Then, for each group a new aggregate is initialized using the initial_value, which can be thought of as a neutral value.\n", - " The aggregate function is then called for each element and the current aggregate structure. It is guaranteed that the combine function is called at least once per group by applying the initial_value to the aggregate.\n", - " Args:\n", - " combine: a UDF to combine two aggregates (results of the aggregate function or the initial_value). E.g., cobmine = lambda agg1, agg2: agg1 + agg2. The initial value should be the neutral element.\n", - " aggregate: a UDF which produces a result by combining a value with the aggregate initialized by initial_value. E.g., aggreagte = lambda agg, value: agg + value sums up values.\n", - " initial_value: a neutral initial value.\n", - " key_columns: the columns to group the aggregate by, a sequence of a mix of strings or integers. If specified as a single string or number, aggregation is over a single column.\n", - " Returns:\n", - " Dataset\n", - "\n" - ] - } - ], - "source": [ - "help(ds.aggregateByKey)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aYeMEv-81nQT" - }, - "source": [ - "Next, let's use a UDF to extract the month and year of the complaint and limit the search to complain types so Tuplex automatically processes fewer rows." - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "gvPtyBDE1m1_", - "outputId": "bb111e74-4122-44de-cc38-681504fded6e" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:logical planner:logical optimization took 0.004637ms\n", - "INFO:codegen:generating pipeline for (Option[str]) -> (Option[str]) (1 operator pipelined)\n", - "INFO:codegen:generating lambda function for ((Option[str])) -> Option[str]\n", - "WARNING:history server:Could not register job, is history server running? To disable this warning, set webui=False in the context configuration.\n", - "INFO:global:Optimization via LLVM passes took 0.010873 ms\n", - "INFO:global:starting code compilation\n", - "INFO:global:first compile done\n", - "INFO:global:functor Stage_0 retrieved from llvm\n", - "INFO:global:retrieving init/release stage functors\n", - "INFO:global:Compiled code paths for stage 0 in 0.00 ms\n", - "INFO:global:[Transform Stage] Stage 0 compiled to x86 in 0.0160895s\n", - "INFO:local ee:split /home/leonhard/projects/2nd-copy/311_subset.csv into 15 tasks\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000198s (5 normal rows, 0 exceptions)\n", - "INFO:driver:[Task Finished] Transform to mem in 0.000220s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000408s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000380s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000185s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000112s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000107s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000107s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000149s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000450s (0 normal rows, 0 exceptions)\n", - "INFO:driver:[Task Finished] Transform to mem in 0.000231s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000239s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000111s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000187s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.000107s (0 normal rows, 0 exceptions)\n", - "INFO:global:[Transform Stage] Stage 0 completed 15 load&transform tasks in 0.0214662s\n", - "INFO:global:[Transform Stage] Stage 0 total wall clock time: 0.0031914s, 19 input rows, time to process 1 row via fast path: 0.167968ms\n", - "INFO:global:[Transform Stage] Stage 0 completed 15 sink tasks in 1.9245e-05s\n", - "INFO:global:[Transform Stage] Stage 0 took 0.0376218s\n", - "INFO:global:Query Execution took 0.0492011s. (planning: 0.0112488s, execution: 0.0379523s)\n", - "INFO:global:Collecting result of 5 rows took 0.000101 seconds\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+--------------------------+\n", - "| Created Date |\n", - "+--------------------------+\n", - "| '03/01/2011 02:27:57 PM' |\n", - "+--------------------------+\n", - "| '03/01/2011 10:41:13 AM' |\n", - "+--------------------------+\n", - "| '03/01/2011 09:07:45 AM' |\n", - "+--------------------------+\n", - "| '03/01/2011 05:39:26 PM' |\n", - "+--------------------------+\n", - "| '03/01/2011 11:08:14 AM' |\n", - "+--------------------------+\n" - ] - } - ], - "source": [ - "ds.selectColumns(['Created Date']).show(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "A3sguZZ30xiq", - "outputId": "0774a0ae-478b-4aba-9fc9-e3e6d39da590" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:logical planner:logical optimization took 0.015383ms\n", - "INFO:codegen:generating pipeline for (Option[str],Option[str]) -> (i64,i64,Option[str]) (5 operators pipelined)\n", - "INFO:codegen:generating lambda function for ((Option[str],Option[str])) -> boolean\n", - "INFO:codegen:generating function extract_month for ((Option[str],Option[str])) -> i64\n", - "INFO:codegen:generating function extract_year for ((Option[str],Option[str],i64)) -> i64\n", - "INFO:codegen:generating lambda function for ((Option[str],Option[str],i64,i64)) -> boolean\n", - "INFO:codegen:generating lambda function for ((Option[str],Option[str],i64,i64)) -> (i64,i64,Option[str])\n", - "WARNING:history server:Could not register job, is history server running? To disable this warning, set webui=False in the context configuration.\n", - "INFO:global:Optimization via LLVM passes took 0.033716 ms\n", - "INFO:global:starting code compilation\n", - "INFO:global:first compile done\n", - "INFO:global:functor Stage_0 retrieved from llvm\n", - "INFO:global:retrieving init/release stage functors\n", - "INFO:global:Compiled code paths for stage 0 in 0.01 ms\n", - "INFO:global:[Transform Stage] Stage 0 compiled to x86 in 0.0496439s\n", - "INFO:local ee:split /home/leonhard/projects/2nd-copy/311_subset.csv into 15 tasks\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.219055s (0 normal rows, 0 exceptions)\n", - "INFO:driver:[Task Finished] Transform to mem in 0.221205s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.217267s (0 normal rows, 0 exceptions)\n", - "INFO:driver:[Task Finished] Transform to mem in 0.231919s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.206888s (4 normal rows, 0 exceptions)\n", - "INFO:driver:[Task Finished] Transform to mem in 0.211544s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.213482s (0 normal rows, 0 exceptions)\n", - "INFO:driver:[Task Finished] Transform to mem in 0.210397s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.232479s (0 normal rows, 0 exceptions)\n", - "INFO:driver:[Task Finished] Transform to mem in 0.236717s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.207968s (0 normal rows, 0 exceptions)\n", - "INFO:driver:[Task Finished] Transform to mem in 0.214165s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.221885s (0 normal rows, 0 exceptions)\n", - "INFO:driver:[Task Finished] Transform to mem in 0.211296s (0 normal rows, 0 exceptions)\n", - "INFO:E/1:[Task Finished] Transform to mem in 0.359222s (0 normal rows, 0 exceptions)\n", - "INFO:global:[Transform Stage] Stage 0 completed 15 load&transform tasks in 1.87937s\n", - "INFO:global:[Transform Stage] Stage 0 total wall clock time: 3.41549s, 1907219 input rows, time to process 1 row via fast path: 0.00179082ms\n", - "INFO:global:[Transform Stage] Stage 0 completed 15 sink tasks in 2.5592e-05s\n", - "INFO:global:[Transform Stage] Stage 0 took 1.92908s\n", - "INFO:global:Query Execution took 1.965s. (planning: 0.0355522s, execution: 1.92945s)\n", - "INFO:global:Collecting result of 4 rows took 0.000079 seconds\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+-------+------+----------------+\n", - "| Month | Year | Complaint Type |\n", - "+-------+------+----------------+\n", - "| 12 | 2019 | 'Mosquitoes' |\n", - "+-------+------+----------------+\n", - "| 12 | 2019 | 'Mosquitoes' |\n", - "+-------+------+----------------+\n", - "| 12 | 2019 | 'Mosquitoes' |\n", - "+-------+------+----------------+\n", - "| 7 | 2019 | 'Mosquitoes' |\n", - "+-------+------+----------------+\n" - ] - } - ], - "source": [ - "year_to_investigate = 2019\n", - "\n", - "def extract_month(row):\n", - " date = row['Created Date']\n", - " date = date[:date.find(' ')]\n", - " return int(date.split('/')[0])\n", - "\n", - "def extract_year(row):\n", - " date = row['Created Date']\n", - " date = date[:date.find(' ')]\n", - " return int(date.split('/')[-1])\n", - "\n", - "ds2 = ds.withColumn('Month', extract_month) \\\n", - " .withColumn('Year', extract_year) \\\n", - " .filter(lambda row: 'Mosquito' in row['Complaint Type']) \\\n", - " .filter(lambda row: row['Year'] == year_to_investigate) \\\n", - " .selectColumns(['Month', 'Year', 'Complaint Type'])\n", - "\n", - "\n", - "ds2.show(5)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wENeFNWEejM8" - }, - "source": [ - "We can now use the aggregateByKey function to count the number of mosquito complaints per month in 2019." - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "joNXIjhSeh0i", - "outputId": "415a3fb4-0490-4594-a0f8-f33da61213aa" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:logical planner:logical optimization took 0.018371ms\n", - "INFO:codegen:generating pipeline for (Option[str],Option[str]) -> (i64,i64) (6 operators pipelined)\n", - "INFO:codegen:generating lambda function for ((Option[str],Option[str])) -> boolean\n", - "INFO:codegen:generating function extract_month for ((Option[str],Option[str])) -> i64\n", - "INFO:codegen:generating function extract_year for ((Option[str],Option[str],i64)) -> i64\n", - "INFO:codegen:generating lambda function for ((Option[str],Option[str],i64,i64)) -> boolean\n", - "INFO:codegen:generating lambda function for ((Option[str],Option[str],i64,i64)) -> (i64,i64,Option[str])\n", - "INFO:codegen:generating function combine_udf for (i64,i64) -> i64\n", - "INFO:codegen:generating function aggregate_udf for (i64,(i64,i64,Option[str])) -> i64\n", - "WARNING:history server:Could not register job, is history server running? To disable this warning, set webui=False in the context configuration.\n", - "INFO:global:provided option to explicitly merge bad rows in order back, however rows will be hashed. Disabling option. To silence this warning, set\n", - "tuplex.optimizer.mergeExceptionsInOrder=false\n", - "INFO:global:Optimization via LLVM passes took 0.037240 ms\n", - "INFO:global:starting code compilation\n", - "INFO:global:first compile done\n", - "INFO:global:functor Stage_0 retrieved from llvm\n", - "INFO:global:retrieving init/release stage functors\n", - "INFO:global:Compiled code paths for stage 0 in 0.02 ms\n", - "INFO:global:[Transform Stage] Stage 0 compiled to x86 in 0.0570446s\n", - "INFO:local ee:split /home/leonhard/projects/2nd-copy/311_subset.csv into 15 tasks\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.227490s (0 normal rows, 0 exceptions, 0 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.220710s (0 normal rows, 0 exceptions, 0 buckets)\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.216524s (0 normal rows, 0 exceptions, 0 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.214208s (0 normal rows, 0 exceptions, 0 buckets)\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.211271s (0 normal rows, 0 exceptions, 2 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.233693s (0 normal rows, 0 exceptions, 0 buckets)\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.207874s (0 normal rows, 0 exceptions, 0 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.211661s (0 normal rows, 0 exceptions, 0 buckets)\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.213264s (0 normal rows, 0 exceptions, 0 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.234989s (0 normal rows, 0 exceptions, 0 buckets)\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.206323s (0 normal rows, 0 exceptions, 0 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.227720s (0 normal rows, 0 exceptions, 0 buckets)\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.235625s (0 normal rows, 0 exceptions, 0 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.220421s (0 normal rows, 0 exceptions, 0 buckets)\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.343307s (0 normal rows, 0 exceptions, 0 buckets)\n", - "INFO:global:[Transform Stage] Stage 0 completed 15 load&transform tasks in 1.86291s\n", - "INFO:global:[Transform Stage] Stage 0 total wall clock time: 3.42508s, 1907219 input rows, time to process 1 row via fast path: 0.00179585ms\n", - "INFO:global:[Transform Stage] Stage 0 completed 15 sink tasks in 0.00101284s\n", - "INFO:global:[Transform Stage] Stage 0 took 1.921s\n", - "INFO:global:[Transform Stage] skipped stage 1 because there is nothing todo here.\n", - "INFO:global:Query Execution took 1.96409s. (planning: 0.0426338s, execution: 1.92146s)\n", - "INFO:global:Collecting result of 2 rows took 0.000038 seconds\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+-------+---+\n", - "| Month | |\n", - "+-------+---+\n", - "| 12 | 3 |\n", - "+-------+---+\n", - "| 7 | 1 |\n", - "+-------+---+\n" - ] - } - ], - "source": [ - "def combine_udf(a, b):\n", - " return a + b\n", - "\n", - "def aggregate_udf(agg, row):\n", - " return agg + 1\n", - "\n", - "ds2.aggregateByKey(combine_udf, aggregate_udf, 0, [\"Month\"]).show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kY_4W26xhmYZ" - }, - "source": [ - "Yet, it seems that mosquito complaints are actually not that common. In total there are 4 complaints for the whole year, of which 3 are in December. Thus we actually can't draw with such little support any meaningful conclusions about mosquitos in NYC from the 311 dataset.\n", - "\n", - "Let's step back and check actually, what kind of complaint is actually the most common:" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "RSV4XLHgh_gC", - "outputId": "62a20ea1-bdab-4758-9a55-1251f8cff21e" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:logical planner:logical optimization took 0.002201ms\n", - "INFO:codegen:generating pipeline for (Option[i64],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[i64],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[i64],Option[str],Option[i64],Option[i64],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[f64],Option[f64],Option[str]) -> (Option[str],i64) (1 operator pipelined)\n", - "INFO:codegen:generating function combine_udf for (i64,i64) -> i64\n", - "INFO:codegen:generating function aggregate_udf for (i64,(Option[i64],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[i64],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[i64],Option[str],Option[i64],Option[i64],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[f64],Option[f64],Option[str])) -> i64\n", - "WARNING:history server:Could not register job, is history server running? To disable this warning, set webui=False in the context configuration.\n", - "INFO:global:provided option to explicitly merge bad rows in order back, however rows will be hashed. Disabling option. To silence this warning, set\n", - "tuplex.optimizer.mergeExceptionsInOrder=false\n", - "INFO:global:Optimization via LLVM passes took 0.091600 ms\n", - "INFO:global:starting code compilation\n", - "INFO:global:first compile done\n", - "INFO:global:functor Stage_0 retrieved from llvm\n", - "INFO:global:retrieving init/release stage functors\n", - "INFO:global:Compiled code paths for stage 0 in 0.03 ms\n", - "INFO:global:[Transform Stage] Stage 0 compiled to x86 in 0.127125s\n", - "INFO:local ee:split /home/leonhard/projects/2nd-copy/311_subset.csv into 15 tasks\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.284140s (0 normal rows, 61 exceptions, 129 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.276317s (0 normal rows, 31 exceptions, 137 buckets)\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.272109s (0 normal rows, 32 exceptions, 136 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.272250s (0 normal rows, 62 exceptions, 128 buckets)\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.266658s (0 normal rows, 33 exceptions, 200 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.271441s (0 normal rows, 46 exceptions, 145 buckets)\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.271498s (0 normal rows, 60 exceptions, 142 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.273922s (0 normal rows, 48 exceptions, 142 buckets)\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.282784s (0 normal rows, 41 exceptions, 143 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.282431s (0 normal rows, 1 exception, 138 buckets)\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.282806s (0 normal rows, 5 exceptions, 140 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.275421s (0 normal rows, 15 exceptions, 135 buckets)\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.274954s (0 normal rows, 4 exceptions, 123 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.279680s (0 normal rows, 37 exceptions, 117 buckets)\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.433613s (0 normal rows, 34 exceptions, 126 buckets)\n", - "INFO:global:[Transform Stage] Stage 0 completed 15 load&transform tasks in 2.3697s\n", - "INFO:global:[Transform Stage] Stage 0 total wall clock time: 4.30002s, 1907219 input rows, time to process 1 row via fast path: 0.0022546ms\n", - "INFO:global:Exception details: \n", - "+------------+-------------------------------------+-------+\n", - "| OperatorID | Exception | Count |\n", - "+------------+-------------------------------------+-------+\n", - "| 100135 | tuplex.internal.BadParseStringInput | 510 |\n", - "+------------+-------------------------------------+-------+\n", - "INFO:local ee:created combined normal-case result in 0.001234s\n", - "INFO:local ee:compiled pure python pipeline in 0.001353s\n", - "INFO:local ee:creating hybrid intermediates took 0.000003s\n", - "INFO:local ee:Created 15 resolve tasks in 0.000494s\n", - "INFO:local ee:15/15 tasks require executing the slow path.\n", - "INFO:E/1:[Task Finished] Resolve in 0.032112s\n", - "INFO:E/1:[Task Finished] Resolve in 0.012562s\n", - "INFO:E/1:[Task Finished] Resolve in 0.033372s\n", - "INFO:driver:[Task Finished] Resolve in 0.072717s\n", - "INFO:E/1:[Task Finished] Resolve in 0.031012s\n", - "INFO:driver:[Task Finished] Resolve in 0.032600s\n", - "INFO:driver:[Task Finished] Resolve in 0.044743s\n", - "INFO:E/1:[Task Finished] Resolve in 0.065935s\n", - "INFO:E/1:[Task Finished] Resolve in 0.009902s\n", - "INFO:driver:[Task Finished] Resolve in 0.030727s\n", - "INFO:E/1:[Task Finished] Resolve in 0.013373s\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=For Hire Vehicle Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=For Hire Vehicle Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=For Hire Vehicle Complaint\n", - "setdefault w. key=Housing - Low Income Senior\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Housing - Low Income Senior\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Sidewalk Condition\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Broken Parking Meter\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Broken Muni Meter\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=For Hire Vehicle Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=SCRIE\n", - "setdefault w. key=Housing Options\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=For Hire Vehicle Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Housing Options\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Food Establishment\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Sidewalk Condition\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=For Hire Vehicle Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Housing Options\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Curb Condition\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Street Condition\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=SCRIE\n", - "setdefault w. key=SCRIE\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Housing - Low Income Senior\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=SCRIE\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Housing - Low Income Senior\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=SCRIE\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=For Hire Vehicle Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Housing Options\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Damaged Tree\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=For Hire Vehicle Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Sidewalk Condition\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Housing Options\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Broken Muni Meter\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Alzheimer's Care\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=For Hire Vehicle Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Sidewalk Condition\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Alzheimer's Care\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=SCRIE\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Housing - Low Income Senior\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:driver:[Task Finished] Resolve in 0.011660s\n", - "INFO:E/1:[Task Finished] Resolve in 0.015524s\n", - "INFO:driver:[Task Finished] Resolve in 0.025153s\n", - "INFO:E/1:[Task Finished] Resolve in 0.030859s\n", - "INFO:local ee:slow path resolved 510/510 exceptions in 0.248894s\n", - "INFO:local ee:slow path for Stage 0: total wall clock time: 0.462249s, time to process 1 row via slow path: 0.906371ms\n", - "INFO:global:[Transform Stage] Stage 0 completed 15 resolve tasks in 0.248915s\n", - "INFO:global:[Transform Stage] Stage 0 completed 15 sink tasks in 0.00174519s\n", - "INFO:global:[Transform Stage] Stage 0 took 2.74754s\n", - "INFO:global:[Transform Stage] skipped stage 1 because there is nothing todo here.\n", - "INFO:global:Query Execution took 2.7727s. (planning: 0.0225033s, execution: 2.75019s)\n", - "INFO:global:Collecting result of 222 rows took 0.001815 seconds\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Housing Options\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Housing - Low Income Senior\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Food Establishment\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=SCRIE\n", - "setdefault w. key=SCRIE\n", - "setdefault w. key=Consumer Complaint\n", - "+---------------------------------------------+--------+\n", - "| Complaint Type | |\n", - "+---------------------------------------------+--------+\n", - "| 'Mosquitoes' | 4 |\n", - "+---------------------------------------------+--------+\n", - "| 'DOF Parking - Payment Issue' | 19487 |\n", - "+---------------------------------------------+--------+\n", - "| 'DOF Property - Update Account' | 65 |\n", - "+---------------------------------------------+--------+\n", - "| 'Street Condition' | 95585 |\n", - "+---------------------------------------------+--------+\n", - "| 'Trans Fat' | 6 |\n", - "+---------------------------------------------+--------+\n", - "| 'Plumbing' | 85 |\n", - "+---------------------------------------------+--------+\n", - "| 'Benefit Card Replacement' | 41 |\n", - "+---------------------------------------------+--------+\n", - "| 'DOF Parking - Address Update' | 1 |\n", - "+---------------------------------------------+--------+\n", - "| 'Non-Emergency Police Matter' | 13897 |\n", - "+---------------------------------------------+--------+\n", - "| 'Harboring Bees/Wasps' | 502 |\n", - "+---------------------------------------------+--------+\n", - "| 'Home Delivered Meal - Missed Delivery' | 265 |\n", - "+---------------------------------------------+--------+\n", - "| 'HPD Literature Request' | 3844 |\n", - "+---------------------------------------------+--------+\n", - "| 'Health' | 2 |\n", - "+---------------------------------------------+--------+\n", - "| 'Beach/Pool/Sauna Complaint' | 517 |\n", - "+---------------------------------------------+--------+\n", - "| 'Unsanitary Animal Facility' | 154 |\n", - "+---------------------------------------------+--------+\n", - "| 'Ferry Complaint' | 391 |\n", - "+---------------------------------------------+--------+\n", - "| 'Illegal Parking' | 117805 |\n", - "+---------------------------------------------+--------+\n", - "| 'Drug Activity' | 70 |\n", - "+---------------------------------------------+--------+\n", - "| 'DRIE' | 4 |\n", - "+---------------------------------------------+--------+\n", - "| 'Dead/Dying Tree' | 132 |\n", - "+---------------------------------------------+--------+\n", - "| 'Overflowing Litter Baskets' | 7 |\n", - "+---------------------------------------------+--------+\n", - "| 'Unleashed Dog' | 1969 |\n", - "+---------------------------------------------+--------+\n", - "| 'BEST/Site Safety' | 8 |\n", - "+---------------------------------------------+--------+\n", - "| 'Vending' | 11235 |\n", - "+---------------------------------------------+--------+\n", - "| 'Sidewalk Condition' | 56500 |\n", - "+---------------------------------------------+--------+\n", - "| 'Highway Sign - Damaged' | 187 |\n", - "+---------------------------------------------+--------+\n", - "| 'Bridge Condition' | 1415 |\n", - "+---------------------------------------------+--------+\n", - "| 'Public Payphone Complaint' | 2527 |\n", - "+---------------------------------------------+--------+\n", - "| 'Overgrown Tree/Branches' | 36842 |\n", - "+---------------------------------------------+--------+\n", - "| 'Broken Parking Meter' | 7377 |\n", - "+---------------------------------------------+--------+\n", - "| 'Animal-Abuse' | 184 |\n", - "+---------------------------------------------+--------+\n", - "| 'Taxi Complaint' | 45876 |\n", - "+---------------------------------------------+--------+\n", - "| 'Green Taxi Complaint' | 9 |\n", - "+---------------------------------------------+--------+\n", - "| 'NONCONST' | 1 |\n", - "+---------------------------------------------+--------+\n", - "| 'Abandoned Vehicle' | 884 |\n", - "+---------------------------------------------+--------+\n", - "| 'Noise - Commercial' | 71607 |\n", - "+---------------------------------------------+--------+\n", - "| 'Noise - Helicopter' | 2049 |\n", - "+---------------------------------------------+--------+\n", - "| 'New Tree Request' | 42969 |\n", - "+---------------------------------------------+--------+\n", - "| 'Noise' | 1114 |\n", - "+---------------------------------------------+--------+\n", - "| 'Illegal Fireworks' | 770 |\n", - "+---------------------------------------------+--------+\n", - "| 'X-Ray Machine/Equipment' | 43 |\n", - "+---------------------------------------------+--------+\n", - "| 'Discipline and Suspension' | 4 |\n", - "+---------------------------------------------+--------+\n", - "| 'Animal in a Park' | 3683 |\n", - "+---------------------------------------------+--------+\n", - "| 'Transportation Provider Complaint' | 49 |\n", - "+---------------------------------------------+--------+\n", - "| 'Tattooing' | 263 |\n", - "+---------------------------------------------+--------+\n", - "| 'Hazardous Materials' | 59 |\n", - "+---------------------------------------------+--------+\n", - "| 'Homeless Street Condition' | 36 |\n", - "+---------------------------------------------+--------+\n", - "| 'WATER LEAK' | 265 |\n", - "+---------------------------------------------+--------+\n", - "| 'HEAP Assistance' | 545 |\n", - "+---------------------------------------------+--------+\n", - "| 'Lifeguard' | 31 |\n", - "+---------------------------------------------+--------+\n", - "| 'Bus Stop Shelter Placement' | 749 |\n", - "+---------------------------------------------+--------+\n", - "| 'Tanning' | 2 |\n", - "+---------------------------------------------+--------+\n", - "| 'Bottled Water' | 37 |\n", - "+---------------------------------------------+--------+\n", - "| 'GENERAL' | 255 |\n", - "+---------------------------------------------+--------+\n", - "| 'Alzheimer's Care' | 727 |\n", - "+---------------------------------------------+--------+\n", - "| 'DHS Advantage - Third Party' | 315 |\n", - "+---------------------------------------------+--------+\n", - "| 'Construction Lead Dust' | 19 |\n", - "+---------------------------------------------+--------+\n", - "| 'Highway Sign - Missing' | 122 |\n", - "+---------------------------------------------+--------+\n", - "| 'Sweeping/Missed-Inadequate' | 5 |\n", - "+---------------------------------------------+--------+\n", - "| 'DPR Internal' | 702 |\n", - "+---------------------------------------------+--------+\n", - "| 'Curb Condition' | 7988 |\n", - "+---------------------------------------------+--------+\n", - "| 'Noise - House of Worship' | 3140 |\n", - "+---------------------------------------------+--------+\n", - "| 'Window Guard' | 410 |\n", - "+---------------------------------------------+--------+\n", - "| 'Sanitation Condition' | 189 |\n", - "+---------------------------------------------+--------+\n", - "| 'PAINT/PLASTER' | 259 |\n", - "+---------------------------------------------+--------+\n", - "| 'Radioactive Material' | 32 |\n", - "+---------------------------------------------+--------+\n", - "| 'Summer Camp' | 218 |\n", - "+---------------------------------------------+--------+\n", - "| 'For Hire Vehicle Report' | 17 |\n", - "+---------------------------------------------+--------+\n", - "| 'Panhandling' | 595 |\n", - "+---------------------------------------------+--------+\n", - "| 'Legal Services Provider Complaint' | 61 |\n", - "+---------------------------------------------+--------+\n", - "| 'Unsanitary Animal Pvt Property' | 2076 |\n", - "+---------------------------------------------+--------+\n", - "| 'Bike Rack Condition' | 381 |\n", - "+---------------------------------------------+--------+\n", - "| 'Weatherization' | 488 |\n", - "+---------------------------------------------+--------+\n", - "| 'Home Delivered Meal Complaint' | 146 |\n", - "+---------------------------------------------+--------+\n", - "| 'Emergency Response Team (ERT)' | 89 |\n", - "+---------------------------------------------+--------+\n", - "| 'Illegal Animal Kept as Pet' | 548 |\n", - "+---------------------------------------------+--------+\n", - "| 'DOF Property - Request Copy' | 12693 |\n", - "+---------------------------------------------+--------+\n", - "| 'Animal Facility - No Permit' | 121 |\n", - "+---------------------------------------------+--------+\n", - "| 'Utility Program' | 587 |\n", - "+---------------------------------------------+--------+\n", - "| 'Squeegee' | 16 |\n", - "+---------------------------------------------+--------+\n", - "| 'DEP Street Condition' | 1 |\n", - "+---------------------------------------------+--------+\n", - "| 'Home Care Provider Complaint' | 56 |\n", - "+---------------------------------------------+--------+\n", - "| 'Investigations and Discipline (IAD)' | 3 |\n", - "+---------------------------------------------+--------+\n", - "| 'SCRIE' | 30933 |\n", - "+---------------------------------------------+--------+\n", - "| 'Electrical' | 24 |\n", - "+---------------------------------------------+--------+\n", - "| 'Urinating in Public' | 1154 |\n", - "+---------------------------------------------+--------+\n", - "| 'PAINT - PLASTER' | 1 |\n", - "+---------------------------------------------+--------+\n", - "| 'Homeless Person Assistance' | 354 |\n", - "+---------------------------------------------+--------+\n", - "| 'Water Conservation' | 35 |\n", - "+---------------------------------------------+--------+\n", - "| 'Noise - Residential' | 404057 |\n", - "+---------------------------------------------+--------+\n", - "| 'Street Sign - Missing' | 12170 |\n", - "+---------------------------------------------+--------+\n", - "| 'City Vehicle Placard Complaint' | 13 |\n", - "+---------------------------------------------+--------+\n", - "| 'OUTSIDE BUILDING' | 3 |\n", - "+---------------------------------------------+--------+\n", - "| 'FLOORING/STAIRS' | 88 |\n", - "+---------------------------------------------+--------+\n", - "| 'Portable Toilet' | 4 |\n", - "+---------------------------------------------+--------+\n", - "| 'Housing Options' | 3744 |\n", - "+---------------------------------------------+--------+\n", - "| 'Building Marshals office' | 5 |\n", - "+---------------------------------------------+--------+\n", - "| 'DOF Parking - Request Copy' | 2411 |\n", - "+---------------------------------------------+--------+\n", - "| 'Posting Advertisement' | 448 |\n", - "+---------------------------------------------+--------+\n", - "| 'Parent Leadership' | 1 |\n", - "+---------------------------------------------+--------+\n", - "| 'Sewer' | 848 |\n", - "+---------------------------------------------+--------+\n", - "| 'Construction' | 14876 |\n", - "+---------------------------------------------+--------+\n", - "| 'DOOR/WINDOW' | 244 |\n", - "+---------------------------------------------+--------+\n", - "| 'Parking Card' | 113 |\n", - "+---------------------------------------------+--------+\n", - "| 'Recycling Enforcement' | 4 |\n", - "+---------------------------------------------+--------+\n", - "| 'LinkNYC' | 4 |\n", - "+---------------------------------------------+--------+\n", - "| 'Mobile Food Vendor' | 29 |\n", - "+---------------------------------------------+--------+\n", - "| 'SAFETY' | 80 |\n", - "+---------------------------------------------+--------+\n", - "| 'Taxpayer Advocate Inquiry' | 1 |\n", - "+---------------------------------------------+--------+\n", - "| 'Special Projects Inspection Team (SPIT)' | 78 |\n", - "+---------------------------------------------+--------+\n", - "| 'DHS Income Savings Requirement' | 91 |\n", - "+---------------------------------------------+--------+\n", - "| 'Food Establishment' | 19238 |\n", - "+---------------------------------------------+--------+\n", - "| 'Drinking' | 3735 |\n", - "+---------------------------------------------+--------+\n", - "| 'ELECTRIC' | 220 |\n", - "+---------------------------------------------+--------+\n", - "| 'Bereavement Support Group' | 74 |\n", - "+---------------------------------------------+--------+\n", - "| 'DOF Parking - Tax Exemption' | 1110 |\n", - "+---------------------------------------------+--------+\n", - "| 'Derelict Vehicles' | 779 |\n", - "+---------------------------------------------+--------+\n", - "| 'Water System' | 1260 |\n", - "+---------------------------------------------+--------+\n", - "| 'DHS Advantage -Landlord/Broker' | 11752 |\n", - "+---------------------------------------------+--------+\n", - "| 'Pet Shop' | 7 |\n", - "+---------------------------------------------+--------+\n", - "| 'For Hire Vehicle Complaint' | 7135 |\n", - "+---------------------------------------------+--------+\n", - "| 'Boilers' | 46 |\n", - "+---------------------------------------------+--------+\n", - "| 'Building Condition' | 11 |\n", - "+---------------------------------------------+--------+\n", - "| 'Traffic Signal Condition' | 670 |\n", - "+---------------------------------------------+--------+\n", - "| 'Street Sign - Dangling' | 5851 |\n", - "+---------------------------------------------+--------+\n", - "| 'Other Enforcement' | 15 |\n", - "+---------------------------------------------+--------+\n", - "| 'Highway Condition' | 8130 |\n", - "+---------------------------------------------+--------+\n", - "| 'UNSANITARY CONDITION' | 670 |\n", - "+---------------------------------------------+--------+\n", - "| 'Ferry Inquiry' | 1011 |\n", - "+---------------------------------------------+--------+\n", - "| 'HEAT/HOT WATER' | 7952 |\n", - "+---------------------------------------------+--------+\n", - "| 'Graffiti' | 1025 |\n", - "+---------------------------------------------+--------+\n", - "| 'Blocked Driveway' | 158577 |\n", - "+---------------------------------------------+--------+\n", - "| 'Bus Stop Shelter Complaint' | 3369 |\n", - "+---------------------------------------------+--------+\n", - "| 'DOF Parking - Request Status' | 3430 |\n", - "+---------------------------------------------+--------+\n", - "| 'School Maintenance' | 3245 |\n", - "+---------------------------------------------+--------+\n", - "| 'ATF' | 2 |\n", - "+---------------------------------------------+--------+\n", - "| 'Case Management Agency Complaint' | 61 |\n", - "+---------------------------------------------+--------+\n", - "| 'Home Repair' | 1659 |\n", - "+---------------------------------------------+--------+\n", - "| 'Rodent' | 475 |\n", - "+---------------------------------------------+--------+\n", - "| 'Lost Property' | 324 |\n", - "+---------------------------------------------+--------+\n", - "| 'OEM Literature Request' | 1103 |\n", - "+---------------------------------------------+--------+\n", - "| 'Industrial Waste' | 23 |\n", - "+---------------------------------------------+--------+\n", - "| 'Public Toilet' | 133 |\n", - "+---------------------------------------------+--------+\n", - "| 'Tunnel Condition' | 12 |\n", - "+---------------------------------------------+--------+\n", - "| 'ELEVATOR' | 11 |\n", - "+---------------------------------------------+--------+\n", - "| 'Violation of Park Rules' | 4185 |\n", - "+---------------------------------------------+--------+\n", - "| 'Taxi Compliment' | 1026 |\n", - "+---------------------------------------------+--------+\n", - "| 'Indoor Air Quality' | 11672 |\n", - "+---------------------------------------------+--------+\n", - "| 'Damaged Tree' | 92930 |\n", - "+---------------------------------------------+--------+\n", - "| 'Noise Survey' | 250 |\n", - "+---------------------------------------------+--------+\n", - "| 'Cranes and Derricks' | 4 |\n", - "+---------------------------------------------+--------+\n", - "| 'Derelict Vehicle' | 30603 |\n", - "+---------------------------------------------+--------+\n", - "| 'Safety' | 1 |\n", - "+---------------------------------------------+--------+\n", - "| 'Elevator' | 371 |\n", - "+---------------------------------------------+--------+\n", - "| 'DOF Property - Property Value' | 110 |\n", - "+---------------------------------------------+--------+\n", - "| 'DOF Property - Reduction Issue' | 21365 |\n", - "+---------------------------------------------+--------+\n", - "| 'Illegal Animal Sold' | 72 |\n", - "+---------------------------------------------+--------+\n", - "| 'Highway Sign - Dangling' | 49 |\n", - "+---------------------------------------------+--------+\n", - "| 'Disorderly Youth' | 2078 |\n", - "+---------------------------------------------+--------+\n", - "| 'PLUMBING' | 511 |\n", - "+---------------------------------------------+--------+\n", - "| 'Calorie Labeling' | 17 |\n", - "+---------------------------------------------+--------+\n", - "| 'Housing - Low Income Senior' | 11013 |\n", - "+---------------------------------------------+--------+\n", - "| 'Illegal Tree Damage' | 5255 |\n", - "+---------------------------------------------+--------+\n", - "| 'Found Property' | 219 |\n", - "+---------------------------------------------+--------+\n", - "| 'Municipal Parking Facility' | 165 |\n", - "+---------------------------------------------+--------+\n", - "| 'Missed Collection (All Materials)' | 1351 |\n", - "+---------------------------------------------+--------+\n", - "| 'DOF Parking - DMV Clearance' | 506 |\n", - "+---------------------------------------------+--------+\n", - "| 'Elder Abuse' | 1521 |\n", - "+---------------------------------------------+--------+\n", - "| 'Food Poisoning' | 7287 |\n", - "+---------------------------------------------+--------+\n", - "| 'Broken Muni Meter' | 71519 |\n", - "+---------------------------------------------+--------+\n", - "| 'Mold' | 898 |\n", - "+---------------------------------------------+--------+\n", - "| 'APPLIANCE' | 147 |\n", - "+---------------------------------------------+--------+\n", - "| 'NORC Complaint' | 3 |\n", - "+---------------------------------------------+--------+\n", - "| 'Registration and Transfers' | 7 |\n", - "+---------------------------------------------+--------+\n", - "| 'General Construction/Plumbing' | 987 |\n", - "+---------------------------------------------+--------+\n", - "| 'Special Natural Area District (SNAD)' | 2 |\n", - "+---------------------------------------------+--------+\n", - "| 'Construction Safety Enforcement' | 51 |\n", - "+---------------------------------------------+--------+\n", - "| 'Indoor Sewage' | 2475 |\n", - "+---------------------------------------------+--------+\n", - "| 'Building/Use' | 520 |\n", - "+---------------------------------------------+--------+\n", - "| 'DHS Advantage - Tenant' | 12835 |\n", - "+---------------------------------------------+--------+\n", - "| 'Bike/Roller/Skate Chronic' | 1304 |\n", - "+---------------------------------------------+--------+\n", - "| 'Noise - Street/Sidewalk' | 72362 |\n", - "+---------------------------------------------+--------+\n", - "| 'Teaching/Learning/Instruction' | 7 |\n", - "+---------------------------------------------+--------+\n", - "| 'Borough Office' | 21 |\n", - "+---------------------------------------------+--------+\n", - "| 'Drinking Water' | 270 |\n", - "+---------------------------------------------+--------+\n", - "| 'Noise - Park' | 9283 |\n", - "+---------------------------------------------+--------+\n", - "| 'DCA / DOH New License Application Request' | 2301 |\n", - "+---------------------------------------------+--------+\n", - "| 'Non-Residential Heat' | 861 |\n", - "+---------------------------------------------+--------+\n", - "| 'Day Care' | 9 |\n", - "+---------------------------------------------+--------+\n", - "| 'Unsanitary Pigeon Condition' | 537 |\n", - "+---------------------------------------------+--------+\n", - "| 'Air Quality' | 131 |\n", - "+---------------------------------------------+--------+\n", - "| 'Noise - Vehicle' | 41089 |\n", - "+---------------------------------------------+--------+\n", - "| 'Dead Tree' | 26331 |\n", - "+---------------------------------------------+--------+\n", - "| 'Dirty Conditions' | 579 |\n", - "+---------------------------------------------+--------+\n", - "| 'Consumer Complaint' | 50527 |\n", - "+---------------------------------------------+--------+\n", - "| 'Homeless Encampment' | 6881 |\n", - "+---------------------------------------------+--------+\n", - "| 'DOF Property - Payment Issue' | 9600 |\n", - "+---------------------------------------------+--------+\n", - "| 'Executive Inspections' | 6 |\n", - "+---------------------------------------------+--------+\n", - "| 'Water Quality' | 107 |\n", - "+---------------------------------------------+--------+\n", - "| 'Smoking' | 1690 |\n", - "+---------------------------------------------+--------+\n", - "| 'Sustainability Enforcement' | 13 |\n", - "+---------------------------------------------+--------+\n", - "| 'DOF Property - City Rebate' | 335 |\n", - "+---------------------------------------------+--------+\n", - "| 'Lead' | 4243 |\n", - "+---------------------------------------------+--------+\n", - "| 'Asbestos' | 1419 |\n", - "+---------------------------------------------+--------+\n", - "| 'Animal Abuse' | 4536 |\n", - "+---------------------------------------------+--------+\n", - "| 'Street Light Condition' | 1090 |\n", - "+---------------------------------------------+--------+\n", - "| 'Root/Sewer/Sidewalk Condition' | 20992 |\n", - "+---------------------------------------------+--------+\n", - "| 'Traffic' | 8982 |\n", - "+---------------------------------------------+--------+\n", - "| 'Street Sign - Damaged' | 22554 |\n", - "+---------------------------------------------+--------+\n", - "| 'DOF Property - Owner Issue' | 20120 |\n", - "+---------------------------------------------+--------+\n", - "| 'Advocate-Personal Exemptions' | 4 |\n", - "+---------------------------------------------+--------+\n", - "| 'Senior Center Complaint' | 790 |\n", - "+---------------------------------------------+--------+\n", - "| 'Vacant Lot' | 8 |\n", - "+---------------------------------------------+--------+\n", - "| 'DOF Property - RPIE Issue' | 507 |\n", - "+---------------------------------------------+--------+\n", - "| 'Taxi Report' | 37 |\n", - "+---------------------------------------------+--------+\n", - "| 'Standing Water' | 2427 |\n", - "+---------------------------------------------+--------+\n", - "| 'Derelict Bicycle' | 51 |\n", - "+---------------------------------------------+--------+\n", - "| 'Poison Ivy' | 360 |\n", - "+---------------------------------------------+--------+\n", - "| 'Scaffold Safety' | 3 |\n", - "+---------------------------------------------+--------+\n", - "| 'Maintenance or Facility' | 23440 |\n", - "+---------------------------------------------+--------+\n", - "| 'Ferry Permit' | 51 |\n", - "+---------------------------------------------+--------+\n", - "| 'Snow' | 16 |\n", - "+---------------------------------------------+--------+\n" - ] - } - ], - "source": [ - "ds.aggregateByKey(combine_udf, aggregate_udf, 0, [\"Complaint Type\"]).show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CnkMPDVOXZmo" - }, - "source": [ - "To see what the most common complaint is, let's sort the output:" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": { - "id": "u7JG8-TpXYsS" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:logical planner:logical optimization took 0.003317ms\n", - "INFO:codegen:generating pipeline for (Option[i64],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[i64],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[i64],Option[str],Option[i64],Option[i64],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[f64],Option[f64],Option[str]) -> (Option[str],i64) (1 operator pipelined)\n", - "INFO:codegen:generating function combine_udf for (i64,i64) -> i64\n", - "INFO:codegen:generating function aggregate_udf for (i64,(Option[i64],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[i64],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[i64],Option[str],Option[i64],Option[i64],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[str],Option[f64],Option[f64],Option[str])) -> i64\n", - "WARNING:history server:Could not register job, is history server running? To disable this warning, set webui=False in the context configuration.\n", - "INFO:global:provided option to explicitly merge bad rows in order back, however rows will be hashed. Disabling option. To silence this warning, set\n", - "tuplex.optimizer.mergeExceptionsInOrder=false\n", - "INFO:global:Optimization via LLVM passes took 0.099010 ms\n", - "INFO:global:starting code compilation\n", - "INFO:global:first compile done\n", - "INFO:global:functor Stage_0 retrieved from llvm\n", - "INFO:global:retrieving init/release stage functors\n", - "INFO:global:Compiled code paths for stage 0 in 0.03 ms\n", - "INFO:global:[Transform Stage] Stage 0 compiled to x86 in 0.136291s\n", - "INFO:local ee:split /home/leonhard/projects/2nd-copy/311_subset.csv into 15 tasks\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.346857s (0 normal rows, 61 exceptions, 129 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.350948s (0 normal rows, 31 exceptions, 137 buckets)\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.269277s (0 normal rows, 32 exceptions, 136 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.268737s (0 normal rows, 62 exceptions, 128 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.267930s (0 normal rows, 46 exceptions, 145 buckets)\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.292654s (0 normal rows, 33 exceptions, 200 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.274232s (0 normal rows, 60 exceptions, 142 buckets)\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.295006s (0 normal rows, 48 exceptions, 142 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.269720s (0 normal rows, 41 exceptions, 143 buckets)\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.279908s (0 normal rows, 1 exception, 138 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.267421s (0 normal rows, 5 exceptions, 140 buckets)\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.306931s (0 normal rows, 15 exceptions, 135 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.266008s (0 normal rows, 4 exceptions, 123 buckets)\n", - "INFO:E/1:[Task Finished] Transform to in-memory hash table in 0.280537s (0 normal rows, 37 exceptions, 117 buckets)\n", - "INFO:driver:[Task Finished] Transform to in-memory hash table in 0.438606s (0 normal rows, 34 exceptions, 126 buckets)\n", - "INFO:global:[Transform Stage] Stage 0 completed 15 load&transform tasks in 2.42753s\n", - "INFO:global:[Transform Stage] Stage 0 total wall clock time: 4.47477s, 1907219 input rows, time to process 1 row via fast path: 0.00234623ms\n", - "INFO:global:Exception details: \n", - "+------------+-------------------------------------+-------+\n", - "| OperatorID | Exception | Count |\n", - "+------------+-------------------------------------+-------+\n", - "| 100135 | tuplex.internal.BadParseStringInput | 510 |\n", - "+------------+-------------------------------------+-------+\n", - "INFO:local ee:created combined normal-case result in 0.001441s\n", - "INFO:local ee:compiled pure python pipeline in 0.001211s\n", - "INFO:local ee:creating hybrid intermediates took 0.000004s\n", - "INFO:local ee:Created 15 resolve tasks in 0.000370s\n", - "INFO:local ee:15/15 tasks require executing the slow path.\n", - "INFO:driver:[Task Finished] Resolve in 0.018784s\n", - "INFO:driver:[Task Finished] Resolve in 0.020323s\n", - "INFO:E/1:[Task Finished] Resolve in 0.054611s\n", - "INFO:E/1:[Task Finished] Resolve in 0.025628s\n", - "INFO:E/1:[Task Finished] Resolve in 0.024868s\n", - "INFO:driver:[Task Finished] Resolve in 0.063070s\n", - "INFO:E/1:[Task Finished] Resolve in 0.049932s\n", - "INFO:E/1:[Task Finished] Resolve in 0.013301s\n", - "INFO:driver:[Task Finished] Resolve in 0.051306s\n", - "INFO:E/1:[Task Finished] Resolve in 0.000610s\n", - "INFO:E/1:[Task Finished] Resolve in 0.006864s\n", - "INFO:E/1:[Task Finished] Resolve in 0.003635s\n", - "INFO:driver:[Task Finished] Resolve in 0.021057s\n", - "INFO:E/1:[Task Finished] Resolve in 0.028195s\n", - "INFO:driver:[Task Finished] Resolve in 0.019220s\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=For Hire Vehicle Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Housing Options\n", - "setdefault w. key=For Hire Vehicle Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Housing - Low Income Senior\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Sidewalk Condition\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Broken Parking Meter\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=For Hire Vehicle Complaint\n", - "setdefault w. key=Housing - Low Income Senior\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Housing Options\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Curb Condition\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Street Condition\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Broken Muni Meter\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=For Hire Vehicle Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=SCRIE\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=For Hire Vehicle Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=SCRIE\n", - "setdefault w. key=SCRIE\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Housing Options\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Food Establishment\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Sidewalk Condition\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=For Hire Vehicle Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Housing Options\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Damaged Tree\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=For Hire Vehicle Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Sidewalk Condition\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Housing - Low Income Senior\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=SCRIE\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Housing - Low Income Senior\n", - "setdefault w. key=Housing Options\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=SCRIE\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=For Hire Vehicle Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Sidewalk Condition\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Broken Muni Meter\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=For Hire Vehicle Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Alzheimer's Care\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=SCRIE\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Housing - Low Income Senior\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Alzheimer's Care\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Housing - Low Income Senior\n", - "setdefault w. key=Housing Options\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Food Establishment\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Taxi Complaint\n", - "setdefault w. key=Consumer Complaint\n", - "setdefault w. key=SCRIE\n", - "setdefault w. key=SCRIE\n", - "setdefault w. key=Consumer Complaint\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:local ee:slow path resolved 510/510 exceptions in 0.223588s\n", - "INFO:local ee:slow path for Stage 0: total wall clock time: 0.401403s, time to process 1 row via slow path: 0.787065ms\n", - "INFO:global:[Transform Stage] Stage 0 completed 15 resolve tasks in 0.223618s\n", - "INFO:global:[Transform Stage] Stage 0 completed 15 sink tasks in 0.00194741s\n", - "INFO:global:[Transform Stage] Stage 0 took 2.78945s\n", - "INFO:global:[Transform Stage] skipped stage 1 because there is nothing todo here.\n", - "INFO:global:Query Execution took 2.81643s. (planning: 0.0238355s, execution: 2.7926s)\n", - "INFO:python:Data transfer back to Python took 0.001298 seconds\n" - ] - }, - { - "data": { - "text/plain": [ - "[('Mosquitoes', 4),\n", - " ('DOF Parking - Payment Issue', 19487),\n", - " ('DOF Property - Update Account', 65),\n", - " ('Street Condition', 95585),\n", - " ('Trans Fat', 6)]" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data = ds.aggregateByKey(combine_udf, aggregate_udf, 0, [\"Complaint Type\"]).collect()\n", - "\n", - "sorted(data, key=lambda x: x[1])\n", - "\n", - "data[:5]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pbHtmBoaYqQD" - }, - "source": [ - "As we can see, ?? is the most common complaint." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7_iruiYiiWmd" - }, - "source": [ - "(c) 2017 - 2022 Tuplex team" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "UyneYgH5XwQz" - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.13" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} From eab21ddc59fcd1a1237e085eba97b6e3b6a54bee Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Sat, 1 Oct 2022 22:38:47 -0400 Subject: [PATCH 12/14] fix --- tuplex/adapters/cpython/include/PythonHelpers.h | 2 ++ tuplex/core/src/ee/local/LocalBackend.cc | 1 + 2 files changed, 3 insertions(+) diff --git a/tuplex/adapters/cpython/include/PythonHelpers.h b/tuplex/adapters/cpython/include/PythonHelpers.h index a3246f2cf..1a93ae7bf 100644 --- a/tuplex/adapters/cpython/include/PythonHelpers.h +++ b/tuplex/adapters/cpython/include/PythonHelpers.h @@ -363,6 +363,8 @@ namespace python { } python::unlockGIL(); } +#else + inline void checkPythonIntegrity() {} #endif /*! diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc index 8eaa7b9ba..d580d5e8e 100644 --- a/tuplex/core/src/ee/local/LocalBackend.cc +++ b/tuplex/core/src/ee/local/LocalBackend.cc @@ -28,6 +28,7 @@ #include #include #include +#include namespace tuplex { From dbfb75dbef400b809db81b72903d5310d9b3ce6b Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Sun, 2 Oct 2022 14:20:41 -0400 Subject: [PATCH 13/14] register fix, could prob. move to lockGIL - but better to do it explicitly --- tuplex/adapters/cpython/include/PythonHelpers.h | 5 +++++ tuplex/adapters/cpython/src/PythonGIL.cc | 9 +++++++++ tuplex/python/include/PythonContext.h | 2 +- tuplex/python/include/PythonDataSet.h | 5 +++-- tuplex/python/include/PythonException.h | 1 + tuplex/python/include/PythonMetrics.h | 2 +- tuplex/python/src/PythonBindings.cc | 4 ++++ tuplex/python/src/PythonCommon.cc | 2 ++ tuplex/python/src/PythonContext.cc | 3 +++ 9 files changed, 29 insertions(+), 4 deletions(-) diff --git a/tuplex/adapters/cpython/include/PythonHelpers.h b/tuplex/adapters/cpython/include/PythonHelpers.h index 1a93ae7bf..7e35c4c3d 100644 --- a/tuplex/adapters/cpython/include/PythonHelpers.h +++ b/tuplex/adapters/cpython/include/PythonHelpers.h @@ -354,6 +354,11 @@ namespace python { */ extern void unlockGIL(); + /*! + * needs to be called if using as C-extension to setup gil etc. MUST BE CALLED FROM MAIN THREAD + */ + extern void registerWithInterpreter(); + #ifndef NDEBUG inline void checkPythonIntegrity() { python::lockGIL(); diff --git a/tuplex/adapters/cpython/src/PythonGIL.cc b/tuplex/adapters/cpython/src/PythonGIL.cc index 497a0943a..cb5109bd4 100644 --- a/tuplex/adapters/cpython/src/PythonGIL.cc +++ b/tuplex/adapters/cpython/src/PythonGIL.cc @@ -79,6 +79,15 @@ namespace python { // vars for python management static std::atomic gilState(nullptr); + void registerWithInterpreter() { + if(!interpreterInitialized) { + interpreterInitialized = true; + gil_main_thread_id = std::this_thread::get_id(); + gil_id = gil_main_thread_id; + gilState = PyGILState_GetThisThreadState(); + } + } + void lockGIL() { gilMutex.lock(); // <-- acquire the managing lock. No other thread can lock the gil! => what if another thread tries to unlock? -> security concern... diff --git a/tuplex/python/include/PythonContext.h b/tuplex/python/include/PythonContext.h index de2cf5bc5..399c952a7 100644 --- a/tuplex/python/include/PythonContext.h +++ b/tuplex/python/include/PythonContext.h @@ -150,7 +150,7 @@ namespace tuplex { const std::string& options); explicit PythonContext(const std::string &runtimeLibraryPath) : PythonContext("", runtimeLibraryPath, - "") {} + "") { python::registerWithInterpreter(); } ~PythonContext(); diff --git a/tuplex/python/include/PythonDataSet.h b/tuplex/python/include/PythonDataSet.h index 169712493..624a02d3d 100644 --- a/tuplex/python/include/PythonDataSet.h +++ b/tuplex/python/include/PythonDataSet.h @@ -11,10 +11,11 @@ #ifndef TUPLEX_PYTHONDATASET_H #define TUPLEX_PYTHONDATASET_H -#include "../../core/include/Context.h" +#include #include #include #include "PythonWrappers.h" +#include namespace tuplex { // wrappers hold the actual objects @@ -41,7 +42,7 @@ namespace tuplex { // convert a flat tuple type fast to list of tuples PyObject* simpleTupleToCPython(ResultSet* rs, const python::Type& type, size_t maxRowCount); public: - PythonDataSet(): _dataset(nullptr) {} + PythonDataSet(): _dataset(nullptr) { python::registerWithInterpreter(); } void wrap(DataSet *dataset) { _dataset = dataset; } diff --git a/tuplex/python/include/PythonException.h b/tuplex/python/include/PythonException.h index f05220aa2..1f82bd236 100644 --- a/tuplex/python/include/PythonException.h +++ b/tuplex/python/include/PythonException.h @@ -32,6 +32,7 @@ namespace tuplex { PythonException(const std::string& message, const std::string& data = "") : _message(message), _data(data) { + python::registerWithInterpreter(); } const char *what() const throw() { diff --git a/tuplex/python/include/PythonMetrics.h b/tuplex/python/include/PythonMetrics.h index 111c73c3e..13844e44f 100644 --- a/tuplex/python/include/PythonMetrics.h +++ b/tuplex/python/include/PythonMetrics.h @@ -25,7 +25,7 @@ namespace tuplex { friend class PythonContext; public: - PythonMetrics(): _metrics(nullptr) {} + PythonMetrics(): _metrics(nullptr) { python::registerWithInterpreter(); } /*! * wraps JobMetrics object in PythonMetrics object * @param metrics pointer to JobMetrics object diff --git a/tuplex/python/src/PythonBindings.cc b/tuplex/python/src/PythonBindings.cc index 6b3683853..7909e5e8f 100644 --- a/tuplex/python/src/PythonBindings.cc +++ b/tuplex/python/src/PythonBindings.cc @@ -39,6 +39,8 @@ PYMODULE { m.attr("__version__") = "dev"; #endif + // Note: before constructing any object - call registerWithInterpreter to setup GIL properly! + py::class_(m, "_DataSet") .def("show", &tuplex::PythonDataSet::show) .def("collect", &tuplex::PythonDataSet::collect) @@ -90,4 +92,6 @@ PYMODULE { // global method to register a new logging function m.def("registerLoggingCallback", &tuplex::registerPythonLoggingCallback); + + m.def("registerWithInterpreter", &python::registerWithInterpreter); } \ No newline at end of file diff --git a/tuplex/python/src/PythonCommon.cc b/tuplex/python/src/PythonCommon.cc index 80e8f7e08..affc009cf 100644 --- a/tuplex/python/src/PythonCommon.cc +++ b/tuplex/python/src/PythonCommon.cc @@ -12,6 +12,8 @@ namespace tuplex { py::object registerPythonLoggingCallback(py::object callback_functor) { + python::registerWithInterpreter(); + // get object callback_functor.inc_ref(); auto functor_obj = callback_functor.ptr(); diff --git a/tuplex/python/src/PythonContext.cc b/tuplex/python/src/PythonContext.cc index 65f6ae04f..2d9b11acb 100644 --- a/tuplex/python/src/PythonContext.cc +++ b/tuplex/python/src/PythonContext.cc @@ -1374,6 +1374,9 @@ namespace tuplex { const std::string &runtimeLibraryPath, const std::string& options) : _context(nullptr) { + python::registerWithInterpreter(); + + using namespace std; TUPLEX_TRACE("entering PythonContext"); From 1cbe7448163983a29fc03295b0f22c858ac28833 Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Sun, 2 Oct 2022 20:32:55 -0400 Subject: [PATCH 14/14] cleanup --- .../adapters/cpython/include/PythonHelpers.h | 17 ------- tuplex/adapters/cpython/src/PythonGIL.cc | 50 +++++-------------- tuplex/core/src/ee/local/LocalBackend.cc | 3 -- tuplex/test/core/AssertAndRaise.cc | 2 +- 4 files changed, 14 insertions(+), 58 deletions(-) diff --git a/tuplex/adapters/cpython/include/PythonHelpers.h b/tuplex/adapters/cpython/include/PythonHelpers.h index 7e35c4c3d..4ed86197e 100644 --- a/tuplex/adapters/cpython/include/PythonHelpers.h +++ b/tuplex/adapters/cpython/include/PythonHelpers.h @@ -359,29 +359,12 @@ namespace python { */ extern void registerWithInterpreter(); -#ifndef NDEBUG - inline void checkPythonIntegrity() { - python::lockGIL(); - if(PyErr_Occurred()) { - std::cerr<<"internal python error"< convert to uint64_t and use this for thread safe access - static std::atomic_int64_t gilID(-1); // id of thread who holds gil - static std::atomic_int64_t interpreterID(-1); // thread which holds the interpreter static std::atomic_bool interpreterInitialized(false); // checks whether interpreter is initialized or not - std::thread::id gil_main_thread_id; + std::thread::id gil_main_thread_id; // id of the main thread. std::thread::id gil_id; // id of the thread holding the gil right now. // vars for python management @@ -106,7 +104,6 @@ namespace python { gil_id = std::this_thread::get_id(); gil = true; gilState = nullptr; - gilID = thisThreadID(); } void unlockGIL() { @@ -120,23 +117,14 @@ namespace python { } gil_id = std::thread::id(); gil = false; - gilID = thisThreadID(); gilMutex.unlock(); } bool holdsGIL() { + // thread holds gil if it is hold in general and thread ids match. return gil && std::this_thread::get_id() == gil_id; } - void acquireGIL() { -// gilMutex.lock(); -// // PyEval_AcquireLock(); -// PyEval_AcquireThread(gilState); // acquires GIL! -// gil = true; -// gilID = thisThreadID(); - std::cerr<<"acquire GIL is deprecated"<= 3 && PY_MINOR_VERSION < 7) @@ -153,36 +140,24 @@ namespace python { PyEval_InitThreads(); assert(PyEval_ThreadsInitialized()); #endif - // assume we are calling from python process/shared object - //gilMutex.lock(); - gil_lock = nullptr; // this is the start, we're in the interpreter... - - gil = true; - gilID = interpreterID = thisThreadID(); } else { - // make sure this thread rn holds the GIL! if(!PyGILState_Check()) throw std::runtime_error("when initializing the thread, initInterpreter MUST hold the GIL"); + } - // assume we are calling from python process/shared object - //gilMutex.lock(); - - gil_lock = nullptr; // this is the start, we're in the interpreter... - // acquire and release to initialize, works b.c. single-threaded interpreter... - acquire_lock(); - release_lock(); + gil_lock = nullptr; // this is the start, we're in the interpreter... + // acquire and release to initialize, works b.c. single-threaded interpreter... + acquire_lock(); + release_lock(); + gil = true; - gil = true; - gilID = interpreterID = thisThreadID(); - } gil_id = std::this_thread::get_id(); gilMutex.lock(); interpreterInitialized = true; } void closeInterpreter() { - if(!PyGILState_Check() || !holdsGIL()) throw std::runtime_error("to shutdown interpreter, GIL must be hold the calling thread..."); @@ -196,16 +171,17 @@ namespace python { PyErr_Clear(); } Py_FinalizeEx(); - + // now set to uninitialized. interpreterInitialized = false; - // unlock -// if(gil) -// gilMutex.unlock(); if (gil_lock) { PyThread_free_lock(gil_lock); gil_lock = NULL; } gilMutex.unlock(); + + // reset vars (except main thread id!) + gil = false; + gil_lock = nullptr; } } \ No newline at end of file diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc index d580d5e8e..f014e766a 100644 --- a/tuplex/core/src/ee/local/LocalBackend.cc +++ b/tuplex/core/src/ee/local/LocalBackend.cc @@ -1004,9 +1004,6 @@ namespace tuplex { executeSlowPath = true; if(executeSlowPath) { - - python::checkPythonIntegrity(); - // only if functor or python is available, else there is simply no slow path to resolve! if(syms->resolveFunctor || !tstage->purePythonCode().empty()) { using namespace std; diff --git a/tuplex/test/core/AssertAndRaise.cc b/tuplex/test/core/AssertAndRaise.cc index 1299f088a..e41030115 100644 --- a/tuplex/test/core/AssertAndRaise.cc +++ b/tuplex/test/core/AssertAndRaise.cc @@ -20,7 +20,7 @@ using namespace std; TEST_F(AssertAndRaiseTest, Assert) { auto opt = microTestOptions(); opt.set("optimizer.mergeExceptionsInOrder", "true"); - // opt.set("executorCount", "0"); // single-threaded? --> works. multithreaded fails??? + // opt.set("executorCount", "0"); // uncomment to force to single-threaded Context c(opt); auto code = "def f(x):\n"