From f52dc5cf4ba081185bd8dc8f830a754b45514a9b Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Sun, 23 Feb 2025 21:11:10 -0800 Subject: [PATCH 1/8] [DOC] Update version to 0.3.7 (#149) Update version to 0.3.7. Other: - Update macos runner to macos13 (last github action runner to support x86) - Remove packaging of lambda runner in github wheel to reduce size. - Update github artifact action to v4. - Use more recent pybind v2.13.6. - Add patch for snappy to avoid error on macos/cmake when building with ORC support. - Add lxml_html_clean for jupyter/notebook testing. - Print explicitly for collect how many elements are transferred in logger. --- .github/workflows/build_wheels.yml | 76 +++++++++----- README.md | 2 +- azure-pipelines.yml | 4 +- doc/source/conf.py | 2 +- scripts/azure/install_azure_ci_reqs.sh | 4 +- scripts/build_macos_3.11_wheel_with_test.sh | 106 ++++++++++++++++++++ scripts/macos/install_antlr4_cpp_runtime.sh | 4 + scripts/set_version.py | 5 +- setup.py | 11 +- tuplex/cmake/FindLibDwarf.cmake | 4 +- tuplex/historyserver/requirements.txt | 10 +- tuplex/historyserver/thserver/version.py | 4 +- tuplex/io/CMakeLists.txt | 7 ++ tuplex/io/patches/snappy.diff | 13 +++ tuplex/python/CMakeLists.txt | 2 +- tuplex/python/requirements.txt | 3 +- tuplex/python/setup.py | 4 +- tuplex/python/src/PythonDataSet.cc | 6 +- tuplex/python/tests/notebook_utils.py | 8 +- tuplex/python/tests/test_exceptions.py | 12 ++- tuplex/python/tuplex/utils/version.py | 4 +- 21 files changed, 227 insertions(+), 64 deletions(-) create mode 100755 scripts/build_macos_3.11_wheel_with_test.sh create mode 100644 tuplex/io/patches/snappy.diff diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index d9aa77fae..a58e554b1 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -2,45 +2,50 @@ name: Build on: [push, pull_request, workflow_dispatch] +env: + # At least 10.13 is required, to avoid issues and since the runner is macos-13 -> use 13.0, which is Venture from 2022. + MACOSX_DEPLOYMENT_TARGET: 13.0 + jobs: build_wheels: name: Build wheel on ${{ matrix.os }} - py ${{ matrix.python-version }} runs-on: ${{ matrix.os }} strategy: matrix: - os: [ ubuntu-20.04, macos-11 ] + # macos-14 (which is macos-latest) is ARM only. macos-13 is latest intel runner. + os: [ ubuntu-latest, macos-13 ] python-version: ["3.8", "3.9", "3.10", "3.11"] include: - - os: ubuntu-20.04 + - os: ubuntu-latest python-version: "3.8" cibw-build: "cp38-manylinux_x86_64" - - os: ubuntu-20.04 + - os: ubuntu-latest python-version: "3.9" cibw-build: "cp39-manylinux_x86_64" - - os: ubuntu-20.04 + - os: ubuntu-latest python-version: "3.10" cibw-build: "cp310-manylinux_x86_64" - - os: ubuntu-20.04 + - os: ubuntu-latest python-version: "3.11" cibw-build: "cp311-manylinux_x86_64" - - os: macos-11 + - os: macos-13 python-version: "3.8" cibw-build: "cp38-macosx_x86_64" - - os: macos-11 + - os: macos-13 python-version: "3.9" cibw-build: "cp39-macosx_x86_64" - - os: macos-11 + - os: macos-13 python-version: "3.10" cibw-build: "cp310-macosx_x86_64" - - os: macos-11 + - os: macos-13 python-version: "3.11" cibw-build: "cp311-macosx_x86_64" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Cache brew dependencies if: runner.os == 'macOS' - uses: actions/cache@v3.3.2 + uses: actions/cache@v4.2.1 with: # Paths to cache: # /usr/local/Homebrew - installation folder of Homebrew @@ -52,17 +57,18 @@ jobs: /usr/local/Frameworks /usr/local/bin /usr/local/opt - key: macos-11-build-cache-${{ hashFiles('./scripts/macos/brew_dependencies.sh') }}-v2 + key: macos-13-build-cache-${{ hashFiles('./scripts/macos/brew_dependencies.sh') }}-v2 + + - name: Setup python + uses: actions/setup-python@v5 + with: + python-version: '3.11' - # need to make this an intermediate step, i.e. build first the different lambda runners on Ubuntu... - - name: Build Lambda runner (Linux only) - if: runner.os != 'macOS' - run: docker pull registry-1.docker.io/tuplex/ci:${{ matrix.python-version }} && export PYTHON3_VERSION=${{ matrix.python-version }}.0 && bash ./scripts/create_lambda_zip.sh && mkdir -p ./tuplex/python/tuplex/other && cp ./build-lambda/tplxlam.zip ./tuplex/python/tuplex/other - shell: bash + - name: Install cibuildwheel + run: python -m pip install cibuildwheel==2.22.0 - - name: Build wheel - #if: runner.os != 'macOS' - uses: pypa/cibuildwheel@fff9ec32ed25a9c576750c91e06b410ed0c15db7 # hash corresponds to v2.16.2 + - name: Build wheels + run: python -m cibuildwheel --output-dir wheelhouse env: # configure cibuildwheel to build native archs ('auto'), and some # emulated ones @@ -70,23 +76,39 @@ jobs: CIBW_MANYLINUX_X86_64_IMAGE: "registry-1.docker.io/tuplex/ci:${{ matrix.python-version }}" CIBW_BUILD: ${{ matrix.cibw-build }} - # macOS dependencies separate, for linux use docker tuplex/ci:3.x images. + # macOS dependencies separate, for Linux use docker tuplex/ci:3.x images. CIBW_BEFORE_ALL_MACOS: bash ./scripts/macos/install_antlr4_cpp_runtime.sh && bash ./scripts/macos/brew_dependencies.sh && bash ./scripts/macos/install_aws-sdk-cpp.sh && echo 'export PATH="/usr/local/opt/openjdk@11/bin:$PATH"' >> /Users/runner/.bash_profile - # bundle aws runner with linux wheel, remove environment variable TUPLEX_LAMBDA_ZIP to remove runner. - CIBW_ENVIRONMENT_LINUX: "TUPLEX_LAMBDA_ZIP='./tuplex/python/tuplex/other/tplxlam.zip' CMAKE_ARGS='-DBUILD_WITH_AWS=ON -DBUILD_WITH_ORC=ON' LD_LIBRARY_PATH=/usr/local/lib:/opt/lib" + # If CI complains about missing /usr/local/libexec/git-core/git-remote-https: error while loading shared libraries: libssl.so.3: cannot open shared object file: No such file or directory + # the OpenSSL3 lib is stored under /usr/local/lib64. + CIBW_ENVIRONMENT_LINUX: "CMAKE_ARGS='-DBUILD_WITH_AWS=ON -DBUILD_WITH_ORC=ON' LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib64:/opt/lib" + + # Requires macOS 10.13 at least to build because of C++17 features. + # To avoid issues, simply use 13.0 for now. + CIBW_ENVIRONMENT_MACOS: "MACOSX_DEPLOYMENT_TARGET=${MACOSX_DEPLOYMENT_TARGET} CMAKE_ARGS='-DBUILD_WITH_AWS=ON -DBUILD_WITH_ORC=ON' JAVA_HOME=${JAVA_HOME_11_X64}" - # requires macOS 10.13 at least to build because of C++17 features. - CIBW_ENVIRONMENT_MACOS: "CMAKE_ARGS='-DBUILD_WITH_AWS=ON -DBUILD_WITH_ORC=ON' JAVA_HOME=${JAVA_HOME_11_X64}" # run all python tests to make sure wheels are not defunct CIBW_TEST_REQUIRES: "pytest pytest-timeout numpy nbformat jupyter" + + # Use following test command when segfaults happen to better pinpoint: + # python3 -X faulthandler -m pytest -p no:faulthandler + # else can use pytest ... + # use 3min timeout per test and print top 25 slowest tests - CIBW_TEST_COMMAND: "cd {project} && pytest tuplex/python/tests -v --timeout 600 --durations 25" + CIBW_TEST_COMMAND: "cd {project} && python3 -X faulthandler -m pytest -p no:faulthandler tuplex/python/tests -v --timeout 600 --durations 25" - name: reorganize files run: touch ./scripts/dummy.version && cp ./scripts/*.version ./wheelhouse && cp ./.github/scripts/test_pypi.sh ./wheelhouse - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: + name: wheels-${{ matrix.os }}-${{ matrix.cibw-build }} path: | ./wheelhouse/*.whl + +# Note: when using download-artifact, use +# - uses: actions/download-artifact@v4 +# with: +# path: dist +# merge-multiple: true +# # Requires 4.1 \ No newline at end of file diff --git a/README.md b/README.md index ad8f0a9d7..081d0c109 100644 --- a/README.md +++ b/README.md @@ -133,4 +133,4 @@ series = {SIGMOD/PODS '21} ``` --- -(c) 2017-2023 Tuplex contributors +(c) 2017-2025 Tuplex contributors diff --git a/azure-pipelines.yml b/azure-pipelines.yml index db3664757..4acdcc769 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -22,7 +22,7 @@ jobs: displayName: 'Install MongoDB' - script: sudo bash scripts/azure/install_azure_ci_reqs.sh displayName: 'Install required packages' - - script: sudo apt-get install -y python3-setuptools ninja-build && python3 -m pip install pytest pygments>=2.4.1 MarkupSafe==2.0 pexpect setuptools astor PyYAML jupyter nbformat pymongo eventlet==0.30.0 gunicorn pymongo && jupyter --version + - script: sudo apt-get install -y python3-setuptools ninja-build && python3 -m pip install pytest pygments>=2.4.1 MarkupSafe==2.0 pexpect setuptools astor PyYAML jupyter nbformat pymongo eventlet==0.30.0 gunicorn pymongo "lxml[html_clean]" && jupyter --version displayName: 'Install python dependencies' - script: cd tuplex/python && python3 -m pip install -r requirements.txt && python3 mongodb_test.py && pkill mongod || true displayName: 'Test local MongoDB' @@ -30,5 +30,5 @@ jobs: displayName: 'Build Tuplex' - script: cd build/temp.linux-x86_64-3.10 && ctest --timeout 180 --output-on-failure --repeat until-pass:3 -j 2 displayName: 'C++ tests' - - script: cd build/temp.linux-x86_64-3.10/dist/python && python3.10 -m pytest -x --full-trace -l --log-cli-level=DEBUG --capture=tee-sys + - script: cd build/temp.linux-x86_64-3.10/dist/python && python3.10 -m pip install lxml_html_clean && python3.10 -m pytest -x --full-trace -l --log-cli-level=DEBUG --capture=tee-sys displayName: 'Python tests' diff --git a/doc/source/conf.py b/doc/source/conf.py index 5654ab6df..ab4148721 100755 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -36,7 +36,7 @@ # The short X.Y version version="0.3" # The full version, including alpha/beta/rc tags -release="0.3.6" +release="0.3.7" # -- General configuration --------------------------------------------------- diff --git a/scripts/azure/install_azure_ci_reqs.sh b/scripts/azure/install_azure_ci_reqs.sh index b196c7e7b..ad9b9ba01 100644 --- a/scripts/azure/install_azure_ci_reqs.sh +++ b/scripts/azure/install_azure_ci_reqs.sh @@ -86,7 +86,7 @@ mkdir -p /root/.ssh/ && mkdir -p ${WORKDIR}/boost # build incl. boost python -pushd ${WORKDIR}/boost && wget https://boostorg.jfrog.io/artifactory/main/release/1.79.0/source/boost_1_79_0.tar.gz && tar xf boost_1_79_0.tar.gz && cd ${WORKDIR}/boost/boost_1_79_0 \ +pushd ${WORKDIR}/boost && wget https://github.com/boostorg/boost/releases/download/boost-1.87.0/boost-1.87.0-b2-nodocs.tar.gz && tar xf boost-1.87.0-b2-nodocs.tar.gz && cd ${WORKDIR}/boost/boost-1.87.0 \ && ./bootstrap.sh --with-python=${PYTHON_EXECUTABLE} --prefix=${PREFIX} --with-libraries="thread,iostreams,regex,system,filesystem,python,stacktrace,atomic,chrono,date_time" \ && ./b2 cxxflags="-fPIC" link=static -j "$(nproc)" \ && ./b2 cxxflags="-fPIC" link=static install && sed -i 's/#if PTHREAD_STACK_MIN > 0/#ifdef PTHREAD_STACK_MIN/g' ${PREFIX}/include/boost/thread/pthread/thread_data.hpp @@ -138,5 +138,5 @@ mkdir -p ${WORKDIR}/pcre2 && cd ${WORKDIR}/pcre2 \ && ./configure CFLAGS="-O2 -fPIC" --prefix=${PREFIX} --enable-jit=auto --disable-shared \ && make -j$(nproc) && make install mkdir -p ${WORKDIR}/protobuf && cd ${WORKDIR}/protobuf && git clone -b v24.3 https://github.com/protocolbuffers/protobuf.git && cd protobuf && git submodule update --init --recursive && mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-fPIC" -DCMAKE_CXX_STANDARD=17 -Dprotobuf_BUILD_TESTS=OFF .. && make -j$(nproc) && make install && ldconfig -pip3 install 'cloudpickle>2.0.0' cython numpy +pip3 install 'cloudpickle>2.0.0' cython numpy 'lxml[html_clean]' echo ">>> installing reqs done." diff --git a/scripts/build_macos_3.11_wheel_with_test.sh b/scripts/build_macos_3.11_wheel_with_test.sh new file mode 100755 index 000000000..2b99a7450 --- /dev/null +++ b/scripts/build_macos_3.11_wheel_with_test.sh @@ -0,0 +1,106 @@ +#!/usr/bin/env bash +# (c) 2017-2023 Tuplex team +# builds x86_64 (and arm64) wheels + +# add -x option for verbose output +set -euo pipefail + +function fail { + printf '%s\n' "$1" >&2 + exit "${2-1}" +} + +function detect_instruction_set() { + arch="$(uname -m)" # -i is only linux, -m is linux and apple + if [[ "$arch" = x86_64* ]]; then + if [[ "$(uname -a)" = *ARM64* ]]; then + echo 'arm64' + else + echo 'x86_64' + fi + elif [[ "$arch" = i*86 ]]; then + echo 'x86_32' + elif [[ "$arch" = arm* ]]; then + echo $arch + elif test "$arch" = aarch64; then + echo 'arm64' + else + exit 1 + fi +} + +# check from where script is invoked +CWD="$(cd -P -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)" +echo " || Tuplex macOS wheel builder || " +echo "-- Executing buildwheel script located in $CWD" + +# check platform is darwin +if [ ! "$(uname -s)" = "Darwin" ]; then + fail "Error: Need to run script under macOS" +fi + +# check which tags are supported +arch=$(detect_instruction_set) +echo "-- Detected arch ${arch}" + +# try to extract version of compiler first via command-line tools or xcode +# either needs to be installed. +xcode_version_str=$(pkgutil --pkg-info=com.apple.pkg.CLTools_Executables 2>/dev/null | grep version || pkgutil --pkg-info=com.apple.pkg.Xcode | grep version) +echo "-- Detected Xcode ${xcode_version_str}" + +# if no param is given, use defaults to build all +if [ "${arch}" = "arm64" ]; then + # build Python 3.9 - 3.11 + CIBW_BUILD=${CIBW_BUILD-"cp311-macosx_arm64"} +else + # build Python 3.8 - 3.11 + CIBW_BUILD=${CIBW_BUILD-"cp311-macosx_x86_64"} +fi + +echo "-- Building wheels for ${CIBW_BUILD}" + +MACOS_VERSION=$(sw_vers -productVersion) +echo "-- Processing on MacOS ${MACOS_VERSION}" +function version { echo "$@" | awk -F. '{ printf("%d%03d%03d%03d\n", $1,$2,$3,$4); }'; } + +MACOS_VERSION_MAJOR=`echo $MACOS_VERSION | cut -d . -f1` + +if [ "$MACOS_VERSION_MAJOR" -ge 11 ]; then + echo "-- Newer MacOS detected (>=11.0), using more recent base target." + echo "-- Using minimum target ${MACOS_VERSION_MAJOR}.0" + MINIMUM_TARGET="${MACOS_VERSION_MAJOR}.0" +else + # keep as is + echo "-- Defaulting build to use as minimum target ${MINIMUM_TARGET}" +fi + +pushd $CWD > /dev/null +cd .. + +# Use 13.0 +MINIMUM_TARGET=13.0 +export MACOSX_DEPLOYMENT_TARGET=${MINIMUM_TARGET} + +# Note: protobuf 3.20 - 3.21.2 is broken for MacOS, do not use those versions +export CIBW_BEFORE_BUILD_MACOS="brew install protobuf coreutils zstd zlib libmagic llvm@16 aws-sdk-cpp pcre2 antlr4-cpp-runtime googletest gflags yaml-cpp celero wget boost ninja snappy libdwarf libelf" + + +# Note: orc build breaks wheel right now... +export CIBW_ENVIRONMENT_MACOS="MACOSX_DEPLOYMENT_TARGET=${MINIMUM_TARGET} CMAKE_ARGS='-DBUILD_WITH_AWS=ON -DBUILD_WITH_ORC=ON' TUPLEX_BUILD_TYPE=RelWithDebInfo" +#export CIBW_ENVIRONMENT_MACOS="MACOSX_DEPLOYMENT_TARGET=${MINIMUM_TARGET} CMAKE_ARGS='-DBUILD_WITH_AWS=ON -DBUILD_WITH_ORC=ON' TUPLEX_BUILD_TYPE=Debug" + +export CIBW_BUILD="${CIBW_BUILD}" +export CIBW_PROJECT_REQUIRES_PYTHON=">=3.8" + +# uncomment to increase verbosity of cibuildwheel +export CIBW_BUILD_VERBOSITY=3 + +# uncomment and set to specific identifier +#export CIBW_BUILD="cp39-macosx_x86_64" + +export CIBW_TEST_REQUIRES="pytest pytest-timeout numpy nbformat jupyter" +export CIBW_TEST_COMMAND="cd {project} && python3 -X faulthandler -m pytest -p no:faulthandler tuplex/python/tests --timeout_method thread --timeout 300 -l -v" + +cibuildwheel --platform macos + +popd diff --git a/scripts/macos/install_antlr4_cpp_runtime.sh b/scripts/macos/install_antlr4_cpp_runtime.sh index 94b7c835f..c786e64d0 100644 --- a/scripts/macos/install_antlr4_cpp_runtime.sh +++ b/scripts/macos/install_antlr4_cpp_runtime.sh @@ -12,6 +12,9 @@ if [ -d "${PREFIX}/include/antlr4-runtime" ]; then exit 0 fi +# use arm64 or x86_64. +ARCH=x86_64 + # if macOS is 10.x -> use this as minimum MINIMUM_TARGET="-DCMAKE_OSX_DEPLOYMENT_TARGET=10.13" @@ -35,6 +38,7 @@ git clone https://github.com/antlr/antlr4.git \ && cd antlr4 && cd runtime && git fetch --all --tags \ && git checkout tags/4.13.1 -b 4.13.1 && cd Cpp/ \ && sed -i '' "s/cmake ./cmake . ${MINIMUM_TARGET}/g" deploy-macos.sh \ +&& sed -i '' "s/CMAKE_OSX_ARCHITECTURES=\"arm64; x86_64\"/CMAKE_OSX_ARCHITECTURES=\"${ARCH}\"/g" deploy-macos.sh \ && cat deploy-macos.sh \ && ./deploy-macos.sh \ && unzip -l antlr4-cpp-runtime-macos.zip && unzip antlr4-cpp-runtime-macos.zip \ diff --git a/scripts/set_version.py b/scripts/set_version.py index a9bcb27f0..af67324c2 100755 --- a/scripts/set_version.py +++ b/scripts/set_version.py @@ -15,9 +15,8 @@ def LooseVersion(v): parts = v.split('.') return parts - -# to create a testpypi version use X.Y.devN -version = '0.3.6' +# change here and run script within its directory to update versions across the board +version = '0.3.7' # https://pypi.org/simple/tuplex/ # or https://test.pypi.org/simple/tuplex/ diff --git a/setup.py b/setup.py index c3616d281..5d2e86484 100644 --- a/setup.py +++ b/setup.py @@ -66,12 +66,13 @@ def in_google_colab(): 'nbformat<7.0', 'prompt_toolkit>=2.0.7', 'pytest>=5.3.2' +'lxml[html_clean]' ] # Also requires to install MongoDB webui_dependencies = [ - 'Flask>=2.0.2,<2.2.0', - 'Werkzeug<2.2.0', + 'Flask>=3.0', + 'Werkzeug', 'gunicorn', 'eventlet==0.30.0', # newer versions of eventlet have a bug under MacOS 'flask-socketio', @@ -117,8 +118,8 @@ def run_command(cmd, cwd, env): logging.debug('Building dependencies for Google Colab environment') install_dependencies = [ - 'urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1', - 'folium==0.2.1' + 'urllib3!=1.25.0,!=1.25.1,>=1.21.1', + 'folium>=0.2.1' 'requests', 'attrs>=19.2.0', 'dill>=0.2.7.1', @@ -695,7 +696,7 @@ def tplx_package_data(): # logic and declaration, and simpler if you include description/version in a file. setup(name="tuplex", python_requires='>=3.8.0', - version="0.3.6", + version="0.3.7", author="Leonhard Spiegelberg", author_email="tuplex@cs.brown.edu", description="Tuplex is a novel big data analytics framework incorporating a Python UDF compiler based on LLVM " diff --git a/tuplex/cmake/FindLibDwarf.cmake b/tuplex/cmake/FindLibDwarf.cmake index 19827ab99..73fd23dae 100644 --- a/tuplex/cmake/FindLibDwarf.cmake +++ b/tuplex/cmake/FindLibDwarf.cmake @@ -32,8 +32,10 @@ find_path (DWARF_INCLUDE_DIR ${PkgConfig_LibDwarf_INCLUDE_DIRS} /usr/include /usr/include/libdwarf + /usr/include/libdwarf-0 /usr/local/include /usr/local/include/libdwarf + /usr/local/include/libdwarf-0 /opt/local/include /sw/include ENV CPATH) # PATH and INCLUDE will also work @@ -126,4 +128,4 @@ endif() mark_as_advanced(LIBDW_INCLUDE_DIR DWARF_INCLUDE_DIR) mark_as_advanced(LIBDWARF_INCLUDE_DIR LIBDWARF_LIBRARIES) -mark_as_advanced(LIBDWARF_CONST_NAME LIBDWARF_USE_INIT_C) \ No newline at end of file +mark_as_advanced(LIBDWARF_CONST_NAME LIBDWARF_USE_INIT_C) diff --git a/tuplex/historyserver/requirements.txt b/tuplex/historyserver/requirements.txt index aa2fcc311..65e118bed 100644 --- a/tuplex/historyserver/requirements.txt +++ b/tuplex/historyserver/requirements.txt @@ -4,12 +4,12 @@ jedi astor==0.7.1 pandas>=0.23.4 cloudpickle -Werkzeug==2.0.1 +Werkzeug>=2.0.1 flask>=2.0.1 -flask_socketio==4.3.1 -python-socketio==4.6.0 -python-engineio==3.13.2 -flask_pymongo==2.2.0 +flask_socketio>=4.3.1 +python-socketio>=4.6.0 +python-engineio>=3.13.2 +flask_pymongo>=2.2.0 iso8601==0.1.12 dill==0.2.8.2 greenlet>=0.4.15 diff --git a/tuplex/historyserver/thserver/version.py b/tuplex/historyserver/thserver/version.py index 7cb607a85..22182bb91 100644 --- a/tuplex/historyserver/thserver/version.py +++ b/tuplex/historyserver/thserver/version.py @@ -1,2 +1,2 @@ -# (c) L.Spiegelberg 2017 - 2023 -__version__="0.3.6" \ No newline at end of file +# (c) L.Spiegelberg 2017 - 2024 +__version__="0.3.7" \ No newline at end of file diff --git a/tuplex/io/CMakeLists.txt b/tuplex/io/CMakeLists.txt index de1af5b52..144dbdd82 100644 --- a/tuplex/io/CMakeLists.txt +++ b/tuplex/io/CMakeLists.txt @@ -38,9 +38,16 @@ if(BUILD_WITH_ORC) set(SNAPPY_STATIC_LIB "${SNAPPY_HOME}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}snappy${CMAKE_STATIC_LIBRARY_SUFFIX}") set(SNAPPY_CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${SNAPPY_HOME} -DBUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_LIBDIR=lib -DSNAPPY_BUILD_BENCHMARKS=OFF -DSNAPPY_BUILD_TESTS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON) + + # cf. https://gitlab.kitware.com/cmake/cmake/-/issues/17287 + set(PATCH_CMD_FOR_SNAPPY bash -c "patch -p1 < \"${CMAKE_CURRENT_LIST_DIR}/patches/snappy.diff\"") + # To show what the patch looks like, use: + # set(PATCH_CMD_FOR_SNAPPY cat "${CMAKE_CURRENT_LIST_DIR}/patches/snappy.diff") + ExternalProject_Add (snappy_ep URL "https://github.com/google/snappy/archive/${SNAPPY_VERSION}.tar.gz" CMAKE_ARGS ${SNAPPY_CMAKE_ARGS} + PATCH_COMMAND ${PATCH_CMD_FOR_SNAPPY} BUILD_BYPRODUCTS "${SNAPPY_STATIC_LIB}") set(SNAPPY_LIBRARIES ${SNAPPY_STATIC_LIB}) diff --git a/tuplex/io/patches/snappy.diff b/tuplex/io/patches/snappy.diff new file mode 100644 index 000000000..3d87114e9 --- /dev/null +++ b/tuplex/io/patches/snappy.diff @@ -0,0 +1,13 @@ +diff --git a/snappy.cc b/snappy.cc +index d4147185..955ff303 100644 +--- a/snappy.cc ++++ b/snappy.cc +@@ -1290,7 +1290,7 @@ std::pair DecompressBranchless( + DeferMemCopy(&deferred_src, &deferred_length, from, len); + } + } while (ip < ip_limit_min_slop && +- (op + deferred_length) < op_limit_min_slop); ++ (long)(op + deferred_length) < (long)op_limit_min_slop); + exit: + ip--; + assert(ip <= ip_limit); diff --git a/tuplex/python/CMakeLists.txt b/tuplex/python/CMakeLists.txt index 62f4fff74..da912e8e9 100644 --- a/tuplex/python/CMakeLists.txt +++ b/tuplex/python/CMakeLists.txt @@ -18,7 +18,7 @@ message(STATUS "Pybind11 uses python version ${Python3_VERSION}") set(PYBIND11_FINDPYTHON OFF CACHE INTERNAL "") set(PYBIND11_PYTHON_VERSION "${Python3_VERSION}" CACHE INTERNAL "") FetchContent_Declare(pybind11 GIT_REPOSITORY https://github.com/pybind/pybind11 - GIT_TAG v2.11.1 ) + GIT_TAG v2.13.6) FetchContent_GetProperties(pybind11) if(NOT pybind11_POPULATED) FetchContent_Populate(pybind11) diff --git a/tuplex/python/requirements.txt b/tuplex/python/requirements.txt index c7c9a76ee..1c53763a5 100644 --- a/tuplex/python/requirements.txt +++ b/tuplex/python/requirements.txt @@ -1,7 +1,8 @@ nbconvert<7.0 jupyter<7.0 nbformat<7.0 -Werkzeug<2.2.0 +lxml[html_clean] +Werkzeug attrs>=19.2.0 dill>=0.2.7.1 pluggy>=0.6.0, <1.0.0 diff --git a/tuplex/python/setup.py b/tuplex/python/setup.py index 99ad427eb..b59fe4c86 100644 --- a/tuplex/python/setup.py +++ b/tuplex/python/setup.py @@ -29,7 +29,7 @@ setup( name="Tuplex", - version="0.3.6", + version="0.3.7", packages=find_packages(), package_data={ # include libs in libexec @@ -45,7 +45,7 @@ 'nbconvert<7.0', 'jupyter<7.0', 'nbformat<7.0', - 'Werkzeug<2.2.0', + 'Werkzeug', 'attrs>=19.2.0', 'dill>=0.2.7.1', 'pluggy>=0.6.0, <1.0.0', diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc index 90c2b47b4..92a52e4b4 100644 --- a/tuplex/python/src/PythonDataSet.cc +++ b/tuplex/python/src/PythonDataSet.cc @@ -83,7 +83,9 @@ namespace tuplex { // new version, directly interact with the interpreter Timer timer; - Logger::instance().logger("python").info("Converting result-set to CPython objects"); + auto output_row_count = rs->rowCount(); + + Logger::instance().logger("python").info("Converting result-set to CPython objects (" + pluralize(output_row_count, "row") + ")"); // build python list object from resultset auto listObj = resultSetToCPython(rs.get(), std::numeric_limits::max()); @@ -102,7 +104,7 @@ namespace tuplex { #endif Logger::instance().logger("python").info("Data transfer back to Python took " - + std::to_string(timer.time()) + " seconds"); + + std::to_string(timer.time()) + " seconds (" + pluralize(PyList_Size(listObj), "element") + ")"); auto list = pybind_list_from_obj(listObj); // Logger::instance().flushAll(); diff --git a/tuplex/python/tests/notebook_utils.py b/tuplex/python/tests/notebook_utils.py index 06fb2b55a..cac5a6eaf 100644 --- a/tuplex/python/tests/notebook_utils.py +++ b/tuplex/python/tests/notebook_utils.py @@ -47,8 +47,12 @@ def notebook_run(path): args = ['jupyter', "nbconvert", "--to", "notebook", "--execute", "--ExecutePreprocessor.timeout=60", "--output", fout.name, path] - subprocess.check_call(args, stderr=subprocess.DEVNULL) - + try: + # use: subprocess.DEVNULL? + subprocess.check_call(args, stderr=subprocess.STDOUT, universal_newlines=True) + except subprocess.CalledProcessError as exc: + logging.error(f"FAIL notebook_run {path}: {exc.returncode} {exc.output}") + raise exc fout.seek(0) nb = nbformat.read(fout, nbformat.current_nbformat) diff --git a/tuplex/python/tests/test_exceptions.py b/tuplex/python/tests/test_exceptions.py index 81ec43222..6ac3d3705 100644 --- a/tuplex/python/tests/test_exceptions.py +++ b/tuplex/python/tests/test_exceptions.py @@ -20,10 +20,13 @@ class TestExceptions: def setup_method(self, method): + + N_PROCESSES = 2 + self.conf = options_for_pytest() - self.conf.update({"tuplex.webui.enable": False, "executorCount": 8, "executorMemory": "256MB", "driverMemory": "256MB", "partitionSize": "256KB", "tuplex.optimizer.mergeExceptionsInOrder": False}) + self.conf.update({"tuplex.webui.enable": False, "executorCount": N_PROCESSES, "executorMemory": "256MB", "driverMemory": "256MB", "partitionSize": "256KB", "tuplex.optimizer.mergeExceptionsInOrder": False}) self.conf_in_order = options_for_pytest() - self.conf_in_order.update({"tuplex.webui.enable": False, "executorCount": 8, "executorMemory": "256MB", "driverMemory": "256MB", "partitionSize": "256KB", "tuplex.optimizer.mergeExceptionsInOrder": True}) + self.conf_in_order.update({"tuplex.webui.enable": False, "executorCount": N_PROCESSES, "executorMemory": "256MB", "driverMemory": "256MB", "partitionSize": "256KB", "tuplex.optimizer.mergeExceptionsInOrder": True}) def assertEqual(self, lhs, rhs): assert lhs == rhs @@ -44,7 +47,7 @@ def test_merge_with_filter(self): output = c.parallelize([-1.1, 1, 2, -2.2, 4, 5, -6.6]).filter(lambda x: x < 0 or x > 3).collect() self.compare_in_order([-1.1, -2.2, 4, 5, -6.6], output) - @pytest.mark.parametrize("n", [1000, 2500]) + @pytest.mark.parametrize("n", [1000, 2500, 5000, 10000]) def test_merge_with_filter(self, n): c = Context(self.conf_in_order) input = list(range(1, n + 1)) @@ -100,8 +103,7 @@ def resolve_udf(x): # for larger partitions, there's a multi-threading issue for this. # need to fix. conf = self.conf_in_order - # use this line to force single-threaded - # conf['executorCount'] = 0 + c = Context(conf) output = c.parallelize(input).filter(filter_udf).map(map_udf).resolve(ZeroDivisionError, resolve_udf).collect() diff --git a/tuplex/python/tuplex/utils/version.py b/tuplex/python/tuplex/utils/version.py index 7cb607a85..22182bb91 100644 --- a/tuplex/python/tuplex/utils/version.py +++ b/tuplex/python/tuplex/utils/version.py @@ -1,2 +1,2 @@ -# (c) L.Spiegelberg 2017 - 2023 -__version__="0.3.6" \ No newline at end of file +# (c) L.Spiegelberg 2017 - 2024 +__version__="0.3.7" \ No newline at end of file From 5c31ef16afbf0d4dfaa84ff9a72238d3361d6c90 Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Wed, 26 Feb 2025 07:38:24 -0800 Subject: [PATCH 2/8] Deprecate python 3.8, add support for 3.12 and 3.13 (#150) Deprecates python 3.8 (EOL Dec'24), adds support for Python 3.12 and 3.13. Other: - Changes unicode conversion to use `PyUnicode_AsUTF8AndSize` instead of macros. - Adds 3.12 and 3.13 to ubuntu/macos github action CI matrix. - Updates README.md badges with new supported python versions. --- .github/workflows/build_wheels.yml | 22 +++--- README.md | 6 +- scripts/build_linux_wheels.sh | 6 +- scripts/build_linux_wheels_with_test.sh | 8 +-- scripts/build_macos_wheels.sh | 10 +-- scripts/build_macos_wheels_with_test.sh | 6 +- setup.py | 5 +- .../adapters/cpython/include/PythonHelpers.h | 7 ++ tuplex/adapters/cpython/src/PythonHelpers.cc | 11 ++- tuplex/codegen/tools/.gitignore | 1 + tuplex/core/include/HybridHashTable.h | 6 ++ tuplex/historyserver/thserver/version.py | 2 +- tuplex/python/setup.py | 8 --- tuplex/python/src/PythonContext.cc | 67 +++++++++++++++---- tuplex/python/tuplex/utils/version.py | 2 +- 15 files changed, 108 insertions(+), 59 deletions(-) create mode 100644 tuplex/codegen/tools/.gitignore diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index a58e554b1..f005a593d 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -12,13 +12,10 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - # macos-14 (which is macos-latest) is ARM only. macos-13 is latest intel runner. + # macos-14 (which is macos-latest) is ARM only. macos-13 is the latest intel runner. os: [ ubuntu-latest, macos-13 ] - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] include: - - os: ubuntu-latest - python-version: "3.8" - cibw-build: "cp38-manylinux_x86_64" - os: ubuntu-latest python-version: "3.9" cibw-build: "cp39-manylinux_x86_64" @@ -28,9 +25,12 @@ jobs: - os: ubuntu-latest python-version: "3.11" cibw-build: "cp311-manylinux_x86_64" - - os: macos-13 - python-version: "3.8" - cibw-build: "cp38-macosx_x86_64" + - os: ubuntu-latest + python-version: "3.12" + cibw-build: "cp312-manylinux_x86_64" + - os: ubuntu-latest + python-version: "3.13" + cibw-build: "cp313-manylinux_x86_64" - os: macos-13 python-version: "3.9" cibw-build: "cp39-macosx_x86_64" @@ -40,6 +40,12 @@ jobs: - os: macos-13 python-version: "3.11" cibw-build: "cp311-macosx_x86_64" + - os: macos-13 + python-version: "3.12" + cibw-build: "cp312-macosx_x86_64" + - os: macos-13 + python-version: "3.13" + cibw-build: "cp313-macosx_x86_64" steps: - uses: actions/checkout@v4 diff --git a/README.md b/README.md index 081d0c109..6161b900c 100644 --- a/README.md +++ b/README.md @@ -2,19 +2,15 @@ [![Build Status](https://dev.azure.com/leonhardspiegelberg/Tuplex%20-%20Open%20Source/_apis/build/status/tuplex.tuplex?branchName=master)](https://dev.azure.com/leonhardspiegelberg/Tuplex%20-%20Open%20Source/_build/latest?definitionId=2&branchName=master) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) -![Supported python versions](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-blue) -[![PyPi Downloads](https://img.shields.io/pypi/dm/tuplex)](https://img.shields.io/pypi/dm/tuplex) +![Supported python versions](https://img.shields.io/badge/3.9%20%7C%203.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue) [Website](https://tuplex.cs.brown.edu/) [Documentation](https://tuplex.cs.brown.edu/python-api.html) Tuplex is a parallel big data processing framework that runs data science pipelines written in Python at the speed of compiled code. Tuplex has similar Python APIs to [Apache Spark](https://spark.apache.org/) or [Dask](https://dask.org/), but rather than invoking the Python interpreter, Tuplex generates optimized LLVM bytecode for the given pipeline and input data set. Under the hood, Tuplex is based on data-driven compilation and dual-mode processing, two key techniques that make it possible for Tuplex to provide speed comparable to a pipeline written in hand-optimized C++. -You can join the discussion on Tuplex on our [Gitter community](https://gitter.im/tuplex/community) or read up more on the background of Tuplex in our [SIGMOD'21 paper](https://dl.acm.org/doi/abs/10.1145/3448016.3457244). - Contributions welcome! - ### Contents + [Example](#example) + [Quickstart](#quickstart) diff --git a/scripts/build_linux_wheels.sh b/scripts/build_linux_wheels.sh index 9bf5885fb..28f0cf4e1 100755 --- a/scripts/build_linux_wheels.sh +++ b/scripts/build_linux_wheels.sh @@ -39,16 +39,12 @@ fi export CIBW_ENVIRONMENT="TUPLEX_LAMBDA_ZIP='./tuplex/other/tplxlam.zip' CMAKE_ARGS='-DBUILD_WITH_AWS=ON -DBUILD_WITH_ORC=ON' LD_LIBRARY_PATH=/usr/local/lib:/opt/lib" # Use the following line to build only python3.7-3.9 wheel -export CIBW_BUILD="cp3{8,9,10,11}-*" +export CIBW_BUILD="cp3{9,10,11,12,13}-*" export CIBW_ARCHS_LINUX="x86_64" # do not build musllinux yet export CIBW_SKIP="*-musllinux_*" -# to test the others from 3.7-3.9, use these two lines: -#export CIBW_BUILD="cp3{7,8,9}-*" -#export CIBW_SKIP="cp3{5,6,7,8}-macosx* pp*" - export CIBW_BUILD_VERBOSITY=3 export CIBW_PROJECT_REQUIRES_PYTHON=">=3.8" diff --git a/scripts/build_linux_wheels_with_test.sh b/scripts/build_linux_wheels_with_test.sh index 6830bb0b2..a595de289 100755 --- a/scripts/build_linux_wheels_with_test.sh +++ b/scripts/build_linux_wheels_with_test.sh @@ -39,18 +39,14 @@ fi export CIBW_ENVIRONMENT="TUPLEX_LAMBDA_ZIP='./tuplex/other/tplxlam.zip' CMAKE_ARGS='-DBUILD_WITH_AWS=ON -DBUILD_WITH_ORC=ON' LD_LIBRARY_PATH=/usr/local/lib:/opt/lib" # Use the following line to build only python3.7-3.9 wheel -export CIBW_BUILD="cp3{8,9,10,11}-*" +export CIBW_BUILD="cp3{9,10,11,12,13}-*" export CIBW_ARCHS_LINUX="x86_64" # do not build musllinux yet export CIBW_SKIP="*-musllinux_*" -# to test the others from 3.7-3.9, use these two lines: -#export CIBW_BUILD="cp3{7,8,9}-*" -#export CIBW_SKIP="cp3{5,6,7,8}-macosx* pp*" - export CIBW_BUILD_VERBOSITY=3 -export CIBW_PROJECT_REQUIRES_PYTHON=">=3.8" +export CIBW_PROJECT_REQUIRES_PYTHON=">=3.9" # uncomment to increase verbosity of cibuildwheel # export CIBW_BUILD_VERBOSITY=3 diff --git a/scripts/build_macos_wheels.sh b/scripts/build_macos_wheels.sh index 63246f974..2e19c2a47 100755 --- a/scripts/build_macos_wheels.sh +++ b/scripts/build_macos_wheels.sh @@ -50,11 +50,11 @@ echo "-- Detected Xcode ${xcode_version_str}" # if no param is given, use defaults to build all if [ "${arch}" = "arm64" ]; then - # build Python 3.9 - 3.11 - CIBW_BUILD=${CIBW_BUILD-"cp3{9,10,11}-macosx_arm64"} + # build Python 3.9 - 3.13 + CIBW_BUILD=${CIBW_BUILD-"cp3{9,10,11,12,13}-macosx_arm64"} else - # build Python 3.8 - 3.11 - CIBW_BUILD=${CIBW_BUILD-"cp3{8,9,10,11}-macosx_x86_64"} + # build Python 3.9 - 3.13 + CIBW_BUILD=${CIBW_BUILD-"cp3{9,10,11,12,13}-macosx_x86_64"} fi echo "-- Building wheels for ${CIBW_BUILD}" @@ -82,7 +82,7 @@ export CIBW_BEFORE_BUILD_MACOS="brew install protobuf coreutils zstd zlib libmag export CIBW_ENVIRONMENT_MACOS="MACOSX_DEPLOYMENT_TARGET=${MINIMUM_TARGET} CMAKE_ARGS='-DBUILD_WITH_AWS=ON -DBUILD_WITH_ORC=ON -DCMAKE_BUILD_TYPE=Release' CMAKE_BUILD_TYPE=Release" export CIBW_BUILD="${CIBW_BUILD}" -export CIBW_PROJECT_REQUIRES_PYTHON=">=3.8" +export CIBW_PROJECT_REQUIRES_PYTHON=">=3.9" # uncomment to increase verbosity of cibuildwheel export CIBW_BUILD_VERBOSITY=3 diff --git a/scripts/build_macos_wheels_with_test.sh b/scripts/build_macos_wheels_with_test.sh index 6ced7cfcb..c10d55180 100755 --- a/scripts/build_macos_wheels_with_test.sh +++ b/scripts/build_macos_wheels_with_test.sh @@ -51,10 +51,10 @@ echo "-- Detected Xcode ${xcode_version_str}" # if no param is given, use defaults to build all if [ "${arch}" = "arm64" ]; then # build Python 3.9 - 3.11 - CIBW_BUILD=${CIBW_BUILD-"cp3{9,10,11}-macosx_arm64"} + CIBW_BUILD=${CIBW_BUILD-"cp3{9,10,11,12,13}-macosx_arm64"} else # build Python 3.8 - 3.11 - CIBW_BUILD=${CIBW_BUILD-"cp3{8,9,10,11}-macosx_x86_64"} + CIBW_BUILD=${CIBW_BUILD-"cp3{9,10,11,12,13}-macosx_x86_64"} fi echo "-- Building wheels for ${CIBW_BUILD}" @@ -92,7 +92,7 @@ export CIBW_ENVIRONMENT_MACOS="MACOSX_DEPLOYMENT_TARGET=${MINIMUM_TARGET} CMAKE_ #export CIBW_ENVIRONMENT_MACOS="MACOSX_DEPLOYMENT_TARGET=${MINIMUM_TARGET} CMAKE_ARGS='-DBUILD_WITH_AWS=ON -DBUILD_WITH_ORC=ON' TUPLEX_BUILD_TYPE=Debug" export CIBW_BUILD="${CIBW_BUILD}" -export CIBW_PROJECT_REQUIRES_PYTHON=">=3.8" +export CIBW_PROJECT_REQUIRES_PYTHON=">=3.9" # uncomment to increase verbosity of cibuildwheel export CIBW_BUILD_VERBOSITY=3 diff --git a/setup.py b/setup.py index 5d2e86484..3d26dada7 100644 --- a/setup.py +++ b/setup.py @@ -695,7 +695,7 @@ def tplx_package_data(): # The information here can also be placed in setup.cfg - better separation of # logic and declaration, and simpler if you include description/version in a file. setup(name="tuplex", - python_requires='>=3.8.0', + python_requires='>=3.9.0', version="0.3.7", author="Leonhard Spiegelberg", author_email="tuplex@cs.brown.edu", @@ -735,10 +735,11 @@ def tplx_package_data(): # Specify the Python versions you support here. In particular, ensure # that you indicate whether you support Python 2, Python 3 or both. - 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', + 'Programming Language :: Python :: 3.13', ], scripts=['tuplex/historyserver/bin/tuplex-webui'], project_urls={ diff --git a/tuplex/adapters/cpython/include/PythonHelpers.h b/tuplex/adapters/cpython/include/PythonHelpers.h index 4ed86197e..58389b98c 100644 --- a/tuplex/adapters/cpython/include/PythonHelpers.h +++ b/tuplex/adapters/cpython/include/PythonHelpers.h @@ -319,6 +319,13 @@ namespace python { */ extern void runGC(); + /*! + * checks whether python error is set, if so extracts error and traceback into string and resets + * interpreter's error flag. + * @return + */ + extern std::string extract_and_reset_py_error(); + /*! * check whether Python interpreter is running in/available to this process * @return bool when is running else false diff --git a/tuplex/adapters/cpython/src/PythonHelpers.cc b/tuplex/adapters/cpython/src/PythonHelpers.cc index 7f37dd615..f26184989 100644 --- a/tuplex/adapters/cpython/src/PythonHelpers.cc +++ b/tuplex/adapters/cpython/src/PythonHelpers.cc @@ -52,7 +52,7 @@ namespace python { Py_SetPythonHome(&vec[0]); } - void handle_and_throw_py_error() { + std::string extract_and_reset_py_error() { if(PyErr_Occurred()) { PyObject *ptype = NULL, *pvalue = NULL, *ptraceback = NULL; PyErr_Fetch(&ptype,&pvalue,&ptraceback); @@ -107,8 +107,15 @@ namespace python { } Py_XDECREF(lines_obj); - throw std::runtime_error(ss.str()); + return ss.str(); } + return ""; + } + + void handle_and_throw_py_error() { + auto err = extract_and_reset_py_error(); + if(!err.empty()) + throw std::runtime_error(err); } diff --git a/tuplex/codegen/tools/.gitignore b/tuplex/codegen/tools/.gitignore new file mode 100644 index 000000000..77185d1d7 --- /dev/null +++ b/tuplex/codegen/tools/.gitignore @@ -0,0 +1 @@ +antlr-*-complete.jar diff --git a/tuplex/core/include/HybridHashTable.h b/tuplex/core/include/HybridHashTable.h index 55472d04e..c26a9c5c1 100644 --- a/tuplex/core/include/HybridHashTable.h +++ b/tuplex/core/include/HybridHashTable.h @@ -16,6 +16,12 @@ #include #include +// Python 3.13 moved internal APIs from modsupport.h to internal/pycore_modsupport.h +#if PY_MAJOR_VERSION >=3 && PY_MINOR_VERSION >= 13 +#define Py_BUILD_CORE +#include +#endif + namespace tuplex { /*! diff --git a/tuplex/historyserver/thserver/version.py b/tuplex/historyserver/thserver/version.py index 22182bb91..8a14b5846 100644 --- a/tuplex/historyserver/thserver/version.py +++ b/tuplex/historyserver/thserver/version.py @@ -1,2 +1,2 @@ -# (c) L.Spiegelberg 2017 - 2024 +# (c) L.Spiegelberg 2017 - 2025 __version__="0.3.7" \ No newline at end of file diff --git a/tuplex/python/setup.py b/tuplex/python/setup.py index b59fe4c86..cf6b2b100 100644 --- a/tuplex/python/setup.py +++ b/tuplex/python/setup.py @@ -64,12 +64,4 @@ 'iso8601' ], url="https://tuplex.cs.brown.edu" - #, - # project_urls={ - # "Bug Tracker": "https://bugs.example.com/HelloWorld/", - # "Documentation": "https://docs.example.com/HelloWorld/", - # "Source Code": "https://code.example.com/HelloWorld/", - # } - - # could also include long_description, download_url, classifiers, etc. ) diff --git a/tuplex/python/src/PythonContext.cc b/tuplex/python/src/PythonContext.cc index 2d9b11acb..79f9e1ad1 100644 --- a/tuplex/python/src/PythonContext.cc +++ b/tuplex/python/src/PythonContext.cc @@ -256,7 +256,7 @@ namespace tuplex { check = check ? PyTuple_Size(obj) == numTupleElements : false; if(check) { - // it's a tuple with macthing size + // it's a tuple with matching size // first get how many bytes are required size_t requiredBytes = baseRequiredBytes; if(varLenField) { @@ -265,7 +265,21 @@ namespace tuplex { if (typeStr[j] == 's') { auto tupleItem = PyTuple_GET_ITEM(obj, j); if (PyUnicode_Check(tupleItem)) { - requiredBytes += PyUnicode_GET_SIZE(tupleItem) + 1; // +1 for '\0' + // new: + Py_ssize_t utf8str_size = -1; + auto utf8str = PyUnicode_AsUTF8AndSize(tupleItem, &utf8str_size); + requiredBytes += utf8str_size + 1; // +1 for '\0'. + if(utf8str_size == -1 || !utf8str) { + // error happened, translate and create error dataset. + auto err= python::extract_and_reset_py_error(); + if(err.empty()) { + err = "PyUnicode_AsUTF8AndSize error, but not python error set."; + } + return _context->makeError(err); + } + + // old: + // requiredBytes += PyUnicode_GET_SIZE(tupleItem) + 1; // +1 for '\0' } else { nonConforming = true; break; @@ -330,18 +344,31 @@ namespace tuplex { if(!PyUnicode_Check(el)) goto bad_element; - auto utf8ptr = PyUnicode_AsUTF8(el); - auto len = PyUnicode_GET_SIZE(el); + // new: + Py_ssize_t utf8str_size = -1; + auto utf8str = PyUnicode_AsUTF8AndSize(el, &utf8str_size); + if(utf8str_size == -1 || !utf8str) { + // error happened, translate and create error dataset. + auto err= python::extract_and_reset_py_error(); + if(err.empty()) { + err = "PyUnicode_AsUTF8AndSize error, but not python error set."; + } + return _context->makeError(err); + } + + // // old: + // auto utf8ptr = PyUnicode_AsUTF8(el); + // auto len = PyUnicode_GET_SIZE(el); - assert(len == strlen(utf8ptr)); - size_t varFieldSize = len + 1; // + 1 for '\0' char! + assert(utf8str_size == strlen(utf8str)); + size_t varFieldSize = utf8str_size + 1; // + 1 for '\0' char! size_t varLenOffset = (numTupleElements + 1 - j) * sizeof(int64_t) + rowVarFieldSizes; // 16 bytes offset int64_t info_field = varLenOffset | (varFieldSize << 32); *((int64_t*)(ptr)) = info_field; // copy string contents - memcpy(ptr + varLenOffset, utf8ptr, len + 1); // +1 for 0 delimiter + memcpy(ptr + varLenOffset, utf8str, utf8str_size + 1); // +1 for 0 delimiter ptr += sizeof(int64_t); // move to next field rowVarFieldSizes += varFieldSize; @@ -502,11 +529,25 @@ namespace tuplex { // (3) is the actual string content (incl. '\0' delimiter) if(PyUnicode_Check(obj)) { - auto len = PyUnicode_GET_SIZE(obj); + // new: + Py_ssize_t utf8str_size = -1; + auto utf8str = PyUnicode_AsUTF8AndSize(obj, &utf8str_size); + if(utf8str_size == -1 || !utf8str) { + // error happened, translate and create error dataset. + auto err= python::extract_and_reset_py_error(); + if(err.empty()) { + err = "PyUnicode_AsUTF8AndSize error, but not python error set."; + } + return _context->makeError(err); + } + + + // // old: + // auto len = PyUnicode_GET_SIZE(obj); + // auto utf8ptr = PyUnicode_AsUTF8(obj); - auto utf8ptr = PyUnicode_AsUTF8(obj); - size_t requiredBytes = sizeof(int64_t) * 2 + len + 1; + size_t requiredBytes = sizeof(int64_t) * 2 + utf8str_size + 1; // check capacity and realloc if necessary get a new partition if(partition->capacity() < numBytesSerialized + requiredBytes) { @@ -525,9 +566,9 @@ namespace tuplex { numBytesSerialized = 0; } - assert(len == strlen(utf8ptr)); + assert(utf8str_size == strlen(utf8str)); - size_t varFieldSize = len + 1; // + 1 for '\0' char! + size_t varFieldSize = utf8str_size + 1; // + 1 for '\0' char! size_t varLenOffset = 2 * sizeof(int64_t); // 16 bytes offset int64_t info_field = varLenOffset | (varFieldSize << 32); @@ -535,7 +576,7 @@ namespace tuplex { // after fixed length fields comes total varlen info field *((int64_t*)(ptr + sizeof(int64_t))) = varFieldSize; // copy string contents - memcpy(ptr + sizeof(int64_t) * 2, utf8ptr, len + 1); // +1 for 0 delimiter + memcpy(ptr + sizeof(int64_t) * 2, utf8str, utf8str_size + 1); // +1 for 0 delimiter ptr += requiredBytes; *rawPtr = *rawPtr + 1; numBytesSerialized += requiredBytes; diff --git a/tuplex/python/tuplex/utils/version.py b/tuplex/python/tuplex/utils/version.py index 22182bb91..8a14b5846 100644 --- a/tuplex/python/tuplex/utils/version.py +++ b/tuplex/python/tuplex/utils/version.py @@ -1,2 +1,2 @@ -# (c) L.Spiegelberg 2017 - 2024 +# (c) L.Spiegelberg 2017 - 2025 __version__="0.3.7" \ No newline at end of file From 5f86a734b819f7aeeebb84c912c2b9c701ac5d3f Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Wed, 5 Mar 2025 01:56:58 -0500 Subject: [PATCH 3/8] Add arm64 runner to CI (#151) Add arm64 runner to CI to support Apple Silicon. Other: - Fix levensthein distance for ARM which only had an SSE optimized implementation. --- .github/workflows/build_wheels.yml | 50 ++++++++++++++++++--- README.md | 2 +- scripts/macos/install_antlr4_cpp_runtime.sh | 24 ++++++---- scripts/macos/install_aws-sdk-cpp.sh | 11 +++-- tuplex/CMakeLists.txt | 2 +- tuplex/utils/include/Utils.h | 39 ++++++++++++++++ 6 files changed, 109 insertions(+), 19 deletions(-) diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index f005a593d..2e551f515 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -2,9 +2,9 @@ name: Build on: [push, pull_request, workflow_dispatch] -env: - # At least 10.13 is required, to avoid issues and since the runner is macos-13 -> use 13.0, which is Venture from 2022. - MACOSX_DEPLOYMENT_TARGET: 13.0 +# For macos, at least 10.13 is required +# to avoid issues and since the runners are macos-13 and macos-14: +# -> use 13.0, which is Venture from 2022 and 14.0 on the arm runners. jobs: build_wheels: @@ -13,7 +13,7 @@ jobs: strategy: matrix: # macos-14 (which is macos-latest) is ARM only. macos-13 is the latest intel runner. - os: [ ubuntu-latest, macos-13 ] + os: [ ubuntu-latest, macos-13, macos-14 ] python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] include: - os: ubuntu-latest @@ -34,18 +34,53 @@ jobs: - os: macos-13 python-version: "3.9" cibw-build: "cp39-macosx_x86_64" + macos-target: "13.0" + arch: "x86_64" - os: macos-13 python-version: "3.10" cibw-build: "cp310-macosx_x86_64" + macos-target: "13.0" + arch: "x86_64" - os: macos-13 python-version: "3.11" cibw-build: "cp311-macosx_x86_64" + macos-target: "13.0" + arch: "x86_64" - os: macos-13 python-version: "3.12" cibw-build: "cp312-macosx_x86_64" + macos-target: "13.0" + arch: "x86_64" - os: macos-13 python-version: "3.13" cibw-build: "cp313-macosx_x86_64" + macos-target: "13.0" + arch: "x86_64" + - os: macos-14 + python-version: "3.9" + cibw-build: "cp39-macosx_arm64" + macos-target: "14.0" + arch: "arm64" + - os: macos-14 + python-version: "3.10" + cibw-build: "cp310-macosx_arm64" + macos-target: "14.0" + arch: "arm64" + - os: macos-14 + python-version: "3.11" + cibw-build: "cp311-macosx_arm64" + macos-target: "14.0" + arch: "arm64" + - os: macos-14 + python-version: "3.12" + cibw-build: "cp312-macosx_arm64" + macos-target: "14.0" + arch: "arm64" + - os: macos-14 + python-version: "3.13" + cibw-build: "cp313-macosx_arm64" + macos-target: "14.0" + arch: "arm64" steps: - uses: actions/checkout@v4 @@ -63,7 +98,8 @@ jobs: /usr/local/Frameworks /usr/local/bin /usr/local/opt - key: macos-13-build-cache-${{ hashFiles('./scripts/macos/brew_dependencies.sh') }}-v2 + # macos13 runners are x86, macos14 are arm. --> use os therefore as cache key. + key: ${{ matrix.os }}-build-cache-${{ hashFiles('./scripts/macos/brew_dependencies.sh') }}-v2 - name: Setup python uses: actions/setup-python@v5 @@ -91,7 +127,9 @@ jobs: # Requires macOS 10.13 at least to build because of C++17 features. # To avoid issues, simply use 13.0 for now. - CIBW_ENVIRONMENT_MACOS: "MACOSX_DEPLOYMENT_TARGET=${MACOSX_DEPLOYMENT_TARGET} CMAKE_ARGS='-DBUILD_WITH_AWS=ON -DBUILD_WITH_ORC=ON' JAVA_HOME=${JAVA_HOME_11_X64}" + # Fix for Java home from https://github.com/actions/runner-images/discussions/9266. + # For github actions, $HOME is /Users/runner/ + CIBW_ENVIRONMENT_MACOS: "ARCH=${{ matrix.arch }} PREFIX=${HOME}/.local MACOSX_DEPLOYMENT_TARGET=${{ matrix.macos-target }} CMAKE_ARGS='-DCMAKE_PREFIX_PATH=/Users/runner/.local -DCMAKE_MODULE_PATH=/Users/runner/.local/cmake/ -DBUILD_WITH_AWS=ON -DBUILD_WITH_ORC=ON' JAVA_HOME=${JAVA_HOME_11_X64:-$JAVA_HOME_11_arm64}" # run all python tests to make sure wheels are not defunct CIBW_TEST_REQUIRES: "pytest pytest-timeout numpy nbformat jupyter" diff --git a/README.md b/README.md index 6161b900c..79144ff73 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![Build Status](https://dev.azure.com/leonhardspiegelberg/Tuplex%20-%20Open%20Source/_apis/build/status/tuplex.tuplex?branchName=master)](https://dev.azure.com/leonhardspiegelberg/Tuplex%20-%20Open%20Source/_build/latest?definitionId=2&branchName=master) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) -![Supported python versions](https://img.shields.io/badge/3.9%20%7C%203.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue) +![Supported python versions](https://img.shields.io/badge/python-3.9%20%7C%203.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue) [Website](https://tuplex.cs.brown.edu/) [Documentation](https://tuplex.cs.brown.edu/python-api.html) diff --git a/scripts/macos/install_antlr4_cpp_runtime.sh b/scripts/macos/install_antlr4_cpp_runtime.sh index c786e64d0..194ae5397 100644 --- a/scripts/macos/install_antlr4_cpp_runtime.sh +++ b/scripts/macos/install_antlr4_cpp_runtime.sh @@ -1,10 +1,12 @@ #!/usr/bin/env bash -# this is a script to install the antlr4 runtime +# This is a script to install the antlr4 runtime. -# specify here target location -PREFIX=/usr/local +set -euxo pipefail -# if antlr4 exists already, skip +# Specify here target location. +PREFIX=${PREFIX:-/usr/local} + +# If antlr4 exists already, skip. [ -d "antlr4" ] && exit 0 if [ -d "${PREFIX}/include/antlr4-runtime" ]; then @@ -13,7 +15,9 @@ if [ -d "${PREFIX}/include/antlr4-runtime" ]; then fi # use arm64 or x86_64. -ARCH=x86_64 +ARCH=${ARCH:-x86_64} + +echo ">>> Building for architecture: ${ARCH}" # if macOS is 10.x -> use this as minimum MINIMUM_TARGET="-DCMAKE_OSX_DEPLOYMENT_TARGET=10.13" @@ -33,6 +37,10 @@ else echo "defaulting build to use as minimum target ${MINIMUM_TARGET}" fi +# Ensure $PREFIX/{lib,include} exist. +mkdir -p $PREFIX/include +mkdir -p $PREFIX/lib + # with sed, modify deploy to add osx_deployment_target git clone https://github.com/antlr/antlr4.git \ && cd antlr4 && cd runtime && git fetch --all --tags \ @@ -44,11 +52,11 @@ git clone https://github.com/antlr/antlr4.git \ && unzip -l antlr4-cpp-runtime-macos.zip && unzip antlr4-cpp-runtime-macos.zip \ && cd lib && cp -R * $PREFIX/lib/ && cd .. \ && mv antlr4-runtime $PREFIX/include/ \ -&& echo "ANTLR4 Cpp runtime installed to $PREFIX" +&& echo "ANTLR4 Cpp runtime installed to $PREFIX." # execute copy command (fix for delocate wheel) -ls -l /usr/local/include -ls -l /usr/local/lib +ls -l $PREFIX/include +ls -l $PREFIX/lib cp lib/libantlr4-runtime.dylib /Users/runner/work/tuplex/tuplex/libantlr4-runtime.dylib diff --git a/scripts/macos/install_aws-sdk-cpp.sh b/scripts/macos/install_aws-sdk-cpp.sh index 16cd47b6f..2e6aa7a29 100755 --- a/scripts/macos/install_aws-sdk-cpp.sh +++ b/scripts/macos/install_aws-sdk-cpp.sh @@ -1,7 +1,12 @@ #!/usr/bin/env bash +set -euxo pipefail + +PREFIX=${PREFIX:-/usr/local} +AWSSDK_CPP_VERSION=1.11.164 + # check if dir exists (i.e. restored from cache, then skip) -if [ -d "/usr/local/include/aws" ]; then +if [ -d "${PREFIX}/include/aws" ]; then echo ">> Skip aws-sdk-cpp compile from source, already exists." exit 0 fi @@ -28,8 +33,8 @@ fi cd /tmp && git clone --recurse-submodules https://github.com/aws/aws-sdk-cpp.git && - cd aws-sdk-cpp && git checkout tags/1.11.164 && mkdir build && pushd build && - cmake ${MINIMUM_TARGET} -DCMAKE_BUILD_TYPE=Release -DUSE_OPENSSL=ON -DENABLE_TESTING=OFF -DENABLE_UNITY_BUILD=ON -DCPP_STANDARD=17 -DBUILD_SHARED_LIBS=OFF -DBUILD_ONLY="s3;core;lambda;transfer" .. && + cd aws-sdk-cpp && git checkout tags/${AWSSDK_CPP_VERSION} && mkdir build && pushd build && + cmake ${MINIMUM_TARGET} -DCMAKE_INSTALL_PREFIX=${PREFIX} -DCMAKE_BUILD_TYPE=Release -DUSE_OPENSSL=ON -DENABLE_TESTING=OFF -DENABLE_UNITY_BUILD=ON -DCPP_STANDARD=17 -DBUILD_SHARED_LIBS=OFF -DBUILD_ONLY="s3;core;lambda;transfer" .. && make -j${CPU_CORES} && make install && popd && diff --git a/tuplex/CMakeLists.txt b/tuplex/CMakeLists.txt index 480b38736..1da90748b 100755 --- a/tuplex/CMakeLists.txt +++ b/tuplex/CMakeLists.txt @@ -92,7 +92,7 @@ endif() # uncomment to get verbose cmake output # set(CMAKE_VERBOSE_MAKEFILE ON) -message(STATUS "additional cmake module path is ${CMAKE_MODULE_PATH}") +message(STATUS "Additional cmake module path is: ${CMAKE_MODULE_PATH}") include("${CMAKE_SOURCE_DIR}/cmake/ucm.cmake") #handy package to manipulate compiler flags include("${CMAKE_SOURCE_DIR}/cmake/CPM.cmake") # package manager from https://github.com/cpm-cmake/CPM.cmake # for debug mode export all symbols diff --git a/tuplex/utils/include/Utils.h b/tuplex/utils/include/Utils.h index 1673d44a7..87f05ca57 100644 --- a/tuplex/utils/include/Utils.h +++ b/tuplex/utils/include/Utils.h @@ -41,7 +41,44 @@ namespace std { } #endif +#if (defined __x86_64__) #include "third_party/levenshtein-sse.h" +#elif (defined __arm64__) +namespace tuplex { + // native C++ implementation for now (ARM) + inline size_t levenshtein(const std::string& word1, const std::string& word2) { + int size1 = word1.size(); + int size2 = word2.size(); + int memo[size1 + 1][size2 + 1]; + + // If one of the words has zero length, the distance is equal to the size of the other word. + if (size1 == 0) + return size2; + if (size2 == 0) + return size1; + + // Init step. + for (int i = 0; i <= size1; i++) + memo[i][0] = i; + for (int j = 0; j <= size2; j++) + memo[0][j] = j; + + // DP step. + for (int i = 1; i <= size1; i++) { + for (int j = 1; j <= size2; j++) { + auto cost = (word2[j - 1] == word1[i - 1]) ? 0 : 1; + + memo[i][j] = std::min( + std::min(memo[i - 1][j] + 1, memo[i][j - 1] + 1), + memo[i - 1][j - 1] + cost + ); + } + } + return memo[size1][size2]; + } +} + +#endif // helper code to allow tuples in maps. #include @@ -468,7 +505,9 @@ namespace tuplex { */ inline int fuzzyMatch(const std::string& needle, const std::vector& dictionary) { using namespace std; +#if (defined __x86_64__) using namespace levenshteinSSE; +#endif if(dictionary.empty()) return -1; From 04b0b4435ef14bee0ab1ad1365badd055de5534c Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Sat, 8 Mar 2025 14:05:55 -0800 Subject: [PATCH 4/8] Add precommit with ruff formatting (#152) Adds precommit with ruff formatting for python package. Other: - Fixes bug in WorkQueue where in multi-threaded scenarios tasks where the wait condition on `_numPendingTasks` was met before adding a task to the completed storage resulting in empty results being returned. --- .pre-commit-config.yaml | 23 + pyproject.toml | 4 + tuplex/core/src/Executor.cc | 8 +- tuplex/python/tuplex/__init__.py | 34 +- tuplex/python/tuplex/context.py | 233 +++++--- tuplex/python/tuplex/dataset.py | 356 +++++++----- tuplex/python/tuplex/distributed.py | 377 ++++++++----- tuplex/python/tuplex/exceptions.py | 118 ++-- tuplex/python/tuplex/libexec/__init__.py | 4 +- tuplex/python/tuplex/libexec/_tuplex.py | 4 +- tuplex/python/tuplex/metrics.py | 13 +- tuplex/python/tuplex/repl/__init__.py | 35 +- tuplex/python/tuplex/utils/__init__.py | 4 +- tuplex/python/tuplex/utils/common.py | 505 +++++++++++------- tuplex/python/tuplex/utils/errors.py | 8 +- tuplex/python/tuplex/utils/framework.py | 9 +- tuplex/python/tuplex/utils/globs.py | 67 +-- .../python/tuplex/utils/interactive_shell.py | 105 ++-- tuplex/python/tuplex/utils/jedi_completer.py | 30 +- tuplex/python/tuplex/utils/jupyter.py | 44 +- tuplex/python/tuplex/utils/reflection.py | 121 +++-- tuplex/python/tuplex/utils/source_vault.py | 172 +++--- tuplex/python/tuplex/utils/tracebacks.py | 40 +- tuplex/python/tuplex/utils/version.py | 2 +- 24 files changed, 1418 insertions(+), 898 deletions(-) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..0d6b6778a --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,23 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v2.3.0 + hooks: + - id: check-yaml + files: ^tuplex/python/tuplex.*\.py$ + - id: end-of-file-fixer + files: ^tuplex/python/tuplex.*\.py$ + - id: trailing-whitespace + files: ^tuplex/python/tuplex.*\.py$ +- repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.9.9 + hooks: + # Run the linter. + - id: ruff + files: ^tuplex/python/tuplex.*\.py$ + args: [ "--fix", "--select", "I" ] + types_or: [ python, pyi ] + # Run the formatter. + - id: ruff-format + files: ^tuplex/python/tuplex.*\.py$ + types_or: [ python, pyi ] \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index aefc4e5dc..dc7fe4af5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,3 +12,7 @@ requires = [ "requests" ] build-backend = "setuptools.build_meta" + + +[tool.ruff] +include = ["pyproject.toml", "tuplex/python/tuplex/**/*.py"] diff --git a/tuplex/core/src/Executor.cc b/tuplex/core/src/Executor.cc index 5078d4bc2..7b74a9937 100644 --- a/tuplex/core/src/Executor.cc +++ b/tuplex/core/src/Executor.cc @@ -104,15 +104,17 @@ namespace tuplex { // save which thread executed this task task->setID(std::this_thread::get_id()); - _numPendingTasks.fetch_add(-1, std::memory_order_release); - - // add task to done list + // Add task to done list, execute before decreasing pending task. TRACE_LOCK("completedTasks"); _completedTasksMutex.lock(); _completedTasks.push_back(std::move(task)); _completedTasksMutex.unlock(); _numCompletedTasks.fetch_add(1, std::memory_order_release); TRACE_UNLOCK("completedTasks"); + + // This needs to come last, because other threads may be waiting on it. + _numPendingTasks.fetch_add(-1, std::memory_order_release); + return true; } } else { diff --git a/tuplex/python/tuplex/__init__.py b/tuplex/python/tuplex/__init__.py index ee06cd764..ad8d14b5e 100644 --- a/tuplex/python/tuplex/__init__.py +++ b/tuplex/python/tuplex/__init__.py @@ -9,29 +9,35 @@ # License: Apache 2.0 # # ----------------------------------------------------------------------------------------------------------------------# -from tuplex.repl import * -from .context import Context -from .dataset import DataSet +import logging # expose aws setup for better convenience import tuplex.distributed -import logging -from tuplex.distributed import setup_aws - +from tuplex.distributed import setup_aws as setup_aws +from tuplex.repl import in_google_colab as in_google_colab +from tuplex.repl import in_jupyter_notebook as in_jupyter_notebook from tuplex.utils.version import __version__ as __version__ +from .context import Context +from .dataset import DataSet as DataSet + + # for convenience create a dummy function to return a default-configured Lambda context def LambdaContext(conf=None, name=None, s3_scratch_dir=None, **kwargs): import uuid if s3_scratch_dir is None: s3_scratch_dir = tuplex.distributed.default_scratch_dir() - logging.debug('Detected default S3 scratch dir for this user as {}'.format(s3_scratch_dir)) + logging.debug( + "Detected default S3 scratch dir for this user as {}".format(s3_scratch_dir) + ) - lambda_conf = {'backend': 'lambda', - 'partitionSize': '1MB', - 'aws.scratchDir': s3_scratch_dir, - 'aws.requesterPay': True} + lambda_conf = { + "backend": "lambda", + "partitionSize": "1MB", + "aws.scratchDir": s3_scratch_dir, + "aws.requesterPay": True, + } if conf: lambda_conf.update(conf) @@ -40,13 +46,13 @@ def LambdaContext(conf=None, name=None, s3_scratch_dir=None, **kwargs): for k, v in kwargs.items(): if k in conf.keys(): lambda_conf[k] = v - elif 'tuplex.' + k in conf.keys(): - lambda_conf['tuplex.' + k] = v + elif "tuplex." + k in conf.keys(): + lambda_conf["tuplex." + k] = v else: lambda_conf[k] = v if name is None: - name = 'AWSLambdaContext-' + str(uuid.uuid4())[:8] + name = "AWSLambdaContext-" + str(uuid.uuid4())[:8] # There's currently a bug in the Lambda backend when transferring local data to S3: The full partition # gets transferred, not just what is needed. diff --git a/tuplex/python/tuplex/context.py b/tuplex/python/tuplex/context.py index f92a5ddee..46763b72d 100644 --- a/tuplex/python/tuplex/context.py +++ b/tuplex/python/tuplex/context.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# # # # Tuplex: Blazing Fast Python Data Science # # # @@ -7,27 +7,44 @@ # (c) 2017 - 2021, Tuplex team # # Created by Leonhard Spiegelberg first on 1/1/2021 # # License: Apache 2.0 # -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# import logging try: - from .libexec.tuplex import _Context, _DataSet, getDefaultOptionsAsJSON + from .libexec.tuplex import _Context, getDefaultOptionsAsJSON except ModuleNotFoundError as e: logging.error("need to compiled Tuplex first, details: {}".format(e)) -from .dataset import DataSet -import os import glob +import json +import os import sys -import cloudpickle -from tuplex.utils.common import flatten_dict, load_conf_yaml, stringify_dict, unflatten_dict, save_conf_yaml, in_jupyter_notebook, in_google_colab, is_in_interactive_mode, current_user, is_shared_lib, host_name, ensure_webui, pythonize_options, logging_callback, registerLoggingCallback import uuid -import json + +from tuplex.utils.common import ( + current_user, + ensure_webui, + flatten_dict, + host_name, + in_google_colab, + in_jupyter_notebook, + is_in_interactive_mode, + is_shared_lib, + load_conf_yaml, + logging_callback, + pythonize_options, + registerLoggingCallback, + save_conf_yaml, + stringify_dict, + unflatten_dict, +) + +from .dataset import DataSet from .metrics import Metrics -class Context: +class Context: def __init__(self, conf=None, name="", **kwargs): r"""creates new Context object, the main entry point for all operations with the Tuplex big data framework @@ -81,8 +98,10 @@ def __init__(self, conf=None, name="", **kwargs): only serializes data that is required within the pipeline. """ - runtime_path = os.path.join(os.path.dirname(__file__), 'libexec', 'tuplex_runtime') - paths = glob.glob(runtime_path + '*') + runtime_path = os.path.join( + os.path.dirname(__file__), "libexec", "tuplex_runtime" + ) + paths = glob.glob(runtime_path + "*") if len(paths) != 1: # filter based on type (runtime must be shared object!) @@ -90,9 +109,15 @@ def __init__(self, conf=None, name="", **kwargs): if len(paths) != 1: if len(paths) == 0: - logging.error("found no tuplex runtime (tuplex_runtime.so). Faulty installation?") + logging.error( + "found no tuplex runtime (tuplex_runtime.so). Faulty installation?" + ) else: - logging.error('found following candidates for tuplex runtime:\n{}, please specify which to use.'.format(paths)) + logging.error( + "found following candidates for tuplex runtime:\n{}, please specify which to use.".format( + paths + ) + ) sys.exit(1) # pass configuration options @@ -102,17 +127,17 @@ def __init__(self, conf=None, name="", **kwargs): # put meaningful defaults for special environments... # per default disable webui - options['tuplex.webui.enable'] = False + options["tuplex.webui.enable"] = False if in_google_colab(): - logging.debug('Detected Google Colab environment, adjusting options...') + logging.debug("Detected Google Colab environment, adjusting options...") # do not use a lot of memory, restrict... - options['tuplex.driverMemory'] = '64MB' - options['tuplex.executorMemory'] = '64MB' - options['tuplex.inputSplitSize'] = '16MB' - options['tuplex.partitionSize'] = '4MB' - options['tuplex.runTimeMemory'] = '16MB' - options['tuplex.webui.enable'] = False + options["tuplex.driverMemory"] = "64MB" + options["tuplex.executorMemory"] = "64MB" + options["tuplex.inputSplitSize"] = "16MB" + options["tuplex.partitionSize"] = "4MB" + options["tuplex.runTimeMemory"] = "16MB" + options["tuplex.webui.enable"] = False if conf: if isinstance(conf, str): @@ -129,55 +154,59 @@ def __init__(self, conf=None, name="", **kwargs): options = stringify_dict(options) user = current_user() - name = name if len(name) > 0 else 'context' + str(uuid.uuid4())[:8] - mode = 'file' + name = name if len(name) > 0 else "context" + str(uuid.uuid4())[:8] + mode = "file" if is_in_interactive_mode(): - mode = 'shell' + mode = "shell" if in_jupyter_notebook(): - mode = 'jupyter' + mode = "jupyter" if in_google_colab(): - mode = 'colab' + mode = "colab" host = host_name() # pass above options as env.user, ... # also pass runtime path like that - options['tuplex.env.user'] = str(user) - options['tuplex.env.hostname'] = str(host) - options['tuplex.env.mode'] = str(mode) + options["tuplex.env.user"] = str(user) + options["tuplex.env.hostname"] = str(host) + options["tuplex.env.mode"] = str(mode) # update runtime path according to user - if 'tuplex.runTimeLibrary' in options: - runtime_path = options['tuplex.runTimeLibrary'] + if "tuplex.runTimeLibrary" in options: + runtime_path = options["tuplex.runTimeLibrary"] # normalize keys to be of format tuplex. supported_keys = json.loads(getDefaultOptionsAsJSON()).keys() key_set = set(options.keys()) for k in key_set: - if k not in supported_keys and 'tuplex.' + k in supported_keys: - options['tuplex.' + k] = options[k] + if k not in supported_keys and "tuplex." + k in supported_keys: + options["tuplex." + k] = options[k] # check if redirect to python logging module should happen or not - if 'tuplex.redirectToPythonLogging' in options.keys(): + if "tuplex.redirectToPythonLogging" in options.keys(): py_opts = pythonize_options(options) - if py_opts['tuplex.redirectToPythonLogging']: - logging.info('Redirecting C++ logging to Python') + if py_opts["tuplex.redirectToPythonLogging"]: + logging.info("Redirecting C++ logging to Python") registerLoggingCallback(logging_callback) else: # check what default options say defaults = pythonize_options(json.loads(getDefaultOptionsAsJSON())) - if defaults['tuplex.redirectToPythonLogging']: - logging.info('Redirecting C++ logging to Python') + if defaults["tuplex.redirectToPythonLogging"]: + logging.info("Redirecting C++ logging to Python") registerLoggingCallback(logging_callback) # autostart mongodb & history server if they are not running yet... # deactivate webui for google colab per default - if 'tuplex.webui.enable' not in options: + if "tuplex.webui.enable" not in options: # for google colab env, disable webui per default. if in_google_colab(): - options['tuplex.webui.enable'] = False + options["tuplex.webui.enable"] = False # fetch default options for webui ... - webui_options = {k: v for k, v in json.loads(getDefaultOptionsAsJSON()).items() if 'webui' in k or 'scratch' in k} + webui_options = { + k: v + for k, v in json.loads(getDefaultOptionsAsJSON()).items() + if "webui" in k or "scratch" in k + } # update only non-existing options! for k, v in webui_options.items(): @@ -187,27 +216,27 @@ def __init__(self, conf=None, name="", **kwargs): # pythonize options = pythonize_options(options) - if options['tuplex.webui.enable']: + if options["tuplex.webui.enable"]: ensure_webui(options) # last arg are the options as json string serialized b.c. of boost python problems # because webui=False/True is convenient, pass it as well to tuplex options - if 'tuplex.webui' in options.keys(): - options['tuplex.webui.enable'] = options['tuplex.webui'] - del options['tuplex.webui'] - if 'webui' in options.keys(): - options['tuplex.webui.enable'] = options['webui'] - del options['webui'] + if "tuplex.webui" in options.keys(): + options["tuplex.webui.enable"] = options["tuplex.webui"] + del options["tuplex.webui"] + if "webui" in options.keys(): + options["tuplex.webui.enable"] = options["webui"] + del options["webui"] # last arg are the options as json string serialized b.c. of boost python problems self._context = _Context(name, runtime_path, json.dumps(options)) python_metrics = self._context.getMetrics() - assert python_metrics, 'internal error: metrics object should be valid' + assert python_metrics, "internal error: metrics object should be valid" self.metrics = Metrics(python_metrics) assert self.metrics def parallelize(self, value_list, columns=None, schema=None, auto_unpack=True): - """ passes data to the Tuplex framework. Must be a list of primitive objects (e.g. of type bool, int, float, str) or + """passes data to the Tuplex framework. Must be a list of primitive objects (e.g. of type bool, int, float, str) or a list of (nested) tuples of these types. Args: @@ -229,20 +258,30 @@ def parallelize(self, value_list, columns=None, schema=None, auto_unpack=True): num_cols = 1 if isinstance(value_list[0], (list, tuple)): num_cols = len(value_list[0]) - cols = ['column{}'.format(i) for i in range(num_cols)] + cols = ["column{}".format(i) for i in range(num_cols)] else: cols = columns for col in cols: - assert isinstance(col, str), 'element {} must be a string'.format(col) - + assert isinstance(col, str), "element {} must be a string".format(col) ds = DataSet() - ds._dataSet = self._context.parallelize(value_list, columns, schema, auto_unpack) + ds._dataSet = self._context.parallelize( + value_list, columns, schema, auto_unpack + ) return ds - def csv(self, pattern, columns=None, header=None, delimiter=None, quotechar='"', null_values=[''], type_hints={}): - """ reads csv (comma separated values) files. This function may either be provided with + def csv( + self, + pattern, + columns=None, + header=None, + delimiter=None, + quotechar='"', + null_values=[""], + type_hints={}, + ): + """reads csv (comma separated values) files. This function may either be provided with parameters that help to determine the delimiter, whether a header present or what kind of quote char is used. Overall, CSV parsing is done according to the RFC-4180 standard (cf. https://tools.ietf.org/html/rfc4180) @@ -274,27 +313,41 @@ def csv(self, pattern, columns=None, header=None, delimiter=None, quotechar='"', if not null_values: null_values = [] - assert isinstance(pattern, str), 'file pattern must be given as str' - assert isinstance(columns, list) or columns is None, 'columns must be a list or None' - assert isinstance(delimiter, str) or delimiter is None, 'delimiter must be given as , or None for auto detection' - assert isinstance(header, bool) or header is None, 'header must be given as bool or None for auto detection' - assert isinstance(quotechar, str), 'quote char must be given as str' - assert isinstance(null_values, list), 'null_values must be a list of strings representing null values' - assert isinstance(type_hints, dict), 'type_hints must be a dictionary mapping index to type hint' # TODO: update with other options + assert isinstance(pattern, str), "file pattern must be given as str" + assert isinstance(columns, list) or columns is None, ( + "columns must be a list or None" + ) + assert isinstance(delimiter, str) or delimiter is None, ( + "delimiter must be given as , or None for auto detection" + ) + assert isinstance(header, bool) or header is None, ( + "header must be given as bool or None for auto detection" + ) + assert isinstance(quotechar, str), "quote char must be given as str" + assert isinstance(null_values, list), ( + "null_values must be a list of strings representing null values" + ) + assert isinstance(type_hints, dict), ( + "type_hints must be a dictionary mapping index to type hint" + ) # TODO: update with other options if delimiter: - assert len(delimiter) == 1, 'delimiter can only exist out of a single character' - assert len(quotechar) == 1, 'quotechar can only be a single character' + assert len(delimiter) == 1, ( + "delimiter can only exist out of a single character" + ) + assert len(quotechar) == 1, "quotechar can only be a single character" ds = DataSet() - ds._dataSet = self._context.csv(pattern, - columns, - header is None, - header if header is not None else False, - '' if delimiter is None else delimiter, - quotechar, - null_values, - type_hints) + ds._dataSet = self._context.csv( + pattern, + columns, + header is None, + header if header is not None else False, + "" if delimiter is None else delimiter, + quotechar, + null_values, + type_hints, + ) return ds def text(self, pattern, null_values=None): @@ -310,15 +363,17 @@ def text(self, pattern, null_values=None): if not null_values: null_values = [] - assert isinstance(pattern, str), 'file pattern must be given as str' - assert isinstance(null_values, list), 'null_values must be a list of strings representing null values' + assert isinstance(pattern, str), "file pattern must be given as str" + assert isinstance(null_values, list), ( + "null_values must be a list of strings representing null values" + ) ds = DataSet() ds._dataSet = self._context.text(pattern, null_values) return ds def orc(self, pattern, columns=None): - """ reads orc files. + """reads orc files. Args: pattern (str): a file glob pattern, e.g. /data/file.csv or /data/\*.csv or /\*/\*csv columns (list): optional list of columns, will be used as header for the CSV file. @@ -326,15 +381,17 @@ def orc(self, pattern, columns=None): tuplex.dataset.DataSet: A Tuplex Dataset object that allows further ETL operations """ - assert isinstance(pattern, str), 'file pattern must be given as str' - assert isinstance(columns, list) or columns is None, 'columns must be a list or None' + assert isinstance(pattern, str), "file pattern must be given as str" + assert isinstance(columns, list) or columns is None, ( + "columns must be a list or None" + ) ds = DataSet() ds._dataSet = self._context.orc(pattern, columns) return ds def options(self, nested=False): - """ retrieves all framework parameters as dictionary + """retrieves all framework parameters as dictionary Args: nested (bool): When set to true, this will return a nested dictionary. @@ -346,15 +403,15 @@ def options(self, nested=False): opt = self._context.options() # small hack because boost python has problems with nested dicts - opt['tuplex.csv.separators'] = eval(opt['tuplex.csv.separators']) - opt['tuplex.csv.comments'] = eval(opt['tuplex.csv.comments']) + opt["tuplex.csv.separators"] = eval(opt["tuplex.csv.separators"]) + opt["tuplex.csv.comments"] = eval(opt["tuplex.csv.comments"]) if nested: return unflatten_dict(opt) else: return opt - def optionsToYAML(self, file_path='config.yaml'): + def optionsToYAML(self, file_path="config.yaml"): """saves options as yaml file to (local) filepath Args: @@ -413,12 +470,12 @@ def uiWebURL(self): None if webUI was disabled, else URL as string """ options = self.options() - if not options['tuplex.webui.enable']: + if not options["tuplex.webui.enable"]: return None - hostname = options['tuplex.webui.url'] - port = options['tuplex.webui.port'] - url = '{}:{}'.format(hostname, port) - if not url.startswith('http://') or url.startswith('https://'): - url = 'http://' + url + hostname = options["tuplex.webui.url"] + port = options["tuplex.webui.port"] + url = "{}:{}".format(hostname, port) + if not url.startswith("http://") or url.startswith("https://"): + url = "http://" + url return url diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py index 2d7eecc00..27e0d37a7 100644 --- a/tuplex/python/tuplex/dataset.py +++ b/tuplex/python/tuplex/dataset.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# # # # Tuplex: Blazing Fast Python Data Science # # # @@ -7,36 +7,39 @@ # (c) 2017 - 2021, Tuplex team # # Created by Leonhard Spiegelberg first on 1/1/2021 # # License: Apache 2.0 # -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# -import cloudpickle -import sys import logging +import cloudpickle + try: - from .libexec.tuplex import _Context, _DataSet + # Checks that compiled tuplex extension object is present and compatible. + from .libexec.tuplex import _Context, _DataSet # noqa: F401 except ModuleNotFoundError as e: logging.error("need to compiled Tuplex first, details: {}".format(e)) -from tuplex.utils.reflection import get_source as get_udf_source -from tuplex.utils.reflection import get_globals from tuplex.utils.framework import UDFCodeExtractionError -from tuplex.utils.source_vault import SourceVault +from tuplex.utils.reflection import get_globals +from tuplex.utils.reflection import get_source as get_udf_source + from .exceptions import classToExceptionCode # signed 64bit limit max_rows = 9223372036854775807 -class DataSet: +class DataSet: def __init__(self): - self._dataSet = None + self._dataSet: _DataSet = None def unique(self): - """ removes duplicates from Dataset (out-of-order). Equivalent to a DISTINCT clause in a SQL-statement. + """removes duplicates from Dataset (out-of-order). Equivalent to a DISTINCT clause in a SQL-statement. Returns: tuplex.dataset.Dataset: A Tuplex Dataset object that allows further ETL operations. """ - assert self._dataSet is not None, 'internal API error, datasets must be created via context object' + assert self._dataSet is not None, ( + "internal API error, datasets must be created via context object" + ) ds = DataSet() ds._dataSet = self._dataSet.unique() @@ -57,16 +60,18 @@ def map(self, ftor): tuplex.dataset.DataSet: A Tuplex Dataset object that allows further ETL operations """ - assert self._dataSet is not None, 'internal API error, datasets must be created via context object' - assert ftor is not None, 'need to provide valid functor' + assert self._dataSet is not None, ( + "internal API error, datasets must be created via context object" + ) + assert ftor is not None, "need to provide valid functor" - code = '' + code = "" # try to get code from vault (only lambdas supported yet!) try: # convert code object to str representation code = get_udf_source(ftor) except UDFCodeExtractionError as e: - logging.warn('Could not extract code for {}. Details:\n{}'.format(ftor, e)) + logging.warn("Could not extract code for {}. Details:\n{}".format(ftor, e)) g = get_globals(ftor) @@ -86,16 +91,18 @@ def filter(self, ftor): tuplex.dataset.DataSet: A Tuplex Dataset object that allows further ETL operations """ - assert self._dataSet is not None, 'internal API error, datasets must be created via context object' - assert ftor is not None, 'need to provide valid functor' + assert self._dataSet is not None, ( + "internal API error, datasets must be created via context object" + ) + assert ftor is not None, "need to provide valid functor" - code = '' + code = "" # try to get code from vault (only lambdas supported yet!) try: # convert code object to str representation code = get_udf_source(ftor) except UDFCodeExtractionError as e: - logging.warn('Could not extract code for {}. Details:\n{}'.format(ftor, e)) + logging.warn("Could not extract code for {}. Details:\n{}".format(ftor, e)) g = get_globals(ftor) ds = DataSet() @@ -103,17 +110,19 @@ def filter(self, ftor): return ds def collect(self): - """ action that generates a physical plan, processes data and collects result then as list of tuples. + """action that generates a physical plan, processes data and collects result then as list of tuples. Returns: (list): A list of tuples, or values if the dataset has only one column. """ - assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' + assert self._dataSet is not None, ( + "internal API error, datasets must be created via context objects" + ) return self._dataSet.collect() def take(self, nrows=5): - """ action that generates a physical plan, processes data and collects the top results then as list of tuples. + """action that generates a physical plan, processes data and collects the top results then as list of tuples. Args: nrows (int): number of rows to collect. Per default ``5``. @@ -122,22 +131,26 @@ def take(self, nrows=5): """ - assert isinstance(nrows, int), 'num rows must be an integer' - assert nrows > 0, 'please specify a number greater than zero' + assert isinstance(nrows, int), "num rows must be an integer" + assert nrows > 0, "please specify a number greater than zero" - assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' + assert self._dataSet is not None, ( + "internal API error, datasets must be created via context objects" + ) return self._dataSet.take(nrows) def show(self, nrows=None): - """ action that generates a physical plan, processes data and prints results as nicely formatted + """action that generates a physical plan, processes data and prints results as nicely formatted ASCII table to stdout. Args: nrows (int): number of rows to collect. If ``None`` all rows will be collected """ - assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' + assert self._dataSet is not None, ( + "internal API error, datasets must be created via context objects" + ) # if optional value is None or below zero, simply return all rows. Else only up to nrows! if nrows is None or nrows < 0: @@ -146,7 +159,7 @@ def show(self, nrows=None): self._dataSet.show(nrows) def resolve(self, eclass, ftor): - """ Adds a resolver operator to the pipeline. The signature of ftor needs to be identical to the one of the preceding operator. + """Adds a resolver operator to the pipeline. The signature of ftor needs to be identical to the one of the preceding operator. Args: eclass: Which exception to apply resolution for, e.g. ZeroDivisionError @@ -158,22 +171,26 @@ def resolve(self, eclass, ftor): """ # check that predicate is a class for an exception class - assert issubclass(eclass, Exception), 'predicate must be a subclass of Exception' + assert issubclass(eclass, Exception), ( + "predicate must be a subclass of Exception" + ) # translate to C++ exception code enum ec = classToExceptionCode(eclass) - assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' + assert self._dataSet is not None, ( + "internal API error, datasets must be created via context objects" + ) - assert ftor is not None, 'need to provide valid functor' + assert ftor is not None, "need to provide valid functor" - code = '' + code = "" # try to get code from vault (only lambdas supported yet!) try: # convert code object to str representation code = get_udf_source(ftor) except UDFCodeExtractionError as e: - logging.warn('Could not extract code for {}. Details:\n{}'.format(ftor, e)) + logging.warn("Could not extract code for {}. Details:\n{}".format(ftor, e)) g = get_globals(ftor) ds = DataSet() @@ -181,7 +198,7 @@ def resolve(self, eclass, ftor): return ds def withColumn(self, column, ftor): - """ appends a new column to the dataset by calling ftor over existing tuples + """appends a new column to the dataset by calling ftor over existing tuples Args: column: name for the new column/variable. If column exists, its values will be replaced @@ -192,24 +209,26 @@ def withColumn(self, column, ftor): """ - assert self._dataSet is not None, 'internal API error, datasets must be created via context object' - assert ftor is not None, 'need to provide valid functor' - assert isinstance(column, str), 'column needs to be a string' + assert self._dataSet is not None, ( + "internal API error, datasets must be created via context object" + ) + assert ftor is not None, "need to provide valid functor" + assert isinstance(column, str), "column needs to be a string" - code = '' + code = "" # try to get code from vault (only lambdas supported yet!) try: # convert code object to str representation code = get_udf_source(ftor) except UDFCodeExtractionError as e: - logging.warn('Could not extract code for {}. Details:\n{}'.format(ftor, e)) + logging.warn("Could not extract code for {}. Details:\n{}".format(ftor, e)) g = get_globals(ftor) ds = DataSet() ds._dataSet = self._dataSet.withColumn(column, code, cloudpickle.dumps(ftor), g) return ds def mapColumn(self, column, ftor): - """ maps directly one column. UDF takes as argument directly the value of the specified column and will overwrite + """maps directly one column. UDF takes as argument directly the value of the specified column and will overwrite that column with the result. If you need access to multiple columns, use withColumn instead. If the column name already exists, it will be overwritten. @@ -221,24 +240,26 @@ def mapColumn(self, column, ftor): tuplex.dataset.DataSet: A Tuplex Dataset object that allows further ETL operations """ - assert self._dataSet is not None, 'internal API error, datasets must be created via context object' - assert ftor is not None, 'need to provide valid functor' - assert isinstance(column, str), 'column needs to be a string' + assert self._dataSet is not None, ( + "internal API error, datasets must be created via context object" + ) + assert ftor is not None, "need to provide valid functor" + assert isinstance(column, str), "column needs to be a string" - code = '' + code = "" # try to get code from vault (only lambdas supported yet!) try: # convert code object to str representation code = get_udf_source(ftor) except UDFCodeExtractionError as e: - logging.warn('Could not extract code for {}. Details:\n{}'.format(ftor, e)) + logging.warn("Could not extract code for {}. Details:\n{}".format(ftor, e)) g = get_globals(ftor) ds = DataSet() ds._dataSet = self._dataSet.mapColumn(column, code, cloudpickle.dumps(ftor), g) return ds def selectColumns(self, columns): - """ selects a subset of columns as defined through columns which is a list or a single column + """selects a subset of columns as defined through columns which is a list or a single column Args: columns: list of strings or integers. A string should reference a column name, whereas as an integer refers to an index. Indices may be negative according to python rules. Order in list determines output order @@ -248,24 +269,28 @@ def selectColumns(self, columns): """ - assert self._dataSet is not None, 'internal API error, datasets must be created via context object' + assert self._dataSet is not None, ( + "internal API error, datasets must be created via context object" + ) # syntatic sugar, allow single column, list, tuple, ... if isinstance(columns, (str, int)): columns = [columns] if isinstance(columns, tuple): columns = list(columns) - assert(isinstance(columns, list)) + assert isinstance(columns, list) for el in columns: - assert isinstance(el, (str, int)), 'element {} must be a string or int'.format(el) + assert isinstance(el, (str, int)), ( + "element {} must be a string or int".format(el) + ) ds = DataSet() ds._dataSet = self._dataSet.selectColumns(columns) return ds def renameColumn(self, key, newColumnName): - """ rename a column in dataset + """rename a column in dataset Args: key: str|int, old column name or (0-indexed) position. newColumnName: str, new column name @@ -274,10 +299,12 @@ def renameColumn(self, key, newColumnName): Dataset """ - assert self._dataSet is not None, 'internal API error, datasets must be created via context object' + assert self._dataSet is not None, ( + "internal API error, datasets must be created via context object" + ) - assert isinstance(key, (str, int)), 'key must be a string or integer' - assert isinstance(newColumnName, str), 'newColumnName must be a string' + assert isinstance(key, (str, int)), "key must be a string or integer" + assert isinstance(newColumnName, str), "newColumnName must be a string" ds = DataSet() if isinstance(key, str): @@ -285,11 +312,11 @@ def renameColumn(self, key, newColumnName): elif isinstance(key, int): ds._dataSet = self._dataSet.renameColumnByPosition(key, newColumnName) else: - raise TypeError('key must be int or str') + raise TypeError("key must be int or str") return ds def ignore(self, eclass): - """ ignores exceptions of type eclass caused by previous operator + """ignores exceptions of type eclass caused by previous operator Args: eclass: exception type/class to ignore @@ -300,19 +327,23 @@ def ignore(self, eclass): """ # check that predicate is a class for an exception class - assert issubclass(eclass, Exception), 'predicate must be a subclass of Exception' + assert issubclass(eclass, Exception), ( + "predicate must be a subclass of Exception" + ) # translate to C++ exception code enum ec = classToExceptionCode(eclass) - assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' + assert self._dataSet is not None, ( + "internal API error, datasets must be created via context objects" + ) ds = DataSet() ds._dataSet = self._dataSet.ignore(ec) return ds def cache(self, store_specialized=True): - """ materializes rows in main-memory for reuse with several pipelines. Can be also used to benchmark certain pipeline costs + """materializes rows in main-memory for reuse with several pipelines. Can be also used to benchmark certain pipeline costs Args: store_specialized: bool whether to store normal case and general case separated or merge everything into one normal case. This affects optimizations for operators called on a cached dataset. @@ -321,7 +352,9 @@ def cache(self, store_specialized=True): tuplex.dataset.DataSet: A Tuplex Dataset object that allows further ETL operations """ - assert self._dataSet is not None, 'internal API error, datasets must be created via context object' + assert self._dataSet is not None, ( + "internal API error, datasets must be created via context object" + ) ds = DataSet() ds._dataSet = self._dataSet.cache(store_specialized) @@ -329,7 +362,7 @@ def cache(self, store_specialized=True): @property def columns(self): - """ retrieve names of columns if assigned + """retrieve names of columns if assigned Returns: None or List[str]: Returns None if columns haven't been named yet or a list of strings representing the column names. @@ -339,7 +372,7 @@ def columns(self): @property def types(self): - """ output schema as list of type objects of the dataset. If the dataset has an error, None is returned. + """output schema as list of type objects of the dataset. If the dataset has an error, None is returned. Returns: detected types (general case) of dataset. Typed according to typing module. @@ -347,7 +380,9 @@ def types(self): types = self._dataSet.types() return types - def join(self, dsRight, leftKeyColumn, rightKeyColumn, prefixes=None, suffixes=None): + def join( + self, dsRight, leftKeyColumn, rightKeyColumn, prefixes=None, suffixes=None + ): """ (inner) join with other dataset Args: @@ -361,33 +396,46 @@ def join(self, dsRight, leftKeyColumn, rightKeyColumn, prefixes=None, suffixes=N """ - assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' - assert dsRight._dataSet is not None, 'internal API error, datasets must be created via context objects' + assert self._dataSet is not None, ( + "internal API error, datasets must be created via context objects" + ) + assert dsRight._dataSet is not None, ( + "internal API error, datasets must be created via context objects" + ) # process prefixes/suffixes - leftPrefix = '' - leftSuffix = '' - rightPrefix = '' - rightSuffix = '' + leftPrefix = "" + leftSuffix = "" + rightPrefix = "" + rightSuffix = "" if prefixes: prefixes = tuple(prefixes) - assert len(prefixes) == 2, 'prefixes must be a sequence of 2 elements!' - leftPrefix = prefixes[0] if prefixes[0] else '' - rightPrefix = prefixes[1] if prefixes[1] else '' + assert len(prefixes) == 2, "prefixes must be a sequence of 2 elements!" + leftPrefix = prefixes[0] if prefixes[0] else "" + rightPrefix = prefixes[1] if prefixes[1] else "" if suffixes: suffixes = tuple(suffixes) - assert len(suffixes) == 2, 'prefixes must be a sequence of 2 elements!' - leftSuffix = suffixes[0] if suffixes[0] else '' - rightSuffix = suffixes[1] if suffixes[1] else '' + assert len(suffixes) == 2, "prefixes must be a sequence of 2 elements!" + leftSuffix = suffixes[0] if suffixes[0] else "" + rightSuffix = suffixes[1] if suffixes[1] else "" ds = DataSet() - ds._dataSet = self._dataSet.join(dsRight._dataSet, leftKeyColumn, rightKeyColumn, - leftPrefix, leftSuffix, rightPrefix, rightSuffix) + ds._dataSet = self._dataSet.join( + dsRight._dataSet, + leftKeyColumn, + rightKeyColumn, + leftPrefix, + leftSuffix, + rightPrefix, + rightSuffix, + ) return ds - def leftJoin(self, dsRight, leftKeyColumn, rightKeyColumn, prefixes=None, suffixes=None): + def leftJoin( + self, dsRight, leftKeyColumn, rightKeyColumn, prefixes=None, suffixes=None + ): """ left (outer) join with other dataset Args: @@ -401,34 +449,53 @@ def leftJoin(self, dsRight, leftKeyColumn, rightKeyColumn, prefixes=None, suffix """ - assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' - assert dsRight._dataSet is not None, 'internal API error, datasets must be created via context objects' + assert self._dataSet is not None, ( + "internal API error, datasets must be created via context objects" + ) + assert dsRight._dataSet is not None, ( + "internal API error, datasets must be created via context objects" + ) # process prefixes/suffixes - leftPrefix = '' - leftSuffix = '' - rightPrefix = '' - rightSuffix = '' + leftPrefix = "" + leftSuffix = "" + rightPrefix = "" + rightSuffix = "" if prefixes: prefixes = tuple(prefixes) - assert len(prefixes) == 2, 'prefixes must be a sequence of 2 elements!' - leftPrefix = prefixes[0] if prefixes[0] else '' - rightPrefix = prefixes[1] if prefixes[1] else '' + assert len(prefixes) == 2, "prefixes must be a sequence of 2 elements!" + leftPrefix = prefixes[0] if prefixes[0] else "" + rightPrefix = prefixes[1] if prefixes[1] else "" if suffixes: suffixes = tuple(suffixes) - assert len(suffixes) == 2, 'prefixes must be a sequence of 2 elements!' - leftSuffix = suffixes[0] if suffixes[0] else '' - rightSuffix = suffixes[1] if suffixes[1] else '' + assert len(suffixes) == 2, "prefixes must be a sequence of 2 elements!" + leftSuffix = suffixes[0] if suffixes[0] else "" + rightSuffix = suffixes[1] if suffixes[1] else "" ds = DataSet() - ds._dataSet = self._dataSet.leftJoin(dsRight._dataSet, leftKeyColumn, rightKeyColumn, - leftPrefix, leftSuffix, rightPrefix, rightSuffix) + ds._dataSet = self._dataSet.leftJoin( + dsRight._dataSet, + leftKeyColumn, + rightKeyColumn, + leftPrefix, + leftSuffix, + rightPrefix, + rightSuffix, + ) return ds - - def tocsv(self, path, part_size=0, num_rows=max_rows, num_parts=0, part_name_generator=None, null_value=None, header=True): + def tocsv( + self, + path, + part_size=0, + num_rows=max_rows, + num_parts=0, + part_name_generator=None, + null_value=None, + header=True, + ): """ save dataset to one or more csv files. Triggers execution of pipeline. Args: path: path where to save files to @@ -441,10 +508,14 @@ def tocsv(self, path, part_size=0, num_rows=max_rows, num_parts=0, part_name_gen null_value: string to represent null values. None equals empty string. Must provide explicit quoting for this argument. header: bool to indicate whether to write a header or not or a list of strings to specify explicitly a header to write. number of names provided must match the column count. """ - assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' - assert isinstance(header, list) or isinstance(header, bool), 'header must be a list of strings, or a boolean' - - code, code_pickled = '', '' + assert self._dataSet is not None, ( + "internal API error, datasets must be created via context objects" + ) + assert isinstance(header, list) or isinstance(header, bool), ( + "header must be a list of strings, or a boolean" + ) + + code, code_pickled = "", "" if part_name_generator is not None: code_pickled = cloudpickle.dumps(part_name_generator) # try to get code from vault (only lambdas supported yet!) @@ -452,18 +523,31 @@ def tocsv(self, path, part_size=0, num_rows=max_rows, num_parts=0, part_name_gen # convert code object to str representation code = get_udf_source(part_name_generator) except UDFCodeExtractionError as e: - logging.warn('Could not extract code for {}. Details:\n{}'.format(ftor, e)) + logging.warn( + "Could not extract code for {}. Details:\n{}".format( + part_name_generator, e + ) + ) # clamp max rows if num_rows > max_rows: - raise Exception('Tuplex supports at most {} rows'.format(max_rows)) + raise Exception("Tuplex supports at most {} rows".format(max_rows)) if null_value is None: - null_value = '' - - self._dataSet.tocsv(path, code, code_pickled, num_parts, part_size, num_rows, null_value, header) - - def toorc(self, path, part_size=0, num_rows=max_rows, num_parts=0, part_name_generator=None): + null_value = "" + + self._dataSet.tocsv( + path, code, code_pickled, num_parts, part_size, num_rows, null_value, header + ) + + def toorc( + self, + path, + part_size=0, + num_rows=max_rows, + num_parts=0, + part_name_generator=None, + ): """ save dataset to one or more orc files. Triggers execution of pipeline. Args: path: path where to save files to @@ -476,7 +560,7 @@ def toorc(self, path, part_size=0, num_rows=max_rows, num_parts=0, part_name_gen """ assert self._dataSet is not None - code, code_pickled = '', '' + code, code_pickled = "", "" if part_name_generator is not None: code_pickled = cloudpickle.dumps(part_name_generator) # try to get code from vault (only lambdas supported yet!) @@ -484,10 +568,14 @@ def toorc(self, path, part_size=0, num_rows=max_rows, num_parts=0, part_name_gen # convert code object to str representation code = get_udf_source(part_name_generator) except UDFCodeExtractionError as e: - logging.warn('Could not extract code for {}. Details:\n{}'.format(ftor, e)) + logging.warn( + "Could not extract code for {}. Details:\n{}".format( + part_name_generator, e + ) + ) if num_rows > max_rows: - raise Exception('Tuplex supports at most {} rows'.format(max_rows)) + raise Exception("Tuplex supports at most {} rows".format(max_rows)) self._dataSet.toorc(path, code, code_pickled, num_parts, part_size, num_rows) @@ -502,7 +590,7 @@ def aggregate(self, combine, aggregate, initial_value): Dataset """ - comb_code, agg_code = '', '' + comb_code, agg_code = "", "" comb_code_pickled = cloudpickle.dumps(combine) agg_code_pickled = cloudpickle.dumps(aggregate) @@ -510,20 +598,34 @@ def aggregate(self, combine, aggregate, initial_value): # convert code object to str representation comb_code = get_udf_source(combine) except UDFCodeExtractionError as e: - logging.warn('Could not extract code for combine UDF {}. Details:\n{}'.format(combine, e)) + logging.warn( + "Could not extract code for combine UDF {}. Details:\n{}".format( + combine, e + ) + ) try: # convert code object to str representation agg_code = get_udf_source(aggregate) except UDFCodeExtractionError as e: - logging.warn('Could not extract code for aggregate UDF {}. Details:\n{}'.format(aggregate, e)) + logging.warn( + "Could not extract code for aggregate UDF {}. Details:\n{}".format( + aggregate, e + ) + ) g_comb = get_globals(combine) g_agg = get_globals(aggregate) ds = DataSet() - ds._dataSet = self._dataSet.aggregate(comb_code, comb_code_pickled, - agg_code, agg_code_pickled, - cloudpickle.dumps(initial_value), g_comb, g_agg) + ds._dataSet = self._dataSet.aggregate( + comb_code, + comb_code_pickled, + agg_code, + agg_code_pickled, + cloudpickle.dumps(initial_value), + g_comb, + g_agg, + ) return ds def aggregateByKey(self, combine, aggregate, initial_value, key_columns): @@ -546,30 +648,40 @@ def aggregateByKey(self, combine, aggregate, initial_value, key_columns): if isinstance(key_columns, int): key_columns = [key_columns] - comb_code, comb_code_pickled = '', '' - agg_code, agg_code_pickled = '', '' + comb_code, comb_code_pickled = "", "" + agg_code, agg_code_pickled = "", "" try: # convert code object to str representation comb_code = get_udf_source(combine) comb_code_pickled = cloudpickle.dumps(combine) except UDFCodeExtractionError as e: - logging.warn('Could not extract code for combine UDF {}. Details:\n{}'.format(ftor, e)) + logging.warn( + "Could not extract code for combine UDFs. Details:\n{}".format(e) + ) try: # convert code object to str representation agg_code = get_udf_source(aggregate) agg_code_pickled = cloudpickle.dumps(aggregate) except UDFCodeExtractionError as e: - logging.warn('Could not extract code for aggregate UDF {}. Details:\n{}'.format(ftor, e)) + logging.warn( + "Could not extract code for aggregate UDFs. Details:\n{}".format(e) + ) g_comb = get_globals(combine) g_agg = get_globals(aggregate) ds = DataSet() - ds._dataSet = self._dataSet.aggregateByKey(comb_code, comb_code_pickled, - agg_code, agg_code_pickled, - cloudpickle.dumps(initial_value), key_columns, - g_comb, g_agg) + ds._dataSet = self._dataSet.aggregateByKey( + comb_code, + comb_code_pickled, + agg_code, + agg_code_pickled, + cloudpickle.dumps(initial_value), + key_columns, + g_comb, + g_agg, + ) return ds @property diff --git a/tuplex/python/tuplex/distributed.py b/tuplex/python/tuplex/distributed.py index 4246ef7a5..096bf56a3 100644 --- a/tuplex/python/tuplex/distributed.py +++ b/tuplex/python/tuplex/distributed.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# # # # Tuplex: Blazing Fast Python Data Science # # # @@ -7,51 +7,49 @@ # (c) 2017 - 2021, Tuplex team # # Created by Leonhard Spiegelberg first on 11/4/2021 # # License: Apache 2.0 # -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# try: import boto3 import botocore.exceptions -except Exception as e: +except Exception: # ignore here, because boto3 is optional pass - #raise Exception('To use distributed version, please install boto3') + # raise Exception('To use distributed version, please install boto3') -import logging -import tempfile -import logging -import os import base64 import datetime -import socket -import json +import logging +import os import sys import threading import time # Tuplex specific imports -from tuplex.utils.common import in_jupyter_notebook, in_google_colab, is_in_interactive_mode, current_user, host_name +from tuplex.utils.common import current_user, host_name def current_iam_user(): - iam = boto3.resource('iam') + iam = boto3.resource("iam") user = iam.CurrentUser() return user.user_name.lower() def default_lambda_name(): - return 'tuplex-lambda-runner' + return "tuplex-lambda-runner" def default_lambda_role(): - return 'tuplex-lambda-role' + return "tuplex-lambda-role" def default_bucket_name(): - return 'tuplex-' + current_iam_user() + return "tuplex-" + current_iam_user() + def default_scratch_dir(): - return default_bucket_name() + '/scratch' + return default_bucket_name() + "/scratch" + def current_region(): session = boto3.session.Session() @@ -59,41 +57,52 @@ def current_region(): if region is None: # could do fancier auto-detect here... - return 'us-east-1' + return "us-east-1" return region + def check_credentials(aws_access_key_id=None, aws_secret_access_key=None): kwargs = {} if isinstance(aws_access_key_id, str): - kwargs['aws_access_key_id'] = aws_access_key_id + kwargs["aws_access_key_id"] = aws_access_key_id if isinstance(aws_secret_access_key, str): - kwargs['aws_secret_access_key'] = aws_secret_access_key - client = boto3.client('s3', **kwargs) + kwargs["aws_secret_access_key"] = aws_secret_access_key + client = boto3.client("s3", **kwargs) try: client.list_buckets() except botocore.exceptions.NoCredentialsError as e: - logging.error('Could not connect to AWS, Details: {}. To configure AWS credentials please confer the guide under https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#configuring-credentials'.format(e)) + logging.error( + "Could not connect to AWS, Details: {}. To configure AWS credentials please confer the guide under https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#configuring-credentials".format( + e + ) + ) return False return True + def ensure_s3_bucket(s3_client, bucket_name, region): - bucket_names = list(map(lambda b: b['Name'], s3_client.list_buckets()['Buckets'])) + bucket_names = list(map(lambda b: b["Name"], s3_client.list_buckets()["Buckets"])) if bucket_name not in bucket_names: - logging.info('Bucket {} not found, creating (private bucket) in {} ...'.format(bucket_name, region)) + logging.info( + "Bucket {} not found, creating (private bucket) in {} ...".format( + bucket_name, region + ) + ) # bug in boto3: if region == current_region(): s3_client.create_bucket(Bucket=bucket_name) - logging.info('Bucket {} created in {}'.format(bucket_name, region)) + logging.info("Bucket {} created in {}".format(bucket_name, region)) else: - location = {'LocationConstraint': region.strip()} - s3_client.create_bucket(Bucket=bucket_name, - CreateBucketConfiguration=location) - logging.info('Bucket {} created in {}'.format(bucket_name, region)) + location = {"LocationConstraint": region.strip()} + s3_client.create_bucket( + Bucket=bucket_name, CreateBucketConfiguration=location + ) + logging.info("Bucket {} created in {}".format(bucket_name, region)) else: - logging.info('Found bucket {}'.format(bucket_name)) + logging.info("Found bucket {}".format(bucket_name)) def create_lambda_role(iam_client, lambda_role): @@ -102,39 +111,59 @@ def create_lambda_role(iam_client, lambda_role): lambda_access_to_s3 = '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Action":["s3:*MultipartUpload*","s3:Get*","s3:ListBucket","s3:Put*"],"Resource":"*"}]}' lambda_invoke_others = '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Action":["lambda:InvokeFunction","lambda:InvokeAsync"],"Resource":"*"}]}' - iam_client.create_role(RoleName=lambda_role, - AssumeRolePolicyDocument=trust_policy, - Description='Auto-created Role for Tuplex AWS Lambda runner') - iam_client.attach_role_policy(RoleName=lambda_role, - PolicyArn='arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole') - iam_client.put_role_policy(RoleName=lambda_role, PolicyName='InvokeOtherlambdas', - PolicyDocument=lambda_invoke_others) - iam_client.put_role_policy(RoleName=lambda_role, PolicyName='LambdaAccessForS3', PolicyDocument=lambda_access_to_s3) - logging.info('Created Tuplex AWS Lambda runner role ({})'.format(lambda_role)) - - # check it exists + iam_client.create_role( + RoleName=lambda_role, + AssumeRolePolicyDocument=trust_policy, + Description="Auto-created Role for Tuplex AWS Lambda runner", + ) + iam_client.attach_role_policy( + RoleName=lambda_role, + PolicyArn="arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole", + ) + iam_client.put_role_policy( + RoleName=lambda_role, + PolicyName="InvokeOtherlambdas", + PolicyDocument=lambda_invoke_others, + ) + iam_client.put_role_policy( + RoleName=lambda_role, + PolicyName="LambdaAccessForS3", + PolicyDocument=lambda_access_to_s3, + ) + logging.info("Created Tuplex AWS Lambda runner role ({})".format(lambda_role)) + + # Check that role exists. try: - response = iam_client.get_role(RoleName=lambda_role) - except: - raise Exception('Failed to create AWS Lambda Role') + iam_client.get_role(RoleName=lambda_role) + except botocore.exceptions.ClientError: + raise Exception("Failed to create AWS Lambda Role.") def remove_lambda_role(iam_client, lambda_role): # detach policies... try: - iam_client.detach_role_policy(RoleName=lambda_role, - PolicyArn='arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole') + iam_client.detach_role_policy( + RoleName=lambda_role, + PolicyArn="arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole", + ) except Exception as e: logging.error( - 'Error while detaching policy AWSLambdaBasicExecutionRole, Tuplex setup corrupted? Details: {}'.format(e)) + "Error while detaching policy AWSLambdaBasicExecutionRole, Tuplex setup corrupted? Details: {}".format( + e + ) + ) - policy_names = iam_client.list_role_policies(RoleName=lambda_role)['PolicyNames'] + policy_names = iam_client.list_role_policies(RoleName=lambda_role)["PolicyNames"] for name in policy_names: try: iam_client.delete_role_policy(RoleName=lambda_role, PolicyName=name) except Exception as e: - logging.error('Error while detaching policy {}, Tuplex setup corrupted? Details: {}'.format(name, e)) + logging.error( + "Error while detaching policy {}, Tuplex setup corrupted? Details: {}".format( + name, e + ) + ) # delete role... iam_client.delete_role(RoleName=lambda_role) @@ -143,16 +172,18 @@ def remove_lambda_role(iam_client, lambda_role): def setup_lambda_role(iam_client, lambda_role, region, overwrite): try: response = iam_client.get_role(RoleName=lambda_role) - logging.info('Found Lambda role from {}'.format(response['Role']['CreateDate'])) + logging.info("Found Lambda role from {}".format(response["Role"]["CreateDate"])) # throw dummy exception to force overwrite if overwrite: remove_lambda_role(iam_client, lambda_role) - logging.info('Overwriting existing role {}'.format(lambda_role)) + logging.info("Overwriting existing role {}".format(lambda_role)) create_lambda_role(iam_client, lambda_role) - except iam_client.exceptions.NoSuchEntityException as e: - logging.info('Role {} was not found in {}, creating ...'.format(lambda_role, region)) + except iam_client.exceptions.NoSuchEntityException: + logging.info( + "Role {} was not found in {}, creating ...".format(lambda_role, region) + ) create_lambda_role(iam_client, lambda_role) @@ -166,7 +197,6 @@ def sizeof_fmt(num, suffix="B"): class ProgressPercentage(object): - def __init__(self, filename): self._filename = filename self._size = float(os.path.getsize(filename)) @@ -179,23 +209,37 @@ def __call__(self, bytes_amount): self._seen_so_far += bytes_amount percentage = (self._seen_so_far / self._size) * 100 sys.stdout.write( - "\r%s %s / %s (%.2f%%)" % ( - self._filename, sizeof_fmt(self._seen_so_far), sizeof_fmt(self._size), - percentage)) + "\r%s %s / %s (%.2f%%)" + % ( + self._filename, + sizeof_fmt(self._seen_so_far), + sizeof_fmt(self._size), + percentage, + ) + ) sys.stdout.flush() def s3_split_uri(uri): - assert '/' in uri, 'at least one / is required!' - uri = uri.replace('s3://', '') + assert "/" in uri, "at least one / is required!" + uri = uri.replace("s3://", "") - bucket = uri[:uri.find('/')] - key = uri[uri.find('/') + 1:] + bucket = uri[: uri.find("/")] + key = uri[uri.find("/") + 1 :] return bucket, key -def upload_lambda(iam_client, lambda_client, lambda_function_name, lambda_role, - lambda_zip_file, overwrite=False, s3_client=None, s3_scratch_space=None, quiet=False): +def upload_lambda( + iam_client, + lambda_client, + lambda_function_name, + lambda_role, + lambda_zip_file, + overwrite=False, + s3_client=None, + s3_scratch_space=None, + quiet=False, +): # AWS only allows 50MB to be uploaded directly via request. Else, requires S3 upload. ZIP_UPLOAD_LIMIT_SIZE = 50000000 @@ -204,124 +248,149 @@ def upload_lambda(iam_client, lambda_client, lambda_function_name, lambda_role, # for runtime, choose https://docs.aws.amazon.com/lambda/latest/dg/lambda-runtimes.html RUNTIME = "provided.al2" HANDLER = "tplxlam" # this is how the executable is called... - ARCHITECTURES = ['x86_64'] DEFAULT_MEMORY_SIZE = 1536 DEFAULT_TIMEOUT = 30 # 30s timeout if not os.path.isfile(lambda_zip_file): - raise Exception('Could not find local lambda zip file {}'.format(lambda_zip_file)) + raise Exception( + "Could not find local lambda zip file {}".format(lambda_zip_file) + ) file_size = os.stat(lambda_zip_file).st_size # if file size is smaller than limit, check how large the base64 encoded version is... CODE = None if file_size < ZIP_UPLOAD_LIMIT_SIZE: - logging.info('Encoding Lambda as base64 ({})'.format(sizeof_fmt(file_size))) - with open(lambda_zip_file, 'rb') as fp: + logging.info("Encoding Lambda as base64 ({})".format(sizeof_fmt(file_size))) + with open(lambda_zip_file, "rb") as fp: CODE = fp.read() CODE = base64.b64encode(CODE) b64_file_size = len(CODE) + 1 - logging.info('File size as base64 is {}'.format(sizeof_fmt(b64_file_size))) + logging.info("File size as base64 is {}".format(sizeof_fmt(b64_file_size))) else: b64_file_size = ZIP_UPLOAD_LIMIT_SIZE + 42 # to not trigger below if # get ARN of lambda role response = iam_client.get_role(RoleName=lambda_role) - lambda_role_arn = response['Role']['Arn'] + lambda_role_arn = response["Role"]["Arn"] # check if Lambda function already exists, if overwrite delete! - l_response = lambda_client.list_functions(FunctionVersion='ALL') - functions = list(filter(lambda f: f['FunctionName'] == lambda_function_name, l_response['Functions'])) + l_response = lambda_client.list_functions(FunctionVersion="ALL") + functions = list( + filter( + lambda f: f["FunctionName"] == lambda_function_name, l_response["Functions"] + ) + ) if len(functions) > 0: if len(functions) != 1: - logging.warning('Found multiple functions with name {}, deleting them all.'.format(lambda_function_name)) + logging.warning( + "Found multiple functions with name {}, deleting them all.".format( + lambda_function_name + ) + ) if not overwrite: raise Exception( - 'Found existing Lambda function {}, specify overwrite=True to replace'.format(lambda_function_name)) + "Found existing Lambda function {}, specify overwrite=True to replace".format( + lambda_function_name + ) + ) for f in functions: - lambda_client.delete_function(FunctionName=f['FunctionName']) - logging.info('Removed existing function {} (Runtime={}, MemorySize={}) from {}'.format(f['FunctionName'], - f['Runtime'], - f['MemorySize'], - f['LastModified'])) + lambda_client.delete_function(FunctionName=f["FunctionName"]) + logging.info( + "Removed existing function {} (Runtime={}, MemorySize={}) from {}".format( + f["FunctionName"], f["Runtime"], f["MemorySize"], f["LastModified"] + ) + ) - logging.info('Assigning role {} to runner'.format(lambda_role_arn)) + logging.info("Assigning role {} to runner".format(lambda_role_arn)) user = current_user() host = host_name() - DEPLOY_MESSAGE = "Auto-deployed Tuplex Lambda Runner function." \ - " Uploaded by {} from {} on {}".format(user, host, datetime.datetime.now()) + DEPLOY_MESSAGE = ( + "Auto-deployed Tuplex Lambda Runner function." + " Uploaded by {} from {} on {}".format(user, host, datetime.datetime.now()) + ) if b64_file_size < ZIP_UPLOAD_LIMIT_SIZE: - logging.info('Found packaged lambda ({})'.format(sizeof_fmt(file_size))) + logging.info("Found packaged lambda ({})".format(sizeof_fmt(file_size))) - logging.info('Loading local zipped lambda...') + logging.info("Loading local zipped lambda...") - logging.info('Uploading Lambda to AWS ({})'.format(sizeof_fmt(file_size))) + logging.info("Uploading Lambda to AWS ({})".format(sizeof_fmt(file_size))) try: # upload directly, we use Custom - response = lambda_client.create_function(FunctionName=lambda_function_name, - Runtime=RUNTIME, - Handler=HANDLER, - Role=lambda_role_arn, - Code={'ZipFile': CODE}, - Description=DEPLOY_MESSAGE, - PackageType='Zip', - MemorySize=DEFAULT_MEMORY_SIZE, - Timeout=DEFAULT_TIMEOUT) + response = lambda_client.create_function( + FunctionName=lambda_function_name, + Runtime=RUNTIME, + Handler=HANDLER, + Role=lambda_role_arn, + Code={"ZipFile": CODE}, + Description=DEPLOY_MESSAGE, + PackageType="Zip", + MemorySize=DEFAULT_MEMORY_SIZE, + Timeout=DEFAULT_TIMEOUT, + ) except Exception as e: - logging.error('Failed with: {}'.format(type(e))) - logging.error('Details: {}'.format(str(e)[:2048])) + logging.error("Failed with: {}".format(type(e))) + logging.error("Details: {}".format(str(e)[:2048])) raise e else: if s3_client is None or s3_scratch_space is None: - raise Exception("Local packaged lambda to large to upload directly, " \ - "need S3. Please specify S3 client + scratch space") - logging.info("Lambda function is larger than current limit ({}) AWS allows, " \ - " deploying via S3...".format(sizeof_fmt(ZIP_UPLOAD_LIMIT_SIZE))) + raise Exception( + "Local packaged lambda to large to upload directly, " + "need S3. Please specify S3 client + scratch space" + ) + logging.info( + "Lambda function is larger than current limit ({}) AWS allows, " + " deploying via S3...".format(sizeof_fmt(ZIP_UPLOAD_LIMIT_SIZE)) + ) # upload to s3 temporarily s3_bucket, s3_key = s3_split_uri(s3_scratch_space) # scratch space, so naming doesn't matter - TEMP_NAME = 'lambda-deploy.zip' - s3_key_obj = s3_key + '/' + TEMP_NAME - s3_target_uri = 's3://' + s3_bucket + '/' + s3_key + '/' + TEMP_NAME + TEMP_NAME = "lambda-deploy.zip" + s3_key_obj = s3_key + "/" + TEMP_NAME + s3_target_uri = "s3://" + s3_bucket + "/" + s3_key + "/" + TEMP_NAME callback = ProgressPercentage(lambda_zip_file) if not quiet else None s3_client.upload_file(lambda_zip_file, s3_bucket, s3_key_obj, Callback=callback) - logging.info('Deploying Lambda from S3 ({})'.format(s3_target_uri)) + logging.info("Deploying Lambda from S3 ({})".format(s3_target_uri)) try: # upload directly, we use Custom - response = lambda_client.create_function(FunctionName=lambda_function_name, - Runtime=RUNTIME, - Handler=HANDLER, - Role=lambda_role_arn, - Code={'S3Bucket': s3_bucket, 'S3Key': s3_key_obj}, - Description=DEPLOY_MESSAGE, - PackageType='Zip', - MemorySize=DEFAULT_MEMORY_SIZE, - Timeout=DEFAULT_TIMEOUT) + response = lambda_client.create_function( + FunctionName=lambda_function_name, + Runtime=RUNTIME, + Handler=HANDLER, + Role=lambda_role_arn, + Code={"S3Bucket": s3_bucket, "S3Key": s3_key_obj}, + Description=DEPLOY_MESSAGE, + PackageType="Zip", + MemorySize=DEFAULT_MEMORY_SIZE, + Timeout=DEFAULT_TIMEOUT, + ) except Exception as e: - logging.error('Failed with: {}'.format(type(e))) - logging.error('Details: {}'.format(str(e)[:2048])) + logging.error("Failed with: {}".format(type(e))) + logging.error("Details: {}".format(str(e)[:2048])) # delete S3 file from scratch s3_client.delete_object(Bucket=s3_bucket, Key=s3_key_obj) - logging.info('Removed {} from S3'.format(s3_target_uri)) + logging.info("Removed {} from S3".format(s3_target_uri)) raise e # delete S3 file from scratch s3_client.delete_object(Bucket=s3_bucket, Key=s3_key_obj) - logging.info('Removed {} from S3'.format(s3_target_uri)) + logging.info("Removed {} from S3".format(s3_target_uri)) # print out deployment details - logging.info('Lambda function {} deployed (MemorySize={}MB, Timeout={}).'.format(response['FunctionName'], - response['MemorySize'], - response['Timeout'])) + logging.info( + "Lambda function {} deployed (MemorySize={}MB, Timeout={}).".format( + response["FunctionName"], response["MemorySize"], response["Timeout"] + ) + ) # return lambda response return response @@ -337,24 +406,26 @@ def find_lambda_package(): this_directory = os.path.abspath(os.path.dirname(__file__)) # check if folder other exists & file tplxlam.zip in it! - candidate_path = os.path.join(this_directory, 'other', 'tplxlam.zip') + candidate_path = os.path.join(this_directory, "other", "tplxlam.zip") if os.path.isfile(candidate_path): - logging.info('Found Lambda runner package in {}'.format(candidate_path)) + logging.info("Found Lambda runner package in {}".format(candidate_path)) return candidate_path return None -def setup_aws(aws_access_key=None, aws_secret_key= None, - overwrite=True, - iam_user=None, - lambda_name=None, - lambda_role=None, - lambda_file=None, - region=None, - s3_scratch_uri=None, - quiet=False - ): +def setup_aws( + aws_access_key=None, + aws_secret_key=None, + overwrite=True, + iam_user=None, + lambda_name=None, + lambda_role=None, + lambda_file=None, + region=None, + s3_scratch_uri=None, + quiet=False, +): start_time = time.time() # detect defaults. Important to do this here, because don't want to always invoke boto3/botocore @@ -372,19 +443,21 @@ def setup_aws(aws_access_key=None, aws_secret_key= None, s3_scratch_uri = default_scratch_dir() if lambda_file is None: - raise Exception('Must specify a lambda runner to upload, i.e. set ' \ - 'parameter lambda_file=. Please check the REAMDE.md to ' \ - ' read about instructions on how to build the lambda runner or visit ' \ - 'the project website to download prebuilt runners.') + raise Exception( + "Must specify a lambda runner to upload, i.e. set " + "parameter lambda_file=. Please check the REAMDE.md to " + " read about instructions on how to build the lambda runner or visit " + "the project website to download prebuilt runners." + ) - assert lambda_file is not None, 'must specify file to upload' + assert lambda_file is not None, "must specify file to upload" # check credentials are existing on machine --> raises exception in case - logging.info('Validating AWS credentials') + logging.info("Validating AWS credentials") check_credentials(aws_access_key, aws_access_key) - logging.info('Setting up AWS Lambda backend for IAM user {}'.format(iam_user)) - logging.info('Configuring backend in zone: {}'.format(region)) + logging.info("Setting up AWS Lambda backend for IAM user {}".format(iam_user)) + logging.info("Configuring backend in zone: {}".format(region)) # check if iam user is found? # --> skip for now, later properly authenticate using assume_role as described in @@ -392,13 +465,15 @@ def setup_aws(aws_access_key=None, aws_secret_key= None, # create all required client objects for setup # key credentials for clients - client_kwargs = {'aws_access_key_id': aws_access_key, - 'aws_secret_access_key': aws_secret_key, - 'region_name': region} + client_kwargs = { + "aws_access_key_id": aws_access_key, + "aws_secret_access_key": aws_secret_key, + "region_name": region, + } - iam_client = boto3.client('iam', **client_kwargs) - s3_client = boto3.client('s3', **client_kwargs) - lambda_client = boto3.client('lambda', **client_kwargs) + iam_client = boto3.client("iam", **client_kwargs) + s3_client = boto3.client("s3", **client_kwargs) + lambda_client = boto3.client("lambda", **client_kwargs) # Step 1: ensure S3 scratch space exists s3_bucket, s3_key = s3_split_uri(s3_scratch_uri) @@ -408,8 +483,18 @@ def setup_aws(aws_access_key=None, aws_secret_key= None, setup_lambda_role(iam_client, lambda_role, region, overwrite) # Step 3: upload/create Lambda - upload_lambda(iam_client, lambda_client, lambda_name, lambda_role, lambda_file, overwrite, s3_client, s3_scratch_uri, quiet) + upload_lambda( + iam_client, + lambda_client, + lambda_name, + lambda_role, + lambda_file, + overwrite, + s3_client, + s3_scratch_uri, + quiet, + ) # done, print if quiet was not set to False if not quiet: - print('\nCompleted lambda setup in {:.2f}s'.format(time.time() - start_time)) + print("\nCompleted lambda setup in {:.2f}s".format(time.time() - start_time)) diff --git a/tuplex/python/tuplex/exceptions.py b/tuplex/python/tuplex/exceptions.py index ae6abd992..0c50fa997 100644 --- a/tuplex/python/tuplex/exceptions.py +++ b/tuplex/python/tuplex/exceptions.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# # # # Tuplex: Blazing Fast Python Data Science # # # @@ -7,7 +7,8 @@ # (c) 2017 - 2021, Tuplex team # # Created by Leonhard Spiegelberg first on 1/1/2021 # # License: Apache 2.0 # -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# + def classToExceptionCode(cls): """ @@ -19,62 +20,63 @@ def classToExceptionCode(cls): """ - lookup = {BaseException : 100, - Exception : 101, - ArithmeticError : 102, - BufferError : 103, - LookupError : 104, - AssertionError : 105, - AttributeError : 106, - EOFError : 107, - GeneratorExit : 108, - ImportError : 109, - ModuleNotFoundError : 110, - IndexError : 111, - KeyError : 112, - KeyboardInterrupt : 113, - MemoryError : 114, - NameError : 115, - NotImplementedError : 116, - OSError : 117, - OverflowError : 118, - RecursionError : 119, - ReferenceError : 120, - RuntimeError : 121, - StopIteration : 122, - StopAsyncIteration : 123, - SyntaxError : 124, - IndentationError : 125, - TabError : 126, - SystemError : 127, - SystemExit : 128, - TypeError : 129, - UnboundLocalError : 130, - UnicodeError : 131, - UnicodeEncodeError : 132, - UnicodeDecodeError : 133, - UnicodeTranslateError : 134, - ValueError : 135, - ZeroDivisionError : 136, - EnvironmentError : 137, - IOError : 138, - BlockingIOError : 139, - ChildProcessError : 140, - ConnectionError : 141, - BrokenPipeError : 142, - ConnectionAbortedError : 143, - ConnectionRefusedError : 144, - FileExistsError : 145, - FileNotFoundError : 146, - InterruptedError : 147, - IsADirectoryError : 148, - NotADirectoryError : 149, - PermissionError : 150, - ProcessLookupError : 151, - TimeoutError : 152 - } + lookup = { + BaseException: 100, + Exception: 101, + ArithmeticError: 102, + BufferError: 103, + LookupError: 104, + AssertionError: 105, + AttributeError: 106, + EOFError: 107, + GeneratorExit: 108, + ImportError: 109, + ModuleNotFoundError: 110, + IndexError: 111, + KeyError: 112, + KeyboardInterrupt: 113, + MemoryError: 114, + NameError: 115, + NotImplementedError: 116, + OSError: 117, + OverflowError: 118, + RecursionError: 119, + ReferenceError: 120, + RuntimeError: 121, + StopIteration: 122, + StopAsyncIteration: 123, + SyntaxError: 124, + IndentationError: 125, + TabError: 126, + SystemError: 127, + SystemExit: 128, + TypeError: 129, + UnboundLocalError: 130, + UnicodeError: 131, + UnicodeEncodeError: 132, + UnicodeDecodeError: 133, + UnicodeTranslateError: 134, + ValueError: 135, + ZeroDivisionError: 136, + EnvironmentError: 137, + IOError: 138, + BlockingIOError: 139, + ChildProcessError: 140, + ConnectionError: 141, + BrokenPipeError: 142, + ConnectionAbortedError: 143, + ConnectionRefusedError: 144, + FileExistsError: 145, + FileNotFoundError: 146, + InterruptedError: 147, + IsADirectoryError: 148, + NotADirectoryError: 149, + PermissionError: 150, + ProcessLookupError: 151, + TimeoutError: 152, + } try: return lookup[cls] - except: - return \ No newline at end of file + except KeyError: + return None diff --git a/tuplex/python/tuplex/libexec/__init__.py b/tuplex/python/tuplex/libexec/__init__.py index f768b97bc..3ff8a069c 100644 --- a/tuplex/python/tuplex/libexec/__init__.py +++ b/tuplex/python/tuplex/libexec/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# # # # Tuplex: Blazing Fast Python Data Science # # # @@ -7,4 +7,4 @@ # (c) 2017 - 2021, Tuplex team # # Created by Leonhard Spiegelberg first on 1/1/2021 # # License: Apache 2.0 # -#----------------------------------------------------------------------------------------------------------------------# \ No newline at end of file +# ----------------------------------------------------------------------------------------------------------------------# diff --git a/tuplex/python/tuplex/libexec/_tuplex.py b/tuplex/python/tuplex/libexec/_tuplex.py index f768b97bc..3ff8a069c 100644 --- a/tuplex/python/tuplex/libexec/_tuplex.py +++ b/tuplex/python/tuplex/libexec/_tuplex.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# # # # Tuplex: Blazing Fast Python Data Science # # # @@ -7,4 +7,4 @@ # (c) 2017 - 2021, Tuplex team # # Created by Leonhard Spiegelberg first on 1/1/2021 # # License: Apache 2.0 # -#----------------------------------------------------------------------------------------------------------------------# \ No newline at end of file +# ----------------------------------------------------------------------------------------------------------------------# diff --git a/tuplex/python/tuplex/metrics.py b/tuplex/python/tuplex/metrics.py index 19903032f..0bcf3fd34 100644 --- a/tuplex/python/tuplex/metrics.py +++ b/tuplex/python/tuplex/metrics.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# # # # Tuplex: Blazing Fast Python Data Science # # # @@ -7,18 +7,23 @@ # (c) 2017 - 2021, Tuplex team # # Created by Leonhard Spiegelberg first on 1/1/2021 # # License: Apache 2.0 # -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# import logging import typing + try: - from .libexec.tuplex import _Context - from .libexec.tuplex import _Metrics + # Module import needed to initialize capture, should revisit. + from .libexec.tuplex import ( + _Context, # noqa: F401 + _Metrics, + ) except ModuleNotFoundError as e: logging.error("need to compiled Tuplex first, details: {}".format(e)) _Metrics = typing.Any import json + class Metrics: """ Stores a reference to the metrics associated with a diff --git a/tuplex/python/tuplex/repl/__init__.py b/tuplex/python/tuplex/repl/__init__.py index 058b111ca..ef1fb8e83 100644 --- a/tuplex/python/tuplex/repl/__init__.py +++ b/tuplex/python/tuplex/repl/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# # # # Tuplex: Blazing Fast Python Data Science # # # @@ -7,27 +7,33 @@ # (c) 2017 - 2021, Tuplex team # # Created by Leonhard Spiegelberg first on 1/1/2021 # # License: Apache 2.0 # -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# import os import sys -from tuplex.utils.common import is_in_interactive_mode, in_jupyter_notebook, in_google_colab +from tuplex.utils.common import ( + in_google_colab, + in_jupyter_notebook, + is_in_interactive_mode, +) + try: from tuplex.utils.version import __version__ -except: - __version__ = 'dev' +except (ImportError, NameError): + __version__ = "dev" + def TuplexBanner(): - banner = '''Welcome to\n + banner = """Welcome to\n _____ _ |_ _| _ _ __ | | _____ __ | || | | | '_ \| |/ _ \ \/ / | || |_| | |_) | | __/> < |_| \__,_| .__/|_|\___/_/\_\\ {} |_| - '''.format(__version__) - banner += '\nusing Python {} on {}'.format(sys.version, sys.platform) + """.format(__version__) + banner += "\nusing Python {} on {}".format(sys.version, sys.platform) return banner @@ -36,14 +42,17 @@ def TuplexBanner(): if is_in_interactive_mode() and not in_jupyter_notebook() and not in_google_colab(): from tuplex.utils.interactive_shell import TuplexShell - os.system('clear') - from tuplex.context import Context + os.system("clear") + + # Module import needed to initialize defaults, should revisit. + from tuplex.context import Context # noqa: F401 + _locals = locals() - _locals = {key: _locals[key] for key in _locals if key in ['Context']} + _locals = {key: _locals[key] for key in _locals if key in ["Context"]} shell = TuplexShell() shell.init(locals=_locals) - shell.interact(banner=TuplexBanner() + '\n Interactive Shell mode') + shell.interact(banner=TuplexBanner() + "\n Interactive Shell mode") else: - print(TuplexBanner()) \ No newline at end of file + print(TuplexBanner()) diff --git a/tuplex/python/tuplex/utils/__init__.py b/tuplex/python/tuplex/utils/__init__.py index f768b97bc..3ff8a069c 100644 --- a/tuplex/python/tuplex/utils/__init__.py +++ b/tuplex/python/tuplex/utils/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# # # # Tuplex: Blazing Fast Python Data Science # # # @@ -7,4 +7,4 @@ # (c) 2017 - 2021, Tuplex team # # Created by Leonhard Spiegelberg first on 1/1/2021 # # License: Apache 2.0 # -#----------------------------------------------------------------------------------------------------------------------# \ No newline at end of file +# ----------------------------------------------------------------------------------------------------------------------# diff --git a/tuplex/python/tuplex/utils/common.py b/tuplex/python/tuplex/utils/common.py index ea9dcf51e..e0708fc3d 100644 --- a/tuplex/python/tuplex/utils/common.py +++ b/tuplex/python/tuplex/utils/common.py @@ -9,33 +9,26 @@ # License: Apache 2.0 # # ----------------------------------------------------------------------------------------------------------------------# import atexit -import sys import collections - import collections.abc -import pathlib -import signal - -import yaml -import sys -from datetime import datetime - import json -import urllib.request +import logging import os +import pathlib +import re +import shutil import signal -import atexit import socket -import shutil -import psutil import subprocess -import logging -import iso8601 -import re +import sys import tempfile import time -import shlex -import pathlib +import urllib.request +from datetime import datetime + +import iso8601 +import psutil +import yaml try: import pwd @@ -46,8 +39,8 @@ try: from tuplex.utils.version import __version__ -except: - __version__ = 'dev' +except ImportError: + __version__ = "dev" def cmd_exists(cmd): @@ -71,11 +64,14 @@ def is_shared_lib(path): """ # use file command - assert cmd_exists('file') + assert cmd_exists("file") - res = subprocess.check_output(['file', '--mime-type', path]) + res = subprocess.check_output(["file", "--mime-type", path]) mime_type = res.split()[-1].decode() - return mime_type == 'application/x-sharedlib' or mime_type == 'application/x-application' + return ( + mime_type == "application/x-sharedlib" + or mime_type == "application/x-application" + ) def current_timestamp(): @@ -105,7 +101,7 @@ def host_name(): Returns: some hostname as string """ - if socket.gethostname().find('.') >= 0: + if socket.gethostname().find(".") >= 0: return socket.gethostname() else: return socket.gethostbyaddr(socket.gethostname())[0] @@ -122,9 +118,10 @@ def post_json(url, data): """ - params = json.dumps(data).encode('utf8') - req = urllib.request.Request(url, data=params, - headers={'content-type': 'application/json'}) + params = json.dumps(data).encode("utf8") + req = urllib.request.Request( + url, data=params, headers={"content-type": "application/json"} + ) response = urllib.request.urlopen(req) return json.loads(response.read()) @@ -139,7 +136,7 @@ def get_json(url, timeout=10): python dictionary of decoded json """ - req = urllib.request.Request(url, headers={'content-type': 'application/json'}) + req = urllib.request.Request(url, headers={"content-type": "application/json"}) response = urllib.request.urlopen(req, timeout=timeout) return json.loads(response.read()) @@ -156,9 +153,9 @@ def in_jupyter_notebook(): try: # get_ipython won't be defined in standard python interpreter shell = get_ipython().__class__.__name__ - if shell == 'ZMQInteractiveShell': + if shell == "ZMQInteractiveShell": return True # Jupyter notebook or qtconsole - elif shell == 'TerminalInteractiveShell': + elif shell == "TerminalInteractiveShell": return False # Terminal running IPython else: return False # Other type (?) @@ -172,23 +169,14 @@ def in_google_colab(): Returns: True if Tuplex is running in Google Colab """ - found_colab_package = False - try: - import google.colab - found_colab_package = True - except: - pass shell_name_matching = False try: - shell_name_matching = 'google.colab' in str(get_ipython()) - except: + shell_name_matching = "google.colab" in str(get_ipython()) + except NameError: pass - if found_colab_package or shell_name_matching: - return True - else: - return False + return shell_name_matching def is_in_interactive_mode(): @@ -198,10 +186,10 @@ def is_in_interactive_mode(): """ - return bool(getattr(sys, 'ps1', sys.flags.interactive)) + return bool(getattr(sys, "ps1", sys.flags.interactive)) -def flatten_dict(d, sep='.', parent_key=''): +def flatten_dict(d, sep=".", parent_key=""): """ flattens a nested dictionary into a flat dictionary by concatenating keys with the separator. Args: d (dict): The dictionary to flatten @@ -222,7 +210,7 @@ def flatten_dict(d, sep='.', parent_key=''): return dict(items) -def unflatten_dict(dictionary, sep='.'): +def unflatten_dict(dictionary, sep="."): """ unflattens a dictionary into a nested dictionary according to sep Args: @@ -265,15 +253,15 @@ def beautify_nesting(d): else: return d - assert isinstance(file_path, str), 'file_path must be instance of str' + assert isinstance(file_path, str), "file_path must be instance of str" - with open(file_path, 'w') as f: - f.write('# Tuplex configuration file\n') - f.write('# created {} UTC\n'.format(datetime.utcnow())) + with open(file_path, "w") as f: + f.write("# Tuplex configuration file\n") + f.write("# created {} UTC\n".format(datetime.utcnow())) out = yaml.dump(beautify_nesting(unflatten_dict(conf))) # pyyaml prints { } around single item dicts. Remove by hand - out = out.replace('{', '').replace('}', '') + out = out.replace("{", "").replace("}", "") f.write(out) @@ -308,17 +296,17 @@ def parse_string(item): return item # do not use bool(...) to convert! - if item.lower() == 'true': + if item.lower() == "true": return True - if item.lower() == 'false': + if item.lower() == "false": return False try: return int(item) - except: + except ValueError: pass try: return float(item) - except: + except ValueError: pass return item @@ -352,9 +340,9 @@ def to_nested_dict(obj): resultDict[key] = val return resultDict - assert isinstance(file_path, str), 'file_path must be instance of str' + assert isinstance(file_path, str), "file_path must be instance of str" d = dict() - with open(file_path, 'r') as f: + with open(file_path, "r") as f: confs = list(yaml.safe_load_all(f)) for conf in confs: d.update(to_nested_dict(conf)) @@ -369,7 +357,7 @@ def stringify_dict(d): Returns: dictionary with keys and vals as strs """ - assert isinstance(d, dict), 'd must be a dictionary' + assert isinstance(d, dict), "d must be a dictionary" return {str(key): str(val) for key, val in d.items()} @@ -428,11 +416,13 @@ def logging_callback(level, time_info, logger_name, msg): # fix pathname/lineno if pathname is None: - pathname = '' + pathname = "" if lineno is None: lineno = 0 - log_record = logging.LogRecord(logger_name, level, pathname, lineno, msg, None, None) + log_record = logging.LogRecord( + logger_name, level, pathname, lineno, msg, None, None + ) log_record.created = ct log_record.msecs = (ct - int(ct)) * 1000 log_record.relativeCreated = log_record.created - logging._startTime @@ -462,13 +452,13 @@ def auto_shutdown_all(): for entry in __exit_handlers__: try: name, func, args, msg = entry - logging.debug('Attempting to shutdown {}...'.format(name)) + logging.debug("Attempting to shutdown {}...".format(name)) if msg: logging.info(msg) func(args) - logging.info('Shutdown {} successfully'.format(name)) - except Exception as e: - logging.error('Failed to shutdown {}'.format(name)) + logging.info("Shutdown {} successfully".format(name)) + except Exception: + logging.error("Failed to shutdown {}".format(name)) __exit_handlers__ = [] @@ -500,7 +490,7 @@ def is_process_running(name): return False -def mongodb_uri(mongodb_url, mongodb_port, db_name='tuplex-history'): +def mongodb_uri(mongodb_url, mongodb_port, db_name="tuplex-history"): """ constructs a fully qualified MongoDB URI Args: @@ -511,10 +501,12 @@ def mongodb_uri(mongodb_url, mongodb_port, db_name='tuplex-history'): Returns: string representing MongoDB URI """ - return 'mongodb://{}:{}/{}'.format(mongodb_url, mongodb_port, db_name) + return "mongodb://{}:{}/{}".format(mongodb_url, mongodb_port, db_name) -def check_mongodb_connection(mongodb_url, mongodb_port, db_name='tuplex-history', timeout=10.0): +def check_mongodb_connection( + mongodb_url, mongodb_port, db_name="tuplex-history", timeout=10.0 +): """ connects to a MongoDB database instance, raises exception if connection fails Args: @@ -530,36 +522,46 @@ def check_mongodb_connection(mongodb_url, mongodb_port, db_name='tuplex-history' # check whether one can connect to MongoDB from pymongo import MongoClient - from pymongo.errors import ServerSelectionTimeoutError start_time = time.time() connect_successful = False - logging.debug('Attempting to contact MongoDB under {}'.format(uri)) + logging.debug("Attempting to contact MongoDB under {}".format(uri)) connect_try = 1 while abs(time.time() - start_time) < timeout: - logging.debug('MongoDB connection try {}...'.format(connect_try)) + logging.debug("MongoDB connection try {}...".format(connect_try)) try: # set client connection to super low timeouts so the wait is not too long. - client = MongoClient(uri, serverSelectionTimeoutMS=100, connectTimeoutMS=1000) - info = client.server_info() # force a call to mongodb, alternative is client.admin.command('ismaster') + client = MongoClient( + uri, serverSelectionTimeoutMS=100, connectTimeoutMS=1000 + ) + client.server_info() # force a call to mongodb, alternative is client.admin.command('ismaster') connect_successful = True except Exception as e: - logging.debug('Connection try {} produced {} exception {}'.format(connect_try, type(e), str(e))) + logging.debug( + "Connection try {} produced {} exception {}".format( + connect_try, type(e), str(e) + ) + ) if connect_successful: timeout = 0 break time.sleep(0.05) # sleep for 50ms - logging.debug('Contacting MongoDB under {}... -- {:.2f}s of poll time left'.format(uri, timeout - ( - time.time() - start_time))) + logging.debug( + "Contacting MongoDB under {}... -- {:.2f}s of poll time left".format( + uri, timeout - (time.time() - start_time) + ) + ) connect_try += 1 if connect_successful is False: - raise Exception('Could not connect to MongoDB, check network connection. (ping must be < 100ms)') + raise Exception( + "Could not connect to MongoDB, check network connection. (ping must be < 100ms)" + ) - logging.debug('Connection test to MongoDB succeeded') + logging.debug("Connection test to MongoDB succeeded") def shutdown_process_via_kill(pid): @@ -571,11 +573,17 @@ def shutdown_process_via_kill(pid): Returns: None """ - logging.debug('Shutting down process PID={}'.format(pid)) + logging.debug("Shutting down process PID={}".format(pid)) os.kill(pid, signal.SIGKILL) -def find_or_start_mongodb(mongodb_url, mongodb_port, mongodb_datapath, mongodb_logpath, db_name='tuplex-history'): +def find_or_start_mongodb( + mongodb_url, + mongodb_port, + mongodb_datapath, + mongodb_logpath, + db_name="tuplex-history", +): """ attempts to connect to a MongoDB database. If no running local MongoDB is found, will auto-start a mongodb database. R aises exception when fails. @@ -591,17 +599,19 @@ def find_or_start_mongodb(mongodb_url, mongodb_port, mongodb_datapath, mongodb_l """ # is it localhost? - if 'localhost' in mongodb_url: - logging.debug('Using local MongoDB instance') + if "localhost" in mongodb_url: + logging.debug("Using local MongoDB instance") # first check whether mongod is on path - if not cmd_exists('mongod'): - raise Exception('MongoDB (mongod) not found on PATH. In order to use Tuplex\'s WebUI, you need MongoDB' - ' installed or point the framework to a running MongoDB instance') + if not cmd_exists("mongod"): + raise Exception( + "MongoDB (mongod) not found on PATH. In order to use Tuplex's WebUI, you need MongoDB" + " installed or point the framework to a running MongoDB instance" + ) # is mongod running on local machine? - if is_process_running('mongod'): - logging.debug('Found locally running MongoDB daemon process') + if is_process_running("mongod"): + logging.debug("Found locally running MongoDB daemon process") # process is running, try to connect check_mongodb_connection(mongodb_url, mongodb_port, db_name) @@ -614,11 +624,23 @@ def find_or_start_mongodb(mongodb_url, mongodb_port, mongodb_datapath, mongodb_l # startup via mongod --fork --logpath /var/log/mongodb/mongod.log --port 1234 --dbpath try: - cmd = ['mongod', '--fork', '--logpath', str(mongodb_logpath), '--port', str(mongodb_port), '--dbpath', - str(mongodb_datapath)] - - logging.debug('starting MongoDB daemon process via {}'.format(' '.join(cmd))) - process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + cmd = [ + "mongod", + "--fork", + "--logpath", + str(mongodb_logpath), + "--port", + str(mongodb_port), + "--dbpath", + str(mongodb_datapath), + ] + + logging.debug( + "starting MongoDB daemon process via {}".format(" ".join(cmd)) + ) + process = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) short_timeout = 2.5 max_mongodb_timeout = 10 # maximum timeout is 10s @@ -628,50 +650,70 @@ def find_or_start_mongodb(mongodb_url, mongodb_port, mongodb_datapath, mongodb_l except subprocess.TimeoutExpired: # try now with more time (up to max) logging.info( - "Could not start MongoDB daemon process in {}s, trying with timeout={}s".format(short_timeout, - max_mongodb_timeout)) - p_stdout, p_stderr = process.communicate(timeout=max_mongodb_timeout) + "Could not start MongoDB daemon process in {}s, trying with timeout={}s".format( + short_timeout, max_mongodb_timeout + ) + ) + p_stdout, p_stderr = process.communicate( + timeout=max_mongodb_timeout + ) # decode p_stdout = p_stdout.decode() p_stderr = p_stderr.decode() if len(p_stderr.strip()) > 0: - raise Exception('mongod produced following errors: {}'.format(p_stderr)) + raise Exception( + "mongod produced following errors: {}".format(p_stderr) + ) # find mongod pid - m = re.search(r'forked process: (\d+)', p_stdout) - assert m is not None, 'Could not find Child process ID when starting MongoDB' + m = re.search(r"forked process: (\d+)", p_stdout) + assert m is not None, ( + "Could not find Child process ID when starting MongoDB" + ) mongo_pid = int(m[1]) - logging.debug('MongoDB Daemon PID={}'.format(mongo_pid)) + logging.debug("MongoDB Daemon PID={}".format(mongo_pid)) # add a new shutdown func for mongod - register_auto_shutdown('mongod', shutdown_process_via_kill, mongo_pid) + register_auto_shutdown("mongod", shutdown_process_via_kill, mongo_pid) except Exception as e: - logging.error('Failed to start MongoDB daemon. Details: {}'.format(str(e))) + logging.error( + "Failed to start MongoDB daemon. Details: {}".format(str(e)) + ) # print out first 10 and last 10 lines of mongodb log if exists n_to_print = 15 mongodb_logpath = str(mongodb_logpath) if os.path.isfile(mongodb_logpath): - with open(mongodb_logpath, 'r') as fp_mongo: - lines = list(map(lambda line: line.strip(), fp_mongo.readlines())) - shortened_log = '' + with open(mongodb_logpath, "r") as fp_mongo: + lines = list( + map(lambda line: line.strip(), fp_mongo.readlines()) + ) + shortened_log = "" if len(lines) > 2 * n_to_print: - shortened_log = '\n'.join(lines[:n_to_print]) + '...\n' + '\n'.join(lines[-n_to_print:]) + shortened_log = ( + "\n".join(lines[:n_to_print]) + + "...\n" + + "\n".join(lines[-n_to_print:]) + ) else: - shortened_log = '\n'.join(lines) - logging.error('MongoDB daemon log:\n{}'.format(shortened_log)) + shortened_log = "\n".join(lines) + logging.error("MongoDB daemon log:\n{}".format(shortened_log)) else: - logging.error('Could not find MongoDB log under {}. Permission error?'.format(mongodb_logpath)) + logging.error( + "Could not find MongoDB log under {}. Permission error?".format( + mongodb_logpath + ) + ) raise e logging.debug("Attempting to connect to freshly started MongoDB daemon...") check_mongodb_connection(mongodb_url, mongodb_port, db_name) else: # remote MongoDB - logging.debug('Connecting to remote MongoDB instance') + logging.debug("Connecting to remote MongoDB instance") check_mongodb_connection(mongodb_url, mongodb_port, db_name) @@ -686,12 +728,15 @@ def log_gunicorn_errors(logpath): """ # parse log, check whether there's any line where [ERROR] is contined - with open(logpath, 'r') as fp: + with open(logpath, "r") as fp: lines = fp.readlines() - indices = map(lambda t: t[1], filter(lambda t: '[ERROR]' in t[0], zip(lines, range(len(lines))))) + indices = map( + lambda t: t[1], + filter(lambda t: "[ERROR]" in t[0], zip(lines, range(len(lines)))), + ) if indices: first_idx = min(indices) - logging.error('Gunicorn error log:\n {}'.format(''.join(lines[first_idx:]))) + logging.error("Gunicorn error log:\n {}".format("".join(lines[first_idx:]))) def find_or_start_webui(mongo_uri, hostname, port, web_logfile): @@ -706,27 +751,34 @@ def find_or_start_webui(mongo_uri, hostname, port, web_logfile): Returns: None, raises exceptions on failure """ - version_endpoint = '/api/version' # use this to connect and trigger WebUI connection + version_endpoint = ( + "/api/version" # use this to connect and trigger WebUI connection + ) - if not hostname.startswith('http://') and not hostname.startswith('https://'): - hostname = 'http://' + str(hostname) + if not hostname.startswith("http://") and not hostname.startswith("https://"): + hostname = "http://" + str(hostname) - base_uri = '{}:{}'.format(hostname, port) + base_uri = "{}:{}".format(hostname, port) version_info = None try: version_info = get_json(base_uri + version_endpoint) - except Exception as err: - logging.debug("Couldn't connect to {}, starting WebUI...".format(base_uri + version_endpoint)) + except Exception: + logging.debug( + "Couldn't connect to {}, starting WebUI...".format( + base_uri + version_endpoint + ) + ) if version_info is not None: # check version compatibility return version_info else: # start WebUI up! - if not cmd_exists('gunicorn'): + if not cmd_exists("gunicorn"): raise Exception( - 'Tuplex uses per default gunicorn with eventlet to run the WebUI. Please install via `pip3 install "gunicorn[eventlet]"` or add to PATH') + 'Tuplex uses per default gunicorn with eventlet to run the WebUI. Please install via `pip3 install "gunicorn[eventlet]"` or add to PATH' + ) # command for this is: # env MONGO_URI=$MONGO_URI gunicorn --daemon --worker-class eventlet --log-file $GUNICORN_LOGFILE -b $HOST:$PORT thserver:app @@ -734,41 +786,52 @@ def find_or_start_webui(mongo_uri, hostname, port, web_logfile): # directory needs to be the one where the history server is located in! # ==> from structure of file we can infer that dir_path = os.path.dirname(os.path.realpath(__file__)) - assert dir_path.endswith(os.path.join('tuplex', 'utils')), 'folder structure changed. Need to fix.' + assert dir_path.endswith(os.path.join("tuplex", "utils")), ( + "folder structure changed. Need to fix." + ) # get tuplex base dir tuplex_basedir = pathlib.Path(dir_path).parent # two options: Could be dev install or site-packages install, therefore check two folders - if not os.path.isdir(os.path.join(tuplex_basedir, 'historyserver', 'thserver')): + if not os.path.isdir(os.path.join(tuplex_basedir, "historyserver", "thserver")): # dev install or somehow different folder structure? # --> try to find root tuplex folder containing historyserver folder! path = pathlib.Path(tuplex_basedir) while path.parent != path: # check in path - if 'tuplex' in os.listdir(path) and 'historyserver' in os.listdir(os.path.join(path, 'tuplex')): - tuplex_basedir = os.path.join(str(path), 'tuplex') - logging.debug('Detected Tuplex rootfolder (dev) to be {}'.format(tuplex_basedir)) + if "tuplex" in os.listdir(path) and "historyserver" in os.listdir( + os.path.join(path, "tuplex") + ): + tuplex_basedir = os.path.join(str(path), "tuplex") + logging.debug( + "Detected Tuplex rootfolder (dev) to be {}".format( + tuplex_basedir + ) + ) break path = path.parent # check dir historyserver/thserver exists! - assert os.path.isdir(os.path.join(tuplex_basedir, 'historyserver', - 'thserver')), 'could not find Tuplex WebUI WebApp in {}'.format( - tuplex_basedir) - assert os.path.isfile(os.path.join(tuplex_basedir, 'historyserver', 'thserver', - '__init__.py')), 'could not find Tuplex WebUI __init__.py file in thserver folder' + assert os.path.isdir( + os.path.join(tuplex_basedir, "historyserver", "thserver") + ), "could not find Tuplex WebUI WebApp in {}".format(tuplex_basedir) + assert os.path.isfile( + os.path.join(tuplex_basedir, "historyserver", "thserver", "__init__.py") + ), "could not find Tuplex WebUI __init__.py file in thserver folder" # history server dir to use to start gunicorn - ui_basedir = os.path.join(tuplex_basedir, 'historyserver') - logging.debug('Launching gunicorn from {}'.format(ui_basedir)) + ui_basedir = os.path.join(tuplex_basedir, "historyserver") + logging.debug("Launching gunicorn from {}".format(ui_basedir)) # create temp PID file to get process ID to shutdown auto-started WebUI PID_FILE = tempfile.NamedTemporaryFile(delete=False).name ui_env = os.environ - ui_env['MONGO_URI'] = mongo_uri - gunicorn_host = '{}:{}'.format(hostname.replace('http://', '').replace('https://', ''), port) + ui_env["MONGO_URI"] = mongo_uri + gunicorn_host = "{}:{}".format( + hostname.replace("http://", "").replace("https://", ""), port + ) # need to convert everything to absolute paths (b.c. gunicorn fails else) web_logfile = os.path.abspath(web_logfile) @@ -778,14 +841,33 @@ def find_or_start_webui(mongo_uri, hostname, port, web_logfile): wl_path = pathlib.Path(web_logfile).parent os.makedirs(str(wl_path), exist_ok=True) except Exception as e: - logging.error("ensuring parent dir of {} exists, failed with {}".format(web_logfile, e)) - - cmd = ['gunicorn', '--daemon', '--worker-class', 'eventlet', '--chdir', ui_basedir, '--pid', PID_FILE, - '--log-file', web_logfile, '-b', gunicorn_host, 'thserver:app'] - - logging.debug('Starting gunicorn with command: {}'.format(' '.join(cmd))) - - process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=ui_env) + logging.error( + "ensuring parent dir of {} exists, failed with {}".format( + web_logfile, e + ) + ) + + cmd = [ + "gunicorn", + "--daemon", + "--worker-class", + "eventlet", + "--chdir", + ui_basedir, + "--pid", + PID_FILE, + "--log-file", + web_logfile, + "-b", + gunicorn_host, + "thserver:app", + ] + + logging.debug("Starting gunicorn with command: {}".format(" ".join(cmd))) + + process = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=ui_env + ) # set a timeout of 2 seconds to keep everything interactive p_stdout, p_stderr = process.communicate(timeout=2) @@ -794,9 +876,9 @@ def find_or_start_webui(mongo_uri, hostname, port, web_logfile): p_stderr = p_stderr.decode() if len(p_stderr.strip()) > 0: - raise Exception('gunicorn produced following errors: {}'.format(p_stderr)) + raise Exception("gunicorn produced following errors: {}".format(p_stderr)) - logging.info('Gunicorn locally started...') + logging.info("Gunicorn locally started...") # find out process id of gunicorn ui_pid = None @@ -809,23 +891,40 @@ def find_or_start_webui(mongo_uri, hostname, port, web_logfile): time.sleep(0.05) # sleep for 50ms else: break - logging.debug('Polling for Gunicorn PID... -- {:.2f}s of poll time left'.format( - TIME_LIMIT - (time.time() - start_time))) + logging.debug( + "Polling for Gunicorn PID... -- {:.2f}s of poll time left".format( + TIME_LIMIT - (time.time() - start_time) + ) + ) ui_pid = None try: # Read PID file - with open(PID_FILE, 'r') as fp: + with open(PID_FILE, "r") as fp: ui_pid = int(fp.read()) except Exception as e: logging.debug("failed to retrieve PID for WebUI, details: {}".format(e)) - non_daemon_log = 'timeout - no log' + non_daemon_log = "timeout - no log" # something went wrong with starting gunicorn. Try to capture some meaningful output and abort try: - cmd = ['gunicorn', '--worker-class', 'eventlet', '--chdir', ui_basedir, '--pid', PID_FILE, - '--log-file', '-', '-b', gunicorn_host, 'thserver:app'] - process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=ui_env) + cmd = [ + "gunicorn", + "--worker-class", + "eventlet", + "--chdir", + ui_basedir, + "--pid", + PID_FILE, + "--log-file", + "-", + "-b", + gunicorn_host, + "thserver:app", + ] + process = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=ui_env + ) # set a timeout of 5 seconds to keep everything interactive p_stdout, p_stderr = process.communicate(timeout=5) @@ -833,36 +932,45 @@ def find_or_start_webui(mongo_uri, hostname, port, web_logfile): p_stdout = p_stdout.decode() p_stderr = p_stderr.decode() - non_daemon_log = p_stdout + '\n' + p_stderr + non_daemon_log = p_stdout + "\n" + p_stderr except subprocess.TimeoutExpired: pass - logging.error('Gunicorn process log:\n' + non_daemon_log) - raise Exception("Failed to start gunicorn daemon, non-daemon run yielded:\n{}".format(non_daemon_log)) + logging.error("Gunicorn process log:\n" + non_daemon_log) + raise Exception( + "Failed to start gunicorn daemon, non-daemon run yielded:\n{}".format( + non_daemon_log + ) + ) - assert ui_pid is not None, 'Invalid PID for WebUI' - logging.info('Gunicorn PID={}'.format(ui_pid)) + assert ui_pid is not None, "Invalid PID for WebUI" + logging.info("Gunicorn PID={}".format(ui_pid)) # register daemon shutdown - logging.debug('Adding auto-shutdown of process with PID={} (WebUI)'.format(ui_pid)) + logging.debug( + "Adding auto-shutdown of process with PID={} (WebUI)".format(ui_pid) + ) def shutdown_gunicorn(pid): - pids_to_kill = [] # iterate over all gunicorn processes and kill them all for proc in psutil.process_iter(): try: # Get process name & pid from process object. - process_name = proc.name() - process_id = proc.pid - - sep_line = '|'.join(proc.cmdline()).lower() - if 'gunicorn' in sep_line: - + sep_line = "|".join(proc.cmdline()).lower() + if "gunicorn" in sep_line: # check whether that gunicorn instance matches what has been started - if 'thserver:app' in proc.cmdline() and gunicorn_host in proc.cmdline() and PID_FILE in proc.cmdline(): + if ( + "thserver:app" in proc.cmdline() + and gunicorn_host in proc.cmdline() + and PID_FILE in proc.cmdline() + ): pids_to_kill.append(proc.pid) - except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + except ( + psutil.NoSuchProcess, + psutil.AccessDenied, + psutil.ZombieProcess, + ): pass # kill all gunicorn processes @@ -870,14 +978,14 @@ def shutdown_gunicorn(pid): os.kill(pid, signal.SIGQUIT) os.kill(pid, signal.SIGKILL) os.kill(pid, signal.SIGTERM) - logging.debug('Shutdown gunicorn worker with PID={}'.format(pid)) - logging.debug('Shutdown gunicorn with PID={}'.format(pid)) + logging.debug("Shutdown gunicorn worker with PID={}".format(pid)) + logging.debug("Shutdown gunicorn with PID={}".format(pid)) - register_auto_shutdown('gunicorn', shutdown_gunicorn, ui_pid) + register_auto_shutdown("gunicorn", shutdown_gunicorn, ui_pid) version_info = get_json(base_uri + version_endpoint) if version_info is None: - raise Exception('Could not retrieve version info from WebUI') + raise Exception("Could not retrieve version info from WebUI") # perform checks (same MongoDB URI? Same Version?) return version_info @@ -901,42 +1009,57 @@ def ensure_webui(options): # {"tuplex.webui.mongodb.port", "27017"}, # {"tuplex.webui.mongodb.path", temp_mongodb_path} - assert options['tuplex.webui.enable'] is True, 'only call ensure webui when webui option is true' - - mongodb_url = options['tuplex.webui.mongodb.url'] - mongodb_port = options['tuplex.webui.mongodb.port'] - mongodb_datapath = os.path.join(options['tuplex.scratchDir'], 'webui', 'data') - mongodb_logpath = os.path.join(options['tuplex.scratchDir'], 'webui', 'logs', 'mongod.log') - gunicorn_logpath = os.path.join(options['tuplex.scratchDir'], 'webui', 'logs', 'gunicorn.log') - webui_url = options['tuplex.webui.url'] - webui_port = options['tuplex.webui.port'] + assert options["tuplex.webui.enable"] is True, ( + "only call ensure webui when webui option is true" + ) + + mongodb_url = options["tuplex.webui.mongodb.url"] + mongodb_port = options["tuplex.webui.mongodb.port"] + mongodb_datapath = os.path.join(options["tuplex.scratchDir"], "webui", "data") + mongodb_logpath = os.path.join( + options["tuplex.scratchDir"], "webui", "logs", "mongod.log" + ) + gunicorn_logpath = os.path.join( + options["tuplex.scratchDir"], "webui", "logs", "gunicorn.log" + ) + webui_url = options["tuplex.webui.url"] + webui_port = options["tuplex.webui.port"] try: - logging.debug('finding MongoDB...') - find_or_start_mongodb(mongodb_url, mongodb_port, mongodb_datapath, mongodb_logpath) + logging.debug("finding MongoDB...") + find_or_start_mongodb( + mongodb_url, mongodb_port, mongodb_datapath, mongodb_logpath + ) mongo_uri = mongodb_uri(mongodb_url, mongodb_port) - logging.debug('finding WebUI..') + logging.debug("finding WebUI..") # now it's time to do the same thing for the WebUI (and also check it's version v.s. the current one!) - version_info = find_or_start_webui(mongo_uri, webui_url, webui_port, gunicorn_logpath) + version_info = find_or_start_webui( + mongo_uri, webui_url, webui_port, gunicorn_logpath + ) - logging.debug('WebUI services found or started!') + logging.debug("WebUI services found or started!") # check that version of WebUI and Tuplex version match # exclude dev versions, i.e. silence warning there. - if 'dev' not in __version__ and version_info['version'] != __version__: - logging.warning('Version of Tuplex WebUI ({}) and Tuplex ({}) do not match.'.format(version_info['version'], - __version__)) + if "dev" not in __version__ and version_info["version"] != __version__: + logging.warning( + "Version of Tuplex WebUI ({}) and Tuplex ({}) do not match.".format( + version_info["version"], __version__ + ) + ) # all good, print out link so user can access WebUI easily - webui_uri = webui_url + ':' + str(webui_port) - if not webui_uri.startswith('http'): - webui_uri = 'http://' + webui_uri - print('Tuplex WebUI can be accessed under {}'.format(webui_uri)) + webui_uri = webui_url + ":" + str(webui_port) + if not webui_uri.startswith("http"): + webui_uri = "http://" + webui_uri + print("Tuplex WebUI can be accessed under {}".format(webui_uri)) except Exception as e: - logging.error('Failed to start or connect to Tuplex WebUI. Details: {}'.format(e)) + logging.error( + "Failed to start or connect to Tuplex WebUI. Details: {}".format(e) + ) # log gunicorn errors for local startup - if os.path.isfile(gunicorn_logpath) and 'localhost' == webui_url: + if os.path.isfile(gunicorn_logpath) and "localhost" == webui_url: log_gunicorn_errors(gunicorn_logpath) diff --git a/tuplex/python/tuplex/utils/errors.py b/tuplex/python/tuplex/utils/errors.py index a05d2f2c6..315ca7317 100644 --- a/tuplex/python/tuplex/utils/errors.py +++ b/tuplex/python/tuplex/utils/errors.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# # # # Tuplex: Blazing Fast Python Data Science # # # @@ -7,10 +7,12 @@ # (c) 2017 - 2021, Tuplex team # # Created by Leonhard Spiegelberg first on 1/1/2021 # # License: Apache 2.0 # -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# + class TuplexException(Exception): """ Base class for exceptions across the fraemwork """ - pass \ No newline at end of file + + pass diff --git a/tuplex/python/tuplex/utils/framework.py b/tuplex/python/tuplex/utils/framework.py index d5d36d225..34a3ff649 100644 --- a/tuplex/python/tuplex/utils/framework.py +++ b/tuplex/python/tuplex/utils/framework.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# # # # Tuplex: Blazing Fast Python Data Science # # # @@ -7,13 +7,16 @@ # (c) 2017 - 2021, Tuplex team # # Created by Leonhard Spiegelberg first on 8/3/2021 # # License: Apache 2.0 # -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# # this file contains Framework specific exceptions class TuplexException(Exception): """Base Exception class on which all Tuplex Framework specific exceptions are based""" + pass + class UDFCodeExtractionError(TuplexException): """thrown when UDF code extraction/reflection failed""" - pass \ No newline at end of file + + pass diff --git a/tuplex/python/tuplex/utils/globs.py b/tuplex/python/tuplex/utils/globs.py index 9fba0e9ed..f938e5035 100644 --- a/tuplex/python/tuplex/utils/globs.py +++ b/tuplex/python/tuplex/utils/globs.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# # # # Tuplex: Blazing Fast Python Data Science # # # @@ -7,40 +7,36 @@ # (c) 2017 - 2021, Tuplex team # # Created by Leonhard Spiegelberg first on 1/1/2021 # # License: Apache 2.0 # -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# -import types -import inspect -import re -import ast -import weakref import dis -import opcode -import types import itertools +import opcode import sys +import types +import weakref + # ALWAYS import cloudpickle before dill, b.c. of https://github.com/uqfoundation/dill/issues/383 from cloudpickle.cloudpickle import _get_cell_contents -import dill # from cloudpickle # ---------------- _extract_code_globals_cache = weakref.WeakKeyDictionary() # relevant opcodes -STORE_GLOBAL = opcode.opmap['STORE_GLOBAL'] -DELETE_GLOBAL = opcode.opmap['DELETE_GLOBAL'] -LOAD_GLOBAL = opcode.opmap['LOAD_GLOBAL'] +STORE_GLOBAL = opcode.opmap["STORE_GLOBAL"] +DELETE_GLOBAL = opcode.opmap["DELETE_GLOBAL"] +LOAD_GLOBAL = opcode.opmap["LOAD_GLOBAL"] GLOBAL_OPS = (STORE_GLOBAL, DELETE_GLOBAL, LOAD_GLOBAL) HAVE_ARGUMENT = dis.HAVE_ARGUMENT EXTENDED_ARG = dis.EXTENDED_ARG + def _extract_code_globals(co): """ Find all globals names read or written to by codeblock co """ out_names = _extract_code_globals_cache.get(co) if out_names is None: - names = co.co_names out_names = {opargval: None for opi, opargval in _walk_global_ops(co)} # Declaring a function inside another one using the "def ..." @@ -57,6 +53,8 @@ def _extract_code_globals(co): _extract_code_globals_cache[co] = out_names return out_names + + def _find_imported_submodules(code, top_level_dependencies): """ Find currently imported submodules used by a function. @@ -85,10 +83,13 @@ def func(): subimports = [] # check if any known dependency is an imported package for x in top_level_dependencies: - if (isinstance(x, types.ModuleType) and - hasattr(x, '__package__') and x.__package__): + if ( + isinstance(x, types.ModuleType) + and hasattr(x, "__package__") + and x.__package__ + ): # check if the package has any currently loaded sub-imports - prefix = x.__name__ + '.' + prefix = x.__name__ + "." # A concurrent thread could mutate sys.modules, # make sure we iterate over a copy to avoid exceptions for name in list(sys.modules): @@ -96,7 +97,7 @@ def func(): # sys.modules. if name is not None and name.startswith(prefix): # check whether the function can address the sub-module - tokens = set(name[len(prefix):].split('.')) + tokens = set(name[len(prefix) :].split(".")) if not tokens - set(code.co_names): subimports.append(sys.modules[name]) return subimports @@ -132,12 +133,12 @@ def _function_getstate(func): } f_globals_ref = _extract_code_globals(func.__code__) - f_globals = {k: func.__globals__[k] for k in f_globals_ref if k in - func.__globals__} + f_globals = {k: func.__globals__[k] for k in f_globals_ref if k in func.__globals__} closure_values = ( list(map(_get_cell_contents, func.__closure__)) - if func.__closure__ is not None else () + if func.__closure__ is not None + else () ) # Extract currently-imported submodules used by func. Storing these modules @@ -145,29 +146,33 @@ def _function_getstate(func): # trigger the side effect of importing these modules at unpickling time # (which is necessary for func to work correctly once depickled) slotstate["_cloudpickle_submodules"] = _find_imported_submodules( - func.__code__, itertools.chain(f_globals.values(), closure_values)) + func.__code__, itertools.chain(f_globals.values(), closure_values) + ) slotstate["__globals__"] = f_globals - - # add free vars to slotstate by decoding closure - try: - slotstate['__freevars__'] = {name: closure_values[i] for i, name in enumerate(func.__code__.co_freevars)} - except: - slotstate['__freevars__'] = {} + # Add free vars to slotstate by decoding closure. + slotstate["__freevars__"] = { + name: closure_values[i] for i, name in enumerate(func.__code__.co_freevars) + } state = func.__dict__ return state, slotstate + + # -------------------- # end from cloudpickle + def get_globals(func): _, d = _function_getstate(func) - func_globals = d['__globals__'] - func_freevars = d['__freevars__'] + func_globals = d["__globals__"] + func_freevars = d["__freevars__"] # unify free vars with globals if len(set(func_globals.keys()).intersection(set(func_freevars.keys()))) != 0: - raise Exception('internal error, overlap between globals and freevars, should not occur.') + raise Exception( + "internal error, overlap between globals and freevars, should not occur." + ) # add free vars to global dict to have everything in one dict. func_globals.update(func_freevars) diff --git a/tuplex/python/tuplex/utils/interactive_shell.py b/tuplex/python/tuplex/utils/interactive_shell.py index 4d432b4c4..56a929b02 100644 --- a/tuplex/python/tuplex/utils/interactive_shell.py +++ b/tuplex/python/tuplex/utils/interactive_shell.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# # # # Tuplex: Blazing Fast Python Data Science # # # @@ -7,16 +7,19 @@ # (c) 2017 - 2021, Tuplex team # # Created by Leonhard Spiegelberg first on 1/1/2021 # # License: Apache 2.0 # -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# from __future__ import unicode_literals +import logging import os -import sys import re -import logging +import sys from code import InteractiveConsole +from types import FunctionType, LambdaType + from prompt_toolkit.history import InMemoryHistory + # old version: 1.0 # from prompt_toolkit.layout.lexers import PygmentsLexer # from prompt_toolkit.styles import style_from_pygments @@ -29,10 +32,10 @@ from prompt_toolkit.styles.pygments import style_from_pygments_cls from pygments.lexers import Python3Lexer from pygments.styles import get_style_by_name + +from tuplex.utils.globs import get_globals from tuplex.utils.jedi_completer import JediCompleter from tuplex.utils.source_vault import SourceVault -from types import LambdaType, FunctionType -from tuplex.utils.globs import get_globals # this is a helper to allow for tuplex.Context syntax @@ -46,27 +49,31 @@ def __init__(self, context_cls): def Context(self): return self._context_cls + # Interactive shell # check https://github.com/python/cpython/blob/master/Lib/code.py for overwriting this class class TuplexShell(InteractiveConsole): - # use BORG design pattern to make class singleton alike __shared_state = {} def __init__(self): self.__dict__ = self.__shared_state - def init(self, locals=None, filename="", histfile=os.path.expanduser("~/.console_history")): - + def init( + self, + locals=None, + filename="", + histfile=os.path.expanduser("~/.console_history"), + ): # add dummy helper for context - if locals is not None and 'Context' in locals.keys(): - locals['tuplex'] = TuplexModuleHelper(locals['Context']) + if locals is not None and "Context" in locals.keys(): + locals["tuplex"] = TuplexModuleHelper(locals["Context"]) self.initialized = True self.filename = "console-0" self.lineno = 0 InteractiveConsole.__init__(self, locals, self.filename) - self._lastLine = '' + self._lastLine = "" self.historyDict = {} def push(self, line): @@ -81,7 +88,7 @@ def push(self, line): value is 1 if more input is required, 0 if the line was dealt with in some way (this is the same as runsource()). """ - assert self.initialized, 'must call init on TuplexShell object first' + assert self.initialized, "must call init on TuplexShell object first" self.buffer.append(line) source = "\n".join(self.buffer) @@ -99,18 +106,17 @@ def push(self, line): self.historyDict[self.filename] = self.buffer.copy() # new filename - self.filename = 'console-{}'.format(self.lineno) + self.filename = "console-{}".format(self.lineno) self.resetbuffer() return more - def get_lambda_source(self, f): # Won't this work for functions as well? - assert self.initialized, 'must call init on TuplexShell object first' + assert self.initialized, "must call init on TuplexShell object first" - assert isinstance(f, LambdaType), 'object needs to be a lambda object' + assert isinstance(f, LambdaType), "object needs to be a lambda object" vault = SourceVault() @@ -118,40 +124,48 @@ def get_lambda_source(self, f): f_globs = get_globals(f) f_filename = f.__code__.co_filename f_lineno = f.__code__.co_firstlineno - f_colno = f.__code__.co_firstcolno if hasattr(f.__code__, 'co_firstcolno') else None + f_colno = ( + f.__code__.co_firstcolno if hasattr(f.__code__, "co_firstcolno") else None + ) # get source from history # Note: because firstlineno is 1-indexed, add a dummy line so everything works. - src_info = (['dummy'] + self.historyDict[f_filename], 0) + src_info = (["dummy"] + self.historyDict[f_filename], 0) vault.extractAndPutAllLambdas(src_info, f_filename, f_lineno, f_colno, f_globs) return vault.get(f, f_filename, f_lineno, f_colno, f_globs) def get_function_source(self, f): + assert self.initialized, "must call init on TuplexShell object first" - assert self.initialized, 'must call init on TuplexShell object first' + assert isinstance(f, FunctionType) and f.__code__.co_name != "", ( + "object needs to be a function (non-lambda) object" + ) - assert isinstance(f, - FunctionType) and f.__code__.co_name != '', 'object needs to be a function (non-lambda) object' - - # fetch all data - f_globs = get_globals(f) + # Fetch all data: f_filename = f.__code__.co_filename - f_lineno = f.__code__.co_firstlineno - f_colno = f.__code__.co_firstcolno if hasattr(f.__code__, 'co_firstcolno') else None + + # # TODO: Include lineno/colno information in AST. + # f_globs = get_globals(f) + # f_lineno = f.__code__.co_firstlineno + # f_colno = ( + # f.__code__.co_firstcolno if hasattr(f.__code__, "co_firstcolno") else None + # ) # retrieve func source from historyDict lines = self.historyDict[f_filename] # check whether def is found in here - source = '\n'.join(lines).strip() + source = "\n".join(lines).strip() function_name = f.__code__.co_name regex = r"def\s*{}\(.*\)\s*:[\t ]*\n".format(function_name) prog = re.compile(regex) if not prog.search(source): - logging.error('Could not find function "{}" in source'.format(function_name)) + logging.error( + 'Could not find function "{}" in source'.format(function_name) + ) return None return source @@ -178,7 +192,7 @@ def interact(self, banner=None, exitmsg=None): style_trafo = None # check if env TUPLEX_COLORSCHEME is set, then pygments style may be used. - scheme = os.environ.get('TUPLEX_COLORSCHEME', None) + scheme = os.environ.get("TUPLEX_COLORSCHEME", None) if scheme: # define here style for python prompt toolkit @@ -199,9 +213,10 @@ def interact(self, banner=None, exitmsg=None): sys.ps2 = "... " cprt = 'Type "help", "copyright", "credits" or "license" for more information.' if banner is None: - self.write("Python %s on %s\n%s\n(%s)\n" % - (sys.version, sys.platform, cprt, - self.__class__.__name__)) + self.write( + "Python %s on %s\n%s\n(%s)\n" + % (sys.version, sys.platform, cprt, self.__class__.__name__) + ) elif banner: self.write("%s\n" % str(banner)) more = 0 @@ -214,18 +229,22 @@ def interact(self, banner=None, exitmsg=None): try: # use prompt toolkit here for more stylish input & tab completion # raw python prompt - #line = self.raw_input(prompt) - + # line = self.raw_input(prompt) # look here http://python-prompt-toolkit.readthedocs.io/en/stable/pages/asking_for_input.html#hello-world # on how to style the prompt better # use patch_stdout=True to output stuff above prompt - line = ptprompt(prompt, lexer=PygmentsLexer(Python3Lexer), style=style, - style_transformation=style_trafo, history=history, - completer=JediCompleter(lambda: self.locals), - complete_style=CompleteStyle.READLINE_LIKE, - complete_while_typing=False) + line = ptprompt( + prompt, + lexer=PygmentsLexer(Python3Lexer), + style=style, + style_transformation=style_trafo, + history=history, + completer=JediCompleter(lambda: self.locals), + complete_style=CompleteStyle.READLINE_LIKE, + complete_while_typing=False, + ) except EOFError: self.write("\n") @@ -237,6 +256,6 @@ def interact(self, banner=None, exitmsg=None): self.resetbuffer() more = 0 if exitmsg is None: - self.write('now exiting %s...\n' % self.__class__.__name__) - elif exitmsg != '': - self.write('%s\n' % exitmsg) \ No newline at end of file + self.write("now exiting %s...\n" % self.__class__.__name__) + elif exitmsg != "": + self.write("%s\n" % exitmsg) diff --git a/tuplex/python/tuplex/utils/jedi_completer.py b/tuplex/python/tuplex/utils/jedi_completer.py index f5f1bb517..deecf8517 100644 --- a/tuplex/python/tuplex/utils/jedi_completer.py +++ b/tuplex/python/tuplex/utils/jedi_completer.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# # # # Tuplex: Blazing Fast Python Data Science # # # @@ -7,18 +7,16 @@ # (c) 2017 - 2021, Tuplex team # # Created by Leonhard Spiegelberg first on 1/1/2021 # # License: Apache 2.0 # -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# +from jedi import Interpreter, settings from prompt_toolkit.completion import Completer, Completion -import jedi -from jedi import Interpreter -from jedi import settings + class JediCompleter(Completer): """REPL Completer using jedi""" def __init__(self, get_locals): - # per default jedi is case insensitive, however we want it to be case sensitive settings.case_insensitive_completion = False @@ -30,20 +28,20 @@ def get_completions(self, document, complete_event): # Jedi API changed, reflect this here completions = [] - if hasattr(interpreter, 'completions'): + if hasattr(interpreter, "completions"): completions = interpreter.completions() - elif hasattr(interpreter, 'complete'): + elif hasattr(interpreter, "complete"): completions = interpreter.complete() else: - raise Exception('Unknown Jedi API, please update or install older version (0.18)') + raise Exception( + "Unknown Jedi API, please update or install older version (0.18)" + ) for completion in completions: - - if completion.name_with_symbols.startswith('_'): + if completion.name_with_symbols.startswith("_"): continue - if len(document.text) > len(completion.name_with_symbols) - len(completion.complete): - last_char = document.text[len(completion.complete) - len(completion.name_with_symbols) - 1] - else: - last_char = None - yield Completion(completion.name_with_symbols, len(completion.complete) - len(completion.name_with_symbols)) \ No newline at end of file + yield Completion( + completion.name_with_symbols, + len(completion.complete) - len(completion.name_with_symbols), + ) diff --git a/tuplex/python/tuplex/utils/jupyter.py b/tuplex/python/tuplex/utils/jupyter.py index f2651ab88..40fa34f70 100644 --- a/tuplex/python/tuplex/utils/jupyter.py +++ b/tuplex/python/tuplex/utils/jupyter.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# # # # Tuplex: Blazing Fast Python Data Science # # # @@ -7,45 +7,61 @@ # (c) 2017 - 2021, Tuplex team # # Created by Leonhard Spiegelberg first on 1/1/2021 # # License: Apache 2.0 # -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# import json import os.path import re -import ipykernel import urllib.request from urllib.parse import urljoin + +import ipykernel from notebook.notebookapp import list_running_servers + def get_jupyter_notebook_info(): """ retrieve infos about the currently running jupyter notebook if possible Returns: dict with several info attributes. If info for current notebook could not be retrieved, returns empty dict """ + def get(url): - req = urllib.request.Request(url, headers={'content-type': 'application/json'}) + req = urllib.request.Request(url, headers={"content-type": "application/json"}) response = urllib.request.urlopen(req) return json.loads(response.read()) - kernel_id = re.search('kernel-(.*).json', - ipykernel.connect.get_connection_file()).group(1) + kernel_id = re.search( + "kernel-(.*).json", ipykernel.connect.get_connection_file() + ).group(1) servers = list_running_servers() for ss in servers: # there may be a 403 from jupyter... try: - notebook_infos = get(urljoin(ss['url'], 'api/sessions?token={}'.format(ss.get('token', '')))) + notebook_infos = get( + urljoin(ss["url"], "api/sessions?token={}".format(ss.get("token", ""))) + ) # search for match for ninfo in notebook_infos: - if ninfo['kernel']['id'] == kernel_id: - return {'kernelID' : kernel_id, 'notebookID' : ninfo['id'], - 'kernelName' : ninfo['kernel']['name'], - 'path' : os.path.join(ss['notebook_dir'], ninfo['notebook']['path']), - 'url' : urljoin(ss['url'],'notebooks/{}?token={}'.format(ninfo['path'], ss.get('token', '')))} + if ninfo["kernel"]["id"] == kernel_id: + return { + "kernelID": kernel_id, + "notebookID": ninfo["id"], + "kernelName": ninfo["kernel"]["name"], + "path": os.path.join( + ss["notebook_dir"], ninfo["notebook"]["path"] + ), + "url": urljoin( + ss["url"], + "notebooks/{}?token={}".format( + ninfo["path"], ss.get("token", "") + ), + ), + } except urllib.error.HTTPError as e: # ignore 403s (i.e. no allowed access) - if e.getcode() != 403: + if e.getcode() != 403: raise e - return {} \ No newline at end of file + return {} diff --git a/tuplex/python/tuplex/utils/reflection.py b/tuplex/python/tuplex/utils/reflection.py index fd4e6a295..258da0b27 100644 --- a/tuplex/python/tuplex/utils/reflection.py +++ b/tuplex/python/tuplex/utils/reflection.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# # # # Tuplex: Blazing Fast Python Data Science # # # @@ -7,36 +7,35 @@ # (c) 2017 - 2021, Tuplex team # # Created by Leonhard Spiegelberg first on 1/1/2021 # # License: Apache 2.0 # -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# -import types +import ast import inspect import re +import types + # ALWAYS import cloudpickle before dill, b.c. of https://github.com/uqfoundation/dill/issues/383 -import cloudpickle import dill -import ast -import weakref -import dis -import opcode -import types -import itertools -import sys +from tuplex.utils.common import ( + in_google_colab, + in_jupyter_notebook, + is_in_interactive_mode, +) from tuplex.utils.errors import TuplexException from tuplex.utils.globs import get_globals from tuplex.utils.source_vault import SourceVault, supports_lambda_closure -from tuplex.utils.common import in_jupyter_notebook, in_google_colab, is_in_interactive_mode # only export get_source function, rest shall be private. -__all__ = ['get_source', 'get_globals', 'supports_lambda_closure'] +__all__ = ["get_source", "get_globals", "supports_lambda_closure"] + def get_jupyter_raw_code(function_name): - # ignore here unresolved reference - history_manager = get_ipython().history_manager + # Ignore here unresolved reference, get_ipython() works in jupyter notebook. + history_manager = get_ipython().history_manager # noqa: F821 hist = history_manager.get_range() regex = r"def\s*{}\(.*\)\s*:[\t ]*\n".format(function_name) - signature = 'hist = history_manager.get_range()' + signature = "hist = history_manager.get_range()" prog = re.compile(regex) matched_cells = [] @@ -47,7 +46,7 @@ def get_jupyter_raw_code(function_name): if signature in inline: continue - if 'get_function_code' in inline: + if "get_function_code" in inline: continue if prog.search(test_str): @@ -55,6 +54,7 @@ def get_jupyter_raw_code(function_name): return matched_cells[-1][2] + def extractFunctionByName(code, func_name, return_linenos=False): class FunctionVisitor(ast.NodeVisitor): def __init__(self): @@ -62,50 +62,51 @@ def __init__(self): self.funcInfo = [] def visit_FunctionDef(self, node): - print(self.lastStmtLineno) self.generic_visit(node) print(self.lastStmtLineno) def visit(self, node): funcStartLineno = -1 - if hasattr(node, 'lineno'): + if hasattr(node, "lineno"): self.lastStmtLineno = node.lineno if isinstance(node, ast.FunctionDef): funcStartLineno = node.lineno self.generic_visit(node) if isinstance(node, ast.FunctionDef): - self.funcInfo.append({'name': node.name, - 'start': funcStartLineno - 1, - 'end': self.lastStmtLineno - 1}) + self.funcInfo.append( + { + "name": node.name, + "start": funcStartLineno - 1, + "end": self.lastStmtLineno - 1, + } + ) root = ast.parse(code) fv = FunctionVisitor() fv.visit(root) # find function with name - candidates = filter(lambda x: x['name'] == func_name, fv.funcInfo) + candidates = filter(lambda x: x["name"] == func_name, fv.funcInfo) def indent(s): - return len(s) - len(s.lstrip(' \t')) + return len(s) - len(s.lstrip(" \t")) - lines = code.split('\n') + lines = code.split("\n") # find out level - candidates = map(lambda x: {**x, 'level': indent(lines[x['start']])}, candidates) + candidates = map(lambda x: {**x, "level": indent(lines[x["start"]])}, candidates) - info = sorted(candidates, key=lambda x: x['level'])[0] + info = sorted(candidates, key=lambda x: x["level"])[0] - func_code = '\n'.join(lines[info['start']:info['end'] + 1]) + func_code = "\n".join(lines[info["start"] : info["end"] + 1]) if return_linenos: - return func_code, info['start'], info['end'] + return func_code, info["start"], info["end"] else: return func_code def extract_function_code(function_name, raw_code): - - # remove greedily up to num_tabs and num_spaces def remove_tabs_and_spaces(line, num_tabs, num_spaces): t = 0 @@ -113,15 +114,15 @@ def remove_tabs_and_spaces(line, num_tabs, num_spaces): pos = 0 while pos < len(line): c = line[pos] - if c == ' ': + if c == " ": s += 1 - elif c == '\t': + elif c == "\t": t += 1 else: break pos += 1 - return ' ' * max(s - num_spaces, 0) + '\t' * max(t - num_tabs, 0) + line[pos:] + return " " * max(s - num_spaces, 0) + "\t" * max(t - num_tabs, 0) + line[pos:] # remove leading spaces / tabs assert len(raw_code) >= 1 @@ -133,19 +134,21 @@ def remove_tabs_and_spaces(line, num_tabs, num_spaces): start_idx = match.start() first_line = raw_code[start_idx:] - first_line_num_tabs = len(first_line) - len(first_line.lstrip('\t')) - first_line_num_spaces = len(first_line) - len(first_line.lstrip(' ')) + first_line_num_tabs = len(first_line) - len(first_line.lstrip("\t")) + first_line_num_spaces = len(first_line) - len(first_line.lstrip(" ")) - - func_lines = [remove_tabs_and_spaces(line, first_line_num_tabs, first_line_num_spaces) \ - for line in raw_code[start_idx:].split('\n')] + func_lines = [ + remove_tabs_and_spaces(line, first_line_num_tabs, first_line_num_spaces) + for line in raw_code[start_idx:].split("\n") + ] # greedily remove for each line tabs/spaces - out = '\n'.join(func_lines) + out = "\n".join(func_lines) return extractFunctionByName(out, function_name) + def get_function_code(f): - """ jupyter notebook, retrieve function history """ + """jupyter notebook, retrieve function history""" assert isinstance(f, types.FunctionType) function_name = f.__code__.co_name assert isinstance(function_name, str) @@ -171,19 +174,22 @@ def get_function_code(f): vault = SourceVault() + def get_source(f): - """ Jupyter notebook code reflection """ + """Jupyter notebook code reflection""" if isinstance(f, types.FunctionType): - # lambda function? # use inspect module # need to clean out lambda... - if f.__name__ == '': + if f.__name__ == "": # interpreter in interactive mode or not? # beware jupyter notebook also returns true for interactive mode! - if is_in_interactive_mode() and not in_jupyter_notebook() and not in_google_colab(): - + if ( + is_in_interactive_mode() + and not in_jupyter_notebook() + and not in_google_colab() + ): # import here, avoids also trouble with jupyter notebooks from tuplex.utils.interactive_shell import TuplexShell @@ -201,26 +207,29 @@ def get_source(f): f_globs = get_globals(f) f_filename = f.__code__.co_filename f_lineno = f.__code__.co_firstlineno - f_colno = f.__code__.co_firstcolno if hasattr(f.__code__, 'co_firstcolno') else None + f_colno = ( + f.__code__.co_firstcolno + if hasattr(f.__code__, "co_firstcolno") + else None + ) # special case: some unknown jupyter magic has been used... - if (in_jupyter_notebook() or in_google_colab()) and (f_filename == '' or f_filename == ''): - raise TuplexException('%%time magic not supported for Tuplex code') + if (in_jupyter_notebook() or in_google_colab()) and ( + f_filename == "" or f_filename == "" + ): + raise TuplexException("%%time magic not supported for Tuplex code") src_info = inspect.getsourcelines(f) - vault.extractAndPutAllLambdas(src_info, - f_filename, - f_lineno, - f_colno, - f_globs) + vault.extractAndPutAllLambdas( + src_info, f_filename, f_lineno, f_colno, f_globs + ) return vault.get(f, f_filename, f_lineno, f_colno, f_globs) else: # works always, because functions can be only defined on a single line! return get_function_code(f) else: - # TODO: for constants, create dummy source code, i.e. lambda x: 20 # when desired to retrieve a constant or so! - return '' \ No newline at end of file + return "" diff --git a/tuplex/python/tuplex/utils/source_vault.py b/tuplex/python/tuplex/utils/source_vault.py index 7a6aabdeb..6e25e8d60 100644 --- a/tuplex/python/tuplex/utils/source_vault.py +++ b/tuplex/python/tuplex/utils/source_vault.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# # # # Tuplex: Blazing Fast Python Data Science # # # @@ -7,14 +7,16 @@ # (c) 2017 - 2021, Tuplex team # # Created by Leonhard Spiegelberg first on 1/1/2021 # # License: Apache 2.0 # -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# import ast -import astor +import logging import os import sys -from types import LambdaType, CodeType -import logging +from types import CodeType, LambdaType + +import astor + def supports_lambda_closure(): """ @@ -24,8 +26,9 @@ def supports_lambda_closure(): Returns: True if operated with patched interpreter, False otherwise """ - f = lambda x: x * x # dummy function - return hasattr(f.__code__, 'co_firstcolno') + # Check with a dummy function. + f = lambda x: x * x # noqa: E731 + return hasattr(f.__code__, "co_firstcolno") def extract_all_lambdas(tree): @@ -43,7 +46,7 @@ def visit_Lambda(self, node): # extract for lambda incl. default values # annotations are not possible with the current syntax... def args_for_lambda_ast(lam): - return [getattr(n, 'arg') for n in lam.args.args] + return [getattr(n, "arg") for n in lam.args.args] def gen_code_for_lambda(lam): @@ -55,12 +58,12 @@ def gen_code_for_lambda(lam): # astor generates here lambda : # but we want lambda: if 0 == len(lam.args.args): - assert 'lambda :' in s - s = s.replace('lambda :', 'lambda:') + assert "lambda :" in s + s = s.replace("lambda :", "lambda:") return s.strip()[1:-1] except Exception as e: - logging.debug('gen_code_for_lambda via astor failed with {}'.format(e)) + logging.debug("gen_code_for_lambda via astor failed with {}".format(e)) # python3.9+ has ast.unparse if sys.version_info.major >= 3 and sys.version_info.minor >= 9: @@ -70,9 +73,11 @@ def gen_code_for_lambda(lam): s = ast.unparse(lam) return s except Exception as e: - logging.debug('gen_code_for_lambda via ast (python3.9+) failed with {}'.format(e)) + logging.debug( + "gen_code_for_lambda via ast (python3.9+) failed with {}".format(e) + ) - return '' + return "" def hash_code_object(code): @@ -80,16 +85,16 @@ def hash_code_object(code): # need to hash contents # for this use bytecode, varnames & constants # the list comprehension constant shows up as a code object in itself, so we have to recursively hash the constants - ret = code.co_code + bytes(str(code.co_varnames), 'utf8') + b'(' + ret = code.co_code + bytes(str(code.co_varnames), "utf8") + b"(" for c in code.co_consts: if isinstance(c, CodeType): ret += hash_code_object(c) - elif isinstance(c, str) and c.endswith('..'): + elif isinstance(c, str) and c.endswith(".."): continue else: - ret += bytes(str(c), 'utf8') - ret += b',' - return ret + b')' + ret += bytes(str(c), "utf8") + ret += b"," + return ret + b")" # join lines and remove stupid \\n @@ -103,12 +108,12 @@ def remove_line_breaks(source_lines): joined source without \ line breaks """ - source = '' + source = "" last_line_had_break = False for line in source_lines: this_line_had_break = False - if line.endswith('\\\n'): - line = line[:-len('\\\n')] + if line.endswith("\\\n"): + line = line[: -len("\\\n")] this_line_had_break = True # remove leading whitespace if last line had break @@ -141,7 +146,7 @@ def __init__(self): # assert isinstance(obj, LambdaType), 'object needs to be a lambda object' # return self.lambdaDict[hash_code_object(obj.__code__)] def get(self, ftor, filename, lineno, colno, globs): - assert isinstance(ftor, LambdaType), 'object needs to be a lambda object' + assert isinstance(ftor, LambdaType), "object needs to be a lambda object" # perform multiway lookup for code if filename and lineno: @@ -154,26 +159,30 @@ def get(self, ftor, filename, lineno, colno, globs): # if i.e. a call is placed within a loop. if len(entries) == 1: - return entries[0]['code'] + return entries[0]["code"] else: # patched interpreter? - if hasattr(ftor.__code__, 'co_firstcolno'): - raise Exception('patched interpreter not yet implemented') + if hasattr(ftor.__code__, "co_firstcolno"): + raise Exception("patched interpreter not yet implemented") else: # multiple lambda entries. Can only search for lambda IFF no globs if len(globs) != 0: - raise KeyError("Multiple lambdas found in {}:+{}, can't extract source code for " - "lambda expression. Please either patch the interpreter or write at " - "most a single lambda using global variables " - "per line.".format(os.path.basename(filename), lineno)) + raise KeyError( + "Multiple lambdas found in {}:+{}, can't extract source code for " + "lambda expression. Please either patch the interpreter or write at " + "most a single lambda using global variables " + "per line.".format(os.path.basename(filename), lineno) + ) # search for entry with matching hash of codeobject! codeobj_hash = hash_code_object(ftor.__code__) for entry in entries: - if entry['code_hash'] == codeobj_hash: - return entry['code'] - raise KeyError('Multiple lambdas found, but failed to retrieve code for this lambda expression.') + if entry["code_hash"] == codeobj_hash: + return entry["code"] + raise KeyError( + "Multiple lambdas found, but failed to retrieve code for this lambda expression." + ) else: - raise KeyError('could not find lambda function') + raise KeyError("could not find lambda function") def extractAndPutAllLambdas(self, src_info, filename, lineno, colno, globals): """ @@ -184,32 +193,34 @@ def extractAndPutAllLambdas(self, src_info, filename, lineno, colno, globals): lines, start_lineno = src_info - assert lineno >= start_lineno, 'line numbers sound off. please fix!' - f_lines = lines[lineno - start_lineno:] + assert lineno >= start_lineno, "line numbers sound off. please fix!" + f_lines = lines[lineno - start_lineno :] take_only_first_lambda = False # are there two lambda's defined in this line? # if so, in unpatched interpreter raise exception! - lam_count_in_target_line = f_lines[0].count('lambda') + lam_count_in_target_line = f_lines[0].count("lambda") if lam_count_in_target_line != 1: if lam_count_in_target_line == 0: - raise Exception('internal extract error, no lambda in source lines?') + raise Exception("internal extract error, no lambda in source lines?") if len(globals) != 0 and not supports_lambda_closure(): - raise Exception('Found {} lambda expressions in {}:{}. Please patch your interpreter or ' - 'reformat so Tuplex can extract the source code.'.format(lam_count_in_target_line, - os.path.basename(filename), - lineno)) + raise Exception( + "Found {} lambda expressions in {}:{}. Please patch your interpreter or " + "reformat so Tuplex can extract the source code.".format( + lam_count_in_target_line, os.path.basename(filename), lineno + ) + ) else: if supports_lambda_closure(): - assert colno, 'colno has to be valid' + assert colno, "colno has to be valid" # simply cut off based on col no! f_lines[0] = f_lines[0][colno:] take_only_first_lambda = True # if the first line contains only one lambda, simply the first lambda is taken. # else, multiple lambdas per - if f_lines[0].count('lambda') <= 1: + if f_lines[0].count("lambda") <= 1: take_only_first_lambda = True # get the line corresponding to the object @@ -221,22 +232,21 @@ def extractAndPutAllLambdas(self, src_info, filename, lineno, colno, globals): # special case for line breaks (this is a bad HACK! However, don't want to write own AST parser again in python) try: tree = ast.parse(source.lstrip()) - except SyntaxError as se: + except SyntaxError: # we could have a lambda that is broken because of \ at the end of lines # i.e. the source object is something like '\t\t.filter(lambda x: x * x)' # search till first lambda keyword - source = source[source.find('lambda'):] + source = source[source.find("lambda") :] try: # now another exception may be raised, i.e. when parsing fails tree = ast.parse(source.strip()) except SyntaxError as se2: - # try to parse partially till where syntax error occured. - source_lines = source.split('\n') - lines = source_lines[:se2.lineno] - lines[se2.lineno - 1] = lines[se2.lineno - 1][:se2.offset - 1] - source = '\n'.join(lines) + source_lines = source.split("\n") + lines = source_lines[: se2.lineno] + lines[se2.lineno - 1] = lines[se2.lineno - 1][: se2.offset - 1] + source = "\n".join(lines) tree = ast.parse(source.strip()) Lams = extract_all_lambdas(tree) @@ -253,7 +263,7 @@ def extractAndPutAllLambdas(self, src_info, filename, lineno, colno, globals): code = gen_code_for_lambda(lam) if 0 == len(code): - raise Exception('Couldn\'t generate code again for lambda function.') + raise Exception("Couldn't generate code again for lambda function.") # Note: can get colno from ast! colno = lam.col_offset + len(source) - len(source.lstrip()) @@ -261,23 +271,33 @@ def extractAndPutAllLambdas(self, src_info, filename, lineno, colno, globals): # however, to simplify code, use astor. key = (filename, lineno) - codeobj = compile(code, '', 'eval') + codeobj = compile(code, "", "eval") # hash evaluated code object's code codeobj_hash = hash_code_object(eval(codeobj).__code__) - entry = {'code': code, 'code_hash': codeobj_hash, - 'globals': globals, 'colno': colno} + entry = { + "code": code, + "code_hash": codeobj_hash, + "globals": globals, + "colno": colno, + } if key in self.lambdaFileDict.keys(): # when declaration is placed within a loop, and e.g. globals are updated things might change. # in particular, the code + code_hash stay the same, yet the source code changes - existing_entries = self.lambdaFileDict[key] # how many can there be? assume 1 at most! + existing_entries = self.lambdaFileDict[ + key + ] # how many can there be? assume 1 at most! updated_existing = False for i, existing_entry in enumerate(existing_entries): - if existing_entry['code'] == entry['code'] and \ - existing_entry['code_hash'] == entry['code_hash'] and \ - existing_entry['colno'] == entry['colno']: - self.lambdaFileDict[key][i] = entry # update entry in existing file/lineno dict + if ( + existing_entry["code"] == entry["code"] + and existing_entry["code_hash"] == entry["code_hash"] + and existing_entry["colno"] == entry["colno"] + ): + self.lambdaFileDict[key][i] = ( + entry # update entry in existing file/lineno dict + ) updated_existing = True if not updated_existing: # add new entry @@ -287,31 +307,39 @@ def extractAndPutAllLambdas(self, src_info, filename, lineno, colno, globals): else: # check that there are no globals when extracting function! if colno is None and len(globals) != 0: - raise Exception('Found more than one lambda expression on {}:+{}. Either use ' - 'a patched interpreter, which supports __code__.co_firstcolno for lambda ' - 'expressions or make sure to have at most one lambda expression ' - 'on this line'.format(os.path.basename(filename), lineno)) + raise Exception( + "Found more than one lambda expression on {}:+{}. Either use " + "a patched interpreter, which supports __code__.co_firstcolno for lambda " + "expressions or make sure to have at most one lambda expression " + "on this line".format(os.path.basename(filename), lineno) + ) for lam in Lams: code = gen_code_for_lambda(lam) if 0 == len(code): - raise Exception('Couldn\'t generate code again for lambda function.') + raise Exception("Couldn't generate code again for lambda function.") lam_colno = lam.col_offset + len(source) - len(source.lstrip()) # => could also extract code from the string then via col_offsets etc.s # however, to simplify code, use astor. key = (filename, lineno) - codeobj = compile(code, '', 'eval') + codeobj = compile(code, "", "eval") # hash evaluated code object's code codeobj_hash = hash_code_object(eval(codeobj).__code__) if colno is None: # interpreter not patched - assert len(globals) == 0, 'this path should only be taken if there are no globs' + assert len(globals) == 0, ( + "this path should only be taken if there are no globs" + ) # can't associate globals clearly - entry = {'code': code, 'code_hash': codeobj_hash, - 'globals': {}, 'colno': lam_colno} + entry = { + "code": code, + "code_hash": codeobj_hash, + "globals": {}, + "colno": lam_colno, + } if key in self.lambdaFileDict.keys(): self.lambdaFileDict[key].append(entry) @@ -319,8 +347,12 @@ def extractAndPutAllLambdas(self, src_info, filename, lineno, colno, globals): self.lambdaFileDict[key] = [entry] else: # simply add the lambda with colno & co. - entry = {'code': code, 'code_hash': codeobj_hash, - 'globals': globals, 'colno': colno} + entry = { + "code": code, + "code_hash": codeobj_hash, + "globals": globals, + "colno": colno, + } if key in self.lambdaFileDict.keys(): self.lambdaFileDict[key].append(entry) diff --git a/tuplex/python/tuplex/utils/tracebacks.py b/tuplex/python/tuplex/utils/tracebacks.py index 480ca2d4c..eb5ba3aed 100644 --- a/tuplex/python/tuplex/utils/tracebacks.py +++ b/tuplex/python/tuplex/utils/tracebacks.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# # # # Tuplex: Blazing Fast Python Data Science # # # @@ -7,14 +7,16 @@ # (c) 2017 - 2021, Tuplex team # # Created by Leonhard Spiegelberg first on 1/1/2021 # # License: Apache 2.0 # -#----------------------------------------------------------------------------------------------------------------------# +# ----------------------------------------------------------------------------------------------------------------------# -import traceback import linecache import re +import traceback + from .reflection import get_source -__all__ = ['traceback_from_udf'] +__all__ = ["traceback_from_udf"] + def format_traceback(tb, function_name): """ @@ -28,15 +30,13 @@ def format_traceback(tb, function_name): """ fnames = set() - out = '' + out = "" for frame, lineno in traceback.walk_tb(tb): co = frame.f_code filename = co.co_filename - name = co.co_name fnames.add(filename) linecache.lazycache(filename, frame.f_globals) - f_locals = frame.f_locals line = linecache.getline(filename, lineno).strip() # @Todo: maybe this is faster possible when strip is ignored, by counting tabs or so @@ -47,7 +47,10 @@ def format_traceback(tb, function_name): # need here open match for line breaks in function definition. # note the use of ^ to make sure docstrings are not matched wrongly regex = r"^[\t ]*def\s*{}\(.*".format(function_name) - while not re.match(regex, linecache.getline(filename, start_lineno).strip()) and start_lineno > 0: + while ( + not re.match(regex, linecache.getline(filename, start_lineno).strip()) + and start_lineno > 0 + ): start_lineno -= 1 # get line where function def starts via # linecache.getline(filename, start_lineno).strip() @@ -55,13 +58,14 @@ def format_traceback(tb, function_name): # UI is currently formatted with line numbering starting at 1 lineno_correction = -start_lineno + 1 - out += 'line {}, in {}:'.format(lineno + lineno_correction, function_name) - out += '\n\t{}'.format(line) + out += "line {}, in {}:".format(lineno + lineno_correction, function_name) + out += "\n\t{}".format(line) for filename in fnames: linecache.checkcache(filename) return out + # get traceback from sample def traceback_from_udf(udf, x): """ @@ -80,21 +84,25 @@ def traceback_from_udf(udf, x): try: udf(x) except Exception as e: - assert e.__traceback__.tb_next # make sure no exception within this function was raised + assert ( + e.__traceback__.tb_next + ) # make sure no exception within this function was raised etype_name = type(e).__name__ e_msg = e.__str__() - formatted_tb = '' + formatted_tb = "" # case (1): lambda function --> simply use get_source module - if udf.__name__ == '': + if udf.__name__ == "": # Lambda expressions in python consist of one line only. simply iterate code here - formatted_tb = 'line 1, in :\n\t' + get_source(udf) # use reflection module + formatted_tb = "line 1, in :\n\t" + get_source( + udf + ) # use reflection module # case (2) function defined via def else: # print out traceback (with relative line numbers!) formatted_tb = format_traceback(e.__traceback__.tb_next, fname) # return traceback and add exception type + its message - return formatted_tb + '\n\n{}: {}'.format(etype_name, e_msg) - return '' \ No newline at end of file + return formatted_tb + "\n\n{}: {}".format(etype_name, e_msg) + return "" diff --git a/tuplex/python/tuplex/utils/version.py b/tuplex/python/tuplex/utils/version.py index 8a14b5846..40995e4a1 100644 --- a/tuplex/python/tuplex/utils/version.py +++ b/tuplex/python/tuplex/utils/version.py @@ -1,2 +1,2 @@ # (c) L.Spiegelberg 2017 - 2025 -__version__="0.3.7" \ No newline at end of file +__version__ = "0.3.7" From 7ee4840a19f9ad0c779823dcaa0f8d7196ba9e9b Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Tue, 11 Mar 2025 00:15:42 -0700 Subject: [PATCH 5/8] Add type hints (#153) Adds type hints and enforcement via ruff precommit to core APIs. --- .pre-commit-config.yaml | 2 +- pyproject.toml | 4 - ruff.toml | 9 ++ tuplex/python/tuplex/__init__.py | 8 +- tuplex/python/tuplex/context.py | 47 +++++---- tuplex/python/tuplex/dataset.py | 99 +++++++++++-------- tuplex/python/tuplex/distributed.py | 90 +++++++++-------- tuplex/python/tuplex/exceptions.py | 3 +- tuplex/python/tuplex/metrics.py | 4 +- tuplex/python/tuplex/repl/__init__.py | 7 +- tuplex/python/tuplex/utils/common.py | 98 ++++++++++-------- tuplex/python/tuplex/utils/globs.py | 12 ++- .../python/tuplex/utils/interactive_shell.py | 27 ++--- tuplex/python/tuplex/utils/jedi_completer.py | 12 ++- tuplex/python/tuplex/utils/jupyter.py | 4 +- tuplex/python/tuplex/utils/reflection.py | 34 ++++--- tuplex/python/tuplex/utils/source_vault.py | 46 +++++---- tuplex/python/tuplex/utils/tracebacks.py | 12 ++- 18 files changed, 305 insertions(+), 213 deletions(-) create mode 100644 ruff.toml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0d6b6778a..9dac19234 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,7 +15,7 @@ repos: # Run the linter. - id: ruff files: ^tuplex/python/tuplex.*\.py$ - args: [ "--fix", "--select", "I" ] + args: [ "--fix", "--config", "ruff.toml"] types_or: [ python, pyi ] # Run the formatter. - id: ruff-format diff --git a/pyproject.toml b/pyproject.toml index dc7fe4af5..aefc4e5dc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,3 @@ requires = [ "requests" ] build-backend = "setuptools.build_meta" - - -[tool.ruff] -include = ["pyproject.toml", "tuplex/python/tuplex/**/*.py"] diff --git a/ruff.toml b/ruff.toml new file mode 100644 index 000000000..9bc42fb7c --- /dev/null +++ b/ruff.toml @@ -0,0 +1,9 @@ +#"--select", "I", "--select", "F" +[lint] +# Add "B", "Q" for flake8 checks. +select = ["I", "E4", "E7", "E9", "F", "CPY001", "T201", "T203", "ANN001", "ANN002", "ANN003", "ANN201", "ANN202", "ANN204", "ANN205", "ANN206"] +ignore = [] + +# Allow fix for all enabled rules (when `--fix`) is provided. +fixable = ["ALL"] +unfixable = [] \ No newline at end of file diff --git a/tuplex/python/tuplex/__init__.py b/tuplex/python/tuplex/__init__.py index ad8d14b5e..20aa0a4c1 100644 --- a/tuplex/python/tuplex/__init__.py +++ b/tuplex/python/tuplex/__init__.py @@ -10,6 +10,7 @@ # ----------------------------------------------------------------------------------------------------------------------# import logging +from typing import Optional, Union # expose aws setup for better convenience import tuplex.distributed @@ -23,7 +24,12 @@ # for convenience create a dummy function to return a default-configured Lambda context -def LambdaContext(conf=None, name=None, s3_scratch_dir=None, **kwargs): +def LambdaContext( + conf: Union[None, str, dict] = None, + name: Optional[str] = None, + s3_scratch_dir: Optional[str] = None, + **kwargs: dict, +) -> Context: import uuid if s3_scratch_dir is None: diff --git a/tuplex/python/tuplex/context.py b/tuplex/python/tuplex/context.py index 46763b72d..04e8d2e0c 100644 --- a/tuplex/python/tuplex/context.py +++ b/tuplex/python/tuplex/context.py @@ -21,6 +21,7 @@ import os import sys import uuid +from typing import Any, List, Optional, Tuple, Union from tuplex.utils.common import ( current_user, @@ -45,7 +46,9 @@ class Context: - def __init__(self, conf=None, name="", **kwargs): + def __init__( + self, conf: Union[None, str, dict] = None, name: str = "", **kwargs: dict + ) -> None: r"""creates new Context object, the main entry point for all operations with the Tuplex big data framework Args: @@ -235,7 +238,13 @@ def __init__(self, conf=None, name="", **kwargs): self.metrics = Metrics(python_metrics) assert self.metrics - def parallelize(self, value_list, columns=None, schema=None, auto_unpack=True): + def parallelize( + self, + value_list: List[Any], + columns: Optional[List[str]] = None, + schema: Optional[Union[Tuple, List]] = None, + auto_unpack: bool = True, + ) -> "DataSet": """passes data to the Tuplex framework. Must be a list of primitive objects (e.g. of type bool, int, float, str) or a list of (nested) tuples of these types. @@ -273,14 +282,14 @@ def parallelize(self, value_list, columns=None, schema=None, auto_unpack=True): def csv( self, - pattern, - columns=None, - header=None, - delimiter=None, - quotechar='"', - null_values=[""], - type_hints={}, - ): + pattern: str, + columns: Optional[List[str]] = None, + header: Optional[bool] = None, + delimiter: Optional[str] = None, + quotechar: str = '"', + null_values: List[str] = [""], + type_hints: dict = {}, + ) -> "DataSet": """reads csv (comma separated values) files. This function may either be provided with parameters that help to determine the delimiter, whether a header present or what kind of quote char is used. Overall, CSV parsing is done according to the RFC-4180 standard @@ -350,11 +359,11 @@ def csv( ) return ds - def text(self, pattern, null_values=None): + def text(self, pattern: str, null_values: Optional[List[str]] = None) -> "DataSet": """reads text files. Args: pattern (str): a file glob pattern, e.g. /data/file.csv or /data/\*.csv or /\*/\*csv - null_values (List[str]): a list of string to interpret as None. When empty list or None, empty lines will be the empty string '' + null_values (List[str]): a list of strings to interpret as None. When empty list or None, empty lines will be the empty string '' Returns: tuplex.dataset.DataSet: A Tuplex Dataset object that allows further ETL operations """ @@ -372,7 +381,7 @@ def text(self, pattern, null_values=None): ds._dataSet = self._context.text(pattern, null_values) return ds - def orc(self, pattern, columns=None): + def orc(self, pattern: str, columns: Optional[List[str]] = None) -> "DataSet": """reads orc files. Args: pattern (str): a file glob pattern, e.g. /data/file.csv or /data/\*.csv or /\*/\*csv @@ -390,7 +399,7 @@ def orc(self, pattern, columns=None): ds._dataSet = self._context.orc(pattern, columns) return ds - def options(self, nested=False): + def options(self, nested: bool = False) -> dict: """retrieves all framework parameters as dictionary Args: @@ -411,7 +420,7 @@ def options(self, nested=False): else: return opt - def optionsToYAML(self, file_path="config.yaml"): + def optionsToYAML(self, file_path: str = "config.yaml") -> None: """saves options as yaml file to (local) filepath Args: @@ -420,7 +429,7 @@ def optionsToYAML(self, file_path="config.yaml"): save_conf_yaml(self.options(), file_path) - def ls(self, pattern): + def ls(self, pattern: str) -> List[str]: """ return a list of strings of all files found matching the pattern. The same pattern can be supplied to read inputs. Args: @@ -433,7 +442,7 @@ def ls(self, pattern): assert self._context return self._context.ls(pattern) - def cp(self, pattern, target_uri): + def cp(self, pattern: str, target_uri: str) -> None: """ copies all files matching the pattern to a target uri. If more than one file is found, a folder is created containing all the files relative to the longest shared path prefix. @@ -448,7 +457,7 @@ def cp(self, pattern, target_uri): assert self._context return self._context.cp(pattern, target_uri) - def rm(self, pattern): + def rm(self, pattern: str) -> None: """ removes all files matching the pattern Args: @@ -463,7 +472,7 @@ def rm(self, pattern): return self._context.rm(pattern) @property - def uiWebURL(self): + def uiWebURL(self) -> str: """ retrieve URL of webUI if running Returns: diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py index 27e0d37a7..e86441146 100644 --- a/tuplex/python/tuplex/dataset.py +++ b/tuplex/python/tuplex/dataset.py @@ -10,6 +10,7 @@ # ----------------------------------------------------------------------------------------------------------------------# import logging +from typing import Any, Callable, List, Optional, Sequence, Tuple, TypeVar, Union import cloudpickle @@ -29,10 +30,10 @@ class DataSet: - def __init__(self): + def __init__(self) -> None: self._dataSet: _DataSet = None - def unique(self): + def unique(self) -> "DataSet": """removes duplicates from Dataset (out-of-order). Equivalent to a DISTINCT clause in a SQL-statement. Returns: tuplex.dataset.Dataset: A Tuplex Dataset object that allows further ETL operations. @@ -45,7 +46,7 @@ def unique(self): ds._dataSet = self._dataSet.unique() return ds - def map(self, ftor): + def map(self, ftor: Callable) -> "DataSet": """ performs a map operation using the provided udf function over the dataset and returns a dataset for further processing. @@ -79,7 +80,7 @@ def map(self, ftor): ds._dataSet = self._dataSet.map(code, cloudpickle.dumps(ftor), g) return ds - def filter(self, ftor): + def filter(self, ftor: Callable) -> "DataSet": """ performs a map operation using the provided udf function over the dataset and returns a dataset for further processing. @@ -109,7 +110,7 @@ def filter(self, ftor): ds._dataSet = self._dataSet.filter(code, cloudpickle.dumps(ftor), g) return ds - def collect(self): + def collect(self) -> List[Any]: """action that generates a physical plan, processes data and collects result then as list of tuples. Returns: @@ -121,7 +122,7 @@ def collect(self): ) return self._dataSet.collect() - def take(self, nrows=5): + def take(self, nrows: int = 5) -> List[Any]: """action that generates a physical plan, processes data and collects the top results then as list of tuples. Args: @@ -140,7 +141,7 @@ def take(self, nrows=5): return self._dataSet.take(nrows) - def show(self, nrows=None): + def show(self, nrows: Optional[int] = None) -> None: """action that generates a physical plan, processes data and prints results as nicely formatted ASCII table to stdout. @@ -158,7 +159,7 @@ def show(self, nrows=None): self._dataSet.show(nrows) - def resolve(self, eclass, ftor): + def resolve(self, eclass: TypeVar, ftor: Callable) -> "DataSet": """Adds a resolver operator to the pipeline. The signature of ftor needs to be identical to the one of the preceding operator. Args: @@ -197,7 +198,7 @@ def resolve(self, eclass, ftor): ds._dataSet = self._dataSet.resolve(ec, code, cloudpickle.dumps(ftor), g) return ds - def withColumn(self, column, ftor): + def withColumn(self, column: str, ftor: Callable) -> "DataSet": """appends a new column to the dataset by calling ftor over existing tuples Args: @@ -227,7 +228,7 @@ def withColumn(self, column, ftor): ds._dataSet = self._dataSet.withColumn(column, code, cloudpickle.dumps(ftor), g) return ds - def mapColumn(self, column, ftor): + def mapColumn(self, column: Union[int, str], ftor: Callable) -> "DataSet": """maps directly one column. UDF takes as argument directly the value of the specified column and will overwrite that column with the result. If you need access to multiple columns, use withColumn instead. If the column name already exists, it will be overwritten. @@ -258,7 +259,7 @@ def mapColumn(self, column, ftor): ds._dataSet = self._dataSet.mapColumn(column, code, cloudpickle.dumps(ftor), g) return ds - def selectColumns(self, columns): + def selectColumns(self, columns: List[Union[str, int]]) -> "DataSet": """selects a subset of columns as defined through columns which is a list or a single column Args: @@ -289,7 +290,7 @@ def selectColumns(self, columns): ds._dataSet = self._dataSet.selectColumns(columns) return ds - def renameColumn(self, key, newColumnName): + def renameColumn(self, key: str, newColumnName: str) -> "DataSet": """rename a column in dataset Args: key: str|int, old column name or (0-indexed) position. @@ -315,7 +316,7 @@ def renameColumn(self, key, newColumnName): raise TypeError("key must be int or str") return ds - def ignore(self, eclass): + def ignore(self, eclass: TypeVar) -> "DataSet": """ignores exceptions of type eclass caused by previous operator Args: @@ -342,7 +343,7 @@ def ignore(self, eclass): ds._dataSet = self._dataSet.ignore(ec) return ds - def cache(self, store_specialized=True): + def cache(self, store_specialized: bool = True) -> "DataSet": """materializes rows in main-memory for reuse with several pipelines. Can be also used to benchmark certain pipeline costs Args: @@ -361,7 +362,7 @@ def cache(self, store_specialized=True): return ds @property - def columns(self): + def columns(self) -> List[str]: """retrieve names of columns if assigned Returns: @@ -371,7 +372,7 @@ def columns(self): return cols if len(cols) > 0 else None @property - def types(self): + def types(self) -> List[TypeVar]: """output schema as list of type objects of the dataset. If the dataset has an error, None is returned. Returns: @@ -381,8 +382,13 @@ def types(self): return types def join( - self, dsRight, leftKeyColumn, rightKeyColumn, prefixes=None, suffixes=None - ): + self, + dsRight: "DataSet", + leftKeyColumn: str, + rightKeyColumn: str, + prefixes: Union[None, Tuple[str, str], List[str]] = None, + suffixes: Union[None, Tuple[str, str], List[str]] = None, + ) -> "DataSet": """ (inner) join with other dataset Args: @@ -434,8 +440,13 @@ def join( return ds def leftJoin( - self, dsRight, leftKeyColumn, rightKeyColumn, prefixes=None, suffixes=None - ): + self, + dsRight: "DataSet", + leftKeyColumn: str, + rightKeyColumn: str, + prefixes: Union[None, Tuple[str, str], List[str]] = None, + suffixes: Union[None, Tuple[str, str], List[str]] = None, + ) -> "DataSet": """ left (outer) join with other dataset Args: @@ -488,18 +499,18 @@ def leftJoin( def tocsv( self, - path, - part_size=0, - num_rows=max_rows, - num_parts=0, - part_name_generator=None, - null_value=None, - header=True, - ): + path: str, + part_size: int = 0, + num_rows: int = max_rows, + num_parts: int = 0, + part_name_generator: Optional[Callable] = None, + null_value: Optional[Any] = None, + header: bool = True, + ) -> None: """ save dataset to one or more csv files. Triggers execution of pipeline. Args: path: path where to save files to - split_size: optional size in bytes for each part to not exceed. + part_size: optional size in bytes for each part to not exceed. num_rows: limit number of output rows num_parts: number of parts to split output into. The last part will be the smallest part_name_generator: optional name generator function to the output parts, receives an integer \ @@ -542,12 +553,12 @@ def tocsv( def toorc( self, - path, - part_size=0, - num_rows=max_rows, - num_parts=0, - part_name_generator=None, - ): + path: str, + part_size: int = 0, + num_rows: int = max_rows, + num_parts: int = 0, + part_name_generator: Callable = None, + ) -> None: """ save dataset to one or more orc files. Triggers execution of pipeline. Args: path: path where to save files to @@ -579,7 +590,9 @@ def toorc( self._dataSet.toorc(path, code, code_pickled, num_parts, part_size, num_rows) - def aggregate(self, combine, aggregate, initial_value): + def aggregate( + self, combine: Callable, aggregate: Callable, initial_value: Any + ) -> "Dataset": # noqa: F821 """ cf. aggregateByKey for details Args: @@ -628,14 +641,20 @@ def aggregate(self, combine, aggregate, initial_value): ) return ds - def aggregateByKey(self, combine, aggregate, initial_value, key_columns): + def aggregateByKey( + self, + combine: Callable, + aggregate: Callable, + initial_value: Any, + key_columns: Sequence[Union[int, str]], + ) -> "tuplex.Dataset": # noqa: F821 """ An experimental aggregateByKey function similar to aggregate. There are several scenarios that do not work with this function yet and its performance hasn't been properly optimized either. Data is grouped by the supplied key_columns. Then, for each group a new aggregate is initialized using the initial_value, which can be thought of as a neutral value. The aggregate function is then called for each element and the current aggregate structure. It is guaranteed that the combine function is called at least once per group by applying the initial_value to the aggregate. Args: - combine: a UDF to combine two aggregates (results of the aggregate function or the initial_value). E.g., cobmine = lambda agg1, agg2: agg1 + agg2. The initial value should be the neutral element. - aggregate: a UDF which produces a result by combining a value with the aggregate initialized by initial_value. E.g., aggreagte = lambda agg, value: agg + value sums up values. + combine: a UDF to combine two aggregates (results of the aggregate function or the initial_value). E.g., combine = lambda agg1, agg2: agg1 + agg2. The initial value should be the neutral element. + aggregate: a UDF which produces a result by combining a value with the aggregate initialized by initial_value. E.g., aggregate = lambda agg, value: agg + value sums up values. initial_value: a neutral initial value. key_columns: the columns to group the aggregate by, a sequence of a mix of strings or integers. If specified as a single string or number, aggregation is over a single column. Returns: @@ -685,7 +704,7 @@ def aggregateByKey(self, combine, aggregate, initial_value, key_columns): return ds @property - def exception_counts(self): + def exception_counts(self) -> dict: """ Returns: dictionary of exception class names with integer keys, i.e. the counts. Returns None diff --git a/tuplex/python/tuplex/distributed.py b/tuplex/python/tuplex/distributed.py index 096bf56a3..5e884d8f7 100644 --- a/tuplex/python/tuplex/distributed.py +++ b/tuplex/python/tuplex/distributed.py @@ -11,6 +11,8 @@ try: import boto3 + import botocore + import botocore.client import botocore.exceptions except Exception: # ignore here, because boto3 is optional @@ -24,34 +26,37 @@ import sys import threading import time +from typing import Optional, Tuple # Tuplex specific imports from tuplex.utils.common import current_user, host_name +_logger = logging.getLogger(__name__) -def current_iam_user(): + +def current_iam_user() -> str: iam = boto3.resource("iam") user = iam.CurrentUser() return user.user_name.lower() -def default_lambda_name(): +def default_lambda_name() -> str: return "tuplex-lambda-runner" -def default_lambda_role(): +def default_lambda_role() -> str: return "tuplex-lambda-role" -def default_bucket_name(): +def default_bucket_name() -> str: return "tuplex-" + current_iam_user() -def default_scratch_dir(): +def default_scratch_dir() -> str: return default_bucket_name() + "/scratch" -def current_region(): +def current_region() -> str: session = boto3.session.Session() region = session.region_name @@ -62,7 +67,9 @@ def current_region(): return region -def check_credentials(aws_access_key_id=None, aws_secret_access_key=None): +def check_credentials( + aws_access_key_id: Optional[str] = None, aws_secret_access_key: Optional[str] = None +) -> bool: kwargs = {} if isinstance(aws_access_key_id, str): kwargs["aws_access_key_id"] = aws_access_key_id @@ -81,7 +88,9 @@ def check_credentials(aws_access_key_id=None, aws_secret_access_key=None): return True -def ensure_s3_bucket(s3_client, bucket_name, region): +def ensure_s3_bucket( + s3_client: "botocore.client.S3", bucket_name: str, region: str +) -> None: bucket_names = list(map(lambda b: b["Name"], s3_client.list_buckets()["Buckets"])) if bucket_name not in bucket_names: @@ -105,7 +114,7 @@ def ensure_s3_bucket(s3_client, bucket_name, region): logging.info("Found bucket {}".format(bucket_name)) -def create_lambda_role(iam_client, lambda_role): +def create_lambda_role(iam_client: "botocore.client.IAM", lambda_role: str) -> None: # Roles required for AWS Lambdas trust_policy = '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"Service":"lambda.amazonaws.com"},"Action":"sts:AssumeRole"}]}' lambda_access_to_s3 = '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Action":["s3:*MultipartUpload*","s3:Get*","s3:ListBucket","s3:Put*"],"Resource":"*"}]}' @@ -139,7 +148,7 @@ def create_lambda_role(iam_client, lambda_role): raise Exception("Failed to create AWS Lambda Role.") -def remove_lambda_role(iam_client, lambda_role): +def remove_lambda_role(iam_client: "botocore.client.IAM", lambda_role: str) -> None: # detach policies... try: iam_client.detach_role_policy( @@ -165,11 +174,12 @@ def remove_lambda_role(iam_client, lambda_role): ) ) - # delete role... iam_client.delete_role(RoleName=lambda_role) -def setup_lambda_role(iam_client, lambda_role, region, overwrite): +def setup_lambda_role( + iam_client: "botocore.client.IAM", lambda_role: str, region: str, overwrite: bool +) -> None: try: response = iam_client.get_role(RoleName=lambda_role) logging.info("Found Lambda role from {}".format(response["Role"]["CreateDate"])) @@ -187,7 +197,7 @@ def setup_lambda_role(iam_client, lambda_role, region, overwrite): create_lambda_role(iam_client, lambda_role) -def sizeof_fmt(num, suffix="B"): +def sizeof_fmt(num: int, suffix: str = "B") -> str: # from https://stackoverflow.com/questions/1094841/get-human-readable-version-of-file-size for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]: if abs(num) < 1024.0: @@ -197,13 +207,13 @@ def sizeof_fmt(num, suffix="B"): class ProgressPercentage(object): - def __init__(self, filename): + def __init__(self, filename: str) -> None: self._filename = filename self._size = float(os.path.getsize(filename)) self._seen_so_far = 0 self._lock = threading.Lock() - def __call__(self, bytes_amount): + def __call__(self, bytes_amount: int) -> None: # To simplify, assume this is hooked up to a single filename with self._lock: self._seen_so_far += bytes_amount @@ -220,7 +230,7 @@ def __call__(self, bytes_amount): sys.stdout.flush() -def s3_split_uri(uri): +def s3_split_uri(uri: str) -> Tuple[str, str]: assert "/" in uri, "at least one / is required!" uri = uri.replace("s3://", "") @@ -230,16 +240,16 @@ def s3_split_uri(uri): def upload_lambda( - iam_client, - lambda_client, - lambda_function_name, - lambda_role, - lambda_zip_file, - overwrite=False, - s3_client=None, - s3_scratch_space=None, - quiet=False, -): + iam_client: Optional[str], + lambda_client: Optional[str], + lambda_function_name: Optional[str], + lambda_role: Optional[str], + lambda_zip_file: Optional[str], + overwrite: bool = False, + s3_client: "botocore.client.S3" = None, + s3_scratch_space: Optional[str] = None, + quiet: bool = False, +) -> dict: # AWS only allows 50MB to be uploaded directly via request. Else, requires S3 upload. ZIP_UPLOAD_LIMIT_SIZE = 50000000 @@ -396,7 +406,7 @@ def upload_lambda( return response -def find_lambda_package(): +def find_lambda_package() -> Optional[str]: """ Check whether a compatible zip file in tuplex/other could be found for auto-upload Returns: None or path to lambda zip to upload @@ -415,17 +425,17 @@ def find_lambda_package(): def setup_aws( - aws_access_key=None, - aws_secret_key=None, - overwrite=True, - iam_user=None, - lambda_name=None, - lambda_role=None, - lambda_file=None, - region=None, - s3_scratch_uri=None, - quiet=False, -): + aws_access_key: Optional[str] = None, + aws_secret_key: Optional[str] = None, + overwrite: Optional[str] = True, + iam_user: Optional[str] = None, + lambda_name: Optional[str] = None, + lambda_role: Optional[str] = None, + lambda_file: Optional[str] = None, + region: Optional[str] = None, + s3_scratch_uri: Optional[str] = None, + quiet: bool = False, +) -> None: start_time = time.time() # detect defaults. Important to do this here, because don't want to always invoke boto3/botocore @@ -497,4 +507,6 @@ def setup_aws( # done, print if quiet was not set to False if not quiet: - print("\nCompleted lambda setup in {:.2f}s".format(time.time() - start_time)) + _logger.info( + "\nCompleted lambda setup in {:.2f}s".format(time.time() - start_time) + ) diff --git a/tuplex/python/tuplex/exceptions.py b/tuplex/python/tuplex/exceptions.py index 0c50fa997..534c10465 100644 --- a/tuplex/python/tuplex/exceptions.py +++ b/tuplex/python/tuplex/exceptions.py @@ -8,9 +8,10 @@ # Created by Leonhard Spiegelberg first on 1/1/2021 # # License: Apache 2.0 # # ----------------------------------------------------------------------------------------------------------------------# +from typing import TypeVar -def classToExceptionCode(cls): +def classToExceptionCode(cls: TypeVar) -> int: """ return C++ enum exception code for class Args: diff --git a/tuplex/python/tuplex/metrics.py b/tuplex/python/tuplex/metrics.py index 0bcf3fd34..776ee7e46 100644 --- a/tuplex/python/tuplex/metrics.py +++ b/tuplex/python/tuplex/metrics.py @@ -30,7 +30,7 @@ class Metrics: context object. """ - def __init__(self, metrics: _Metrics): + def __init__(self, metrics: _Metrics) -> None: """ Creates a Metrics object by using the context object to set its metric parameter and store the resulting @@ -101,7 +101,7 @@ def as_json(self) -> str: assert self._metrics return self._metrics.getJSONString() - def as_dict(self): + def as_dict(self) -> dict: """ all measurements in nested dictionary Returns: diff --git a/tuplex/python/tuplex/repl/__init__.py b/tuplex/python/tuplex/repl/__init__.py index ef1fb8e83..243ca4e20 100644 --- a/tuplex/python/tuplex/repl/__init__.py +++ b/tuplex/python/tuplex/repl/__init__.py @@ -9,6 +9,7 @@ # License: Apache 2.0 # # ----------------------------------------------------------------------------------------------------------------------# +import logging import os import sys @@ -23,8 +24,10 @@ except (ImportError, NameError): __version__ = "dev" +_logger = logging.getLogger(__name__) -def TuplexBanner(): + +def TuplexBanner() -> str: banner = """Welcome to\n _____ _ |_ _| _ _ __ | | _____ __ @@ -55,4 +58,4 @@ def TuplexBanner(): shell.init(locals=_locals) shell.interact(banner=TuplexBanner() + "\n Interactive Shell mode") else: - print(TuplexBanner()) + _logger.info(TuplexBanner()) diff --git a/tuplex/python/tuplex/utils/common.py b/tuplex/python/tuplex/utils/common.py index e0708fc3d..32d6fb47f 100644 --- a/tuplex/python/tuplex/utils/common.py +++ b/tuplex/python/tuplex/utils/common.py @@ -25,6 +25,7 @@ import time import urllib.request from datetime import datetime +from typing import Callable import iso8601 import psutil @@ -42,8 +43,12 @@ except ImportError: __version__ = "dev" +from typing import Any, Optional, Union -def cmd_exists(cmd): +_logger = logging.getLogger(__name__) + + +def cmd_exists(cmd: str) -> bool: """ checks whether command `cmd` exists or not Args: @@ -55,7 +60,7 @@ def cmd_exists(cmd): return shutil.which(cmd) is not None -def is_shared_lib(path): +def is_shared_lib(path: str) -> bool: """ Args: path: str path to a file @@ -74,7 +79,7 @@ def is_shared_lib(path): ) -def current_timestamp(): +def current_timestamp() -> str: """ get current time as isoformatted string Returns: isoformatted current time (utc) @@ -83,7 +88,7 @@ def current_timestamp(): return str(datetime.now().isoformat()) -def current_user(): +def current_user() -> str: """ retrieve current user name Returns: username as string @@ -95,7 +100,7 @@ def current_user(): return getpass.getuser() -def host_name(): +def host_name() -> str: """ retrieve host name to identify machine Returns: some hostname as string @@ -107,7 +112,7 @@ def host_name(): return socket.gethostbyaddr(socket.gethostname())[0] -def post_json(url, data): +def post_json(url: str, data: dict) -> dict: """ perform a post request to a REST endpoint with JSON Args: @@ -126,12 +131,12 @@ def post_json(url, data): return json.loads(response.read()) -def get_json(url, timeout=10): +def get_json(url: str, timeout: float = 10) -> dict: """ perform a GET request to given URL Args: url: hostname & port - + timeout: timeout in s Returns: python dictionary of decoded json """ @@ -141,7 +146,7 @@ def get_json(url, timeout=10): return json.loads(response.read()) -def in_jupyter_notebook(): +def in_jupyter_notebook() -> bool: """check whether frameworks runs in jupyter notebook. Returns: ``True`` if the module is running in IPython kernel, @@ -163,7 +168,7 @@ def in_jupyter_notebook(): return False # Probably standard Python interpreter -def in_google_colab(): +def in_google_colab() -> bool: """ check whether framework runs in Google Colab environment Returns: @@ -179,7 +184,7 @@ def in_google_colab(): return shell_name_matching -def is_in_interactive_mode(): +def is_in_interactive_mode() -> bool: """checks whether the module is loaded in an interactive shell session or not Returns: True when in interactive mode. Note that Jupyter notebook also returns True here. @@ -189,7 +194,7 @@ def is_in_interactive_mode(): return bool(getattr(sys, "ps1", sys.flags.interactive)) -def flatten_dict(d, sep=".", parent_key=""): +def flatten_dict(d: dict, sep: str = ".", parent_key: str = "") -> dict: """ flattens a nested dictionary into a flat dictionary by concatenating keys with the separator. Args: d (dict): The dictionary to flatten @@ -210,7 +215,7 @@ def flatten_dict(d, sep=".", parent_key=""): return dict(items) -def unflatten_dict(dictionary, sep="."): +def unflatten_dict(dictionary: dict, sep: str = ".") -> dict: """ unflattens a dictionary into a nested dictionary according to sep Args: @@ -236,7 +241,7 @@ def unflatten_dict(dictionary, sep="."): return resultDict -def save_conf_yaml(conf, file_path): +def save_conf_yaml(conf: dict, file_path: str) -> None: """saves a dictionary holding the configuration options to Tuplex Yaml format. \ Dict can be either flattened or not. @@ -245,7 +250,7 @@ def save_conf_yaml(conf, file_path): file_path: """ - def beautify_nesting(d): + def beautify_nesting(d: Union[dict, Any]) -> Any: # i.e. make lists out of dicts if isinstance(d, dict): items = d.items() @@ -265,7 +270,7 @@ def beautify_nesting(d): f.write(out) -def pythonize_options(options): +def pythonize_options(options: dict) -> dict: """ convert string based options into python objects/types Args: @@ -275,7 +280,7 @@ def pythonize_options(options): dict with python types """ - def parse_string(item): + def parse_string(item: str) -> Any: """ check what kind of variable string represents and convert accordingly Args: @@ -313,7 +318,7 @@ def parse_string(item): return {k: parse_string(v) for k, v in options.items()} -def load_conf_yaml(file_path): +def load_conf_yaml(file_path: str) -> dict: """loads yaml file and converts contents to nested dictionary Args: @@ -322,7 +327,7 @@ def load_conf_yaml(file_path): """ # helper function to get correct nesting from yaml file! - def to_nested_dict(obj): + def to_nested_dict(obj: Any) -> dict: resultDict = dict() if isinstance(obj, list): for item in obj: @@ -349,7 +354,7 @@ def to_nested_dict(obj): return to_nested_dict(d) -def stringify_dict(d): +def stringify_dict(d: dict) -> dict: """convert keys and vals into strings Args: d (dict): dictionary @@ -361,7 +366,7 @@ def stringify_dict(d): return {str(key): str(val) for key, val in d.items()} -def registerLoggingCallback(callback): +def registerLoggingCallback(callback: Callable) -> None: """ register a custom logging callback function with tuplex Args: @@ -373,7 +378,7 @@ def registerLoggingCallback(callback): from ..libexec.tuplex import registerLoggingCallback as ccRegister # create a wrapper to capture exceptions properly and avoid crashing - def wrapper(level, time_info, logger_name, msg): + def wrapper(level: int, time_info: str, logger_name: str, msg: str) -> None: args = (level, time_info, logger_name, msg) try: @@ -384,7 +389,7 @@ def wrapper(level, time_info, logger_name, msg): ccRegister(wrapper) -def logging_callback(level, time_info, logger_name, msg): +def logging_callback(level: int, time_info: str, logger_name: str, msg: str) -> None: """ this is a callback function which can be used to redirect C++ logging to python logging. :param level: logging level as integer, for values cf. PythonCommon.h @@ -441,7 +446,7 @@ def logging_callback(level, time_info, logger_name, msg): # register at exit function to take care of exit handlers -def auto_shutdown_all(): +def auto_shutdown_all() -> None: """ helper function to automatially shutdown whatever is in the global exit handler array. Resets global variable. Returns: @@ -456,13 +461,15 @@ def auto_shutdown_all(): if msg: logging.info(msg) func(args) - logging.info("Shutdown {} successfully".format(name)) + logging.debug("Shutdown {} successfully".format(name)) except Exception: logging.error("Failed to shutdown {}".format(name)) __exit_handlers__ = [] -def register_auto_shutdown(name, func, args, msg=None): +def register_auto_shutdown( + name: str, func: Callable, args: tuple, msg: Optional[str] = None +) -> None: global __exit_handlers__ __exit_handlers__.append((name, func, args, msg)) @@ -470,7 +477,7 @@ def register_auto_shutdown(name, func, args, msg=None): atexit.register(auto_shutdown_all) -def is_process_running(name): +def is_process_running(name: str) -> bool: """ helper function to check if a process is running on the local machine Args: @@ -490,7 +497,9 @@ def is_process_running(name): return False -def mongodb_uri(mongodb_url, mongodb_port, db_name="tuplex-history"): +def mongodb_uri( + mongodb_url: str, mongodb_port: int, db_name: str = "tuplex-history" +) -> str: """ constructs a fully qualified MongoDB URI Args: @@ -505,8 +514,11 @@ def mongodb_uri(mongodb_url, mongodb_port, db_name="tuplex-history"): def check_mongodb_connection( - mongodb_url, mongodb_port, db_name="tuplex-history", timeout=10.0 -): + mongodb_url: str, + mongodb_port: int, + db_name: str = "tuplex-history", + timeout: float = 10.0, +) -> None: """ connects to a MongoDB database instance, raises exception if connection fails Args: @@ -564,7 +576,7 @@ def check_mongodb_connection( logging.debug("Connection test to MongoDB succeeded") -def shutdown_process_via_kill(pid): +def shutdown_process_via_kill(pid: int) -> None: """ issues a KILL signals to a process with pid Args: @@ -578,12 +590,12 @@ def shutdown_process_via_kill(pid): def find_or_start_mongodb( - mongodb_url, - mongodb_port, - mongodb_datapath, - mongodb_logpath, - db_name="tuplex-history", -): + mongodb_url: str, + mongodb_port: int, + mongodb_datapath: str, + mongodb_logpath: str, + db_name: str = "tuplex-history", +) -> None: """ attempts to connect to a MongoDB database. If no running local MongoDB is found, will auto-start a mongodb database. R aises exception when fails. @@ -717,7 +729,7 @@ def find_or_start_mongodb( check_mongodb_connection(mongodb_url, mongodb_port, db_name) -def log_gunicorn_errors(logpath): +def log_gunicorn_errors(logpath: str) -> None: """ uses logging module to print out gunicorn errors if something went wrong Args: @@ -739,7 +751,9 @@ def log_gunicorn_errors(logpath): logging.error("Gunicorn error log:\n {}".format("".join(lines[first_idx:]))) -def find_or_start_webui(mongo_uri, hostname, port, web_logfile): +def find_or_start_webui( + mongo_uri: str, hostname: str, port: int, web_logfile: str +) -> None: """ tries to connect to Tuplex WebUI. If local uri is specified, autostarts WebUI. Args: @@ -950,7 +964,7 @@ def find_or_start_webui(mongo_uri, hostname, port, web_logfile): "Adding auto-shutdown of process with PID={} (WebUI)".format(ui_pid) ) - def shutdown_gunicorn(pid): + def shutdown_gunicorn(pid: int) -> None: pids_to_kill = [] # iterate over all gunicorn processes and kill them all @@ -991,7 +1005,7 @@ def shutdown_gunicorn(pid): return version_info -def ensure_webui(options): +def ensure_webui(options: dict) -> None: """ Helper function to ensure WebUI/MongoDB is auto-started when webui is specified Args: @@ -1054,7 +1068,7 @@ def ensure_webui(options): webui_uri = webui_url + ":" + str(webui_port) if not webui_uri.startswith("http"): webui_uri = "http://" + webui_uri - print("Tuplex WebUI can be accessed under {}".format(webui_uri)) + _logger.info("Tuplex WebUI can be accessed under {}".format(webui_uri)) except Exception as e: logging.error( "Failed to start or connect to Tuplex WebUI. Details: {}".format(e) diff --git a/tuplex/python/tuplex/utils/globs.py b/tuplex/python/tuplex/utils/globs.py index f938e5035..83c31cb6d 100644 --- a/tuplex/python/tuplex/utils/globs.py +++ b/tuplex/python/tuplex/utils/globs.py @@ -15,6 +15,8 @@ import sys import types import weakref +from types import CodeType +from typing import Any, Callable, List, Tuple # ALWAYS import cloudpickle before dill, b.c. of https://github.com/uqfoundation/dill/issues/383 from cloudpickle.cloudpickle import _get_cell_contents @@ -31,7 +33,7 @@ EXTENDED_ARG = dis.EXTENDED_ARG -def _extract_code_globals(co): +def _extract_code_globals(co: CodeType) -> dict: """ Find all globals names read or written to by codeblock co """ @@ -55,7 +57,7 @@ def _extract_code_globals(co): return out_names -def _find_imported_submodules(code, top_level_dependencies): +def _find_imported_submodules(code: CodeType, top_level_dependencies: List[Any]) -> Any: """ Find currently imported submodules used by a function. Submodules used by a function need to be detected and referenced for the @@ -103,7 +105,7 @@ def func(): return subimports -def _walk_global_ops(code): +def _walk_global_ops(code: Any) -> Any: """ Yield (opcode, argument number) tuples for all global-referencing instructions in *code*. @@ -114,7 +116,7 @@ def _walk_global_ops(code): yield instr.arg, instr.argval -def _function_getstate(func): +def _function_getstate(func: Callable) -> Tuple[dict, dict]: # - Put func's dynamic attributes (stored in func.__dict__) in state. These # attributes will be restored at unpickling time using # f.__dict__.update(state) @@ -163,7 +165,7 @@ def _function_getstate(func): # end from cloudpickle -def get_globals(func): +def get_globals(func: Callable) -> dict: _, d = _function_getstate(func) func_globals = d["__globals__"] diff --git a/tuplex/python/tuplex/utils/interactive_shell.py b/tuplex/python/tuplex/utils/interactive_shell.py index 56a929b02..e45e9a5e9 100644 --- a/tuplex/python/tuplex/utils/interactive_shell.py +++ b/tuplex/python/tuplex/utils/interactive_shell.py @@ -17,6 +17,7 @@ import sys from code import InteractiveConsole from types import FunctionType, LambdaType +from typing import Callable, Optional from prompt_toolkit.history import InMemoryHistory @@ -42,11 +43,11 @@ # the idea is basically, we can't simply call 'import tuplex' because this would # lead to a circular import. Yet, for user convenience, simply exposing tuplex.Context should be sufficient! class TuplexModuleHelper: - def __init__(self, context_cls): + def __init__(self, context_cls: "Context") -> None: self._context_cls = context_cls @property - def Context(self): + def Context(self) -> "Context": return self._context_cls @@ -56,15 +57,15 @@ class TuplexShell(InteractiveConsole): # use BORG design pattern to make class singleton alike __shared_state = {} - def __init__(self): + def __init__(self) -> None: self.__dict__ = self.__shared_state def init( self, - locals=None, - filename="", - histfile=os.path.expanduser("~/.console_history"), - ): + locals: Optional[dict] = None, + filename: str = "", + histfile: str = os.path.expanduser("~/.console_history"), + ) -> None: # add dummy helper for context if locals is not None and "Context" in locals.keys(): locals["tuplex"] = TuplexModuleHelper(locals["Context"]) @@ -76,7 +77,7 @@ def init( self._lastLine = "" self.historyDict = {} - def push(self, line): + def push(self, line: str) -> bool: """Push a line to the interpreter. The line should not have a trailing newline; it may have internal newlines. The line is appended to a buffer and the @@ -111,7 +112,7 @@ def push(self, line): return more - def get_lambda_source(self, f): + def get_lambda_source(self, f: Callable) -> str: # Won't this work for functions as well? assert self.initialized, "must call init on TuplexShell object first" @@ -135,7 +136,7 @@ def get_lambda_source(self, f): vault.extractAndPutAllLambdas(src_info, f_filename, f_lineno, f_colno, f_globs) return vault.get(f, f_filename, f_lineno, f_colno, f_globs) - def get_function_source(self, f): + def get_function_source(self, f: Callable) -> str: assert self.initialized, "must call init on TuplexShell object first" assert isinstance(f, FunctionType) and f.__code__.co_name != "", ( @@ -166,13 +167,15 @@ def get_function_source(self, f): logging.error( 'Could not find function "{}" in source'.format(function_name) ) - return None + return "" return source # taken from Lib/code.py # overwritten to customize behaviour - def interact(self, banner=None, exitmsg=None): + def interact( + self, banner: Optional[str] = None, exitmsg: Optional[str] = None + ) -> None: """Closely emulate the interactive Python console. The optional banner argument specifies the banner to print before the first interaction; by default it prints a banner diff --git a/tuplex/python/tuplex/utils/jedi_completer.py b/tuplex/python/tuplex/utils/jedi_completer.py index deecf8517..613b5d7c9 100644 --- a/tuplex/python/tuplex/utils/jedi_completer.py +++ b/tuplex/python/tuplex/utils/jedi_completer.py @@ -9,20 +9,24 @@ # License: Apache 2.0 # # ----------------------------------------------------------------------------------------------------------------------# +from typing import Any, List + from jedi import Interpreter, settings -from prompt_toolkit.completion import Completer, Completion +from prompt_toolkit.completion import CompleteEvent, Completer, Completion +from prompt_toolkit.document import Document class JediCompleter(Completer): """REPL Completer using jedi""" - def __init__(self, get_locals): + def __init__(self, get_locals: Any) -> None: # per default jedi is case insensitive, however we want it to be case sensitive settings.case_insensitive_completion = False - self.get_locals = get_locals - def get_completions(self, document, complete_event): + def get_completions( + self, document: Document, complete_event: CompleteEvent + ) -> List[Completion]: _locals = self.get_locals() interpreter = Interpreter(document.text, [_locals]) diff --git a/tuplex/python/tuplex/utils/jupyter.py b/tuplex/python/tuplex/utils/jupyter.py index 40fa34f70..59272c0a3 100644 --- a/tuplex/python/tuplex/utils/jupyter.py +++ b/tuplex/python/tuplex/utils/jupyter.py @@ -19,14 +19,14 @@ from notebook.notebookapp import list_running_servers -def get_jupyter_notebook_info(): +def get_jupyter_notebook_info() -> dict: """ retrieve infos about the currently running jupyter notebook if possible Returns: dict with several info attributes. If info for current notebook could not be retrieved, returns empty dict """ - def get(url): + def get(url: str) -> dict: req = urllib.request.Request(url, headers={"content-type": "application/json"}) response = urllib.request.urlopen(req) return json.loads(response.read()) diff --git a/tuplex/python/tuplex/utils/reflection.py b/tuplex/python/tuplex/utils/reflection.py index 258da0b27..6397ab8f2 100644 --- a/tuplex/python/tuplex/utils/reflection.py +++ b/tuplex/python/tuplex/utils/reflection.py @@ -11,8 +11,10 @@ import ast import inspect +import logging import re import types +from typing import Callable, List, Tuple, Union # ALWAYS import cloudpickle before dill, b.c. of https://github.com/uqfoundation/dill/issues/383 import dill @@ -29,8 +31,10 @@ # only export get_source function, rest shall be private. __all__ = ["get_source", "get_globals", "supports_lambda_closure"] +_logger = logging.getLogger(__name__) -def get_jupyter_raw_code(function_name): + +def get_jupyter_raw_code(function_name: str) -> str: # Ignore here unresolved reference, get_ipython() works in jupyter notebook. history_manager = get_ipython().history_manager # noqa: F821 hist = history_manager.get_range() @@ -55,18 +59,20 @@ def get_jupyter_raw_code(function_name): return matched_cells[-1][2] -def extractFunctionByName(code, func_name, return_linenos=False): +def extractFunctionByName( + code: str, func_name: str, return_linenos: bool = False +) -> Union[str, Tuple[str, int, int]]: class FunctionVisitor(ast.NodeVisitor): - def __init__(self): - self.lastStmtLineno = 0 - self.funcInfo = [] + def __init__(self) -> None: + self.lastStmtLineno: int = 0 + self.funcInfo: List[dict] = [] - def visit_FunctionDef(self, node): - print(self.lastStmtLineno) + def visit_FunctionDef(self, node: ast.FunctionDef) -> None: + _logger.debug(self.lastStmtLineno) self.generic_visit(node) - print(self.lastStmtLineno) + _logger.debug(self.lastStmtLineno) - def visit(self, node): + def visit(self, node: ast.AST) -> None: funcStartLineno = -1 if hasattr(node, "lineno"): self.lastStmtLineno = node.lineno @@ -89,7 +95,7 @@ def visit(self, node): # find function with name candidates = filter(lambda x: x["name"] == func_name, fv.funcInfo) - def indent(s): + def indent(s: str) -> int: return len(s) - len(s.lstrip(" \t")) lines = code.split("\n") @@ -106,9 +112,9 @@ def indent(s): return func_code -def extract_function_code(function_name, raw_code): +def extract_function_code(function_name: str, raw_code: str) -> str: # remove greedily up to num_tabs and num_spaces - def remove_tabs_and_spaces(line, num_tabs, num_spaces): + def remove_tabs_and_spaces(line: str, num_tabs: int, num_spaces: int) -> str: t = 0 s = 0 pos = 0 @@ -147,7 +153,7 @@ def remove_tabs_and_spaces(line, num_tabs, num_spaces): return extractFunctionByName(out, function_name) -def get_function_code(f): +def get_function_code(f: Callable) -> str: """jupyter notebook, retrieve function history""" assert isinstance(f, types.FunctionType) function_name = f.__code__.co_name @@ -175,7 +181,7 @@ def get_function_code(f): vault = SourceVault() -def get_source(f): +def get_source(f: Callable) -> str: """Jupyter notebook code reflection""" if isinstance(f, types.FunctionType): diff --git a/tuplex/python/tuplex/utils/source_vault.py b/tuplex/python/tuplex/utils/source_vault.py index 6e25e8d60..f95e04c5b 100644 --- a/tuplex/python/tuplex/utils/source_vault.py +++ b/tuplex/python/tuplex/utils/source_vault.py @@ -14,11 +14,12 @@ import os import sys from types import CodeType, LambdaType +from typing import Callable, List, Optional, Tuple import astor -def supports_lambda_closure(): +def supports_lambda_closure() -> bool: """ source code of lambdas can't be extracted, because there's no column information available in code objects. This can be achieved by patching 4 lines in the cpython source code. @@ -31,11 +32,11 @@ def supports_lambda_closure(): return hasattr(f.__code__, "co_firstcolno") -def extract_all_lambdas(tree): +def extract_all_lambdas(tree: ast.AST) -> List[ast.Lambda]: lambdas = [] class Visitor(ast.NodeVisitor): - def visit_Lambda(self, node): + def visit_Lambda(self, node: ast.Lambda) -> None: lambdas.append(node) Visitor().visit(tree) @@ -45,11 +46,11 @@ def visit_Lambda(self, node): # extract for lambda incl. default values # annotations are not possible with the current syntax... -def args_for_lambda_ast(lam): - return [getattr(n, "arg") for n in lam.args.args] +def args_for_lambda_ast(lam: Callable) -> List[str]: + return [n.arg for n in lam.args.args] -def gen_code_for_lambda(lam): +def gen_code_for_lambda(lam: Callable) -> str: # surround in try except if user provided malformed lambdas try: s = astor.to_source(lam) @@ -80,7 +81,7 @@ def gen_code_for_lambda(lam): return "" -def hash_code_object(code): +def hash_code_object(code: CodeType) -> bytes: # can't take the full object because this includes memory addresses # need to hash contents # for this use bytecode, varnames & constants @@ -97,8 +98,7 @@ def hash_code_object(code): return ret + b")" -# join lines and remove stupid \\n -def remove_line_breaks(source_lines): +def remove_line_breaks(source_lines: List[str]) -> str: """ expressions may be defined over multiple line using \ in python. This function removes this and joins lines. Args: @@ -130,22 +130,21 @@ class SourceVault: # borg pattern __shared_state = {} - def __init__(self): + def __init__(self) -> None: self.__dict__ = self.__shared_state self.lambdaDict = {} # new: lookup via filename, lineno and colno self.lambdaFileDict = {} - # def get(self, obj): - # """ - # returns source code for given object - # :param codeboj: - # :return: - # """ - # assert isinstance(obj, LambdaType), 'object needs to be a lambda object' - # return self.lambdaDict[hash_code_object(obj.__code__)] - def get(self, ftor, filename, lineno, colno, globs): + def get( + self, + ftor: Callable, + filename: str, + lineno: int, + colno: Optional[int], + globs: dict, + ) -> str: assert isinstance(ftor, LambdaType), "object needs to be a lambda object" # perform multiway lookup for code @@ -184,7 +183,14 @@ def get(self, ftor, filename, lineno, colno, globs): else: raise KeyError("could not find lambda function") - def extractAndPutAllLambdas(self, src_info, filename, lineno, colno, globals): + def extractAndPutAllLambdas( + self, + src_info: Tuple[List[str], int], + filename: str, + lineno: int, + colno: Optional[int], + globals: dict, + ) -> None: """ extracts the source code from all lambda functions and stores them in the source vault :param source: diff --git a/tuplex/python/tuplex/utils/tracebacks.py b/tuplex/python/tuplex/utils/tracebacks.py index eb5ba3aed..6b7e82789 100644 --- a/tuplex/python/tuplex/utils/tracebacks.py +++ b/tuplex/python/tuplex/utils/tracebacks.py @@ -12,21 +12,23 @@ import linecache import re import traceback +from types import TracebackType +from typing import Any, Callable from .reflection import get_source __all__ = ["traceback_from_udf"] -def format_traceback(tb, function_name): +def format_traceback(tb: TracebackType, function_name: str) -> str: """ helper function to format a traceback object with line numbers relative to function definition Args: - tb: - function_name: + tb: traceback object + function_name: name of function to add to traceback Returns: - + formatted traceback string """ fnames = set() @@ -67,7 +69,7 @@ def format_traceback(tb, function_name): # get traceback from sample -def traceback_from_udf(udf, x): +def traceback_from_udf(udf: Callable, x: Any) -> str: """ get a formatted traceback as string by executing a udf over a sample Args: From 0c42c18361b0c959532cd389153ed7f5073135d1 Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Sat, 15 Mar 2025 20:36:22 -0700 Subject: [PATCH 6/8] Add check for pyarrow to avoid segfault when pyarrow is loaded in the process (#154) Issue https://github.com/aws/aws-sdk-cpp/issues/2699 happens when pyarrow and tuplex are used together. If you import tuplex first, and then pyarrow then there is no problem. However, if you import pyarrow first and then tuplex the bug is triggered in pyarrow leading to the process segfaulting. While the primary goal is not to fix bugs in 3rd party libraries, pyarrow's prevalence warrants to issue an error. Other: Adds AWS SDK shutdown on process exit. --- scripts/azure/install_azure_ci_reqs.sh | 2 +- scripts/docker/ci/install_tuplex_reqs.sh | 2 +- scripts/generate_scripts.py | 2 +- scripts/macos/install_antlr4_cpp_runtime.sh | 2 +- scripts/macos/install_aws-sdk-cpp.sh | 2 +- scripts/ubuntu2004/install_requirements.sh | 2 +- scripts/ubuntu2204/install_requirements.sh | 2 +- tuplex/CMakeLists.txt | 1 + tuplex/io/include/AWSCommon.h | 7 + tuplex/io/src/AWSCommon.cc | 6 + tuplex/python/CMakeLists.txt | 1 + tuplex/python/include/PythonCommon.h | 13 + tuplex/python/src/PythonBindings.cc | 22 + tuplex/python/src/PythonCommon.cc | 8 + tuplex/python/tuplex/context.py | 5 + tuplex/python/tuplex/utils/common.py | 30 ++ tuplex/python/tuplex/utils/dllist.py | 492 ++++++++++++++++++++ 17 files changed, 592 insertions(+), 7 deletions(-) create mode 100644 tuplex/python/tuplex/utils/dllist.py diff --git a/scripts/azure/install_azure_ci_reqs.sh b/scripts/azure/install_azure_ci_reqs.sh index ad9b9ba01..a455f1a34 100644 --- a/scripts/azure/install_azure_ci_reqs.sh +++ b/scripts/azure/install_azure_ci_reqs.sh @@ -114,7 +114,7 @@ mkdir -p ${WORKDIR}/antlr && cd ${WORKDIR}/antlr \ && make -j$(nproc) && make install mkdir -p ${WORKDIR}/aws && cd ${WORKDIR}/aws \ && git clone --recurse-submodules https://github.com/aws/aws-sdk-cpp.git \ -&& cd aws-sdk-cpp && git checkout tags/1.11.164 && mkdir build && cd build \ +&& cd aws-sdk-cpp && git checkout tags/1.11.524 && mkdir build && cd build \ && cmake -DCMAKE_BUILD_TYPE=Release -DUSE_OPENSSL=ON -DENABLE_TESTING=OFF -DENABLE_UNITY_BUILD=ON -DCPP_STANDARD=17 -DBUILD_SHARED_LIBS=OFF -DBUILD_ONLY="s3;core;lambda;transfer" -DCMAKE_INSTALL_PREFIX=${PREFIX} .. \ && make -j$(nproc) \ && make install diff --git a/scripts/docker/ci/install_tuplex_reqs.sh b/scripts/docker/ci/install_tuplex_reqs.sh index 37dc0d2a9..c695b3ead 100644 --- a/scripts/docker/ci/install_tuplex_reqs.sh +++ b/scripts/docker/ci/install_tuplex_reqs.sh @@ -4,7 +4,7 @@ set -euxo pipefail # dependency versions -AWSSDK_CPP_VERSION=1.11.164 +AWSSDK_CPP_VERSION=1.11.524 ANTLR4_VERSION=4.13.1 YAML_CPP_VERSION=0.8.0 AWS_LAMBDA_CPP_VERSION=0.2.8 diff --git a/scripts/generate_scripts.py b/scripts/generate_scripts.py index 3fd45b684..46b0e3c32 100755 --- a/scripts/generate_scripts.py +++ b/scripts/generate_scripts.py @@ -23,7 +23,7 @@ def configure_versions(osname): VERSIONS['YAMLCPP_VERSION'] = '0.8.0' VERSIONS['CELERO_VERSION'] = '2.8.3' VERSIONS['ANTLR_VERSION'] = '4.13.1' - VERSIONS['AWSSDK_VERSION'] = '1.11.164' + VERSIONS['AWSSDK_VERSION'] = '1.11.524' VERSIONS['AWSLAMBDACPP_VERSION'] = '0.2.8' VERSIONS['PCRE2_VERSION'] = '10.42' VERSIONS['PROTOBUF_VERSION'] = '24.3' diff --git a/scripts/macos/install_antlr4_cpp_runtime.sh b/scripts/macos/install_antlr4_cpp_runtime.sh index 194ae5397..777a83495 100644 --- a/scripts/macos/install_antlr4_cpp_runtime.sh +++ b/scripts/macos/install_antlr4_cpp_runtime.sh @@ -58,6 +58,6 @@ git clone https://github.com/antlr/antlr4.git \ ls -l $PREFIX/include ls -l $PREFIX/lib -cp lib/libantlr4-runtime.dylib /Users/runner/work/tuplex/tuplex/libantlr4-runtime.dylib +cp $PREFIX/lib/libantlr4-runtime.dylib /Users/runner/work/tuplex/tuplex/libantlr4-runtime.dylib || echo "cp failed." exit 0 diff --git a/scripts/macos/install_aws-sdk-cpp.sh b/scripts/macos/install_aws-sdk-cpp.sh index 2e6aa7a29..bdf7fadd9 100755 --- a/scripts/macos/install_aws-sdk-cpp.sh +++ b/scripts/macos/install_aws-sdk-cpp.sh @@ -3,7 +3,7 @@ set -euxo pipefail PREFIX=${PREFIX:-/usr/local} -AWSSDK_CPP_VERSION=1.11.164 +AWSSDK_CPP_VERSION=1.11.524 # need at least 1.11.267 because of pyarrow bugs... # check if dir exists (i.e. restored from cache, then skip) if [ -d "${PREFIX}/include/aws" ]; then diff --git a/scripts/ubuntu2004/install_requirements.sh b/scripts/ubuntu2004/install_requirements.sh index 2f36e1b3a..3d3aab12a 100755 --- a/scripts/ubuntu2004/install_requirements.sh +++ b/scripts/ubuntu2004/install_requirements.sh @@ -137,7 +137,7 @@ mkdir -p ${WORKDIR}/protobuf && cd ${WORKDIR}/protobuf && git clone -b v24.3 htt echo ">> Installing AWS SDK" mkdir -p ${WORKDIR}/aws && cd ${WORKDIR}/aws \ && git clone --recurse-submodules https://github.com/aws/aws-sdk-cpp.git \ -&& cd aws-sdk-cpp && git checkout tags/1.11.164 && mkdir build && cd build \ +&& cd aws-sdk-cpp && git checkout tags/1.11.524 && mkdir build && cd build \ && cmake -DCMAKE_BUILD_TYPE=Release -DUSE_OPENSSL=ON -DENABLE_TESTING=OFF -DENABLE_UNITY_BUILD=ON -DCPP_STANDARD=17 -DBUILD_SHARED_LIBS=OFF -DBUILD_ONLY="s3;core;lambda;transfer" -DCMAKE_INSTALL_PREFIX=${PREFIX} .. \ && make -j$(nproc) \ && make install diff --git a/scripts/ubuntu2204/install_requirements.sh b/scripts/ubuntu2204/install_requirements.sh index 154c83a71..ab57ab87a 100755 --- a/scripts/ubuntu2204/install_requirements.sh +++ b/scripts/ubuntu2204/install_requirements.sh @@ -133,7 +133,7 @@ mkdir -p ${WORKDIR}/protobuf && cd ${WORKDIR}/protobuf && git clone -b v24.3 htt echo ">> Installing AWS SDK" mkdir -p ${WORKDIR}/aws && cd ${WORKDIR}/aws \ && git clone --recurse-submodules https://github.com/aws/aws-sdk-cpp.git \ -&& cd aws-sdk-cpp && git checkout tags/1.11.164 && mkdir build && cd build \ +&& cd aws-sdk-cpp && git checkout tags/1.11.524 && mkdir build && cd build \ && cmake -DCMAKE_BUILD_TYPE=Release -DUSE_OPENSSL=ON -DENABLE_TESTING=OFF -DENABLE_UNITY_BUILD=ON -DCPP_STANDARD=17 -DBUILD_SHARED_LIBS=OFF -DBUILD_ONLY="s3;core;lambda;transfer" -DCMAKE_INSTALL_PREFIX=${PREFIX} .. \ && make -j$(nproc) \ && make install diff --git a/tuplex/CMakeLists.txt b/tuplex/CMakeLists.txt index 1da90748b..dff4df040 100755 --- a/tuplex/CMakeLists.txt +++ b/tuplex/CMakeLists.txt @@ -325,6 +325,7 @@ if(BUILD_WITH_AWS) endif() endif() find_package(AWSSDK REQUIRED COMPONENTS s3 core lambda transfer) + message(STATUS "AWS SDK version: ${AWSSDK_VERSION}") message(STATUS "AWS libs: ${AWSSDK_LINK_LIBRARIES}") message(STATUS "AWS include dirs: ${AWSSDK_INCLUDE_DIR}") if(AWSSDK_FOUND) diff --git a/tuplex/io/include/AWSCommon.h b/tuplex/io/include/AWSCommon.h index 6d01f5b4f..a1c98ac68 100644 --- a/tuplex/io/include/AWSCommon.h +++ b/tuplex/io/include/AWSCommon.h @@ -60,6 +60,13 @@ namespace tuplex { * @return true/false. */ extern bool isValidAWSZone(const std::string& zone); + + /*! + * Use this function to suggest to Tuplex the state of the AWS SDK, e.g. if in the process the + * Aws sdk is already initialized in some form. + * @param overrideAwssdkInitializedValue + */ + extern void setExternalAwssdk(bool overrideAwssdkInitializedValue); } // Amazon frequently changes the parameters of lambda functions, diff --git a/tuplex/io/src/AWSCommon.cc b/tuplex/io/src/AWSCommon.cc index 801b59d12..f8147c9ba 100644 --- a/tuplex/io/src/AWSCommon.cc +++ b/tuplex/io/src/AWSCommon.cc @@ -33,6 +33,12 @@ static std::string throw_if_missing_envvar(const std::string &name) { static bool isAWSInitialized = false; static Aws::SDKOptions aws_options; +namespace tuplex { + void setExternalAwssdk(bool overrideAwssdkInitializedValue) { + isAWSInitialized = overrideAwssdkInitializedValue; + } +} + // for Lambda, check: https://docs.aws.amazon.com/code-samples/latest/catalog/cpp-lambda-lambda_example.cpp.html // https://sdk.amazonaws.com/cpp/api/LATEST/class_aws_1_1_utils_1_1_logging_1_1_formatted_log_system.html diff --git a/tuplex/python/CMakeLists.txt b/tuplex/python/CMakeLists.txt index da912e8e9..abb2bcaf3 100644 --- a/tuplex/python/CMakeLists.txt +++ b/tuplex/python/CMakeLists.txt @@ -144,6 +144,7 @@ FILE(COPY ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/__init__.py ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/tracebacks.py ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/version.py ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/globs.py + ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/dllist.py DESTINATION ${PYTHON_DIST_DIR}/tuplex/utils) FILE(COPY ${CMAKE_CURRENT_SOURCE_DIR}/tests/test_tuples.py diff --git a/tuplex/python/include/PythonCommon.h b/tuplex/python/include/PythonCommon.h index f34a4b832..205f2b63d 100644 --- a/tuplex/python/include/PythonCommon.h +++ b/tuplex/python/include/PythonCommon.h @@ -154,6 +154,19 @@ namespace tuplex { return py::cast(listObj); } + + extern py::object getPythonVersion(); + +#ifndef BUILD_WITH_AWS + // if not building with aws, define dummy function, else this function lives in AWSCommon.h/cc + /*! + * Use this function to suggest to Tuplex the state of the AWS SDK, e.g. if in the process the + * Aws sdk is already initialized in some form. + * @param overrideAwssdkInitializedValue + */ + [[nodiscard]] inline void setExternalAwssdk(bool overrideAwssdkInitializedValue) { + } +#endif } #endif //TUPLEX_PYTHONCOMMON_H diff --git a/tuplex/python/src/PythonBindings.cc b/tuplex/python/src/PythonBindings.cc index 7909e5e8f..8100e6f6d 100644 --- a/tuplex/python/src/PythonBindings.cc +++ b/tuplex/python/src/PythonBindings.cc @@ -39,6 +39,24 @@ PYMODULE { m.attr("__version__") = "dev"; #endif + // Perform cleanup (e.g., AWS SDK shutdown if necessary to await endless loop) + // Register a callback function that is invoked when the BaseClass object is collected + // cf. https://pybind11.readthedocs.io/en/stable/advanced/misc.html + auto cleanup_callback = []() { + // perform cleanup here -- this function is called with the GIL held + // std::cout<<"Pybind11 clean up call here."<(m, "_DataSet") @@ -94,4 +112,8 @@ PYMODULE { m.def("registerLoggingCallback", &tuplex::registerPythonLoggingCallback); m.def("registerWithInterpreter", &python::registerWithInterpreter); + + m.def("getPythonVersion", &tuplex::getPythonVersion); + + m.def("setExternalAwssdk", &tuplex::setExternalAwssdk); } \ No newline at end of file diff --git a/tuplex/python/src/PythonCommon.cc b/tuplex/python/src/PythonCommon.cc index 6e064ff9f..a510ecf7c 100644 --- a/tuplex/python/src/PythonCommon.cc +++ b/tuplex/python/src/PythonCommon.cc @@ -20,6 +20,14 @@ backward::SignalHandling sh; #endif namespace tuplex { + + py::object getPythonVersion() { + std::stringstream ss; + ss< None: # log gunicorn errors for local startup if os.path.isfile(gunicorn_logpath) and "localhost" == webui_url: log_gunicorn_errors(gunicorn_logpath) + + +def pyarrow_aws_sdk_cpp_check() -> None: + """Help fix issue of pyarrow (frequent because pyarrow seems to be shipped very often) + Call this function BEFORE initializing the _Context object from the tuplex C extension object.""" + # Newer PyArrow versions use a more recent version of the AWS SDK, which leads to pyarrow crashing + # other libraries under macOS. Warn here explicitly about this to avoid a segfault, and provide error. + + if os.name == "posix" and sys.platform == "darwin": + loaded_shared_objects = dllist() + pyarrow_loaded = any("pyarrow/lib" in path for path in loaded_shared_objects) + + if pyarrow_loaded: + import pyarrow as pa + + pyarrow_version = [int(v) for v in pa.__version__.split(".")] + + # PyArrow has since v13+ a bug with crashes other libraries due to bad use of AWS SDK. + # cf. https://github.com/aws/aws-sdk-cpp/issues/2699 which has been merged, + # but whose solution has not been reflected in pyarrow yet. + # Display to user actionable usage on what to do. + if pyarrow_version[0] >= 13: + raise RuntimeError( + "PyArrow {pa.__version__} present in process and loaded or imported before tuplex." + " If you need to import/load pyarrow first, only compatible with pyarrow versions < 13.0.0." + " If you must use pyarrow >= 13.0.0, import tuplex first and then load pyarrow. " + "Note that pyarrow < 13.0.0 is not compatible with numpy >= 2.0." + ) diff --git a/tuplex/python/tuplex/utils/dllist.py b/tuplex/python/tuplex/utils/dllist.py new file mode 100644 index 000000000..0978bdc79 --- /dev/null +++ b/tuplex/python/tuplex/utils/dllist.py @@ -0,0 +1,492 @@ +# This is code schedule to be part of Python 3.14+ from https://github.com/python/cpython/pull/122946/files, ported back here for usagge to avoid pyarrow bug +# where AWS SDK is always initialized and thus may lead to issues with tuplex. + +import os +import shutil +import subprocess +import sys +from typing import Any, List, Optional, Union + +# find_library(name) returns the pathname of a library, or None. +if os.name == "nt": + + def _get_build_version() -> Union[None, int, float]: + """Return the version of MSVC that was used to build Python. + + For Python 2.3 and up, the version number is included in + sys.version. For earlier versions, assume the compiler is MSVC 6. + """ + # This function was copied from Lib/distutils/msvccompiler.py + prefix = "MSC v." + i = sys.version.find(prefix) + if i == -1: + return 6 + i = i + len(prefix) + s, rest = sys.version[i:].split(" ", 1) + majorVersion = int(s[:-2]) - 6 + if majorVersion >= 13: + majorVersion += 1 + minorVersion = int(s[2:3]) / 10.0 + # I don't think paths are affected by minor version in version 6 + if majorVersion == 6: + minorVersion = 0 + if majorVersion >= 6: + return majorVersion + minorVersion + # else we don't know what version of the compiler this is + return None + + def find_msvcrt() -> Optional[str]: + """Return the name of the VC runtime dll""" + version = _get_build_version() + if version is None: + # better be safe than sorry + return None + if version <= 6: + clibname = "msvcrt" + elif version <= 13: + clibname = "msvcr%d" % (version * 10) + else: + # CRT is no longer directly loadable. See issue23606 for the + # discussion about alternative approaches. + return None + + # If python was built with in debug mode + import importlib.machinery + + if "_d.pyd" in importlib.machinery.EXTENSION_SUFFIXES: + clibname += "d" + return clibname + ".dll" + + def find_library(name: str) -> Optional[str]: + if name in ("c", "m"): + return find_msvcrt() + # See MSDN for the REAL search order. + for directory in os.environ["PATH"].split(os.pathsep): + fname = os.path.join(directory, name) + if os.path.isfile(fname): + return fname + if fname.lower().endswith(".dll"): + continue + fname = fname + ".dll" + if os.path.isfile(fname): + return fname + return None + + # Listing loaded DLLs on Windows relies on the following APIs: + # https://learn.microsoft.com/windows/win32/api/psapi/nf-psapi-enumprocessmodules + # https://learn.microsoft.com/windows/win32/api/libloaderapi/nf-libloaderapi-getmodulefilenamew + import ctypes + from ctypes import wintypes + + _kernel32 = ctypes.WinDLL("kernel32", use_last_error=True) + _get_current_process = _kernel32["GetCurrentProcess"] + _get_current_process.restype = wintypes.HANDLE + + _k32_get_module_file_name = _kernel32["GetModuleFileNameW"] + _k32_get_module_file_name.restype = wintypes.DWORD + _k32_get_module_file_name.argtypes = ( + wintypes.HMODULE, + wintypes.LPWSTR, + wintypes.DWORD, + ) + + _psapi = ctypes.WinDLL("psapi", use_last_error=True) + _enum_process_modules = _psapi["EnumProcessModules"] + _enum_process_modules.restype = wintypes.BOOL + _enum_process_modules.argtypes = ( + wintypes.HANDLE, + ctypes.POINTER(wintypes.HMODULE), + wintypes.DWORD, + wintypes.LPDWORD, + ) + + def _get_module_filename(module: wintypes.HMODULE) -> Optional[str]: + name = (wintypes.WCHAR * 32767)() # UNICODE_STRING_MAX_CHARS + if _k32_get_module_file_name(module, name, len(name)): + return name.value + return None + + def _get_module_handles() -> List[Any]: + process = _get_current_process() + space_needed = wintypes.DWORD() + n = 1024 + while True: + modules = (wintypes.HMODULE * n)() + if not _enum_process_modules( + process, modules, ctypes.sizeof(modules), ctypes.byref(space_needed) + ): + err = ctypes.get_last_error() + msg = ctypes.FormatError(err).strip() + raise ctypes.WinError(err, f"EnumProcessModules failed: {msg}") + n = space_needed.value // ctypes.sizeof(wintypes.HMODULE) + if n <= len(modules): + return modules[:n] + + def dllist() -> List[str]: + """Return a list of loaded shared libraries in the current process.""" + modules = _get_module_handles() + libraries = [ + name for h in modules if (name := _get_module_filename(h)) is not None + ] + return libraries + +elif os.name == "posix" and sys.platform in {"darwin", "ios", "tvos", "watchos"}: + from ctypes.macholib.dyld import dyld_find as _dyld_find + + def find_library(name: str) -> Optional[str]: + possible = [ + "lib%s.dylib" % name, + "%s.dylib" % name, + "%s.framework/%s" % (name, name), + ] + for name in possible: + try: + return _dyld_find(name) + except ValueError: + continue + return None + + # Listing loaded libraries on Apple systems relies on the following API: + # https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man3/dyld.3.html + import ctypes + + _libc = ctypes.CDLL(find_library("c")) + _dyld_get_image_name = _libc["_dyld_get_image_name"] + _dyld_get_image_name.restype = ctypes.c_char_p + + def dllist() -> List[str]: + """Return a list of loaded shared libraries in the current process.""" + num_images = _libc._dyld_image_count() + libraries = [ + os.fsdecode(name) + for i in range(num_images) + if (name := _dyld_get_image_name(i)) is not None + ] + + return libraries + +elif sys.platform.startswith("aix"): + # AIX has two styles of storing shared libraries + # GNU auto_tools refer to these as svr4 and aix + # svr4 (System V Release 4) is a regular file, often with .so as suffix + # AIX style uses an archive (suffix .a) with members (e.g., shr.o, libssl.so) + # see issue#26439 and _aix.py for more details + + from ctypes._aix import find_library + +elif sys.platform == "android": + + def find_library(name: str) -> Optional[str]: + directory = "/system/lib" + if "64" in os.uname().machine: + directory += "64" + + fname = f"{directory}/lib{name}.so" + return fname if os.path.isfile(fname) else None + +elif os.name == "posix": + # Andreas Degert's find functions, using gcc, /sbin/ldconfig, objdump + import re + import tempfile + + def _is_elf(filename: str) -> bool: + "Return True if the given file is an ELF file" + elf_header = b"\x7fELF" + try: + with open(filename, "br") as thefile: + return thefile.read(4) == elf_header + except FileNotFoundError: + return False + + def _findLib_gcc(name: str) -> Optional[str]: + # Run GCC's linker with the -t (aka --trace) option and examine the + # library name it prints out. The GCC command will fail because we + # haven't supplied a proper program with main(), but that does not + # matter. + expr = os.fsencode(r"[^\(\)\s]*lib%s\.[^\(\)\s]*" % re.escape(name)) + + c_compiler = shutil.which("gcc") + if not c_compiler: + c_compiler = shutil.which("cc") + if not c_compiler: + # No C compiler available, give up + return None + + temp = tempfile.NamedTemporaryFile() + try: + args = [c_compiler, "-Wl,-t", "-o", temp.name, "-l" + name] + + env = dict(os.environ) + env["LC_ALL"] = "C" + env["LANG"] = "C" + try: + proc = subprocess.Popen( + args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env + ) + except OSError: # E.g. bad executable + return None + with proc: + trace = proc.stdout.read() + finally: + try: + temp.close() + except FileNotFoundError: + # Raised if the file was already removed, which is the normal + # behaviour of GCC if linking fails + pass + res = re.findall(expr, trace) + if not res: + return None + + for file in res: + # Check if the given file is an elf file: gcc can report + # some files that are linker scripts and not actual + # shared objects. See bpo-41976 for more details + if not _is_elf(file): + continue + return os.fsdecode(file) + + if sys.platform == "sunos5": + # use /usr/ccs/bin/dump on solaris + def _get_soname(f: Any) -> Optional[str]: + if not f: + return None + + try: + proc = subprocess.Popen( + ("/usr/ccs/bin/dump", "-Lpv", f), + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + ) + except OSError: # E.g. command not found + return None + with proc: + data = proc.stdout.read() + res = re.search(rb"\[.*\]\sSONAME\s+([^\s]+)", data) + if not res: + return None + return os.fsdecode(res.group(1)) + else: + + def _get_soname(f: Any) -> Optional[str]: + # assuming GNU binutils / ELF + if not f: + return None + objdump = shutil.which("objdump") + if not objdump: + # objdump is not available, give up + return None + + try: + proc = subprocess.Popen( + (objdump, "-p", "-j", ".dynamic", f), + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + ) + except OSError: # E.g. bad executable + return None + with proc: + dump = proc.stdout.read() + res = re.search(rb"\sSONAME\s+([^\s]+)", dump) + if not res: + return None + return os.fsdecode(res.group(1)) + + if sys.platform.startswith(("freebsd", "openbsd", "dragonfly")): + + def _num_version(libname: str) -> List[int]: + # "libxyz.so.MAJOR.MINOR" => [ MAJOR, MINOR ] + parts = libname.split(b".") + nums = [] + try: + while parts: + nums.insert(0, int(parts.pop())) + except ValueError: + pass + return nums or [sys.maxsize] + + def find_library(name: str) -> Optional[str]: + ename = re.escape(name) + expr = r":-l%s\.\S+ => \S*/(lib%s\.\S+)" % (ename, ename) + expr = os.fsencode(expr) + + try: + proc = subprocess.Popen( + ("/sbin/ldconfig", "-r"), + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + ) + except OSError: # E.g. command not found + data = b"" + else: + with proc: + data = proc.stdout.read() + + res = re.findall(expr, data) + if not res: + return _get_soname(_findLib_gcc(name)) + res.sort(key=_num_version) + return os.fsdecode(res[-1]) + + elif sys.platform == "sunos5": + + def _findLib_crle(name: str, is64: bool) -> Optional[str]: + if not os.path.exists("/usr/bin/crle"): + return None + + env = dict(os.environ) + env["LC_ALL"] = "C" + + if is64: + args = ("/usr/bin/crle", "-64") + else: + args = ("/usr/bin/crle",) + + paths = None + try: + proc = subprocess.Popen( + args, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, env=env + ) + except OSError: # E.g. bad executable + return None + with proc: + for line in proc.stdout: + line = line.strip() + if line.startswith(b"Default Library Path (ELF):"): + paths = os.fsdecode(line).split()[4] + + if not paths: + return None + + for dir in paths.split(":"): + libfile = os.path.join(dir, "lib%s.so" % name) + if os.path.exists(libfile): + return libfile + + return None + + def find_library(name: str, is64: bool = False) -> Optional[str]: + return _get_soname(_findLib_crle(name, is64) or _findLib_gcc(name)) + + else: + + def _findSoname_ldconfig(name: str) -> Optional[str]: + import struct + + if struct.calcsize("l") == 4: + machine = os.uname().machine + "-32" + else: + machine = os.uname().machine + "-64" + mach_map = { + "x86_64-64": "libc6,x86-64", + "ppc64-64": "libc6,64bit", + "sparc64-64": "libc6,64bit", + "s390x-64": "libc6,64bit", + "ia64-64": "libc6,IA-64", + } + abi_type = mach_map.get(machine, "libc6") + + # XXX assuming GLIBC's ldconfig (with option -p) + regex = r"\s+(lib%s\.[^\s]+)\s+\(%s" + regex = os.fsencode(regex % (re.escape(name), abi_type)) + try: + with subprocess.Popen( + ["/sbin/ldconfig", "-p"], + stdin=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + stdout=subprocess.PIPE, + env={"LC_ALL": "C", "LANG": "C"}, + ) as p: + res = re.search(regex, p.stdout.read()) + if res: + return os.fsdecode(res.group(1)) + except OSError: + pass + + def _findLib_ld(name: str) -> Optional[str]: + # See issue #9998 for why this is needed + expr = r"[^\(\)\s]*lib%s\.[^\(\)\s]*" % re.escape(name) + cmd = ["ld", "-t"] + libpath = os.environ.get("LD_LIBRARY_PATH") + if libpath: + for d in libpath.split(":"): + cmd.extend(["-L", d]) + cmd.extend(["-o", os.devnull, "-l%s" % name]) + result = None + try: + p = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True, + ) + out, _ = p.communicate() + res = re.findall(expr, os.fsdecode(out)) + for file in res: + # Check if the given file is an elf file: gcc can report + # some files that are linker scripts and not actual + # shared objects. See bpo-41976 for more details + if not _is_elf(file): + continue + return os.fsdecode(file) + except Exception: + pass # result will be None + return result + + def find_library(name: str) -> Optional[str]: + # See issue #9998 + return ( + _findSoname_ldconfig(name) + or _get_soname(_findLib_gcc(name)) + or _get_soname(_findLib_ld(name)) + ) + + +# Listing loaded libraries on other systems will try to use +# functions common to Linux and a few other Unix-like systems. +# See the following for several platforms' documentation of the same API: +# https://man7.org/linux/man-pages/man3/dl_iterate_phdr.3.html +# https://man.freebsd.org/cgi/man.cgi?query=dl_iterate_phdr +# https://man.openbsd.org/dl_iterate_phdr +# https://docs.oracle.com/cd/E88353_01/html/E37843/dl-iterate-phdr-3c.html +if os.name == "posix" and sys.platform not in {"darwin", "ios", "tvos", "watchos"}: + import ctypes + + if hasattr((_libc := ctypes.CDLL(None)), "dl_iterate_phdr"): + + class _dl_phdr_info(ctypes.Structure): + _fields_ = [ + ("dlpi_addr", ctypes.c_void_p), + ("dlpi_name", ctypes.c_char_p), + ("dlpi_phdr", ctypes.c_void_p), + ("dlpi_phnum", ctypes.c_ushort), + ] + + _dl_phdr_callback = ctypes.CFUNCTYPE( + ctypes.c_int, + ctypes.POINTER(_dl_phdr_info), + ctypes.c_size_t, + ctypes.POINTER(ctypes.py_object), + ) + + @_dl_phdr_callback + def _info_callback(info, _size, data) -> int: # noqa: ANN001 + libraries = data.contents.value + name = os.fsdecode(info.contents.dlpi_name) + libraries.append(name) + return 0 + + _dl_iterate_phdr = _libc["dl_iterate_phdr"] + _dl_iterate_phdr.argtypes = [ + _dl_phdr_callback, + ctypes.POINTER(ctypes.py_object), + ] + _dl_iterate_phdr.restype = ctypes.c_int + + def dllist() -> List[str]: + """Return a list of loaded shared libraries in the current process.""" + libraries = [] + _dl_iterate_phdr(_info_callback, ctypes.byref(ctypes.py_object(libraries))) + return libraries + +################################################################ From 0acd57490801ce6eb7d4da8203d6c9fc473175f3 Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Tue, 15 Jul 2025 18:04:34 -0700 Subject: [PATCH 7/8] [CI] Orc version update for cmake v4 (#157) Update ORC dependency to 2.1.3, snappy and zstd. Fix build errors caused by cmake v4. Set minimum macOS version supported to 13.6. --- .github/workflows/build_wheels.yml | 48 ++++++++++++------------ scripts/azure/install_azure_ci_reqs.sh | 5 ++- scripts/macos/brew_dependencies.sh | 2 + tuplex/CMakeLists.txt | 3 +- tuplex/cmake/ExternalAntlr4Cpp.cmake | 2 +- tuplex/cmake/FindPythonInterpreter.cmake | 2 +- tuplex/cmake/ucm.cmake | 2 +- tuplex/codegen/src/Pipe.cc | 15 +++++++- tuplex/io/CMakeLists.txt | 41 +++++++++++++------- 9 files changed, 77 insertions(+), 43 deletions(-) diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index 2e551f515..682c184dc 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -4,7 +4,7 @@ on: [push, pull_request, workflow_dispatch] # For macos, at least 10.13 is required # to avoid issues and since the runners are macos-13 and macos-14: -# -> use 13.0, which is Venture from 2022 and 14.0 on the arm runners. +# -> use 13.6, which is Venture from 2022 and 14.0 on the arm runners. jobs: build_wheels: @@ -13,6 +13,7 @@ jobs: strategy: matrix: # macos-14 (which is macos-latest) is ARM only. macos-13 is the latest intel runner. + # libunwind from brew has 13.6 as requirement right now, update the 13 runners accordingly. os: [ ubuntu-latest, macos-13, macos-14 ] python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] include: @@ -34,27 +35,27 @@ jobs: - os: macos-13 python-version: "3.9" cibw-build: "cp39-macosx_x86_64" - macos-target: "13.0" + macos-target: "13.6" arch: "x86_64" - os: macos-13 python-version: "3.10" cibw-build: "cp310-macosx_x86_64" - macos-target: "13.0" + macos-target: "13.6" arch: "x86_64" - os: macos-13 python-version: "3.11" cibw-build: "cp311-macosx_x86_64" - macos-target: "13.0" + macos-target: "13.6" arch: "x86_64" - os: macos-13 python-version: "3.12" cibw-build: "cp312-macosx_x86_64" - macos-target: "13.0" + macos-target: "13.6" arch: "x86_64" - os: macos-13 python-version: "3.13" cibw-build: "cp313-macosx_x86_64" - macos-target: "13.0" + macos-target: "13.6" arch: "x86_64" - os: macos-14 python-version: "3.9" @@ -84,22 +85,23 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Cache brew dependencies - if: runner.os == 'macOS' - uses: actions/cache@v4.2.1 - with: - # Paths to cache: - # /usr/local/Homebrew - installation folder of Homebrew - # /usr/local/Cellar - installation folder of Homebrew formulae - # /usr/local/Frameworks, /usr/local/bin, /usr/local/opt - contain (links to) binaries installed by Homebrew formulae - path: | - /usr/local/Homebrew - /usr/local/Cellar - /usr/local/Frameworks - /usr/local/bin - /usr/local/opt - # macos13 runners are x86, macos14 are arm. --> use os therefore as cache key. - key: ${{ matrix.os }}-build-cache-${{ hashFiles('./scripts/macos/brew_dependencies.sh') }}-v2 + # disable cache for now. + #- name: Cache brew dependencies + # if: runner.os == 'macOS' + # uses: actions/cache@v4.2.1 + # with: + # # Paths to cache: + # # /usr/local/Homebrew - installation folder of Homebrew + # # /usr/local/Cellar - installation folder of Homebrew formulae + # # /usr/local/Frameworks, /usr/local/bin, /usr/local/opt - contain (links to) binaries installed by Homebrew formulae + # path: | + # /usr/local/Homebrew + # /usr/local/Cellar + # /usr/local/Frameworks + # /usr/local/bin + # /usr/local/opt + # # macos13 runners are x86, macos14 are arm. --> use os therefore as cache key. + # key: ${{ matrix.os }}-build-cache-${{ hashFiles('./scripts/macos/brew_dependencies.sh') }}-v2 - name: Setup python uses: actions/setup-python@v5 @@ -126,7 +128,7 @@ jobs: CIBW_ENVIRONMENT_LINUX: "CMAKE_ARGS='-DBUILD_WITH_AWS=ON -DBUILD_WITH_ORC=ON' LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib64:/opt/lib" # Requires macOS 10.13 at least to build because of C++17 features. - # To avoid issues, simply use 13.0 for now. + # To avoid issues, simply use 13.6 for now. # Fix for Java home from https://github.com/actions/runner-images/discussions/9266. # For github actions, $HOME is /Users/runner/ CIBW_ENVIRONMENT_MACOS: "ARCH=${{ matrix.arch }} PREFIX=${HOME}/.local MACOSX_DEPLOYMENT_TARGET=${{ matrix.macos-target }} CMAKE_ARGS='-DCMAKE_PREFIX_PATH=/Users/runner/.local -DCMAKE_MODULE_PATH=/Users/runner/.local/cmake/ -DBUILD_WITH_AWS=ON -DBUILD_WITH_ORC=ON' JAVA_HOME=${JAVA_HOME_11_X64:-$JAVA_HOME_11_arm64}" diff --git a/scripts/azure/install_azure_ci_reqs.sh b/scripts/azure/install_azure_ci_reqs.sh index a455f1a34..18c4dda4b 100644 --- a/scripts/azure/install_azure_ci_reqs.sh +++ b/scripts/azure/install_azure_ci_reqs.sh @@ -18,6 +18,9 @@ export DEBIAN_FRONTEND=noninteractive PREFIX=${PREFIX:-/opt} WORKDIR=${WORKDIR:-/tmp} +export CXXFLAGS="-fPIC" +export CFLAGS="-fPIC" + echo ">> Installing packages into ${PREFIX}" mkdir -p $PREFIX && chmod 0755 $PREFIX mkdir -p $PREFIX/sbin @@ -115,7 +118,7 @@ mkdir -p ${WORKDIR}/antlr && cd ${WORKDIR}/antlr \ mkdir -p ${WORKDIR}/aws && cd ${WORKDIR}/aws \ && git clone --recurse-submodules https://github.com/aws/aws-sdk-cpp.git \ && cd aws-sdk-cpp && git checkout tags/1.11.524 && mkdir build && cd build \ -&& cmake -DCMAKE_BUILD_TYPE=Release -DUSE_OPENSSL=ON -DENABLE_TESTING=OFF -DENABLE_UNITY_BUILD=ON -DCPP_STANDARD=17 -DBUILD_SHARED_LIBS=OFF -DBUILD_ONLY="s3;core;lambda;transfer" -DCMAKE_INSTALL_PREFIX=${PREFIX} .. \ +&& cmake -DCMAKE_CXX_FLAGS="-fPIC" -DCMAKE_BUILD_TYPE=Release -DUSE_OPENSSL=ON -DENABLE_TESTING=OFF -DENABLE_UNITY_BUILD=ON -DCPP_STANDARD=17 -DBUILD_SHARED_LIBS=OFF -DBUILD_ONLY="s3;core;lambda;transfer" -DCMAKE_INSTALL_PREFIX=${PREFIX} .. \ && make -j$(nproc) \ && make install diff --git a/scripts/macos/brew_dependencies.sh b/scripts/macos/brew_dependencies.sh index c2882af16..ac2a5512c 100755 --- a/scripts/macos/brew_dependencies.sh +++ b/scripts/macos/brew_dependencies.sh @@ -8,3 +8,5 @@ brew install openjdk@11 cmake coreutils protobuf zstd zlib libmagic llvm@15 pcre # link (when e.g. used from restoring cache) brew link --overwrite cmake coreutils protobuf zstd zlib libmagic llvm@15 pcre2 gflags yaml-cpp celero wget boost googletest libdwarf libelf abseil + +echo "Done!" \ No newline at end of file diff --git a/tuplex/CMakeLists.txt b/tuplex/CMakeLists.txt index dff4df040..8abb640d0 100755 --- a/tuplex/CMakeLists.txt +++ b/tuplex/CMakeLists.txt @@ -849,7 +849,8 @@ set(EXTERNAL_INSTALL_LOCATION ${CMAKE_BINARY_DIR}/third_party) # external libs to build / download set(ZLIB_VERSION "1.2.11") # which zlib version to use -set(ZSTD_VERSION "1.5.5") # which zstd version to use +set(ZSTD_VERSION "1.5.7") # which zstd version to use +set(SNAPPY_VERSION "1.2.2") # which snappy version to use set(BUILD_AND_DOWNLOAD_ZLIB True) set(BUILD_AND_DOWNLOAD_ZSTD True) diff --git a/tuplex/cmake/ExternalAntlr4Cpp.cmake b/tuplex/cmake/ExternalAntlr4Cpp.cmake index caaffc451..e4b8d1ae4 100755 --- a/tuplex/cmake/ExternalAntlr4Cpp.cmake +++ b/tuplex/cmake/ExternalAntlr4Cpp.cmake @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.12 FATAL_ERROR) +cmake_minimum_required(VERSION 3.20 FATAL_ERROR) if(${CMAKE_VERSION} VERSION_GREATER "3.19.0") cmake_policy(SET CMP0114 OLD) # policy update in 3.19, explicitly use old behavior diff --git a/tuplex/cmake/FindPythonInterpreter.cmake b/tuplex/cmake/FindPythonInterpreter.cmake index 370f92fc0..65fbcca2a 100644 --- a/tuplex/cmake/FindPythonInterpreter.cmake +++ b/tuplex/cmake/FindPythonInterpreter.cmake @@ -4,7 +4,7 @@ # source tree. # from https://github.com/Krzmbrzl/FindPythonInterpreter/blob/main/FindPythonInterpreter.cmake -cmake_minimum_required(VERSION 3.5) +cmake_minimum_required(VERSION 3.20) function(find_python_interpreter) set(options REQUIRED EXACT) diff --git a/tuplex/cmake/ucm.cmake b/tuplex/cmake/ucm.cmake index 651464872..99eb054b7 100644 --- a/tuplex/cmake/ucm.cmake +++ b/tuplex/cmake/ucm.cmake @@ -10,7 +10,7 @@ # The documentation can be found at the library's page: # https://github.com/onqtam/ucm -cmake_minimum_required(VERSION 2.8.12) +cmake_minimum_required(VERSION 3.20) include(CMakeParseArguments) diff --git a/tuplex/codegen/src/Pipe.cc b/tuplex/codegen/src/Pipe.cc index 4e978eda3..ce1805713 100644 --- a/tuplex/codegen/src/Pipe.cc +++ b/tuplex/codegen/src/Pipe.cc @@ -10,7 +10,20 @@ #include #include + +#include +#define BOOST_PROCESS_VERSION 1 +#if BOOST_VERSION < 108800 #include +namespace bp_process = boost::process; +#else +#include +#include +#include +#include +namespace bp_process = boost::process::v1; +#endif + #include #include #include @@ -20,7 +33,7 @@ int Pipe::pipe(const std::string& file_input, const std::string& tmpdir) { try { - using namespace boost::process; + using namespace bp_process; ipstream pipe_stdout; ipstream pipe_stderr; diff --git a/tuplex/io/CMakeLists.txt b/tuplex/io/CMakeLists.txt index 144dbdd82..d7ffd541a 100644 --- a/tuplex/io/CMakeLists.txt +++ b/tuplex/io/CMakeLists.txt @@ -16,6 +16,9 @@ message(STATUS "Found LibMagic ${LibMagic_INCLUDE_DIR}, ${LibMagic_LIBRARIES}") include_directories("include") include_directories(${Boost_INCLUDE_DIR}) +# Check what lib suffix is. +include(GNUInstallDirs) +message(STATUS "GNUInstallDIrs lib dir (should be lib/lib64): ${CMAKE_INSTALL_LIBDIR}") # Install and build ORC C++ APIs when BUILD_WITH_ORC is active if(BUILD_WITH_ORC) @@ -32,22 +35,27 @@ if(BUILD_WITH_ORC) EXECUTE_PROCESS(COMMAND brew list snappy OUTPUT_VARIABLE BREW_SNAPPY_LIST ERROR_VARIABLE BREW_SNAPPY_NOTFOUND OUTPUT_STRIP_TRAILING_WHITESPACE) if(BREW_SNAPPY_NOTFOUND) message(STATUS "Could not find locally installed snappy, building third party") - set(SNAPPY_VERSION "1.1.10") set(SNAPPY_HOME "${EXTERNAL_INSTALL_LOCATION}") set(SNAPPY_INCLUDE_DIR "${SNAPPY_HOME}/include") set(SNAPPY_STATIC_LIB "${SNAPPY_HOME}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}snappy${CMAKE_STATIC_LIBRARY_SUFFIX}") set(SNAPPY_CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${SNAPPY_HOME} -DBUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_LIBDIR=lib -DSNAPPY_BUILD_BENCHMARKS=OFF -DSNAPPY_BUILD_TESTS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON) - # cf. https://gitlab.kitware.com/cmake/cmake/-/issues/17287 - set(PATCH_CMD_FOR_SNAPPY bash -c "patch -p1 < \"${CMAKE_CURRENT_LIST_DIR}/patches/snappy.diff\"") - # To show what the patch looks like, use: - # set(PATCH_CMD_FOR_SNAPPY cat "${CMAKE_CURRENT_LIST_DIR}/patches/snappy.diff") - + # Snappy 1.10 patch. + # # cf. https://gitlab.kitware.com/cmake/cmake/-/issues/17287 + # set(PATCH_CMD_FOR_SNAPPY bash -c "patch -p1 < \"${CMAKE_CURRENT_LIST_DIR}/patches/snappy.diff\"") + # # To show what the patch looks like, use: + # # set(PATCH_CMD_FOR_SNAPPY cat "${CMAKE_CURRENT_LIST_DIR}/patches/snappy.diff") + # + # ExternalProject_Add (snappy_ep + # URL "https://github.com/google/snappy/archive/${SNAPPY_VERSION}.tar.gz" + # CMAKE_ARGS ${SNAPPY_CMAKE_ARGS} + # PATCH_COMMAND ${PATCH_CMD_FOR_SNAPPY} + # BUILD_BYPRODUCTS "${SNAPPY_STATIC_LIB}") + ExternalProject_Add (snappy_ep URL "https://github.com/google/snappy/archive/${SNAPPY_VERSION}.tar.gz" CMAKE_ARGS ${SNAPPY_CMAKE_ARGS} - PATCH_COMMAND ${PATCH_CMD_FOR_SNAPPY} BUILD_BYPRODUCTS "${SNAPPY_STATIC_LIB}") set(SNAPPY_LIBRARIES ${SNAPPY_STATIC_LIB}) @@ -101,8 +109,7 @@ if(BUILD_WITH_ORC) "${CMAKE_CURRENT_BINARY_DIR}/lz4_ep-prefix/src/lz4_ep/contrib/cmake_unofficial") endif() - ExternalProject_Add (lz4_ep - URL "https://github.com/lz4/lz4/archive/v${LZ4_VERSION}.tar.gz" + ExternalProject_Add (lz4_ep URL "https://github.com/lz4/lz4/archive/v${LZ4_VERSION}.tar.gz"s ${LZ4_CONFIGURE} BUILD_BYPRODUCTS "${LZ4_STATIC_LIB}") @@ -146,7 +153,7 @@ if(BUILD_WITH_ORC) endif() if (NOT APPLE) - set(LZ4_LIBRARIES ${EXTERNAL_INSTALL_LOCATION}/lib/liblz4.a) + set(LZ4_LIBRARIES ${EXTERNAL_INSTALL_LOCATION}/${CMAKE_INSTALL_LIBDIR}/liblz4.a) set(ORC_THIRD_PARTY_LIBS ${SNAPPY_LIBRARIES} ${LZ4_LIBRARIES}) @@ -175,7 +182,6 @@ if(BUILD_WITH_ORC) set(SNAPPY_LIBRARIES ${Snappy_LIBRARIES}) else() message(STATUS "Could not find locally installed snappy, building third party") - set(SNAPPY_VERSION "1.1.10") set(SNAPPY_HOME "${EXTERNAL_INSTALL_LOCATION}") set(SNAPPY_INCLUDE_DIR "${SNAPPY_HOME}/include") set(SNAPPY_STATIC_LIB "${SNAPPY_HOME}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}snappy${CMAKE_STATIC_LIBRARY_SUFFIX}") @@ -199,9 +205,12 @@ if(BUILD_WITH_ORC) endif() endif() + # Search for orc library, can be in lib/ or lib64 depending on platform / GnuInstall + set(orc_LIBRARY ${EXTERNAL_INSTALL_LOCATION}/${CMAKE_INSTALL_LIBDIR}/liborc.a) + ExternalProject_Add(orc GIT_REPOSITORY https://github.com/apache/orc.git - GIT_TAG rel/release-1.9.2 + GIT_TAG rel/release-2.1.3 TIMEOUT 5 CMAKE_ARGS -DBUILD_LIBHDFSPP=OFF -DSNAPPY_HOME=${SNAPPY_HOME} -DLZ4_HOME=${LZ4_HOME} -DZSTD_HOME=${ZSTD_HOME} -DZLIB_HOME=${ZLIB_HOME} @@ -211,14 +220,13 @@ if(BUILD_WITH_ORC) -DBUILD_POSITION_INDEPENDENT_LIB=ON -DPROTOBUF_HOME=${Protobuf_HOME} PREFIX "${EXTERNAL_INSTALL_LOCATION}" UPDATE_COMMAND "" # Disable update step: clones the project only once - BUILD_BYPRODUCTS ${EXTERNAL_INSTALL_LOCATION}/lib/liborc.a ${ORC_THIRD_PARTY_LIBS} + BUILD_BYPRODUCTS ${orc_LIBRARY} ${ORC_THIRD_PARTY_LIBS} ) ExternalProject_Add_StepDependencies(orc build ${SNAPPY_DEPENDS} ${LZ4_DEPENDS} ${ZSTD_DEPENDS} ) set(orc_INCLUDE_DIR ${EXTERNAL_INSTALL_LOCATION}/include) ExternalProject_Get_Property(orc binary_dir) - set(orc_LIBRARY ${EXTERNAL_INSTALL_LOCATION}/lib/liborc.a) add_library(liborc STATIC IMPORTED) target_link_libraries(liborc INTERFACE ${SNAPPY_LIBRARIES} ${LZ4_LIBRARIES}) @@ -238,6 +246,11 @@ add_library(libio OBJECT ${CMAKE_CURRENT_BINARY_DIR} ${SOURCES} ${INCLUDES}) set_target_properties(libio PROPERTIES PREFIX "") +# Make sure orc is built if libio is requested. +if(BUILD_WITH_ORC) + add_dependencies(libio liborc) +endif() + target_include_directories(libio PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_BINARY_DIR} ${Boost_INCLUDE_DIR} ${LibMagic_INCLUDE_DIR} From 5380aec8ab38017401610435511450cda9fd8b3b Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Sat, 9 Aug 2025 20:23:55 -0700 Subject: [PATCH 8/8] Add dockerfile with ubuntu 24.04 in scripts (#158) Adds new ubuntu 24.04 dockerfile and install requirements script. Other: - Silence std::move warning in GCC for IRBuilder<>. - Modify github actions to auto-cancel actions when new commit is pushed. --- .github/workflows/build_wheels.yml | 5 + scripts/ubuntu2404/Dockerfile | 82 +++++++ scripts/ubuntu2404/install_requirements.sh | 265 +++++++++++++++++++++ tuplex/CMakeLists.txt | 2 + tuplex/codegen/include/CodegenHelper.h | 6 + tuplex/codegen/include/Token.h | 1 + tuplex/core/include/RESTInterface.h | 1 + tuplex/io/include/VirtualFileSystemBase.h | 2 + tuplex/io/include/VirtualMappedFile.h | 3 + tuplex/utils/include/Base.h | 1 + tuplex/utils/include/Field.h | 1 + 11 files changed, 369 insertions(+) create mode 100644 scripts/ubuntu2404/Dockerfile create mode 100755 scripts/ubuntu2404/install_requirements.sh diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index 682c184dc..f4dc95937 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -2,6 +2,11 @@ name: Build on: [push, pull_request, workflow_dispatch] +# Cancel previous runs, i.e. run only latest push. +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + # For macos, at least 10.13 is required # to avoid issues and since the runners are macos-13 and macos-14: # -> use 13.6, which is Venture from 2022 and 14.0 on the arm runners. diff --git a/scripts/ubuntu2404/Dockerfile b/scripts/ubuntu2404/Dockerfile new file mode 100644 index 000000000..716f4c9ae --- /dev/null +++ b/scripts/ubuntu2404/Dockerfile @@ -0,0 +1,82 @@ +# Tuplex ubuntu 24.04 image, holds code in /code directory and allows to build tuplex easily. +# create with docker build -t tuplex/ubuntu:24.04 . +# Run with docker run -it tuplex/ubuntu:24.04 bash +FROM ubuntu:24.04 +LABEL authors="leonhards" + +# Versions, environment variables. +# Needs to be a pyenv supported version: pyenv install --list. +ENV PYTHON_VERSION=3.13.5 +ENV GITHUB_BRANCH=master + +WORKDIR /tmp + +# Install python version +# set the variables as per $(pyenv init -) +ENV LANG="C.UTF-8" \ + LC_ALL="C.UTF-8" \ + PATH="/opt/pyenv/shims:/opt/pyenv/bin:$PATH" \ + PYENV_ROOT="/opt/pyenv" \ + PYENV_SHELL="bash" + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + curl \ + git \ + libbz2-dev \ + libffi-dev \ + libncurses5-dev \ + libncursesw5-dev \ + libreadline-dev \ + libsqlite3-dev \ + liblzma-dev \ + libssl-dev \ + make \ + netbase \ + pkg-config \ + tk-dev \ + wget \ + xz-utils \ + zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + + +RUN git clone --single-branch --depth 1 https://github.com/pyenv/pyenv.git $PYENV_ROOT \ + && pyenv install $PYTHON_VERSION \ + && pyenv global $PYTHON_VERSION \ + && find $PYENV_ROOT/versions -type d '(' -name '__pycache__' -o -name 'test' -o -name 'tests' ')' -exec rm -rf '{}' + \ + && find $PYENV_ROOT/versions -type f '(' -name '*.pyo' -o -name '*.exe' ')' -exec rm -f '{}' + \ + && rm -rf /tmp/* + + +# --- install requirements --- + +ADD install_requirements.sh /tmp/install_requirements.sh + +RUN /tmp/install_requirements.sh + +# installs tuplex-specific dependencies into /opt +RUN mkdir -p /opt + +# downloads/clones tuplex into /code directory +RUN mkdir -p /code + +RUN rm -rf /tmp/* + +WORKDIR /code + +# Clone tuplex, and build initial version as well as install in python. +RUN git clone -b ${GITHUB_BRANCH} --single-branch https://github.com/tuplex/tuplex.git + +# Install python dependencies +RUN python3 -m pip install cloudpickle numpy pandas + +# Add paths to compile, +RUN echo "export PATH=/opt/bin:${PATH}" >> /root/.bashrc +RUN echo "export LLVM_ROOT=/opt/llvm-16.0.6" >> /root/.bashrc + +# Within the docker container, compile tuplex with +# cd /code/tuplex/tuplex && mkdir -p build && cd build && cmake -DLLVM_ROOT=/opt/llvm-16.0.6/ -DBUILD_WITH_AWS=ON -DBUILD_WITH_ORC=ON .. && make +# You can test via ctest +# or follow instructions to test the python package. \ No newline at end of file diff --git a/scripts/ubuntu2404/install_requirements.sh b/scripts/ubuntu2404/install_requirements.sh new file mode 100755 index 000000000..3937a5056 --- /dev/null +++ b/scripts/ubuntu2404/install_requirements.sh @@ -0,0 +1,265 @@ +#!/usr/bin/env bash +# (c) Tuplex team 2017-2023 +# Installs all tuplex dependencies required to build tuplex. + +# Variables needed incl. defaults. +PREFIX=${PREFIX:-/opt} +WORKDIR=${WORKDIR:-/tmp} +PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE:-python3} +CMAKE_VERSION="3.27.5" +BOOST_VERSION="1.88.0" +LLVM_VERSION="16.0.6" +AWSSDK_CPP_VERSION=1.11.524 +ANTLR4_VERSION=4.13.1 +YAML_CPP_VERSION=0.8.0 +AWS_LAMBDA_CPP_VERSION=0.2.10 +PCRE2_VERSION=10.45 +PROTOBUF_VERSION=24.3 +CELERO_VERSION=2.8.3 +CC=gcc +CXX=g++ + +CPU_COUNT=$(( 1 * $( egrep '^processor[[:space:]]+:' /proc/cpuinfo | wc -l ) )) + +PYTHON_VERSION=$(echo $(python3 --version) | cut -d ' ' -f2) +PYTHON_MAJMIN_VERSION=${PYTHON_VERSION%.*} +echo ">> Installing dependencies for Python version ${PYTHON_VERSION}" + +function version { echo "$@" | awk -F. '{ printf("%d%03d%03d%03d\n", $1,$2,$3,$4); }'; } + +# Start script. +set -euxo pipefail + +# need to run this with root privileges +if [[ $(id -u) -ne 0 ]]; then + echo "Please run this script with root privileges" + exit 1 +fi + +# --- helper functions --- +# Parse the major, minor and patch versions. +# You use it like this: +# semver="3.4.5+xyz" +# a=($(parse_semver "$semver")) +# major=${a[0]} +# minor=${a[1]} +# patch=${a[2]} +# printf "%-32s %4d %4d %4d\n" "$semver" $major $minor $patch +function parse_semver() { + local token="$1" + local major=0 + local minor=0 + local patch=0 + + if egrep '^[0-9]+\.[0-9]+\.[0-9]+' <<<"$token" >/dev/null 2>&1 ; then + # It has the correct syntax. + local n=${token//[!0-9]/ } + local a=(${n//\./ }) + major=${a[0]} + minor=${a[1]} + patch=${a[2]} + fi + + echo "$major $minor $patch" +} + +function install_llvm { + LLVM_VERSION=$1 + LLVM_MAJOR_VERSION=`echo ${LLVM_VERSION} | cut -d. -f1` + LLVM_MINOR_VERSION=`echo ${LLVM_VERSION} | cut -d. -f2` + LLVM_MAJMIN_VERSION="${LLVM_MAJOR_VERSION}.${LLVM_MINOR_VERSION}" + + # list of targets available to build: AArch64;AMDGPU;ARM;AVR;BPF;Hexagon;Lanai;LoongArch;Mips;MSP430;NVPTX;PowerPC;RISCV;Sparc;SystemZ;VE;WebAssembly;X86;XCore + # in order to cross-compile, should use targets: + + + echo ">> building LLVM ${LLVM_VERSION}" + LLVM_URL=https://github.com/llvm/llvm-project/releases/download/llvmorg-${LLVM_VERSION}/llvm-${LLVM_VERSION}.src.tar.xz + CLANG_URL=https://github.com/llvm/llvm-project/releases/download/llvmorg-${LLVM_VERSION}/clang-${LLVM_VERSION}.src.tar.xz + # required when LLVM version > 15 + LLVM_CMAKE_URL=https://github.com/llvm/llvm-project/releases/download/llvmorg-${LLVM_VERSION}/cmake-${LLVM_VERSION}.src.tar.xz + + PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE:-python3} + PYTHON_BASENAME="$(basename -- $PYTHON_EXECUTABLE)" + PYTHON_VERSION=$(${PYTHON_EXECUTABLE} --version) + echo ">> Building dependencies for ${PYTHON_VERSION}" + + echo ">> Downloading prerequisites for llvm ${LLVM_VERSION}}" + LLVM_WORKDIR=${WORKDIR}/llvm${LLVM_VERSION} + mkdir -p ${LLVM_WORKDIR} + pushd "${LLVM_WORKDIR}" || exit 1 + + wget ${LLVM_URL} && tar xf llvm-${LLVM_VERSION}.src.tar.xz + wget ${CLANG_URL} && tar xf clang-${LLVM_VERSION}.src.tar.xz && mv clang-${LLVM_VERSION}.src llvm-${LLVM_VERSION}.src/../clang + + if (( LLVM_MAJOR_VERSION >= 15 )); then + wget ${LLVM_CMAKE_URL} && tar xf cmake-${LLVM_VERSION}.src.tar.xz && mv cmake-${LLVM_VERSION}.src cmake + fi + + mkdir -p llvm-${LLVM_VERSION}.src/build && cd llvm-${LLVM_VERSION}.src/build + + cmake -GNinja -DLLVM_ENABLE_RTTI=ON -DLLVM_ENABLE_EH=ON -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_TARGETS_TO_BUILD="X86;AArch64" \ + -DCMAKE_BUILD_TYPE=Release -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_BENCHMARKS=OFF \ + -DCMAKE_INSTALL_PREFIX=/opt/llvm-${LLVM_VERSION} .. + ninja install + popd +} + +export DEBIAN_FRONTEND=noninteractive + + + +echo ">> Installing packages into ${PREFIX}" +mkdir -p $PREFIX && chmod 0755 $PREFIX +mkdir -p $PREFIX/sbin +mkdir -p $PREFIX/bin +mkdir -p $PREFIX/share +mkdir -p $PREFIX/include +mkdir -p $PREFIX/lib + +echo ">> Files will be downloaded to ${WORKDIR}/tuplex-downloads" +WORKDIR=$WORKDIR/tuplex-downloads +mkdir -p $WORKDIR + +PYTHON_BASENAME="$(basename -- $PYTHON_EXECUTABLE)" +PYTHON_VERSION=$(${PYTHON_EXECUTABLE} --version) +echo ">> Building dependencies for ${PYTHON_VERSION}" +echo ">> Installing all build dependencies for Tuplex under Ubuntu 24.04" + +echo ">> Installing apt dependencies" +apt update -y + +apt-get install -y apt-utils dh-autoreconf libmagic-dev curl libxml2-dev vim build-essential libssl-dev zlib1g-dev libncurses5-dev \ +libncursesw5-dev libreadline-dev libsqlite3-dev libgdbm-dev libdb5.3-dev openssh-client unzip \ +libbz2-dev libexpat1-dev liblzma-dev tk-dev libffi-dev wget git libcurl4-openssl-dev python3-dev python3-pip openjdk-11-jdk ninja-build + +ldconfig +export CC=${CC} +export CXX=${CXX} + +echo ">> Installing recent cmake" +# fetch recent cmake & install +URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz +mkdir -p ${WORKDIR}/cmake && cd ${WORKDIR}/cmake && + curl -sSL $URL -o cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && + tar -v -zxf cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && + rm -f cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && + cd cmake-${CMAKE_VERSION}-linux-x86_64 && + cp -rp bin/* ${PREFIX}/bin/ && + cp -rp share/* ${PREFIX}/share/ && + cd / && rm -rf ${WORKDIR}/cmake + +export PATH=$PREFIX/bin:$PATH +cmake --version + + +echo ">> Installing Boost" +mkdir -p ${WORKDIR}/boost + +# -- Install Boost. +# create underscored version +# e.g. 1.79.0 -> 1_79_0 +BOOST_UNDERSCORED_VERSION=$(echo ${BOOST_VERSION} | tr . _) + +# build incl. boost python +cd ${WORKDIR}/boost && curl -L -O https://github.com/boostorg/boost/releases/download/boost-${BOOST_VERSION}/boost-${BOOST_VERSION}-b2-nodocs.tar.gz && tar xf boost-${BOOST_VERSION}-b2-nodocs.tar.gz && cd ${WORKDIR}/boost/boost-${BOOST_VERSION} \ + && ./bootstrap.sh --with-python=${PYTHON_EXECUTABLE} --prefix=${PREFIX} --with-libraries="thread,iostreams,regex,system,filesystem,python,stacktrace,atomic,chrono,date_time" \ + && ./b2 cxxflags="-fPIC" link=static -j "$(nproc)" \ + && ./b2 cxxflags="-fPIC" link=static install && sed -i 's/#if PTHREAD_STACK_MIN > 0/#ifdef PTHREAD_STACK_MIN/g' ${PREFIX}/include/boost/thread/pthread/thread_data.hpp + +cd $WORKDIR +rm -rf ${WORKDIR}/boost + +# -- install llvm +install_llvm $LLVM_VERSION + + +echo ">>> Installing tuplex dependencies." +# install recent zlib version (1.2.11) fork from cloudflare +# https://github.com/aws/aws-graviton-getting-started#zlib-on-linux +LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-} +export LD_LIBRARY_PATH=$PREFIX/lib:$PREFIX/lib64:$LD_LIBRARY_PATH + +# Cloudflare fork is too old +#mkdir -p $WORKDIR/zlib && cd $WORKDIR && git clone https://github.com/cloudflare/zlib.git && cd zlib && ./configure --prefix=$PREFIX && make -j ${CPU_COUNT} && make install + +# note that zlib defines Z_NULL=0 whereas zlib-ng defines it as NULL, patch aws sdk accordingly +git clone https://github.com/zlib-ng/zlib-ng.git && cd zlib-ng && git checkout tags/2.1.3 && mkdir build && cd build && cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS="-fPIC" -DZLIB_COMPAT=ON .. && make -j ${CPU_COUNT} && make install + +git clone https://github.com/google/googletest.git -b v1.14.0 && cd googletest && mkdir build && cd build && cmake -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_FLAGS="-fPIC" -DCMAKE_BUILD_TYPE=Release .. && make -j ${CPU_COUNT} && make install + +# build snappy as static lib +git clone https://github.com/google/snappy.git -b 1.1.10 && cd snappy && git submodule update --init && mkdir build && cd build && cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF -DCMAKE_CXX_FLAGS="-fPIC" .. && make -j ${CPU_COUNT} && make install + +# add github to known hosts +mkdir -p /root/.ssh/ && + touch /root/.ssh/known_hosts && + ssh-keyscan github.com >> /root/.ssh/known_hosts + + +echo ">> Installing YAMLCPP" +mkdir -p ${WORKDIR}/yamlcpp && cd ${WORKDIR}/yamlcpp \ +&& git clone https://github.com/jbeder/yaml-cpp.git yaml-cpp \ +&& cd yaml-cpp \ +&& git checkout tags/${YAML_CPP_VERSION} \ +&& mkdir build && cd build \ +&& cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=${PREFIX} -DYAML_CPP_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_CXX_FLAGS="-fPIC" .. \ +&& make -j ${CPU_COUNT} && make install + +echo ">> Installing Celero" +mkdir -p ${WORKDIR}/celero && cd ${WORKDIR}/celero \ +&& git clone https://github.com/DigitalInBlue/Celero.git celero && cd celero \ +&& git checkout tags/v${CELERO_VERSION} \ +&& mkdir build && cd build \ +&& cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=${PREFIX} -DBUILD_SHARED_LIBS=OFF -DCMAKE_CXX_FLAGS="-fPIC -std=c++11" .. \ +&& make -j ${CPU_COUNT} && make install + +echo ">> Installing ANTLR" +mkdir -p ${WORKDIR}/antlr && cd ${WORKDIR}/antlr \ +&& curl -O https://www.antlr.org/download/antlr-${ANTLR4_VERSION}-complete.jar \ +&& cp antlr-${ANTLR4_VERSION}-complete.jar ${PREFIX}/lib/ \ +&& curl -O https://www.antlr.org/download/antlr4-cpp-runtime-${ANTLR4_VERSION}-source.zip \ +&& unzip antlr4-cpp-runtime-${ANTLR4_VERSION}-source.zip -d antlr4-cpp-runtime \ +&& rm antlr4-cpp-runtime-${ANTLR4_VERSION}-source.zip \ +&& cd antlr4-cpp-runtime \ +&& mkdir build && cd build && cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=${PREFIX} .. \ +&& make -j ${CPU_COUNT}&& make install + +echo ">> Installing AWS SDK" +# Note the z-lib patch here. +mkdir -p ${WORKDIR}/aws && cd ${WORKDIR}/aws \ +&& git clone --recurse-submodules https://github.com/aws/aws-sdk-cpp.git \ +&& cd aws-sdk-cpp && git checkout tags/${AWSSDK_CPP_VERSION} && sed -i 's/int ret = Z_NULL;/int ret = static_cast(Z_NULL);/g' src/aws-cpp-sdk-core/source/client/RequestCompression.cpp && mkdir build && cd build \ +&& cmake -DCMAKE_BUILD_TYPE=Release -DUSE_OPENSSL=ON -DENABLE_TESTING=OFF -DUSE_CRT_HTTP_CLIENT=ON -DENABLE_UNITY_BUILD=ON -DCPP_STANDARD=17 -DBUILD_SHARED_LIBS=OFF -DBUILD_ONLY="s3;s3-crt;core;lambda;transfer" -DCMAKE_INSTALL_PREFIX=${PREFIX} .. \ +&& make -j ${CPU_COUNT} \ +&& make install + +# Installing AWS Lambda C++ runtime. +cd ${WORKDIR}/aws \ +&& git clone https://github.com/awslabs/aws-lambda-cpp.git \ +&& cd aws-lambda-cpp \ +&& git fetch && git fetch --tags \ +&& git checkout v${AWS_LAMBDA_CPP_VERSION} \ +&& mkdir build \ +&& cd build \ +&& cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=${PREFIX} .. \ +&& make -j$(nproc) && make install + +echo ">> Installing PCRE2" +mkdir -p ${WORKDIR}/pcre2 && cd ${WORKDIR}/pcre2 \ +&& curl -LO https://github.com/PhilipHazel/pcre2/releases/download/pcre2-${PCRE2_VERSION}/pcre2-${PCRE2_VERSION}.zip \ +&& unzip pcre2-${PCRE2_VERSION}.zip \ +&& rm pcre2-${PCRE2_VERSION}.zip \ +&& cd pcre2-${PCRE2_VERSION} \ +&& ./configure CFLAGS="-O2 -fPIC" --prefix=${PREFIX} --enable-jit=auto --disable-shared \ +&& make -j$(nproc) && make install + +echo ">> Installing protobuf" +mkdir -p ${WORKDIR}/protobuf && cd ${WORKDIR}/protobuf \ +&& git clone -b v${PROTOBUF_VERSION} https://github.com/protocolbuffers/protobuf.git && cd protobuf && git submodule update --init --recursive && mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-fPIC" -DCMAKE_CXX_STANDARD=17 -Dprotobuf_BUILD_TESTS=OFF .. && make -j ${CPU_COUNT} && make install + + +# delete workdir (downloads dir) to clean up space +rm -rf ${WORKDIR} + +echo "-- Done, all Tuplex requirements installed to /opt --" diff --git a/tuplex/CMakeLists.txt b/tuplex/CMakeLists.txt index 8abb640d0..7adf4b7ef 100755 --- a/tuplex/CMakeLists.txt +++ b/tuplex/CMakeLists.txt @@ -6,6 +6,8 @@ cmake_minimum_required(VERSION 3.16 FATAL_ERROR) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) message(STATUS "Using language version: C++${CMAKE_CXX_STANDARD}") +set(CMAKE_C_STANDARD 17) +set(CMAKE_C_STANDARD_REQUIRED ON) # add cmake modules from cmake folder list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake/") diff --git a/tuplex/codegen/include/CodegenHelper.h b/tuplex/codegen/include/CodegenHelper.h index 63af95f38..9649dc9d7 100644 --- a/tuplex/codegen/include/CodegenHelper.h +++ b/tuplex/codegen/include/CodegenHelper.h @@ -763,7 +763,13 @@ namespace tuplex { llvm::Instruction& inst = *firstBlock.getFirstInsertionPt(); ctorBuilder.SetInsertPoint(&inst); } + + // llvm IR builder has some issues with copy disallowed. C++ compilers will issue -Wreturn-local-addr. + // Disable warning here. +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wreturn-local-addr" return std::move(ctorBuilder); +#pragma GCC diagnostic pop } // in order to serialize/deserialize data properly and deal with diff --git a/tuplex/codegen/include/Token.h b/tuplex/codegen/include/Token.h index 7688a4dbe..b6ca1b713 100644 --- a/tuplex/codegen/include/Token.h +++ b/tuplex/codegen/include/Token.h @@ -13,6 +13,7 @@ #include #include +#include #include #include #include diff --git a/tuplex/core/include/RESTInterface.h b/tuplex/core/include/RESTInterface.h index ddf2d9bb8..ba1358250 100644 --- a/tuplex/core/include/RESTInterface.h +++ b/tuplex/core/include/RESTInterface.h @@ -13,6 +13,7 @@ #include #include +#include #include #include diff --git a/tuplex/io/include/VirtualFileSystemBase.h b/tuplex/io/include/VirtualFileSystemBase.h index 31b0087a6..d9c1c50f6 100644 --- a/tuplex/io/include/VirtualFileSystemBase.h +++ b/tuplex/io/include/VirtualFileSystemBase.h @@ -11,6 +11,8 @@ #ifndef TUPLEX_VIRTUALFILESYSTEMBASE_H #define TUPLEX_VIRTUALFILESYSTEMBASE_H +#include + namespace tuplex { enum class VirtualFileSystemStatus { VFS_OK = 0, diff --git a/tuplex/io/include/VirtualMappedFile.h b/tuplex/io/include/VirtualMappedFile.h index e7c65f374..0d21f30da 100644 --- a/tuplex/io/include/VirtualMappedFile.h +++ b/tuplex/io/include/VirtualMappedFile.h @@ -11,6 +11,9 @@ #ifndef TUPLEX_VIRTUALMAPPEDFILE_H #define TUPLEX_VIRTUALMAPPEDFILE_H +#include +#include + namespace tuplex { class VirtualMappedFile; class VirtualFileSystem; diff --git a/tuplex/utils/include/Base.h b/tuplex/utils/include/Base.h index 2475a9514..da3db47b7 100644 --- a/tuplex/utils/include/Base.h +++ b/tuplex/utils/include/Base.h @@ -13,6 +13,7 @@ #include #include +#include #include #include diff --git a/tuplex/utils/include/Field.h b/tuplex/utils/include/Field.h index fef968870..0ecb2d896 100644 --- a/tuplex/utils/include/Field.h +++ b/tuplex/utils/include/Field.h @@ -12,6 +12,7 @@ #define TUPLEX_FIELD_H #include +#include #include #include #include