From af04be76db2ae81fc8c50d9435ee63c5c4902db3 Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Tue, 11 Mar 2025 23:33:26 -0700 Subject: [PATCH 01/14] pyarrow fix, need to add test to check this is not broken --- tuplex/python/tuplex/context.py | 5 + tuplex/python/tuplex/utils/common.py | 18 + tuplex/python/tuplex/utils/dllist.py | 472 +++++++++++++++++++++++++++ 3 files changed, 495 insertions(+) create mode 100644 tuplex/python/tuplex/utils/dllist.py diff --git a/tuplex/python/tuplex/context.py b/tuplex/python/tuplex/context.py index 04e8d2e0c..b088a4344 100644 --- a/tuplex/python/tuplex/context.py +++ b/tuplex/python/tuplex/context.py @@ -11,6 +11,8 @@ import logging +from .utils.common import pyarrow_aws_sdk_cpp_fix + try: from .libexec.tuplex import _Context, getDefaultOptionsAsJSON except ModuleNotFoundError as e: @@ -231,6 +233,9 @@ def __init__( options["tuplex.webui.enable"] = options["webui"] del options["webui"] + # Ensure no crash due to PyArrow potentially being present. + pyarrow_aws_sdk_cpp_fix() + # last arg are the options as json string serialized b.c. of boost python problems self._context = _Context(name, runtime_path, json.dumps(options)) python_metrics = self._context.getMetrics() diff --git a/tuplex/python/tuplex/utils/common.py b/tuplex/python/tuplex/utils/common.py index 32d6fb47f..888ca2709 100644 --- a/tuplex/python/tuplex/utils/common.py +++ b/tuplex/python/tuplex/utils/common.py @@ -31,6 +31,8 @@ import psutil import yaml +from .dllist import dllist + try: import pwd except ImportError: @@ -1077,3 +1079,19 @@ def ensure_webui(options: dict) -> None: # log gunicorn errors for local startup if os.path.isfile(gunicorn_logpath) and "localhost" == webui_url: log_gunicorn_errors(gunicorn_logpath) + +def pyarrow_aws_sdk_cpp_fix() -> None: + """Help fix issue of pyarrow (frequent because pyarrow seems to be shipped very often) + Call this function BEFORE initializing the _Context object from the tuplex C extension object.""" + # PyArrow always initializes AWS SDK. Because Tuplex may initialize it as well, + # skip in the presence of the arrow lib being loaded + # the AWS SDK initialization. + + loaded_shared_objects = dllist() + pyarrow_loaded = any("pyarrow/lib" in path for path in loaded_shared_objects) + + if pyarrow_loaded: + from tuplex.libexec.tuplex import setExternalAwssdk + + # Calling this function will prevent Tuplex from calling initAWSSDK and shutdownAWSSDK. + setExternalAwssdk(True) \ No newline at end of file diff --git a/tuplex/python/tuplex/utils/dllist.py b/tuplex/python/tuplex/utils/dllist.py new file mode 100644 index 000000000..b8452efa5 --- /dev/null +++ b/tuplex/python/tuplex/utils/dllist.py @@ -0,0 +1,472 @@ + +# This is code schedule to be part of Python 3.14+ from https://github.com/python/cpython/pull/122946/files, ported back here for usagge to avoid pyarrow bug +# where AWS SDK is always initialized and thus may lead to issues with tuplex. + +import os +import shutil +import subprocess +import sys + +# find_library(name) returns the pathname of a library, or None. +if os.name == "nt": + + def _get_build_version(): + """Return the version of MSVC that was used to build Python. + + For Python 2.3 and up, the version number is included in + sys.version. For earlier versions, assume the compiler is MSVC 6. + """ + # This function was copied from Lib/distutils/msvccompiler.py + prefix = "MSC v." + i = sys.version.find(prefix) + if i == -1: + return 6 + i = i + len(prefix) + s, rest = sys.version[i:].split(" ", 1) + majorVersion = int(s[:-2]) - 6 + if majorVersion >= 13: + majorVersion += 1 + minorVersion = int(s[2:3]) / 10.0 + # I don't think paths are affected by minor version in version 6 + if majorVersion == 6: + minorVersion = 0 + if majorVersion >= 6: + return majorVersion + minorVersion + # else we don't know what version of the compiler this is + return None + + def find_msvcrt(): + """Return the name of the VC runtime dll""" + version = _get_build_version() + if version is None: + # better be safe than sorry + return None + if version <= 6: + clibname = 'msvcrt' + elif version <= 13: + clibname = 'msvcr%d' % (version * 10) + else: + # CRT is no longer directly loadable. See issue23606 for the + # discussion about alternative approaches. + return None + + # If python was built with in debug mode + import importlib.machinery + if '_d.pyd' in importlib.machinery.EXTENSION_SUFFIXES: + clibname += 'd' + return clibname+'.dll' + + def find_library(name): + if name in ('c', 'm'): + return find_msvcrt() + # See MSDN for the REAL search order. + for directory in os.environ['PATH'].split(os.pathsep): + fname = os.path.join(directory, name) + if os.path.isfile(fname): + return fname + if fname.lower().endswith(".dll"): + continue + fname = fname + ".dll" + if os.path.isfile(fname): + return fname + return None + + # Listing loaded DLLs on Windows relies on the following APIs: + # https://learn.microsoft.com/windows/win32/api/psapi/nf-psapi-enumprocessmodules + # https://learn.microsoft.com/windows/win32/api/libloaderapi/nf-libloaderapi-getmodulefilenamew + import ctypes + from ctypes import wintypes + + _kernel32 = ctypes.WinDLL('kernel32', use_last_error=True) + _get_current_process = _kernel32["GetCurrentProcess"] + _get_current_process.restype = wintypes.HANDLE + + _k32_get_module_file_name = _kernel32["GetModuleFileNameW"] + _k32_get_module_file_name.restype = wintypes.DWORD + _k32_get_module_file_name.argtypes = ( + wintypes.HMODULE, + wintypes.LPWSTR, + wintypes.DWORD, + ) + + _psapi = ctypes.WinDLL('psapi', use_last_error=True) + _enum_process_modules = _psapi["EnumProcessModules"] + _enum_process_modules.restype = wintypes.BOOL + _enum_process_modules.argtypes = ( + wintypes.HANDLE, + ctypes.POINTER(wintypes.HMODULE), + wintypes.DWORD, + wintypes.LPDWORD, + ) + + def _get_module_filename(module: wintypes.HMODULE): + name = (wintypes.WCHAR * 32767)() # UNICODE_STRING_MAX_CHARS + if _k32_get_module_file_name(module, name, len(name)): + return name.value + return None + + + def _get_module_handles(): + process = _get_current_process() + space_needed = wintypes.DWORD() + n = 1024 + while True: + modules = (wintypes.HMODULE * n)() + if not _enum_process_modules(process, + modules, + ctypes.sizeof(modules), + ctypes.byref(space_needed)): + err = ctypes.get_last_error() + msg = ctypes.FormatError(err).strip() + raise ctypes.WinError(err, f"EnumProcessModules failed: {msg}") + n = space_needed.value // ctypes.sizeof(wintypes.HMODULE) + if n <= len(modules): + return modules[:n] + + def dllist(): + """Return a list of loaded shared libraries in the current process.""" + modules = _get_module_handles() + libraries = [name for h in modules + if (name := _get_module_filename(h)) is not None] + return libraries + +elif os.name == "posix" and sys.platform in {"darwin", "ios", "tvos", "watchos"}: + from ctypes.macholib.dyld import dyld_find as _dyld_find + def find_library(name): + possible = ['lib%s.dylib' % name, + '%s.dylib' % name, + '%s.framework/%s' % (name, name)] + for name in possible: + try: + return _dyld_find(name) + except ValueError: + continue + return None + + # Listing loaded libraries on Apple systems relies on the following API: + # https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man3/dyld.3.html + import ctypes + + _libc = ctypes.CDLL(find_library("c")) + _dyld_get_image_name = _libc["_dyld_get_image_name"] + _dyld_get_image_name.restype = ctypes.c_char_p + + def dllist(): + """Return a list of loaded shared libraries in the current process.""" + num_images = _libc._dyld_image_count() + libraries = [os.fsdecode(name) for i in range(num_images) + if (name := _dyld_get_image_name(i)) is not None] + + return libraries + +elif sys.platform.startswith("aix"): + # AIX has two styles of storing shared libraries + # GNU auto_tools refer to these as svr4 and aix + # svr4 (System V Release 4) is a regular file, often with .so as suffix + # AIX style uses an archive (suffix .a) with members (e.g., shr.o, libssl.so) + # see issue#26439 and _aix.py for more details + + from ctypes._aix import find_library + +elif sys.platform == "android": + def find_library(name): + directory = "/system/lib" + if "64" in os.uname().machine: + directory += "64" + + fname = f"{directory}/lib{name}.so" + return fname if os.path.isfile(fname) else None + +elif os.name == "posix": + # Andreas Degert's find functions, using gcc, /sbin/ldconfig, objdump + import re, tempfile + + def _is_elf(filename): + "Return True if the given file is an ELF file" + elf_header = b'\x7fELF' + try: + with open(filename, 'br') as thefile: + return thefile.read(4) == elf_header + except FileNotFoundError: + return False + + def _findLib_gcc(name): + # Run GCC's linker with the -t (aka --trace) option and examine the + # library name it prints out. The GCC command will fail because we + # haven't supplied a proper program with main(), but that does not + # matter. + expr = os.fsencode(r'[^\(\)\s]*lib%s\.[^\(\)\s]*' % re.escape(name)) + + c_compiler = shutil.which('gcc') + if not c_compiler: + c_compiler = shutil.which('cc') + if not c_compiler: + # No C compiler available, give up + return None + + temp = tempfile.NamedTemporaryFile() + try: + args = [c_compiler, '-Wl,-t', '-o', temp.name, '-l' + name] + + env = dict(os.environ) + env['LC_ALL'] = 'C' + env['LANG'] = 'C' + try: + proc = subprocess.Popen(args, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + env=env) + except OSError: # E.g. bad executable + return None + with proc: + trace = proc.stdout.read() + finally: + try: + temp.close() + except FileNotFoundError: + # Raised if the file was already removed, which is the normal + # behaviour of GCC if linking fails + pass + res = re.findall(expr, trace) + if not res: + return None + + for file in res: + # Check if the given file is an elf file: gcc can report + # some files that are linker scripts and not actual + # shared objects. See bpo-41976 for more details + if not _is_elf(file): + continue + return os.fsdecode(file) + + + if sys.platform == "sunos5": + # use /usr/ccs/bin/dump on solaris + def _get_soname(f): + if not f: + return None + + try: + proc = subprocess.Popen(("/usr/ccs/bin/dump", "-Lpv", f), + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL) + except OSError: # E.g. command not found + return None + with proc: + data = proc.stdout.read() + res = re.search(br'\[.*\]\sSONAME\s+([^\s]+)', data) + if not res: + return None + return os.fsdecode(res.group(1)) + else: + def _get_soname(f): + # assuming GNU binutils / ELF + if not f: + return None + objdump = shutil.which('objdump') + if not objdump: + # objdump is not available, give up + return None + + try: + proc = subprocess.Popen((objdump, '-p', '-j', '.dynamic', f), + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL) + except OSError: # E.g. bad executable + return None + with proc: + dump = proc.stdout.read() + res = re.search(br'\sSONAME\s+([^\s]+)', dump) + if not res: + return None + return os.fsdecode(res.group(1)) + + if sys.platform.startswith(("freebsd", "openbsd", "dragonfly")): + + def _num_version(libname): + # "libxyz.so.MAJOR.MINOR" => [ MAJOR, MINOR ] + parts = libname.split(b".") + nums = [] + try: + while parts: + nums.insert(0, int(parts.pop())) + except ValueError: + pass + return nums or [sys.maxsize] + + def find_library(name): + ename = re.escape(name) + expr = r':-l%s\.\S+ => \S*/(lib%s\.\S+)' % (ename, ename) + expr = os.fsencode(expr) + + try: + proc = subprocess.Popen(('/sbin/ldconfig', '-r'), + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL) + except OSError: # E.g. command not found + data = b'' + else: + with proc: + data = proc.stdout.read() + + res = re.findall(expr, data) + if not res: + return _get_soname(_findLib_gcc(name)) + res.sort(key=_num_version) + return os.fsdecode(res[-1]) + + elif sys.platform == "sunos5": + + def _findLib_crle(name, is64): + if not os.path.exists('/usr/bin/crle'): + return None + + env = dict(os.environ) + env['LC_ALL'] = 'C' + + if is64: + args = ('/usr/bin/crle', '-64') + else: + args = ('/usr/bin/crle',) + + paths = None + try: + proc = subprocess.Popen(args, + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + env=env) + except OSError: # E.g. bad executable + return None + with proc: + for line in proc.stdout: + line = line.strip() + if line.startswith(b'Default Library Path (ELF):'): + paths = os.fsdecode(line).split()[4] + + if not paths: + return None + + for dir in paths.split(":"): + libfile = os.path.join(dir, "lib%s.so" % name) + if os.path.exists(libfile): + return libfile + + return None + + def find_library(name, is64 = False): + return _get_soname(_findLib_crle(name, is64) or _findLib_gcc(name)) + + else: + + def _findSoname_ldconfig(name): + import struct + if struct.calcsize('l') == 4: + machine = os.uname().machine + '-32' + else: + machine = os.uname().machine + '-64' + mach_map = { + 'x86_64-64': 'libc6,x86-64', + 'ppc64-64': 'libc6,64bit', + 'sparc64-64': 'libc6,64bit', + 's390x-64': 'libc6,64bit', + 'ia64-64': 'libc6,IA-64', + } + abi_type = mach_map.get(machine, 'libc6') + + # XXX assuming GLIBC's ldconfig (with option -p) + regex = r'\s+(lib%s\.[^\s]+)\s+\(%s' + regex = os.fsencode(regex % (re.escape(name), abi_type)) + try: + with subprocess.Popen(['/sbin/ldconfig', '-p'], + stdin=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + stdout=subprocess.PIPE, + env={'LC_ALL': 'C', 'LANG': 'C'}) as p: + res = re.search(regex, p.stdout.read()) + if res: + return os.fsdecode(res.group(1)) + except OSError: + pass + + def _findLib_ld(name): + # See issue #9998 for why this is needed + expr = r'[^\(\)\s]*lib%s\.[^\(\)\s]*' % re.escape(name) + cmd = ['ld', '-t'] + libpath = os.environ.get('LD_LIBRARY_PATH') + if libpath: + for d in libpath.split(':'): + cmd.extend(['-L', d]) + cmd.extend(['-o', os.devnull, '-l%s' % name]) + result = None + try: + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True) + out, _ = p.communicate() + res = re.findall(expr, os.fsdecode(out)) + for file in res: + # Check if the given file is an elf file: gcc can report + # some files that are linker scripts and not actual + # shared objects. See bpo-41976 for more details + if not _is_elf(file): + continue + return os.fsdecode(file) + except Exception: + pass # result will be None + return result + + def find_library(name): + # See issue #9998 + return _findSoname_ldconfig(name) or \ + _get_soname(_findLib_gcc(name)) or _get_soname(_findLib_ld(name)) + + +# Listing loaded libraries on other systems will try to use +# functions common to Linux and a few other Unix-like systems. +# See the following for several platforms' documentation of the same API: +# https://man7.org/linux/man-pages/man3/dl_iterate_phdr.3.html +# https://man.freebsd.org/cgi/man.cgi?query=dl_iterate_phdr +# https://man.openbsd.org/dl_iterate_phdr +# https://docs.oracle.com/cd/E88353_01/html/E37843/dl-iterate-phdr-3c.html +if (os.name == "posix" and + sys.platform not in {"darwin", "ios", "tvos", "watchos"}): + import ctypes + if hasattr((_libc := ctypes.CDLL(None)), "dl_iterate_phdr"): + + class _dl_phdr_info(ctypes.Structure): + _fields_ = [ + ("dlpi_addr", ctypes.c_void_p), + ("dlpi_name", ctypes.c_char_p), + ("dlpi_phdr", ctypes.c_void_p), + ("dlpi_phnum", ctypes.c_ushort), + ] + + _dl_phdr_callback = ctypes.CFUNCTYPE( + ctypes.c_int, + ctypes.POINTER(_dl_phdr_info), + ctypes.c_size_t, + ctypes.POINTER(ctypes.py_object), + ) + + @_dl_phdr_callback + def _info_callback(info, _size, data): + libraries = data.contents.value + name = os.fsdecode(info.contents.dlpi_name) + libraries.append(name) + return 0 + + _dl_iterate_phdr = _libc["dl_iterate_phdr"] + _dl_iterate_phdr.argtypes = [ + _dl_phdr_callback, + ctypes.POINTER(ctypes.py_object), + ] + _dl_iterate_phdr.restype = ctypes.c_int + + def dllist(): + """Return a list of loaded shared libraries in the current process.""" + libraries = [] + _dl_iterate_phdr(_info_callback, + ctypes.byref(ctypes.py_object(libraries))) + return libraries + +################################################################ \ No newline at end of file From 8879054b8a2757f7cad3f8e9ad0469c69a9ef4a1 Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Wed, 12 Mar 2025 19:53:26 -0700 Subject: [PATCH 02/14] pyarrow aws sdk cpp workaround --- tuplex/io/include/AWSCommon.h | 7 +++++++ tuplex/io/src/AWSCommon.cc | 6 ++++++ tuplex/python/include/PythonCommon.h | 13 +++++++++++++ tuplex/python/src/PythonBindings.cc | 22 ++++++++++++++++++++++ tuplex/python/src/PythonCommon.cc | 12 ++++++++++++ tuplex/python/tuplex/utils/common.py | 15 ++++++++------- 6 files changed, 68 insertions(+), 7 deletions(-) diff --git a/tuplex/io/include/AWSCommon.h b/tuplex/io/include/AWSCommon.h index 6d01f5b4f..a1c98ac68 100644 --- a/tuplex/io/include/AWSCommon.h +++ b/tuplex/io/include/AWSCommon.h @@ -60,6 +60,13 @@ namespace tuplex { * @return true/false. */ extern bool isValidAWSZone(const std::string& zone); + + /*! + * Use this function to suggest to Tuplex the state of the AWS SDK, e.g. if in the process the + * Aws sdk is already initialized in some form. + * @param overrideAwssdkInitializedValue + */ + extern void setExternalAwssdk(bool overrideAwssdkInitializedValue); } // Amazon frequently changes the parameters of lambda functions, diff --git a/tuplex/io/src/AWSCommon.cc b/tuplex/io/src/AWSCommon.cc index 801b59d12..f8147c9ba 100644 --- a/tuplex/io/src/AWSCommon.cc +++ b/tuplex/io/src/AWSCommon.cc @@ -33,6 +33,12 @@ static std::string throw_if_missing_envvar(const std::string &name) { static bool isAWSInitialized = false; static Aws::SDKOptions aws_options; +namespace tuplex { + void setExternalAwssdk(bool overrideAwssdkInitializedValue) { + isAWSInitialized = overrideAwssdkInitializedValue; + } +} + // for Lambda, check: https://docs.aws.amazon.com/code-samples/latest/catalog/cpp-lambda-lambda_example.cpp.html // https://sdk.amazonaws.com/cpp/api/LATEST/class_aws_1_1_utils_1_1_logging_1_1_formatted_log_system.html diff --git a/tuplex/python/include/PythonCommon.h b/tuplex/python/include/PythonCommon.h index f34a4b832..205f2b63d 100644 --- a/tuplex/python/include/PythonCommon.h +++ b/tuplex/python/include/PythonCommon.h @@ -154,6 +154,19 @@ namespace tuplex { return py::cast(listObj); } + + extern py::object getPythonVersion(); + +#ifndef BUILD_WITH_AWS + // if not building with aws, define dummy function, else this function lives in AWSCommon.h/cc + /*! + * Use this function to suggest to Tuplex the state of the AWS SDK, e.g. if in the process the + * Aws sdk is already initialized in some form. + * @param overrideAwssdkInitializedValue + */ + [[nodiscard]] inline void setExternalAwssdk(bool overrideAwssdkInitializedValue) { + } +#endif } #endif //TUPLEX_PYTHONCOMMON_H diff --git a/tuplex/python/src/PythonBindings.cc b/tuplex/python/src/PythonBindings.cc index 7909e5e8f..8100e6f6d 100644 --- a/tuplex/python/src/PythonBindings.cc +++ b/tuplex/python/src/PythonBindings.cc @@ -39,6 +39,24 @@ PYMODULE { m.attr("__version__") = "dev"; #endif + // Perform cleanup (e.g., AWS SDK shutdown if necessary to await endless loop) + // Register a callback function that is invoked when the BaseClass object is collected + // cf. https://pybind11.readthedocs.io/en/stable/advanced/misc.html + auto cleanup_callback = []() { + // perform cleanup here -- this function is called with the GIL held + // std::cout<<"Pybind11 clean up call here."<(m, "_DataSet") @@ -94,4 +112,8 @@ PYMODULE { m.def("registerLoggingCallback", &tuplex::registerPythonLoggingCallback); m.def("registerWithInterpreter", &python::registerWithInterpreter); + + m.def("getPythonVersion", &tuplex::getPythonVersion); + + m.def("setExternalAwssdk", &tuplex::setExternalAwssdk); } \ No newline at end of file diff --git a/tuplex/python/src/PythonCommon.cc b/tuplex/python/src/PythonCommon.cc index 6e064ff9f..c57fef42f 100644 --- a/tuplex/python/src/PythonCommon.cc +++ b/tuplex/python/src/PythonCommon.cc @@ -20,6 +20,14 @@ backward::SignalHandling sh; #endif namespace tuplex { + + py::object getPythonVersion() { + std::stringstream ss; + ss< None: Call this function BEFORE initializing the _Context object from the tuplex C extension object.""" # PyArrow always initializes AWS SDK. Because Tuplex may initialize it as well, # skip in the presence of the arrow lib being loaded - # the AWS SDK initialization. + # the AWS SDK initialization on macos. It doesn't seem to be a problem on linux. - loaded_shared_objects = dllist() - pyarrow_loaded = any("pyarrow/lib" in path for path in loaded_shared_objects) + if os.name == "posix" and sys.platform == "darwin": + loaded_shared_objects = dllist() + pyarrow_loaded = any("pyarrow/lib" in path for path in loaded_shared_objects) - if pyarrow_loaded: - from tuplex.libexec.tuplex import setExternalAwssdk + if pyarrow_loaded: + from tuplex.libexec.tuplex import setExternalAwssdk - # Calling this function will prevent Tuplex from calling initAWSSDK and shutdownAWSSDK. - setExternalAwssdk(True) \ No newline at end of file + # Calling this function will prevent Tuplex from calling initAWSSDK and shutdownAWSSDK. + setExternalAwssdk(True) \ No newline at end of file From 89066ffa86af07dcc33bb22bd51d4c9f35f19d3b Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Wed, 12 Mar 2025 20:25:04 -0700 Subject: [PATCH 03/14] script fix --- scripts/macos/install_antlr4_cpp_runtime.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/macos/install_antlr4_cpp_runtime.sh b/scripts/macos/install_antlr4_cpp_runtime.sh index 194ae5397..777a83495 100644 --- a/scripts/macos/install_antlr4_cpp_runtime.sh +++ b/scripts/macos/install_antlr4_cpp_runtime.sh @@ -58,6 +58,6 @@ git clone https://github.com/antlr/antlr4.git \ ls -l $PREFIX/include ls -l $PREFIX/lib -cp lib/libantlr4-runtime.dylib /Users/runner/work/tuplex/tuplex/libantlr4-runtime.dylib +cp $PREFIX/lib/libantlr4-runtime.dylib /Users/runner/work/tuplex/tuplex/libantlr4-runtime.dylib || echo "cp failed." exit 0 From d419bfe863c4393e0d23a5acde8d617c6ab301be Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Wed, 12 Mar 2025 20:45:50 -0700 Subject: [PATCH 04/14] compile fix --- tuplex/python/src/PythonCommon.cc | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tuplex/python/src/PythonCommon.cc b/tuplex/python/src/PythonCommon.cc index c57fef42f..a510ecf7c 100644 --- a/tuplex/python/src/PythonCommon.cc +++ b/tuplex/python/src/PythonCommon.cc @@ -59,8 +59,4 @@ namespace tuplex { // return None return py::none(); } - - void setExternalAwssdk(bool overrideAwssdkInitializedValue) { - - } } \ No newline at end of file From 3678c882bf9d3c0c7c25ad4867eed2d495c413c8 Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Wed, 12 Mar 2025 20:54:39 -0700 Subject: [PATCH 05/14] ruff format --- tuplex/python/tuplex/utils/common.py | 3 +- tuplex/python/tuplex/utils/dllist.py | 246 +++++++++++++++------------ 2 files changed, 135 insertions(+), 114 deletions(-) diff --git a/tuplex/python/tuplex/utils/common.py b/tuplex/python/tuplex/utils/common.py index 069d74714..886354e2c 100644 --- a/tuplex/python/tuplex/utils/common.py +++ b/tuplex/python/tuplex/utils/common.py @@ -1080,6 +1080,7 @@ def ensure_webui(options: dict) -> None: if os.path.isfile(gunicorn_logpath) and "localhost" == webui_url: log_gunicorn_errors(gunicorn_logpath) + def pyarrow_aws_sdk_cpp_fix() -> None: """Help fix issue of pyarrow (frequent because pyarrow seems to be shipped very often) Call this function BEFORE initializing the _Context object from the tuplex C extension object.""" @@ -1095,4 +1096,4 @@ def pyarrow_aws_sdk_cpp_fix() -> None: from tuplex.libexec.tuplex import setExternalAwssdk # Calling this function will prevent Tuplex from calling initAWSSDK and shutdownAWSSDK. - setExternalAwssdk(True) \ No newline at end of file + setExternalAwssdk(True) diff --git a/tuplex/python/tuplex/utils/dllist.py b/tuplex/python/tuplex/utils/dllist.py index b8452efa5..0978bdc79 100644 --- a/tuplex/python/tuplex/utils/dllist.py +++ b/tuplex/python/tuplex/utils/dllist.py @@ -1,4 +1,3 @@ - # This is code schedule to be part of Python 3.14+ from https://github.com/python/cpython/pull/122946/files, ported back here for usagge to avoid pyarrow bug # where AWS SDK is always initialized and thus may lead to issues with tuplex. @@ -6,11 +5,12 @@ import shutil import subprocess import sys +from typing import Any, List, Optional, Union # find_library(name) returns the pathname of a library, or None. if os.name == "nt": - def _get_build_version(): + def _get_build_version() -> Union[None, int, float]: """Return the version of MSVC that was used to build Python. For Python 2.3 and up, the version number is included in @@ -35,16 +35,16 @@ def _get_build_version(): # else we don't know what version of the compiler this is return None - def find_msvcrt(): + def find_msvcrt() -> Optional[str]: """Return the name of the VC runtime dll""" version = _get_build_version() if version is None: # better be safe than sorry return None if version <= 6: - clibname = 'msvcrt' + clibname = "msvcrt" elif version <= 13: - clibname = 'msvcr%d' % (version * 10) + clibname = "msvcr%d" % (version * 10) else: # CRT is no longer directly loadable. See issue23606 for the # discussion about alternative approaches. @@ -52,15 +52,16 @@ def find_msvcrt(): # If python was built with in debug mode import importlib.machinery - if '_d.pyd' in importlib.machinery.EXTENSION_SUFFIXES: - clibname += 'd' - return clibname+'.dll' - def find_library(name): - if name in ('c', 'm'): + if "_d.pyd" in importlib.machinery.EXTENSION_SUFFIXES: + clibname += "d" + return clibname + ".dll" + + def find_library(name: str) -> Optional[str]: + if name in ("c", "m"): return find_msvcrt() # See MSDN for the REAL search order. - for directory in os.environ['PATH'].split(os.pathsep): + for directory in os.environ["PATH"].split(os.pathsep): fname = os.path.join(directory, name) if os.path.isfile(fname): return fname @@ -77,7 +78,7 @@ def find_library(name): import ctypes from ctypes import wintypes - _kernel32 = ctypes.WinDLL('kernel32', use_last_error=True) + _kernel32 = ctypes.WinDLL("kernel32", use_last_error=True) _get_current_process = _kernel32["GetCurrentProcess"] _get_current_process.restype = wintypes.HANDLE @@ -89,7 +90,7 @@ def find_library(name): wintypes.DWORD, ) - _psapi = ctypes.WinDLL('psapi', use_last_error=True) + _psapi = ctypes.WinDLL("psapi", use_last_error=True) _enum_process_modules = _psapi["EnumProcessModules"] _enum_process_modules.restype = wintypes.BOOL _enum_process_modules.argtypes = ( @@ -99,23 +100,21 @@ def find_library(name): wintypes.LPDWORD, ) - def _get_module_filename(module: wintypes.HMODULE): - name = (wintypes.WCHAR * 32767)() # UNICODE_STRING_MAX_CHARS + def _get_module_filename(module: wintypes.HMODULE) -> Optional[str]: + name = (wintypes.WCHAR * 32767)() # UNICODE_STRING_MAX_CHARS if _k32_get_module_file_name(module, name, len(name)): return name.value return None - - def _get_module_handles(): + def _get_module_handles() -> List[Any]: process = _get_current_process() space_needed = wintypes.DWORD() n = 1024 while True: modules = (wintypes.HMODULE * n)() - if not _enum_process_modules(process, - modules, - ctypes.sizeof(modules), - ctypes.byref(space_needed)): + if not _enum_process_modules( + process, modules, ctypes.sizeof(modules), ctypes.byref(space_needed) + ): err = ctypes.get_last_error() msg = ctypes.FormatError(err).strip() raise ctypes.WinError(err, f"EnumProcessModules failed: {msg}") @@ -123,19 +122,23 @@ def _get_module_handles(): if n <= len(modules): return modules[:n] - def dllist(): + def dllist() -> List[str]: """Return a list of loaded shared libraries in the current process.""" modules = _get_module_handles() - libraries = [name for h in modules - if (name := _get_module_filename(h)) is not None] + libraries = [ + name for h in modules if (name := _get_module_filename(h)) is not None + ] return libraries elif os.name == "posix" and sys.platform in {"darwin", "ios", "tvos", "watchos"}: from ctypes.macholib.dyld import dyld_find as _dyld_find - def find_library(name): - possible = ['lib%s.dylib' % name, - '%s.dylib' % name, - '%s.framework/%s' % (name, name)] + + def find_library(name: str) -> Optional[str]: + possible = [ + "lib%s.dylib" % name, + "%s.dylib" % name, + "%s.framework/%s" % (name, name), + ] for name in possible: try: return _dyld_find(name) @@ -151,11 +154,14 @@ def find_library(name): _dyld_get_image_name = _libc["_dyld_get_image_name"] _dyld_get_image_name.restype = ctypes.c_char_p - def dllist(): + def dllist() -> List[str]: """Return a list of loaded shared libraries in the current process.""" num_images = _libc._dyld_image_count() - libraries = [os.fsdecode(name) for i in range(num_images) - if (name := _dyld_get_image_name(i)) is not None] + libraries = [ + os.fsdecode(name) + for i in range(num_images) + if (name := _dyld_get_image_name(i)) is not None + ] return libraries @@ -169,7 +175,8 @@ def dllist(): from ctypes._aix import find_library elif sys.platform == "android": - def find_library(name): + + def find_library(name: str) -> Optional[str]: directory = "/system/lib" if "64" in os.uname().machine: directory += "64" @@ -179,43 +186,43 @@ def find_library(name): elif os.name == "posix": # Andreas Degert's find functions, using gcc, /sbin/ldconfig, objdump - import re, tempfile + import re + import tempfile - def _is_elf(filename): + def _is_elf(filename: str) -> bool: "Return True if the given file is an ELF file" - elf_header = b'\x7fELF' + elf_header = b"\x7fELF" try: - with open(filename, 'br') as thefile: + with open(filename, "br") as thefile: return thefile.read(4) == elf_header except FileNotFoundError: return False - def _findLib_gcc(name): + def _findLib_gcc(name: str) -> Optional[str]: # Run GCC's linker with the -t (aka --trace) option and examine the # library name it prints out. The GCC command will fail because we # haven't supplied a proper program with main(), but that does not # matter. - expr = os.fsencode(r'[^\(\)\s]*lib%s\.[^\(\)\s]*' % re.escape(name)) + expr = os.fsencode(r"[^\(\)\s]*lib%s\.[^\(\)\s]*" % re.escape(name)) - c_compiler = shutil.which('gcc') + c_compiler = shutil.which("gcc") if not c_compiler: - c_compiler = shutil.which('cc') + c_compiler = shutil.which("cc") if not c_compiler: # No C compiler available, give up return None temp = tempfile.NamedTemporaryFile() try: - args = [c_compiler, '-Wl,-t', '-o', temp.name, '-l' + name] + args = [c_compiler, "-Wl,-t", "-o", temp.name, "-l" + name] env = dict(os.environ) - env['LC_ALL'] = 'C' - env['LANG'] = 'C' + env["LC_ALL"] = "C" + env["LANG"] = "C" try: - proc = subprocess.Popen(args, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - env=env) + proc = subprocess.Popen( + args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env + ) except OSError: # E.g. bad executable return None with proc: @@ -239,51 +246,55 @@ def _findLib_gcc(name): continue return os.fsdecode(file) - if sys.platform == "sunos5": # use /usr/ccs/bin/dump on solaris - def _get_soname(f): + def _get_soname(f: Any) -> Optional[str]: if not f: return None try: - proc = subprocess.Popen(("/usr/ccs/bin/dump", "-Lpv", f), - stdout=subprocess.PIPE, - stderr=subprocess.DEVNULL) + proc = subprocess.Popen( + ("/usr/ccs/bin/dump", "-Lpv", f), + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + ) except OSError: # E.g. command not found return None with proc: data = proc.stdout.read() - res = re.search(br'\[.*\]\sSONAME\s+([^\s]+)', data) + res = re.search(rb"\[.*\]\sSONAME\s+([^\s]+)", data) if not res: return None return os.fsdecode(res.group(1)) else: - def _get_soname(f): + + def _get_soname(f: Any) -> Optional[str]: # assuming GNU binutils / ELF if not f: return None - objdump = shutil.which('objdump') + objdump = shutil.which("objdump") if not objdump: # objdump is not available, give up return None try: - proc = subprocess.Popen((objdump, '-p', '-j', '.dynamic', f), - stdout=subprocess.PIPE, - stderr=subprocess.DEVNULL) + proc = subprocess.Popen( + (objdump, "-p", "-j", ".dynamic", f), + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + ) except OSError: # E.g. bad executable return None with proc: dump = proc.stdout.read() - res = re.search(br'\sSONAME\s+([^\s]+)', dump) + res = re.search(rb"\sSONAME\s+([^\s]+)", dump) if not res: return None return os.fsdecode(res.group(1)) if sys.platform.startswith(("freebsd", "openbsd", "dragonfly")): - def _num_version(libname): + def _num_version(libname: str) -> List[int]: # "libxyz.so.MAJOR.MINOR" => [ MAJOR, MINOR ] parts = libname.split(b".") nums = [] @@ -294,17 +305,19 @@ def _num_version(libname): pass return nums or [sys.maxsize] - def find_library(name): + def find_library(name: str) -> Optional[str]: ename = re.escape(name) - expr = r':-l%s\.\S+ => \S*/(lib%s\.\S+)' % (ename, ename) + expr = r":-l%s\.\S+ => \S*/(lib%s\.\S+)" % (ename, ename) expr = os.fsencode(expr) try: - proc = subprocess.Popen(('/sbin/ldconfig', '-r'), - stdout=subprocess.PIPE, - stderr=subprocess.DEVNULL) + proc = subprocess.Popen( + ("/sbin/ldconfig", "-r"), + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + ) except OSError: # E.g. command not found - data = b'' + data = b"" else: with proc: data = proc.stdout.read() @@ -317,30 +330,29 @@ def find_library(name): elif sys.platform == "sunos5": - def _findLib_crle(name, is64): - if not os.path.exists('/usr/bin/crle'): + def _findLib_crle(name: str, is64: bool) -> Optional[str]: + if not os.path.exists("/usr/bin/crle"): return None env = dict(os.environ) - env['LC_ALL'] = 'C' + env["LC_ALL"] = "C" if is64: - args = ('/usr/bin/crle', '-64') + args = ("/usr/bin/crle", "-64") else: - args = ('/usr/bin/crle',) + args = ("/usr/bin/crle",) paths = None try: - proc = subprocess.Popen(args, - stdout=subprocess.PIPE, - stderr=subprocess.DEVNULL, - env=env) + proc = subprocess.Popen( + args, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, env=env + ) except OSError: # E.g. bad executable return None with proc: for line in proc.stdout: line = line.strip() - if line.startswith(b'Default Library Path (ELF):'): + if line.startswith(b"Default Library Path (ELF):"): paths = os.fsdecode(line).split()[4] if not paths: @@ -353,55 +365,61 @@ def _findLib_crle(name, is64): return None - def find_library(name, is64 = False): + def find_library(name: str, is64: bool = False) -> Optional[str]: return _get_soname(_findLib_crle(name, is64) or _findLib_gcc(name)) else: - def _findSoname_ldconfig(name): + def _findSoname_ldconfig(name: str) -> Optional[str]: import struct - if struct.calcsize('l') == 4: - machine = os.uname().machine + '-32' + + if struct.calcsize("l") == 4: + machine = os.uname().machine + "-32" else: - machine = os.uname().machine + '-64' + machine = os.uname().machine + "-64" mach_map = { - 'x86_64-64': 'libc6,x86-64', - 'ppc64-64': 'libc6,64bit', - 'sparc64-64': 'libc6,64bit', - 's390x-64': 'libc6,64bit', - 'ia64-64': 'libc6,IA-64', + "x86_64-64": "libc6,x86-64", + "ppc64-64": "libc6,64bit", + "sparc64-64": "libc6,64bit", + "s390x-64": "libc6,64bit", + "ia64-64": "libc6,IA-64", } - abi_type = mach_map.get(machine, 'libc6') + abi_type = mach_map.get(machine, "libc6") # XXX assuming GLIBC's ldconfig (with option -p) - regex = r'\s+(lib%s\.[^\s]+)\s+\(%s' + regex = r"\s+(lib%s\.[^\s]+)\s+\(%s" regex = os.fsencode(regex % (re.escape(name), abi_type)) try: - with subprocess.Popen(['/sbin/ldconfig', '-p'], - stdin=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - stdout=subprocess.PIPE, - env={'LC_ALL': 'C', 'LANG': 'C'}) as p: + with subprocess.Popen( + ["/sbin/ldconfig", "-p"], + stdin=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + stdout=subprocess.PIPE, + env={"LC_ALL": "C", "LANG": "C"}, + ) as p: res = re.search(regex, p.stdout.read()) if res: return os.fsdecode(res.group(1)) except OSError: pass - def _findLib_ld(name): + def _findLib_ld(name: str) -> Optional[str]: # See issue #9998 for why this is needed - expr = r'[^\(\)\s]*lib%s\.[^\(\)\s]*' % re.escape(name) - cmd = ['ld', '-t'] - libpath = os.environ.get('LD_LIBRARY_PATH') + expr = r"[^\(\)\s]*lib%s\.[^\(\)\s]*" % re.escape(name) + cmd = ["ld", "-t"] + libpath = os.environ.get("LD_LIBRARY_PATH") if libpath: - for d in libpath.split(':'): - cmd.extend(['-L', d]) - cmd.extend(['-o', os.devnull, '-l%s' % name]) + for d in libpath.split(":"): + cmd.extend(["-L", d]) + cmd.extend(["-o", os.devnull, "-l%s" % name]) result = None try: - p = subprocess.Popen(cmd, stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True) + p = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True, + ) out, _ = p.communicate() res = re.findall(expr, os.fsdecode(out)) for file in res: @@ -415,10 +433,13 @@ def _findLib_ld(name): pass # result will be None return result - def find_library(name): + def find_library(name: str) -> Optional[str]: # See issue #9998 - return _findSoname_ldconfig(name) or \ - _get_soname(_findLib_gcc(name)) or _get_soname(_findLib_ld(name)) + return ( + _findSoname_ldconfig(name) + or _get_soname(_findLib_gcc(name)) + or _get_soname(_findLib_ld(name)) + ) # Listing loaded libraries on other systems will try to use @@ -428,9 +449,9 @@ def find_library(name): # https://man.freebsd.org/cgi/man.cgi?query=dl_iterate_phdr # https://man.openbsd.org/dl_iterate_phdr # https://docs.oracle.com/cd/E88353_01/html/E37843/dl-iterate-phdr-3c.html -if (os.name == "posix" and - sys.platform not in {"darwin", "ios", "tvos", "watchos"}): +if os.name == "posix" and sys.platform not in {"darwin", "ios", "tvos", "watchos"}: import ctypes + if hasattr((_libc := ctypes.CDLL(None)), "dl_iterate_phdr"): class _dl_phdr_info(ctypes.Structure): @@ -449,7 +470,7 @@ class _dl_phdr_info(ctypes.Structure): ) @_dl_phdr_callback - def _info_callback(info, _size, data): + def _info_callback(info, _size, data) -> int: # noqa: ANN001 libraries = data.contents.value name = os.fsdecode(info.contents.dlpi_name) libraries.append(name) @@ -462,11 +483,10 @@ def _info_callback(info, _size, data): ] _dl_iterate_phdr.restype = ctypes.c_int - def dllist(): + def dllist() -> List[str]: """Return a list of loaded shared libraries in the current process.""" libraries = [] - _dl_iterate_phdr(_info_callback, - ctypes.byref(ctypes.py_object(libraries))) + _dl_iterate_phdr(_info_callback, ctypes.byref(ctypes.py_object(libraries))) return libraries -################################################################ \ No newline at end of file +################################################################ From df82443b47c8ba70d883a0325e9f1565fb2dfa61 Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Wed, 12 Mar 2025 21:54:51 -0700 Subject: [PATCH 06/14] new try --- scripts/azure/install_azure_ci_reqs.sh | 2 +- scripts/docker/ci/install_tuplex_reqs.sh | 2 +- scripts/generate_scripts.py | 2 +- scripts/macos/install_aws-sdk-cpp.sh | 2 +- scripts/ubuntu2004/install_requirements.sh | 2 +- scripts/ubuntu2204/install_requirements.sh | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/azure/install_azure_ci_reqs.sh b/scripts/azure/install_azure_ci_reqs.sh index ad9b9ba01..a455f1a34 100644 --- a/scripts/azure/install_azure_ci_reqs.sh +++ b/scripts/azure/install_azure_ci_reqs.sh @@ -114,7 +114,7 @@ mkdir -p ${WORKDIR}/antlr && cd ${WORKDIR}/antlr \ && make -j$(nproc) && make install mkdir -p ${WORKDIR}/aws && cd ${WORKDIR}/aws \ && git clone --recurse-submodules https://github.com/aws/aws-sdk-cpp.git \ -&& cd aws-sdk-cpp && git checkout tags/1.11.164 && mkdir build && cd build \ +&& cd aws-sdk-cpp && git checkout tags/1.11.524 && mkdir build && cd build \ && cmake -DCMAKE_BUILD_TYPE=Release -DUSE_OPENSSL=ON -DENABLE_TESTING=OFF -DENABLE_UNITY_BUILD=ON -DCPP_STANDARD=17 -DBUILD_SHARED_LIBS=OFF -DBUILD_ONLY="s3;core;lambda;transfer" -DCMAKE_INSTALL_PREFIX=${PREFIX} .. \ && make -j$(nproc) \ && make install diff --git a/scripts/docker/ci/install_tuplex_reqs.sh b/scripts/docker/ci/install_tuplex_reqs.sh index 37dc0d2a9..c695b3ead 100644 --- a/scripts/docker/ci/install_tuplex_reqs.sh +++ b/scripts/docker/ci/install_tuplex_reqs.sh @@ -4,7 +4,7 @@ set -euxo pipefail # dependency versions -AWSSDK_CPP_VERSION=1.11.164 +AWSSDK_CPP_VERSION=1.11.524 ANTLR4_VERSION=4.13.1 YAML_CPP_VERSION=0.8.0 AWS_LAMBDA_CPP_VERSION=0.2.8 diff --git a/scripts/generate_scripts.py b/scripts/generate_scripts.py index 3fd45b684..42ca54653 100755 --- a/scripts/generate_scripts.py +++ b/scripts/generate_scripts.py @@ -23,7 +23,7 @@ def configure_versions(osname): VERSIONS['YAMLCPP_VERSION'] = '0.8.0' VERSIONS['CELERO_VERSION'] = '2.8.3' VERSIONS['ANTLR_VERSION'] = '4.13.1' - VERSIONS['AWSSDK_VERSION'] = '1.11.164' + VERSIONS['AWSSDK_VERSION'] = '1.11.' VERSIONS['AWSLAMBDACPP_VERSION'] = '0.2.8' VERSIONS['PCRE2_VERSION'] = '10.42' VERSIONS['PROTOBUF_VERSION'] = '24.3' diff --git a/scripts/macos/install_aws-sdk-cpp.sh b/scripts/macos/install_aws-sdk-cpp.sh index 2e6aa7a29..bdf7fadd9 100755 --- a/scripts/macos/install_aws-sdk-cpp.sh +++ b/scripts/macos/install_aws-sdk-cpp.sh @@ -3,7 +3,7 @@ set -euxo pipefail PREFIX=${PREFIX:-/usr/local} -AWSSDK_CPP_VERSION=1.11.164 +AWSSDK_CPP_VERSION=1.11.524 # need at least 1.11.267 because of pyarrow bugs... # check if dir exists (i.e. restored from cache, then skip) if [ -d "${PREFIX}/include/aws" ]; then diff --git a/scripts/ubuntu2004/install_requirements.sh b/scripts/ubuntu2004/install_requirements.sh index 2f36e1b3a..3d3aab12a 100755 --- a/scripts/ubuntu2004/install_requirements.sh +++ b/scripts/ubuntu2004/install_requirements.sh @@ -137,7 +137,7 @@ mkdir -p ${WORKDIR}/protobuf && cd ${WORKDIR}/protobuf && git clone -b v24.3 htt echo ">> Installing AWS SDK" mkdir -p ${WORKDIR}/aws && cd ${WORKDIR}/aws \ && git clone --recurse-submodules https://github.com/aws/aws-sdk-cpp.git \ -&& cd aws-sdk-cpp && git checkout tags/1.11.164 && mkdir build && cd build \ +&& cd aws-sdk-cpp && git checkout tags/1.11.524 && mkdir build && cd build \ && cmake -DCMAKE_BUILD_TYPE=Release -DUSE_OPENSSL=ON -DENABLE_TESTING=OFF -DENABLE_UNITY_BUILD=ON -DCPP_STANDARD=17 -DBUILD_SHARED_LIBS=OFF -DBUILD_ONLY="s3;core;lambda;transfer" -DCMAKE_INSTALL_PREFIX=${PREFIX} .. \ && make -j$(nproc) \ && make install diff --git a/scripts/ubuntu2204/install_requirements.sh b/scripts/ubuntu2204/install_requirements.sh index 154c83a71..ab57ab87a 100755 --- a/scripts/ubuntu2204/install_requirements.sh +++ b/scripts/ubuntu2204/install_requirements.sh @@ -133,7 +133,7 @@ mkdir -p ${WORKDIR}/protobuf && cd ${WORKDIR}/protobuf && git clone -b v24.3 htt echo ">> Installing AWS SDK" mkdir -p ${WORKDIR}/aws && cd ${WORKDIR}/aws \ && git clone --recurse-submodules https://github.com/aws/aws-sdk-cpp.git \ -&& cd aws-sdk-cpp && git checkout tags/1.11.164 && mkdir build && cd build \ +&& cd aws-sdk-cpp && git checkout tags/1.11.524 && mkdir build && cd build \ && cmake -DCMAKE_BUILD_TYPE=Release -DUSE_OPENSSL=ON -DENABLE_TESTING=OFF -DENABLE_UNITY_BUILD=ON -DCPP_STANDARD=17 -DBUILD_SHARED_LIBS=OFF -DBUILD_ONLY="s3;core;lambda;transfer" -DCMAKE_INSTALL_PREFIX=${PREFIX} .. \ && make -j$(nproc) \ && make install From 11dc83fbc9a679a9af9f5a53e9a19000438d9421 Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Thu, 13 Mar 2025 18:54:09 -0700 Subject: [PATCH 07/14] brewed aws sdk to check whether compatible with pyarrow --- .github/workflows/build_wheels.yml | 2 +- scripts/macos/brew_dependencies.sh | 5 ++--- tuplex/CMakeLists.txt | 1 + 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index 2e551f515..41981b6ff 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -119,7 +119,7 @@ jobs: CIBW_BUILD: ${{ matrix.cibw-build }} # macOS dependencies separate, for Linux use docker tuplex/ci:3.x images. - CIBW_BEFORE_ALL_MACOS: bash ./scripts/macos/install_antlr4_cpp_runtime.sh && bash ./scripts/macos/brew_dependencies.sh && bash ./scripts/macos/install_aws-sdk-cpp.sh && echo 'export PATH="/usr/local/opt/openjdk@11/bin:$PATH"' >> /Users/runner/.bash_profile + CIBW_BEFORE_ALL_MACOS: bash ./scripts/macos/install_antlr4_cpp_runtime.sh && bash ./scripts/macos/brew_dependencies.sh && echo 'export PATH="/usr/local/opt/openjdk@11/bin:$PATH"' >> /Users/runner/.bash_profile # If CI complains about missing /usr/local/libexec/git-core/git-remote-https: error while loading shared libraries: libssl.so.3: cannot open shared object file: No such file or directory # the OpenSSL3 lib is stored under /usr/local/lib64. diff --git a/scripts/macos/brew_dependencies.sh b/scripts/macos/brew_dependencies.sh index c2882af16..efc15c75b 100755 --- a/scripts/macos/brew_dependencies.sh +++ b/scripts/macos/brew_dependencies.sh @@ -2,9 +2,8 @@ # This script installs all required dependencies via brew # for instructions on how to install brew, visit https://brew.sh/ - # brew doesn't provide llvm@16 bottle anymore for big sur, but python3.8 only works with big sur tags. use llvm@15 instead -brew install openjdk@11 cmake coreutils protobuf zstd zlib libmagic llvm@15 pcre2 gflags yaml-cpp celero wget boost googletest libdwarf libelf +brew install openjdk@11 cmake coreutils protobuf zstd zlib libmagic llvm@15 pcre2 gflags yaml-cpp celero wget boost googletest libdwarf libelf aws-sdk-cpp # link (when e.g. used from restoring cache) -brew link --overwrite cmake coreutils protobuf zstd zlib libmagic llvm@15 pcre2 gflags yaml-cpp celero wget boost googletest libdwarf libelf abseil +brew link --overwrite cmake coreutils protobuf zstd zlib libmagic llvm@15 pcre2 gflags yaml-cpp celero wget boost googletest libdwarf libelf abseil aws-sdk-cpp diff --git a/tuplex/CMakeLists.txt b/tuplex/CMakeLists.txt index 1da90748b..dff4df040 100755 --- a/tuplex/CMakeLists.txt +++ b/tuplex/CMakeLists.txt @@ -325,6 +325,7 @@ if(BUILD_WITH_AWS) endif() endif() find_package(AWSSDK REQUIRED COMPONENTS s3 core lambda transfer) + message(STATUS "AWS SDK version: ${AWSSDK_VERSION}") message(STATUS "AWS libs: ${AWSSDK_LINK_LIBRARIES}") message(STATUS "AWS include dirs: ${AWSSDK_INCLUDE_DIR}") if(AWSSDK_FOUND) From 89196748e9fb6486058db48fd5ca222bbad0ef69 Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Thu, 13 Mar 2025 20:57:17 -0700 Subject: [PATCH 08/14] warn message --- tuplex/python/tuplex/utils/common.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/tuplex/python/tuplex/utils/common.py b/tuplex/python/tuplex/utils/common.py index 886354e2c..67a0fbfb7 100644 --- a/tuplex/python/tuplex/utils/common.py +++ b/tuplex/python/tuplex/utils/common.py @@ -1093,7 +1093,18 @@ def pyarrow_aws_sdk_cpp_fix() -> None: pyarrow_loaded = any("pyarrow/lib" in path for path in loaded_shared_objects) if pyarrow_loaded: - from tuplex.libexec.tuplex import setExternalAwssdk - - # Calling this function will prevent Tuplex from calling initAWSSDK and shutdownAWSSDK. - setExternalAwssdk(True) + import pyarrow as pa + + pyarrow_version = [int(v) for v in pa.__version__.split(".")] + + # PyArrow has since v13+ a bug with crashes other libraries due to bad use of AWS SDK. + # cf. https://github.com/aws/aws-sdk-cpp/issues/2699 which has been merged, + # but whose solution has not been reflected in pyarrow yet. + # Display to user actionable usage on what to do. + if pyarrow_version[0] >= 13: + raise RuntimeError( + "PyArrow {pa.__version__} present in process and loaded or imported before tuplex." + " If you need to import/load pyarrow first, only compatible with pyarrow versions < 13.0.0." + " If you must use pyarrow >= 13.0.0, import tuplex first and then load pyarrow. " + "Note that pyarrow < 13.0.0 is not compatible with numpy >= 2.0." + ) From 82f158a1403258f5e7be6829d1f6cb473d76468a Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Thu, 13 Mar 2025 20:58:52 -0700 Subject: [PATCH 09/14] refactor, update comment --- tuplex/python/tuplex/context.py | 4 ++-- tuplex/python/tuplex/utils/common.py | 7 +++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/tuplex/python/tuplex/context.py b/tuplex/python/tuplex/context.py index b088a4344..db0e42a64 100644 --- a/tuplex/python/tuplex/context.py +++ b/tuplex/python/tuplex/context.py @@ -11,7 +11,7 @@ import logging -from .utils.common import pyarrow_aws_sdk_cpp_fix +from .utils.common import pyarrow_aws_sdk_cpp_check try: from .libexec.tuplex import _Context, getDefaultOptionsAsJSON @@ -234,7 +234,7 @@ def __init__( del options["webui"] # Ensure no crash due to PyArrow potentially being present. - pyarrow_aws_sdk_cpp_fix() + pyarrow_aws_sdk_cpp_check() # last arg are the options as json string serialized b.c. of boost python problems self._context = _Context(name, runtime_path, json.dumps(options)) diff --git a/tuplex/python/tuplex/utils/common.py b/tuplex/python/tuplex/utils/common.py index 67a0fbfb7..1ff0a6583 100644 --- a/tuplex/python/tuplex/utils/common.py +++ b/tuplex/python/tuplex/utils/common.py @@ -1081,12 +1081,11 @@ def ensure_webui(options: dict) -> None: log_gunicorn_errors(gunicorn_logpath) -def pyarrow_aws_sdk_cpp_fix() -> None: +def pyarrow_aws_sdk_cpp_check() -> None: """Help fix issue of pyarrow (frequent because pyarrow seems to be shipped very often) Call this function BEFORE initializing the _Context object from the tuplex C extension object.""" - # PyArrow always initializes AWS SDK. Because Tuplex may initialize it as well, - # skip in the presence of the arrow lib being loaded - # the AWS SDK initialization on macos. It doesn't seem to be a problem on linux. + # Newer PyArrow versions use a more recent version of the AWS SDK, which leads to pyarrow crashing + # other libraries under macOS. Warn here explicitly about this to avoid a segfault, and provide error. if os.name == "posix" and sys.platform == "darwin": loaded_shared_objects = dllist() From 9beac14f83b0f00a7665060947921f0eff8aa7a9 Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Thu, 13 Mar 2025 21:03:15 -0700 Subject: [PATCH 10/14] script fix --- scripts/generate_scripts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/generate_scripts.py b/scripts/generate_scripts.py index 42ca54653..46b0e3c32 100755 --- a/scripts/generate_scripts.py +++ b/scripts/generate_scripts.py @@ -23,7 +23,7 @@ def configure_versions(osname): VERSIONS['YAMLCPP_VERSION'] = '0.8.0' VERSIONS['CELERO_VERSION'] = '2.8.3' VERSIONS['ANTLR_VERSION'] = '4.13.1' - VERSIONS['AWSSDK_VERSION'] = '1.11.' + VERSIONS['AWSSDK_VERSION'] = '1.11.524' VERSIONS['AWSLAMBDACPP_VERSION'] = '0.2.8' VERSIONS['PCRE2_VERSION'] = '10.42' VERSIONS['PROTOBUF_VERSION'] = '24.3' From 84e5a6de159625440bee3c7ae680ac26c11b14b8 Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Fri, 14 Mar 2025 00:02:58 -0700 Subject: [PATCH 11/14] fix --- tuplex/python/tuplex/utils/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tuplex/python/tuplex/utils/common.py b/tuplex/python/tuplex/utils/common.py index 1ff0a6583..b2c3addce 100644 --- a/tuplex/python/tuplex/utils/common.py +++ b/tuplex/python/tuplex/utils/common.py @@ -31,7 +31,7 @@ import psutil import yaml -from .dllist import dllist +from tuplex.utils.dllist import dllist try: import pwd From 35fdd328a1f170725284faffabab50c89fef68b3 Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Fri, 14 Mar 2025 00:13:57 -0700 Subject: [PATCH 12/14] reinstall --- scripts/macos/brew_dependencies.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/macos/brew_dependencies.sh b/scripts/macos/brew_dependencies.sh index efc15c75b..fd70790b2 100755 --- a/scripts/macos/brew_dependencies.sh +++ b/scripts/macos/brew_dependencies.sh @@ -3,7 +3,7 @@ # for instructions on how to install brew, visit https://brew.sh/ # brew doesn't provide llvm@16 bottle anymore for big sur, but python3.8 only works with big sur tags. use llvm@15 instead -brew install openjdk@11 cmake coreutils protobuf zstd zlib libmagic llvm@15 pcre2 gflags yaml-cpp celero wget boost googletest libdwarf libelf aws-sdk-cpp +brew reinstall openjdk@11 cmake coreutils protobuf zstd zlib libmagic llvm@15 pcre2 gflags yaml-cpp celero wget boost googletest libdwarf libelf aws-sdk-cpp # link (when e.g. used from restoring cache) brew link --overwrite cmake coreutils protobuf zstd zlib libmagic llvm@15 pcre2 gflags yaml-cpp celero wget boost googletest libdwarf libelf abseil aws-sdk-cpp From f8e0ba60f720c1e8a4faacbc2ee6cc28c07f42f0 Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Sat, 15 Mar 2025 15:06:17 -0700 Subject: [PATCH 13/14] fix --- .github/workflows/build_wheels.yml | 2 +- scripts/macos/brew_dependencies.sh | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index 41981b6ff..2e551f515 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -119,7 +119,7 @@ jobs: CIBW_BUILD: ${{ matrix.cibw-build }} # macOS dependencies separate, for Linux use docker tuplex/ci:3.x images. - CIBW_BEFORE_ALL_MACOS: bash ./scripts/macos/install_antlr4_cpp_runtime.sh && bash ./scripts/macos/brew_dependencies.sh && echo 'export PATH="/usr/local/opt/openjdk@11/bin:$PATH"' >> /Users/runner/.bash_profile + CIBW_BEFORE_ALL_MACOS: bash ./scripts/macos/install_antlr4_cpp_runtime.sh && bash ./scripts/macos/brew_dependencies.sh && bash ./scripts/macos/install_aws-sdk-cpp.sh && echo 'export PATH="/usr/local/opt/openjdk@11/bin:$PATH"' >> /Users/runner/.bash_profile # If CI complains about missing /usr/local/libexec/git-core/git-remote-https: error while loading shared libraries: libssl.so.3: cannot open shared object file: No such file or directory # the OpenSSL3 lib is stored under /usr/local/lib64. diff --git a/scripts/macos/brew_dependencies.sh b/scripts/macos/brew_dependencies.sh index fd70790b2..c2882af16 100755 --- a/scripts/macos/brew_dependencies.sh +++ b/scripts/macos/brew_dependencies.sh @@ -2,8 +2,9 @@ # This script installs all required dependencies via brew # for instructions on how to install brew, visit https://brew.sh/ + # brew doesn't provide llvm@16 bottle anymore for big sur, but python3.8 only works with big sur tags. use llvm@15 instead -brew reinstall openjdk@11 cmake coreutils protobuf zstd zlib libmagic llvm@15 pcre2 gflags yaml-cpp celero wget boost googletest libdwarf libelf aws-sdk-cpp +brew install openjdk@11 cmake coreutils protobuf zstd zlib libmagic llvm@15 pcre2 gflags yaml-cpp celero wget boost googletest libdwarf libelf # link (when e.g. used from restoring cache) -brew link --overwrite cmake coreutils protobuf zstd zlib libmagic llvm@15 pcre2 gflags yaml-cpp celero wget boost googletest libdwarf libelf abseil aws-sdk-cpp +brew link --overwrite cmake coreutils protobuf zstd zlib libmagic llvm@15 pcre2 gflags yaml-cpp celero wget boost googletest libdwarf libelf abseil From 5d996358dec72a11adb91c9ba76f2d9869efe9ff Mon Sep 17 00:00:00 2001 From: Leonhard Spiegelberg Date: Sat, 15 Mar 2025 17:09:10 -0700 Subject: [PATCH 14/14] copy add --- tuplex/python/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tuplex/python/CMakeLists.txt b/tuplex/python/CMakeLists.txt index da912e8e9..abb2bcaf3 100644 --- a/tuplex/python/CMakeLists.txt +++ b/tuplex/python/CMakeLists.txt @@ -144,6 +144,7 @@ FILE(COPY ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/__init__.py ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/tracebacks.py ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/version.py ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/globs.py + ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/dllist.py DESTINATION ${PYTHON_DIST_DIR}/tuplex/utils) FILE(COPY ${CMAKE_CURRENT_SOURCE_DIR}/tests/test_tuples.py