tuplex · LeonhardFS · May 20, 2022 · Apr 20, 2022 · Apr 20, 2022 · Apr 20, 2022
diff --git a/tuplex/adapters/cpython/include/PythonHelpers.h b/tuplex/adapters/cpython/include/PythonHelpers.h
@@ -257,17 +257,19 @@ namespace python {
     /*!
      * converts a python object to tuplex object, obj must be not null.
      * @param obj if it can not be mapped to a tuplex type, stored as PYOBJECT (cloudpickled).
+     * @param autoUpcast whether to upcast numeric types to a unified type when type conflicts, false by default
      * @return C++ Tuplex Field object
      */
-    extern tuplex::Field pythonToField(PyObject* obj);
+    extern tuplex::Field pythonToField(PyObject *obj, bool autoUpcast=false);
 
     /*!
      * converts python object to Row using row type supplied in type.
      * @param obj
      * @param type specify what type of objects should be serialized, may contain options.
+     * @param autoUpcast whether to upcast numeric types to a unified type when type conflicts, false by default
      * @return Tuplex C++ row object.
      */
-    extern tuplex::Row pythonToRow(PyObject *obj, const python::Type &type);
+    extern tuplex::Row pythonToRow(PyObject *obj, const python::Type &type, bool autoUpcast=false);
 
     /*!
      * converts a Tuplex C++ object to a python object
@@ -318,10 +320,11 @@ namespace python {
 
     /*!
      * get corresponding tuplex type for python object
-     * @param o
-     * @return
+     * @param o python object to map to Tuplex type
+     * @param autoUpcast whether to upcast numeric types to a unified type when type conflicts, false by default
+     * @return internal Tuplex type corresponding to given python object.
      */
-    extern python::Type mapPythonClassToTuplexType(PyObject *o);
+    extern python::Type mapPythonClassToTuplexType(PyObject *o, bool autoUpcast=false);
 
     /*!
      * Tuplex's python API provides a paramter to (optionally) specify a schema, this functions decodes that PyObject
@@ -391,4 +394,4 @@ namespace python {
 }
 
 
-#endif //TUPLEX_PYTHONHELPERS_H
+#endif //TUPLEX_PYTHONHELPERS_H
diff --git a/tuplex/adapters/cpython/include/PythonSerializer.h b/tuplex/adapters/cpython/include/PythonSerializer.h
@@ -26,10 +26,13 @@ namespace tuplex {
          * @param nextptr ptr position after deserialization
          * @return bool for if deserialization was successful or not
          */
-        extern bool fromSerializedMemory(const uint8_t *ptr, int64_t capacity, const Schema &schema, PyObject **obj,
+        extern bool fromSerializedMemory(const uint8_t *ptr,
+                                         size_t capacity,
+                                         const Schema &schema,
+                                         PyObject **obj,
                                          const uint8_t **nextptr = nullptr);
     }
 
 }
 
-#endif //TUPLEX_PYTHONSERIALIZER_H
+#endif //TUPLEX_PYTHONSERIALIZER_H
diff --git a/tuplex/adapters/cpython/include/PythonSerializer_private.h b/tuplex/adapters/cpython/include/PythonSerializer_private.h
@@ -21,28 +21,38 @@ namespace tuplex {
          * Creates Python object from raw memory (deserialize)
          * @param ptr memory location to where serialized data is
          * @param row_type holding information on types of input memory
+         * @param capacity size of buffer
          * @param bitmap pointer to bitmap (i.e. multiple 64bit blocks)
          * @param index of the element within the bitmap
          * @return Python object holding deserialized elements
          */
-        PyObject *createPyObjectFromMemory(const uint8_t *ptr, const python::Type &row_type, const uint8_t *bitmap = nullptr, int index = 0);
+        PyObject* createPyObjectFromMemory(const uint8_t *ptr, const python::Type &row_type, size_t capacity,
+                                 const uint8_t *bitmap = nullptr, unsigned index = 0);
 
         /*!
          * Creates Python tuple object from raw memory (deserialize)
          * @param ptr memory location to where serialized data is
          * @param row_type holding information on types of input memory
+         * @param capacity size of buffer
          * @return Python object holding deserialized elements
          */
-        PyObject *createPyTupleFromMemory(const uint8_t *ptr, const python::Type &row_type);
+        PyObject *createPyTupleFromMemory(const uint8_t *ptr, const python::Type &row_type, size_t capacity);
 
         PyObject *createPyDictFromMemory(const uint8_t *ptr);
 
-        PyObject *createPyListFromMemory(const uint8_t *ptr, const python::Type &row_type);
+        /*!
+         * Creates Python list object from raw memory (deserialize)
+         * @param ptr memory location to where serialized data is
+         * @param row_type holding information on types of input memory
+         * @param capacity size of buffer
+         * @return Python object holding deserialized elements
+         */
+        PyObject *createPyListFromMemory(const uint8_t *ptr, const python::Type &row_type, size_t capacity);
 
         /*!
          * Checks if capacity for buffer with schema is valid (if it is possible for buffer to hold such data given schema)
          * @param ptr memory location to where serialized data is
-         * @param capacity size of buffer
+         * @param capacity size of buffer, negative values are invalid.
          * @param row_type holding information on types of input memory
          * @return bool for if capacity is valid or not
          */
@@ -55,10 +65,19 @@ namespace tuplex {
          * @param row_type holding information on types of input memory
          * @return -1 if invalid, size of serialized data if valid
          */
-        int64_t checkTupleCapacity(const uint8_t *ptr, int64_t capacity, const python::Type &row_type);
+        int64_t checkTupleCapacity(const uint8_t *ptr, size_t capacity, const python::Type &row_type);
+
+        /*!
+         * map bitmap of the object at ptr to a vector with numElements true/false values
+         * @param objectType current object type that contains optional value
+         * @param ptr memory location to where the start of bitmap
+         * @param numElements number of elements in objectType for which bitmap is needed
+         * @return vector of booleans representing a bitmap indicating whether element is null (true) or not (false).
+         */
+        std::vector<bool> getBitmapFromType(const python::Type &objectType, const uint8_t *&ptr, size_t numElements);
     }
 }
 
 
 
-#endif //TUPLEX_PYTHONSERIALIZER_PRIVATE_H
+#endif //TUPLEX_PYTHONSERIALIZER_PRIVATE_H
diff --git a/tuplex/adapters/cpython/src/PythonHelpers.cc b/tuplex/adapters/cpython/src/PythonHelpers.cc
@@ -473,7 +473,7 @@ namespace python {
         return f;
     }
 
-    tuplex::Field pythonToField(PyObject* obj) {
+    tuplex::Field pythonToField(PyObject *obj, bool autoUpcast) {
         using namespace tuplex;
         using namespace std;
 
@@ -498,7 +498,7 @@ namespace python {
             vector<Field> v;
             v.reserve(numElements);
             for(unsigned i = 0; i < numElements; ++i) {
-                v.push_back(pythonToField(PyTuple_GetItem(obj, i)));
+                v.push_back(pythonToField(PyTuple_GetItem(obj, i), autoUpcast));
             }
             return Field(Tuple::from_vector(v));
         } else if(PyBool_Check(obj)) { // important to call this before isinstance long since isinstance also return long for bool
@@ -529,15 +529,15 @@ namespace python {
             if(PyDict_Size(obj) == 0)
                 return Field::empty_dict();
 
-            auto dictType = mapPythonClassToTuplexType(obj);
+            auto dictType = mapPythonClassToTuplexType(obj, autoUpcast);
             std::string dictStr;
             PyObject *key = nullptr, *val = nullptr;
             Py_ssize_t pos = 0;  // must be initialized to 0 to start iteration, however internal iterator variable. Don't use semantically.
             dictStr += "{";
             while(PyDict_Next(obj, &pos, &key, &val)) {
                 // create key
                 auto keyStr = PyString_AsString(PyObject_Str(key));
-                auto keyType = mapPythonClassToTuplexType(key);
+                auto keyType = mapPythonClassToTuplexType(key, autoUpcast);
                 python::Type valType;
 
                 // create value, mimicking cJSON printing standards
@@ -625,7 +625,7 @@ namespace python {
             vector<Field> v;
             v.reserve(numElements);
             for(unsigned i = 0; i < numElements; ++i) {
-                v.push_back(pythonToField(PyList_GET_ITEM(obj, i)));
+                v.push_back(pythonToField(PyList_GET_ITEM(obj, i), autoUpcast));
             }
             return Field(List::from_vector(v));
         } else if(obj == Py_None) {
@@ -660,9 +660,10 @@ namespace python {
      * converts object to field of specified type.
      * @param obj
      * @param type
+     * @param autoUpcast whether to upcast numeric types to a unified type when type conflicts, false by default
      * @return Field object
      */
-    tuplex::Field pythonToField(PyObject *obj, const python::Type &type) {
+    tuplex::Field pythonToField(PyObject *obj, const python::Type &type, bool autoUpcast=false) {
         assert(obj);
 
         // TODO: check assumptions about whether nonempty tuple can be an option
@@ -671,8 +672,15 @@ namespace python {
                 return tuplex::Field::null(type);
             } else {
                 tuplex::Field f;
-                f = pythonToField(obj);
-                f = fieldCastTo(f, type.getReturnType());
+                auto rtType = type.getReturnType();
+                if(rtType.isListType() || rtType.isTupleType()) {
+                    // type still needed to correctly construct field
+                    f = pythonToField(obj, rtType, autoUpcast);
+                } else {
+                    // simple types
+                    f = pythonToField(obj, autoUpcast);
+                    f = autoUpcast? fieldCastTo(f, type.getReturnType()) : f;
+                }
                 f.makeOptional();
                 return f;
             }
@@ -682,19 +690,30 @@ namespace python {
 
             std::vector<tuplex::Field> v;
             for(unsigned i = 0; i < numElements; ++i) {
-                v.push_back(pythonToField(PyTuple_GetItem(obj, i), type.parameters()[i]));
+                v.push_back(pythonToField(PyTuple_GetItem(obj, i), type.parameters()[i], autoUpcast));
             }
             return tuplex::Field(tuplex::Tuple::from_vector(v));
+        } else if(type.isListType() && type != python::Type::EMPTYLIST) {
+            auto numElements = PyList_Size(obj);
+            auto elementType = type.elementType();
+            std::vector<tuplex::Field> v;
+            v.reserve(numElements);
+            for(unsigned i = 0; i < numElements; ++i) {
+                auto currListItem = PyList_GetItem(obj, i);
+                v.push_back(pythonToField(currListItem, elementType, autoUpcast));
+                Py_IncRef(currListItem);
+            }
+            return tuplex::Field(tuplex::List::from_vector(v));
         } else {
-            auto f = pythonToField(obj);
-            return fieldCastTo(f, type);
+            auto f = pythonToField(obj, autoUpcast);
+            return autoUpcast? fieldCastTo(f, type) : f;
         }
     }
 
-    tuplex::Row pythonToRow(PyObject *obj, const python::Type &type) {
+    tuplex::Row pythonToRow(PyObject *obj, const python::Type &type, bool autoUpcast) {
         assert(obj);
 
-        tuplex::Field f = pythonToField(obj, type);
+        tuplex::Field f = pythonToField(obj, type, autoUpcast);
 
         // unpack the tuples one level
         if(f.getType().isTupleType() && f.getType() != python::Type::EMPTYTUPLE) {
@@ -1365,7 +1384,7 @@ namespace python {
     }
 
     // mapping type to internal types, unknown as default
-    python::Type mapPythonClassToTuplexType(PyObject *o) {
+    python::Type mapPythonClassToTuplexType(PyObject *o, bool autoUpcast) {
         if(Py_None == o)
             return python::Type::NULLVALUE;
 
@@ -1393,7 +1412,7 @@ namespace python {
             for(int j = 0; j < numElements; j++) {
                 auto item = PyTuple_GET_ITEM(o, j); // borrowed reference
                 assert(item->ob_refcnt > 0); // important!!!
-                elementTypes.push_back(mapPythonClassToTuplexType(item));
+                elementTypes.push_back(mapPythonClassToTuplexType(item, autoUpcast));
             }
             return python::TypeFactory::instance().createOrGetTupleType(elementTypes);
         }
@@ -1410,8 +1429,8 @@ namespace python {
             Py_ssize_t pos = 0; // must be initialized to 0 to start iteration, however internal iterator variable. Don't use semantically.
             bool types_set = false; // need extra var here b/c vals could be unknown.
             while(PyDict_Next(o, &pos, &key, &val)) {
-                auto curKeyType = mapPythonClassToTuplexType(key);
-                auto curValType = mapPythonClassToTuplexType(val);
+                auto curKeyType = mapPythonClassToTuplexType(key, autoUpcast);
+                auto curValType = mapPythonClassToTuplexType(val, autoUpcast);
                 if(!types_set) {
                     types_set = true;
                     keyType = curKeyType;
@@ -1431,13 +1450,18 @@ namespace python {
             if(numElements == 0)
                 return python::Type::EMPTYLIST;
 
-            python::Type elementType = mapPythonClassToTuplexType(PyList_GetItem(o, 0));
+            python::Type elementType = mapPythonClassToTuplexType(PyList_GetItem(o, 0), autoUpcast);
             // verify that all elements have the same type
             for(int j = 0; j < numElements; j++) {
-                if(elementType != mapPythonClassToTuplexType(PyList_GetItem(o, j))) {
-                    Logger::instance().defaultLogger().error("lists with variable type elements are not supported.");
-                    return python::Type::UNKNOWN;
-                    // TODO: the general case should return python::Type::PyObject in the future
+                python::Type currElementType = mapPythonClassToTuplexType(PyList_GetItem(o, j), autoUpcast);
+                if(elementType != currElementType) {
+                    // possible to use nullable type as element type?
+                    auto newElementType = unifyTypes(elementType, currElementType, autoUpcast);
+                    if (newElementType == python::Type::UNKNOWN) {
+                        Logger::instance().defaultLogger().error("list with variable element type " + elementType.desc() + " and " + currElementType.desc() + " not supported.");
+                        return python::Type::PYOBJECT;
+                    }
+                    elementType = newElementType;
                 }
             }
             return python::Type::makeListType(elementType);
@@ -1500,6 +1524,23 @@ namespace python {
             // typing.Optional[str] which is equal to typing.Union[str, NoneType]
             auto typing_optional = PyDict_GetItemString(typing_dict, "Optional");
             assert(typing_optional);
+            if (t.getReturnType().isTupleType()) {
+                // https://docs.python.org/3/library/typing.html#typing.Tuple
+#if (PY_MAJOR_VERSION >= 3 && PY_MINOR_VERSION >= 9)
+                // use builtin.tuple[...]
+                auto builtin_mod = PyImport_AddModule("builtins");
+                assert(builtin_mod);
+                auto builtin_dict = PyModule_GetDict(builtin_mod);
+                assert(builtin_dict);
+                auto tuple = PyDict_GetItemString(builtin_dict, "tuple");
+                tobj = PyObject_GetItem(tuple, tobj);
+#else
+                // use Tuple[...]
+                auto typing_tuple = PyDict_GetItemString(typing_dict, "Tuple");
+                assert(typing_tuple);
+                tobj = PyObject_GetItem(typing_tuple, tobj);
+#endif
+            }
             auto opt_type = PyObject_GetItem(typing_optional, tobj);
             return opt_type;
         }