From 1f1d7bb01a08813e93b22ded1059ae9af1449285 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Sat, 22 Jan 2022 17:37:43 -0500 Subject: [PATCH 01/56] Modify dataset --- tuplex/python/tuplex/dataset.py | 82 +++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py index a2b8c0b33..aa5b1ca12 100644 --- a/tuplex/python/tuplex/dataset.py +++ b/tuplex/python/tuplex/dataset.py @@ -28,6 +28,88 @@ class DataSet: def __init__(self): self._dataSet = None + def getDataLen(self): + data = self.collect() + if len(data) == 0: + return 0, 0 + else: + return len(data), len(data[0]) + + def revTake(self, nRows = 5): + return self.collect()[-nRows:] + + def _repr_html_(self): + rows_list = self.take() + total_row_cnt, total_col_cnt = self.getDataLen() + print('rowlist') + print(rows_list) + if len(rows_list) == 0: + header = '\n' + rows = '\n' + else: + header = '\n' + + if self.columns != None: + for x in self.columns: + header += f' {x}\n' + else: + for i in range(len(rows_list[0])): + header += f' column {i + 1}\n' + + rows = '' + for i, r in enumerate(rows_list): + rows += ' \n' + rows += f' {i}\n' + for data in r: + rows += f' {data}\n' + rows += ' \n' + + # add the ... + rows += ' \n' + rows += ' ...\n' + for i in range(total_col_cnt): + rows += ' ...\n' + rows += ' \n' + + lastData = self.revTake() + for i, r in enumerate(lastData): + rows += ' \n' + rows += f' {total_row_cnt - len(lastData) + i}\n' + for data in r: + rows += f' {data}\n' + rows += ' \n' + + html_template = ( + '
\n' + '\n' + '\n' + ' \n' + ' \n' + f'{header}' + ' \n' + ' \n' + ' \n' + f'{rows}' + ' \n' + '
\n' + f'

{total_row_cnt} rows × {total_col_cnt} columns

\n' + '
' + ) + + return html_template + def unique(self): """ removes duplicates from Dataset (out-of-order). Equivalent to a DISTINCT clause in a SQL-statement. Returns: From 0b1e7677a6e6a93f4de812da3eb37c091689f490 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Thu, 27 Jan 2022 22:12:09 -0500 Subject: [PATCH 02/56] Add in takeLast operator --- tuplex/core/include/DataSet.h | 2 + .../include/logical/LogicalOperatorType.h | 1 + .../core/include/logical/TakeLastOperator.h | 51 ++++++++++++++ tuplex/core/src/DataSet.cc | 24 +++++++ tuplex/core/src/logical/TaskLastOperator.cc | 43 ++++++++++++ tuplex/core/src/physical/PhysicalPlan.cc | 7 +- tuplex/python/include/PythonDataSet.h | 1 + tuplex/python/src/PythonBindings.cc | 1 + tuplex/python/src/PythonDataSet.cc | 69 +++++++++++++++++++ tuplex/python/tuplex/dataset.py | 17 +++++ 10 files changed, 215 insertions(+), 1 deletion(-) create mode 100644 tuplex/core/include/logical/TakeLastOperator.h create mode 100644 tuplex/core/src/logical/TaskLastOperator.cc diff --git a/tuplex/core/include/DataSet.h b/tuplex/core/include/DataSet.h index 899032723..429d8c6a7 100644 --- a/tuplex/core/include/DataSet.h +++ b/tuplex/core/include/DataSet.h @@ -269,6 +269,8 @@ namespace tuplex { virtual std::vector takeAsVector(int64_t numElements, std::ostream &os = std::cout); + virtual std::shared_ptr takeLast(int64_t numElements, std::ostream &os = std::cout); + /*! * saves dataset to file. There are multiple options to control the behavior * ==> 1.) files can be split across multiple ones. Specify number of files to split rows to diff --git a/tuplex/core/include/logical/LogicalOperatorType.h b/tuplex/core/include/logical/LogicalOperatorType.h index 594252820..b6a1c788b 100644 --- a/tuplex/core/include/logical/LogicalOperatorType.h +++ b/tuplex/core/include/logical/LogicalOperatorType.h @@ -17,6 +17,7 @@ namespace tuplex { MAP, FILTER, TAKE, // i.e. output to python / in memory + TAKELAST, PARALLELIZE, // i.e. input from python FILEINPUT, RESOLVE, diff --git a/tuplex/core/include/logical/TakeLastOperator.h b/tuplex/core/include/logical/TakeLastOperator.h new file mode 100644 index 000000000..28896e513 --- /dev/null +++ b/tuplex/core/include/logical/TakeLastOperator.h @@ -0,0 +1,51 @@ +//--------------------------------------------------------------------------------------------------------------------// +// // +// Tuplex: Blazing Fast Python Data Science // +// // +// // +// (c) 2017 - 2021, Tuplex team // +// Created by Leonhard Spiegelberg first on 1/1/2021 // +// License: Apache 2.0 // +//--------------------------------------------------------------------------------------------------------------------// + +#ifndef TUPLEX_TAKELASTOPERATOR_H +#define TUPLEX_TAKELASTOPERATOR_H + + +#include "LogicalOperator.h" + +namespace tuplex { + class TakeLastOperator : public LogicalOperator { + private: + int64_t _limit; + public: + LogicalOperator *clone() override; + + public: + TakeLastOperator(LogicalOperator *parent, const int64_t numElements); + + std::string name() override { + if(_limit < 0 || std::numeric_limits::max() == _limit) + return "collect"; + return "take"; + } + LogicalOperatorType type() const override { return LogicalOperatorType::TAKELAST; } + + bool isActionable() override { return true; } + + bool isDataSource() override { return false; } + + bool good() const override; + + int64_t limit() { return _limit; } + + + std::vector getSample(const size_t num) const override; + + Schema getInputSchema() const override { return getOutputSchema(); } + + std::vector columns() const override; + }; +} + +#endif //TUPLEX_TAKELASTOPERATOR_H \ No newline at end of file diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc index a53a14094..66a6a548c 100644 --- a/tuplex/core/src/DataSet.cc +++ b/tuplex/core/src/DataSet.cc @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -102,6 +103,29 @@ namespace tuplex { return v; } + std::shared_ptr DataSet::takeLast(int64_t numElements, std::ostream &os) { + // error dataset? + if (isError()) + throw std::runtime_error("is error dataset!"); + + // negative numbers mean get all elements! + if (numElements < 0) + numElements = std::numeric_limits::max(); + + // create a take node + assert(_context); + LogicalOperator *op = _context->addOperator(new TakeLastOperator(this->_operator, numElements)); + DataSet *dsptr = _context->createDataSet(op->getOutputSchema()); + dsptr->_operator = op; + op->setDataSet(dsptr); + + // perform action. + assert(this->_context); + auto rs = op->compute(*this->_context); + + return rs; + } + void DataSet::tofile(tuplex::FileFormat fmt, const tuplex::URI &uri, const tuplex::UDF &udf, size_t fileCount, size_t shardSize, const std::unordered_map &outputOptions, size_t limit, diff --git a/tuplex/core/src/logical/TaskLastOperator.cc b/tuplex/core/src/logical/TaskLastOperator.cc new file mode 100644 index 000000000..92295efb3 --- /dev/null +++ b/tuplex/core/src/logical/TaskLastOperator.cc @@ -0,0 +1,43 @@ +//--------------------------------------------------------------------------------------------------------------------// +// // +// Tuplex: Blazing Fast Python Data Science // +// // +// // +// (c) 2017 - 2021, Tuplex team // +// Created by Leonhard Spiegelberg first on 1/1/2021 // +// License: Apache 2.0 // +//--------------------------------------------------------------------------------------------------------------------// + +#include +#include + +namespace tuplex { + TakeLastOperator::TakeLastOperator(LogicalOperator *parent, const int64_t numElements) : LogicalOperator::LogicalOperator(parent), _limit(numElements) { + // take schema from parent node + setSchema(this->parent()->getOutputSchema()); + } + + bool TakeLastOperator::good() const { + return _limit >= -1; + } + + std::vector TakeLastOperator::getSample(const size_t num) const { + // take sample from parent + return parent()->getSample(num); + } + + std::vector TakeLastOperator::columns() const { + assert(parent()); + return parent()->columns(); + } + + LogicalOperator *TakeLastOperator::clone() { + // create clone of this operator + auto copy = new TakeLastOperator(parent()->clone(), _limit); + + copy->setDataSet(getDataSet()); // weak ptr to old dataset... + copy->copyMembers(this); + assert(getID() == copy->getID()); + return copy; + } +} \ No newline at end of file diff --git a/tuplex/core/src/physical/PhysicalPlan.cc b/tuplex/core/src/physical/PhysicalPlan.cc index 2399edf6f..87a73a712 100644 --- a/tuplex/core/src/physical/PhysicalPlan.cc +++ b/tuplex/core/src/physical/PhysicalPlan.cc @@ -208,7 +208,9 @@ namespace tuplex { if(ops.back()->isActionable()) { if(ops.back()->type() == LogicalOperatorType::FILEOUTPUT) outputMode = EndPointMode::FILE; - else if(ops.back()->type() == LogicalOperatorType::TAKE || ops.back()->type() == LogicalOperatorType::CACHE) { + else if(ops.back()->type() == LogicalOperatorType::TAKE || + ops.back()->type() == LogicalOperatorType::TAKELAST || + ops.back()->type() == LogicalOperatorType::CACHE) { // memory? outputMode = EndPointMode::MEMORY; } else @@ -382,6 +384,9 @@ namespace tuplex { if(outputNode->type() == LogicalOperatorType::TAKE) { auto top = static_cast(outputNode); builder.setOutputLimit(top->limit()); + } else if (outputNode->type() == LogicalOperatorType::TAKELAST) { + auto top = static_cast(outputNode); + builder.setOutputLimit(top->limit()); } // @TODO: add slowPip builder to this process... diff --git a/tuplex/python/include/PythonDataSet.h b/tuplex/python/include/PythonDataSet.h index 665d68856..58827ea33 100644 --- a/tuplex/python/include/PythonDataSet.h +++ b/tuplex/python/include/PythonDataSet.h @@ -78,6 +78,7 @@ namespace tuplex { py::object collect(); py::object take(const int64_t numRows); + boost::python::object takeLast(const int64_t numRows); void show(const int64_t numRows=-1); // DataFrame like operations diff --git a/tuplex/python/src/PythonBindings.cc b/tuplex/python/src/PythonBindings.cc index 6b3683853..4d0b1f4e9 100644 --- a/tuplex/python/src/PythonBindings.cc +++ b/tuplex/python/src/PythonBindings.cc @@ -43,6 +43,7 @@ PYMODULE { .def("show", &tuplex::PythonDataSet::show) .def("collect", &tuplex::PythonDataSet::collect) .def("take", &tuplex::PythonDataSet::take) + .def("takeLast", &tuplex::PythonDataSet::takeLast) .def("map", &tuplex::PythonDataSet::map) .def("resolve", &tuplex::PythonDataSet::resolve) .def("ignore", &tuplex::PythonDataSet::ignore) diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc index 36f9a392b..2e54deec5 100644 --- a/tuplex/python/src/PythonDataSet.cc +++ b/tuplex/python/src/PythonDataSet.cc @@ -176,6 +176,75 @@ namespace tuplex { } } + boost::python::object PythonDataSet::takeLast(const int64_t numRows) { + // make sure a dataset is wrapped + assert(this->_dataset); + + // is callee error dataset? if so return list with error string + if (this->_dataset->isError()) { + ErrorDataSet *eds = static_cast(this->_dataset); + boost::python::list L; + L.append(eds->getError()); + // Logger::instance().flushAll(); + Logger::instance().flushToPython(); + return L; + } else { + std::stringstream ss; + + // release GIL & hand over everything to Tuplex + assert(PyGILState_Check()); // make sure this thread holds the GIL! + python::unlockGIL(); + + std::shared_ptr rs; + std::string err_message = ""; + try { + rs = _dataset->takeLast(numRows, ss); + if(!rs) + throw std::runtime_error("invalid result set"); + // if there are more than 1 million (100k in debug mode) elements print message... + if (rs->rowCount() > LARGE_RESULT_SIZE) + Logger::instance().logger("python").info("transferring " + + std::to_string(rs->rowCount()) + + " elements back to Python. This might take a while..."); + } catch(const std::exception& e) { + err_message = e.what(); + Logger::instance().defaultLogger().error(err_message); + } catch(...) { + err_message = "unknown C++ exception occurred, please change type."; + Logger::instance().defaultLogger().error(err_message); + } + + // reqacquire GIL + python::lockGIL(); + + // error? then return list of error string + if(!rs || !err_message.empty()) { + // Logger::instance().flushAll(); + Logger::instance().flushToPython(); + auto listObj = PyList_New(1); + PyList_SetItem(listObj, 0, python::PyString_FromString(err_message.c_str())); + auto list = boost::python::object(boost::python::borrowed<>(listObj)); + return list; + } + + // collect results & transfer them back to python + // new version, directly interact with the interpreter + Timer timer; + // build python list object from resultset + auto listObj = resultSetToCPython(rs.get(), numRows); + Logger::instance().logger("python").info("Data transfer back to python took " + + std::to_string(timer.time()) + " seconds"); + // Logger::instance().flushAll(); + Logger::instance().flushToPython(); + + // print errors + if (ss.str().length() > 0) + PySys_FormatStdout("%s", ss.str().c_str()); + + return boost::python::object(boost::python::handle<>(listObj)); + } + } + PythonDataSet PythonDataSet::map(const std::string &lambda_code, const std::string &pickled_code, const py::object& closure) { auto& logger = Logger::instance().logger("python"); diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py index aa5b1ca12..a1d838526 100644 --- a/tuplex/python/tuplex/dataset.py +++ b/tuplex/python/tuplex/dataset.py @@ -208,6 +208,23 @@ def take(self, nrows=5): return self._dataSet.take(nrows) + def takeLast(self, nrows=5): + """ action that generates a physical plan, processes data and collects the last results then as list of tuples. + + Args: + nrows (int): number of rows to collect. Per default ``5``. + Returns: + (list): A list of tuples + + """ + + assert isinstance(nrows, int), 'num rows must be an integer' + assert nrows > 0, 'please specify a number greater than zero' + + assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' + + return self._dataSet.takeLast(nrows) + def show(self, nrows=None): """ action that generates a physical plan, processes data and prints results as nicely formatted ASCII table to stdout. From eafb76d2a4e6461a3c2e035c8b0e5abed76d9a7e Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Thu, 27 Jan 2022 22:25:39 -0500 Subject: [PATCH 03/56] (wip) add reverse limit in partition --- tuplex/core/include/Partition.h | 15 +++++++ tuplex/core/src/physical/TransformStage.cc | 51 ++++++++++++++++------ 2 files changed, 53 insertions(+), 13 deletions(-) diff --git a/tuplex/core/include/Partition.h b/tuplex/core/include/Partition.h index 9bc7fc54c..5a66023fd 100644 --- a/tuplex/core/include/Partition.h +++ b/tuplex/core/include/Partition.h @@ -248,6 +248,21 @@ namespace tuplex { _mutex.unlock(); } + void setNumLastRows(const size_t numRows) { + // TODO: set another value instead + _mutex.lock(); + + _numRows = numRows; + + // save to memptr + if(_arena) { + *((int64_t*)_arena) = numRows; + } + + _mutex.unlock(); + } + + int64_t getDataSetID() const { return _dataSetID; } diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc index b61f9cbe2..9cd15694a 100644 --- a/tuplex/core/src/physical/TransformStage.cc +++ b/tuplex/core/src/physical/TransformStage.cc @@ -139,21 +139,46 @@ namespace tuplex { } // check output limit, adjust partitions if necessary - size_t numOutputRows = 0; - for (auto partition : partitions) { - numOutputRows += partition->getNumRows(); - if (numOutputRows >= outputLimit()) { - // clip last partition & leave loop - auto clipped = outputLimit() - (numOutputRows - partition->getNumRows()); - assert(clipped <= partition->getNumRows()); - partition->setNumRows(clipped); - if (clipped > 0) + // TODO: add reverse outputLimit condition here + if (true) { + size_t numOutputRows = 0; + for (auto partition : partitions) { + numOutputRows += partition->getNumRows(); + if (numOutputRows >= outputLimit()) { + // clip last partition & leave loop + auto clipped = outputLimit() - (numOutputRows - partition->getNumRows()); + assert(clipped <= partition->getNumRows()); + partition->setNumRows(clipped); + if (clipped > 0) + limitedPartitions.push_back(partition); + break; + } else { + // put full partition to output set limitedPartitions.push_back(partition); - break; - } else { - // put full partition to output set - limitedPartitions.push_back(partition); + } + } + } else { + size_t numOutputRows = 0; + for (auto partitionIt = partitions.rbeing(); + partitionIt != partitions.rend(); partitionIt++) { + numOutputRows += partition->getNumRows(); + if (numOutputRows >= outputLimit()) { + // clip last partition & leave loop + auto clipped = outputLimit() - (numOutputRows - partition->getNumRows()); + assert(clipped <= partition->getNumRows()); + + // TODO: do backward clip here instead + partition->setNumRows(clipped); + if (clipped > 0) + limitedPartitions.push_back(partition); + break; + } else { + // put full partition to output set + limitedPartitions.push_back(partition); + } } + + std::reverse(limitedPartitions.begin(), limitedPartitions.end()); } } From cb47a4da16279cbee77a76664d4702f5e38f7b5d Mon Sep 17 00:00:00 2001 From: korlamarch Date: Fri, 11 Feb 2022 09:20:06 -0500 Subject: [PATCH 04/56] Remove row count --- tuplex/python/tuplex/dataset.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py index a1d838526..976a751f4 100644 --- a/tuplex/python/tuplex/dataset.py +++ b/tuplex/python/tuplex/dataset.py @@ -28,19 +28,19 @@ class DataSet: def __init__(self): self._dataSet = None - def getDataLen(self): + def getColumnSize(self): data = self.collect() if len(data) == 0: return 0, 0 else: - return len(data), len(data[0]) + return len(data[0]) def revTake(self, nRows = 5): return self.collect()[-nRows:] def _repr_html_(self): rows_list = self.take() - total_row_cnt, total_col_cnt = self.getDataLen() + total_col_cnt = self.getColumnSize() print('rowlist') print(rows_list) if len(rows_list) == 0: @@ -74,7 +74,7 @@ def _repr_html_(self): lastData = self.revTake() for i, r in enumerate(lastData): rows += ' \n' - rows += f' {total_row_cnt - len(lastData) + i}\n' + rows += f' {0 - len(lastData) + i}\n' for data in r: rows += f' {data}\n' rows += ' \n' @@ -104,7 +104,7 @@ def _repr_html_(self): f'{rows}' ' \n' '\n' - f'

{total_row_cnt} rows × {total_col_cnt} columns

\n' + f'

{total_col_cnt} columns

\n' '' ) From d879bcd0a9d95e4f3e9812635c44769dfcc8e74e Mon Sep 17 00:00:00 2001 From: korlamarch Date: Tue, 15 Feb 2022 23:30:47 -0500 Subject: [PATCH 05/56] refactor TakeOperator --- tuplex/core/include/DataSet.h | 3 +- tuplex/core/include/EmptyDataset.h | 2 +- tuplex/core/include/ErrorDataSet.h | 2 +- .../include/logical/LogicalOperatorType.h | 1 - .../core/include/logical/TakeLastOperator.h | 51 ------------- tuplex/core/include/logical/TakeOperator.h | 10 ++- tuplex/core/src/DataSet.cc | 36 ++------- tuplex/core/src/EmptyDataset.cc | 4 +- tuplex/core/src/ErrorDataSet.cc | 4 +- tuplex/core/src/logical/TakeOperator.cc | 6 +- tuplex/core/src/logical/TaskLastOperator.cc | 43 ----------- tuplex/core/src/physical/PhysicalPlan.cc | 4 - tuplex/core/src/physical/StageBuilder.cc | 3 +- tuplex/core/src/physical/TransformStage.cc | 51 ++++--------- tuplex/python/include/PythonDataSet.h | 3 +- tuplex/python/src/PythonBindings.cc | 1 - tuplex/python/src/PythonDataSet.cc | 73 +------------------ tuplex/python/tuplex/dataset.py | 23 +----- 18 files changed, 44 insertions(+), 276 deletions(-) delete mode 100644 tuplex/core/include/logical/TakeLastOperator.h delete mode 100644 tuplex/core/src/logical/TaskLastOperator.cc diff --git a/tuplex/core/include/DataSet.h b/tuplex/core/include/DataSet.h index 429d8c6a7..65a766a87 100644 --- a/tuplex/core/include/DataSet.h +++ b/tuplex/core/include/DataSet.h @@ -263,13 +263,12 @@ namespace tuplex { // these are actions that cause execution virtual std::shared_ptr collect(std::ostream &os = std::cout); - virtual std::shared_ptr take(int64_t numElements, std::ostream &os = std::cout); + virtual std::shared_ptr take(int64_t numTop, int64_t numBottom, std::ostream &os = std::cout); virtual std::vector collectAsVector(std::ostream &os = std::cout); virtual std::vector takeAsVector(int64_t numElements, std::ostream &os = std::cout); - virtual std::shared_ptr takeLast(int64_t numElements, std::ostream &os = std::cout); /*! * saves dataset to file. There are multiple options to control the behavior diff --git a/tuplex/core/include/EmptyDataset.h b/tuplex/core/include/EmptyDataset.h index b3c1ed7af..0f8a1f52c 100644 --- a/tuplex/core/include/EmptyDataset.h +++ b/tuplex/core/include/EmptyDataset.h @@ -70,7 +70,7 @@ namespace tuplex { virtual std::shared_ptr collect(std::ostream& os) override; // take / collect will print out the error only - virtual std::shared_ptr take(int64_t numElements, std::ostream& os) override; + virtual std::shared_ptr take(int64_t numTop, int64_t numBottom, std::ostream& os) override; //virtual void show(const int64_t numRows=-1, std::ostream& os=std::cout) override; virtual std::vector collectAsVector(std::ostream& os) override; diff --git a/tuplex/core/include/ErrorDataSet.h b/tuplex/core/include/ErrorDataSet.h index 2f46d8638..34fc60685 100644 --- a/tuplex/core/include/ErrorDataSet.h +++ b/tuplex/core/include/ErrorDataSet.h @@ -90,7 +90,7 @@ namespace tuplex { std::shared_ptr collect(std::ostream& os) override; // take / collect will print out the error only - std::shared_ptr take(int64_t numElements, std::ostream& os) override; + std::shared_ptr take(int64_t numTop, int64_t numBottom, std::ostream& os) override; //virtual void show(const int64_t numRows=-1, std::ostream& os=std::cout) override; std::vector collectAsVector(std::ostream& os) override; diff --git a/tuplex/core/include/logical/LogicalOperatorType.h b/tuplex/core/include/logical/LogicalOperatorType.h index b6a1c788b..594252820 100644 --- a/tuplex/core/include/logical/LogicalOperatorType.h +++ b/tuplex/core/include/logical/LogicalOperatorType.h @@ -17,7 +17,6 @@ namespace tuplex { MAP, FILTER, TAKE, // i.e. output to python / in memory - TAKELAST, PARALLELIZE, // i.e. input from python FILEINPUT, RESOLVE, diff --git a/tuplex/core/include/logical/TakeLastOperator.h b/tuplex/core/include/logical/TakeLastOperator.h deleted file mode 100644 index 28896e513..000000000 --- a/tuplex/core/include/logical/TakeLastOperator.h +++ /dev/null @@ -1,51 +0,0 @@ -//--------------------------------------------------------------------------------------------------------------------// -// // -// Tuplex: Blazing Fast Python Data Science // -// // -// // -// (c) 2017 - 2021, Tuplex team // -// Created by Leonhard Spiegelberg first on 1/1/2021 // -// License: Apache 2.0 // -//--------------------------------------------------------------------------------------------------------------------// - -#ifndef TUPLEX_TAKELASTOPERATOR_H -#define TUPLEX_TAKELASTOPERATOR_H - - -#include "LogicalOperator.h" - -namespace tuplex { - class TakeLastOperator : public LogicalOperator { - private: - int64_t _limit; - public: - LogicalOperator *clone() override; - - public: - TakeLastOperator(LogicalOperator *parent, const int64_t numElements); - - std::string name() override { - if(_limit < 0 || std::numeric_limits::max() == _limit) - return "collect"; - return "take"; - } - LogicalOperatorType type() const override { return LogicalOperatorType::TAKELAST; } - - bool isActionable() override { return true; } - - bool isDataSource() override { return false; } - - bool good() const override; - - int64_t limit() { return _limit; } - - - std::vector getSample(const size_t num) const override; - - Schema getInputSchema() const override { return getOutputSchema(); } - - std::vector columns() const override; - }; -} - -#endif //TUPLEX_TAKELASTOPERATOR_H \ No newline at end of file diff --git a/tuplex/core/include/logical/TakeOperator.h b/tuplex/core/include/logical/TakeOperator.h index 8d0d6dcab..20c035a74 100644 --- a/tuplex/core/include/logical/TakeOperator.h +++ b/tuplex/core/include/logical/TakeOperator.h @@ -17,15 +17,16 @@ namespace tuplex { class TakeOperator : public LogicalOperator { private: - int64_t _limit; + int64_t _limitTop; + int64_t _limitBottom; public: LogicalOperator *clone() override; public: - TakeOperator(LogicalOperator *parent, const int64_t numElements); + TakeOperator(LogicalOperator *parent, const int64_t numTop, const int64_t numBottom); std::string name() override { - if(_limit < 0 || std::numeric_limits::max() == _limit) + if(_limitTop < 0 || std::numeric_limits::max() == _limitTop) return "collect"; return "take"; } @@ -37,8 +38,9 @@ namespace tuplex { bool good() const override; - int64_t limit() { return _limit; } + int64_t limit() { return _limitTop; } + bool limitBottom() { return _limitBottom; } std::vector getSample(const size_t num) const override; diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc index 66a6a548c..3de903d1c 100644 --- a/tuplex/core/src/DataSet.cc +++ b/tuplex/core/src/DataSet.cc @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -39,21 +38,21 @@ namespace tuplex { } std::shared_ptr DataSet::collect(std::ostream &os) { - return take(-1, os); + return take(-1, false, os); } - std::shared_ptr DataSet::take(int64_t numElements, std::ostream &os) { + std::shared_ptr DataSet::take(int64_t numTop, int64_t numBottom, std::ostream &os) { // error dataset? if (isError()) throw std::runtime_error("is error dataset!"); // negative numbers mean get all elements! - if (numElements < 0) - numElements = std::numeric_limits::max(); + if (numTop < 0) + numTop = std::numeric_limits::max(); // create a take node assert(_context); - LogicalOperator *op = _context->addOperator(new TakeOperator(this->_operator, numElements)); + LogicalOperator *op = _context->addOperator(new TakeOperator(this->_operator, numTop, numBottom)); DataSet *dsptr = _context->createDataSet(op->getOutputSchema()); dsptr->_operator = op; op->setDataSet(dsptr); @@ -72,7 +71,7 @@ namespace tuplex { // -1 means to retrieve all elements std::vector DataSet::takeAsVector(int64_t numElements, std::ostream &os) { - auto rs = take(numElements, os); + auto rs = take(numElements, false, os); Timer timer; #warning "limiting should make this hack irrelevant..." @@ -103,29 +102,6 @@ namespace tuplex { return v; } - std::shared_ptr DataSet::takeLast(int64_t numElements, std::ostream &os) { - // error dataset? - if (isError()) - throw std::runtime_error("is error dataset!"); - - // negative numbers mean get all elements! - if (numElements < 0) - numElements = std::numeric_limits::max(); - - // create a take node - assert(_context); - LogicalOperator *op = _context->addOperator(new TakeLastOperator(this->_operator, numElements)); - DataSet *dsptr = _context->createDataSet(op->getOutputSchema()); - dsptr->_operator = op; - op->setDataSet(dsptr); - - // perform action. - assert(this->_context); - auto rs = op->compute(*this->_context); - - return rs; - } - void DataSet::tofile(tuplex::FileFormat fmt, const tuplex::URI &uri, const tuplex::UDF &udf, size_t fileCount, size_t shardSize, const std::unordered_map &outputOptions, size_t limit, diff --git a/tuplex/core/src/EmptyDataset.cc b/tuplex/core/src/EmptyDataset.cc index 984fa904f..7504e8499 100644 --- a/tuplex/core/src/EmptyDataset.cc +++ b/tuplex/core/src/EmptyDataset.cc @@ -11,7 +11,7 @@ #include namespace tuplex { - std::shared_ptr EmptyDataset::take(int64_t numElements, std::ostream &os) { + std::shared_ptr EmptyDataset::take(int64_t numTop, int64_t numBottom, std::ostream &os) { return std::make_shared(); } @@ -20,7 +20,7 @@ namespace tuplex { } std::shared_ptr EmptyDataset::collect(std::ostream &os) { - return take(0, os); + return take(0, false, os); } std::vector EmptyDataset::collectAsVector(std::ostream &os) { diff --git a/tuplex/core/src/ErrorDataSet.cc b/tuplex/core/src/ErrorDataSet.cc index 57c03ffba..9d19594f2 100644 --- a/tuplex/core/src/ErrorDataSet.cc +++ b/tuplex/core/src/ErrorDataSet.cc @@ -23,7 +23,7 @@ namespace tuplex { return takeAsVector(0, os); } - std::shared_ptr ErrorDataSet::take(int64_t numElements, std::ostream &os) { + std::shared_ptr ErrorDataSet::take(int64_t numTop, int64_t numBottom, std::ostream &os) { // return empty vector and print err message Logger::instance().logger("core").error(this->_error); @@ -31,7 +31,7 @@ namespace tuplex { } std::shared_ptr ErrorDataSet::collect(std::ostream &os) { - return take(0, os); + return take(0, false, os); } void diff --git a/tuplex/core/src/logical/TakeOperator.cc b/tuplex/core/src/logical/TakeOperator.cc index aa7c49668..e588b5e97 100644 --- a/tuplex/core/src/logical/TakeOperator.cc +++ b/tuplex/core/src/logical/TakeOperator.cc @@ -12,13 +12,13 @@ #include namespace tuplex { - TakeOperator::TakeOperator(LogicalOperator *parent, const int64_t numElements) : LogicalOperator::LogicalOperator(parent), _limit(numElements) { + TakeOperator::TakeOperator(LogicalOperator *parent, const int64_t numTop, const int64_t numBottom) : LogicalOperator::LogicalOperator(parent), _limitTop(numTop), _limitBottom(numBottom) { // take schema from parent node setSchema(this->parent()->getOutputSchema()); } bool TakeOperator::good() const { - return _limit >= -1; + return _limitTop >= -1 && _limitBottom >= -1; } std::vector TakeOperator::getSample(const size_t num) const { @@ -33,7 +33,7 @@ namespace tuplex { LogicalOperator *TakeOperator::clone() { // create clone of this operator - auto copy = new TakeOperator(parent()->clone(), _limit); + auto copy = new TakeOperator(parent()->clone(), _limitTop, _limitBottom); copy->setDataSet(getDataSet()); // weak ptr to old dataset... copy->copyMembers(this); diff --git a/tuplex/core/src/logical/TaskLastOperator.cc b/tuplex/core/src/logical/TaskLastOperator.cc deleted file mode 100644 index 92295efb3..000000000 --- a/tuplex/core/src/logical/TaskLastOperator.cc +++ /dev/null @@ -1,43 +0,0 @@ -//--------------------------------------------------------------------------------------------------------------------// -// // -// Tuplex: Blazing Fast Python Data Science // -// // -// // -// (c) 2017 - 2021, Tuplex team // -// Created by Leonhard Spiegelberg first on 1/1/2021 // -// License: Apache 2.0 // -//--------------------------------------------------------------------------------------------------------------------// - -#include -#include - -namespace tuplex { - TakeLastOperator::TakeLastOperator(LogicalOperator *parent, const int64_t numElements) : LogicalOperator::LogicalOperator(parent), _limit(numElements) { - // take schema from parent node - setSchema(this->parent()->getOutputSchema()); - } - - bool TakeLastOperator::good() const { - return _limit >= -1; - } - - std::vector TakeLastOperator::getSample(const size_t num) const { - // take sample from parent - return parent()->getSample(num); - } - - std::vector TakeLastOperator::columns() const { - assert(parent()); - return parent()->columns(); - } - - LogicalOperator *TakeLastOperator::clone() { - // create clone of this operator - auto copy = new TakeLastOperator(parent()->clone(), _limit); - - copy->setDataSet(getDataSet()); // weak ptr to old dataset... - copy->copyMembers(this); - assert(getID() == copy->getID()); - return copy; - } -} \ No newline at end of file diff --git a/tuplex/core/src/physical/PhysicalPlan.cc b/tuplex/core/src/physical/PhysicalPlan.cc index 87a73a712..17a4c7c0e 100644 --- a/tuplex/core/src/physical/PhysicalPlan.cc +++ b/tuplex/core/src/physical/PhysicalPlan.cc @@ -209,7 +209,6 @@ namespace tuplex { if(ops.back()->type() == LogicalOperatorType::FILEOUTPUT) outputMode = EndPointMode::FILE; else if(ops.back()->type() == LogicalOperatorType::TAKE || - ops.back()->type() == LogicalOperatorType::TAKELAST || ops.back()->type() == LogicalOperatorType::CACHE) { // memory? outputMode = EndPointMode::MEMORY; @@ -384,9 +383,6 @@ namespace tuplex { if(outputNode->type() == LogicalOperatorType::TAKE) { auto top = static_cast(outputNode); builder.setOutputLimit(top->limit()); - } else if (outputNode->type() == LogicalOperatorType::TAKELAST) { - auto top = static_cast(outputNode); - builder.setOutputLimit(top->limit()); } // @TODO: add slowPip builder to this process... diff --git a/tuplex/core/src/physical/StageBuilder.cc b/tuplex/core/src/physical/StageBuilder.cc index 72f01e2b8..0bf509ed1 100644 --- a/tuplex/core/src/physical/StageBuilder.cc +++ b/tuplex/core/src/physical/StageBuilder.cc @@ -457,7 +457,8 @@ namespace tuplex { break; } case LogicalOperatorType::TAKE: { - opt_ops.push_back(new TakeOperator(lastParent, dynamic_cast(node)->limit())); + auto takeOp = dynamic_cast(node); + opt_ops.push_back(new TakeOperator(lastParent, takeOp->limit(), takeOp->limitBottom())); opt_ops.back()->setID(node->getID()); break; } diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc index 9cd15694a..b61f9cbe2 100644 --- a/tuplex/core/src/physical/TransformStage.cc +++ b/tuplex/core/src/physical/TransformStage.cc @@ -139,46 +139,21 @@ namespace tuplex { } // check output limit, adjust partitions if necessary - // TODO: add reverse outputLimit condition here - if (true) { - size_t numOutputRows = 0; - for (auto partition : partitions) { - numOutputRows += partition->getNumRows(); - if (numOutputRows >= outputLimit()) { - // clip last partition & leave loop - auto clipped = outputLimit() - (numOutputRows - partition->getNumRows()); - assert(clipped <= partition->getNumRows()); - partition->setNumRows(clipped); - if (clipped > 0) - limitedPartitions.push_back(partition); - break; - } else { - // put full partition to output set - limitedPartitions.push_back(partition); - } - } - } else { - size_t numOutputRows = 0; - for (auto partitionIt = partitions.rbeing(); - partitionIt != partitions.rend(); partitionIt++) { - numOutputRows += partition->getNumRows(); - if (numOutputRows >= outputLimit()) { - // clip last partition & leave loop - auto clipped = outputLimit() - (numOutputRows - partition->getNumRows()); - assert(clipped <= partition->getNumRows()); - - // TODO: do backward clip here instead - partition->setNumRows(clipped); - if (clipped > 0) - limitedPartitions.push_back(partition); - break; - } else { - // put full partition to output set + size_t numOutputRows = 0; + for (auto partition : partitions) { + numOutputRows += partition->getNumRows(); + if (numOutputRows >= outputLimit()) { + // clip last partition & leave loop + auto clipped = outputLimit() - (numOutputRows - partition->getNumRows()); + assert(clipped <= partition->getNumRows()); + partition->setNumRows(clipped); + if (clipped > 0) limitedPartitions.push_back(partition); - } + break; + } else { + // put full partition to output set + limitedPartitions.push_back(partition); } - - std::reverse(limitedPartitions.begin(), limitedPartitions.end()); } } diff --git a/tuplex/python/include/PythonDataSet.h b/tuplex/python/include/PythonDataSet.h index 58827ea33..23b09314d 100644 --- a/tuplex/python/include/PythonDataSet.h +++ b/tuplex/python/include/PythonDataSet.h @@ -77,8 +77,7 @@ namespace tuplex { PythonDataSet resolve(const int64_t exceptionCode, const std::string& lambda_code, const std::string& pickled_code, const py::object& closure=py::object()); py::object collect(); - py::object take(const int64_t numRows); - boost::python::object takeLast(const int64_t numRows); + py::object take(const int64_t numTop, const int64_t numBottom); void show(const int64_t numRows=-1); // DataFrame like operations diff --git a/tuplex/python/src/PythonBindings.cc b/tuplex/python/src/PythonBindings.cc index 4d0b1f4e9..6b3683853 100644 --- a/tuplex/python/src/PythonBindings.cc +++ b/tuplex/python/src/PythonBindings.cc @@ -43,7 +43,6 @@ PYMODULE { .def("show", &tuplex::PythonDataSet::show) .def("collect", &tuplex::PythonDataSet::collect) .def("take", &tuplex::PythonDataSet::take) - .def("takeLast", &tuplex::PythonDataSet::takeLast) .def("map", &tuplex::PythonDataSet::map) .def("resolve", &tuplex::PythonDataSet::resolve) .def("ignore", &tuplex::PythonDataSet::ignore) diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc index 2e54deec5..853b910db 100644 --- a/tuplex/python/src/PythonDataSet.cc +++ b/tuplex/python/src/PythonDataSet.cc @@ -107,7 +107,7 @@ namespace tuplex { } } - py::object PythonDataSet::take(const int64_t numRows) { + py::object PythonDataSet::take(const int64_t numTop, const int64_t numBottom) { // make sure a dataset is wrapped assert(this->_dataset); @@ -162,7 +162,7 @@ namespace tuplex { // new version, directly interact with the interpreter Timer timer; // build python list object from resultset - auto listObj = resultSetToCPython(rs.get(), numRows); + auto listObj = resultSetToCPython(rs.get(), numTop); Logger::instance().logger("python").info("Data transfer back to python took " + std::to_string(timer.time()) + " seconds"); // Logger::instance().flushAll(); @@ -176,75 +176,6 @@ namespace tuplex { } } - boost::python::object PythonDataSet::takeLast(const int64_t numRows) { - // make sure a dataset is wrapped - assert(this->_dataset); - - // is callee error dataset? if so return list with error string - if (this->_dataset->isError()) { - ErrorDataSet *eds = static_cast(this->_dataset); - boost::python::list L; - L.append(eds->getError()); - // Logger::instance().flushAll(); - Logger::instance().flushToPython(); - return L; - } else { - std::stringstream ss; - - // release GIL & hand over everything to Tuplex - assert(PyGILState_Check()); // make sure this thread holds the GIL! - python::unlockGIL(); - - std::shared_ptr rs; - std::string err_message = ""; - try { - rs = _dataset->takeLast(numRows, ss); - if(!rs) - throw std::runtime_error("invalid result set"); - // if there are more than 1 million (100k in debug mode) elements print message... - if (rs->rowCount() > LARGE_RESULT_SIZE) - Logger::instance().logger("python").info("transferring " - + std::to_string(rs->rowCount()) + - " elements back to Python. This might take a while..."); - } catch(const std::exception& e) { - err_message = e.what(); - Logger::instance().defaultLogger().error(err_message); - } catch(...) { - err_message = "unknown C++ exception occurred, please change type."; - Logger::instance().defaultLogger().error(err_message); - } - - // reqacquire GIL - python::lockGIL(); - - // error? then return list of error string - if(!rs || !err_message.empty()) { - // Logger::instance().flushAll(); - Logger::instance().flushToPython(); - auto listObj = PyList_New(1); - PyList_SetItem(listObj, 0, python::PyString_FromString(err_message.c_str())); - auto list = boost::python::object(boost::python::borrowed<>(listObj)); - return list; - } - - // collect results & transfer them back to python - // new version, directly interact with the interpreter - Timer timer; - // build python list object from resultset - auto listObj = resultSetToCPython(rs.get(), numRows); - Logger::instance().logger("python").info("Data transfer back to python took " - + std::to_string(timer.time()) + " seconds"); - // Logger::instance().flushAll(); - Logger::instance().flushToPython(); - - // print errors - if (ss.str().length() > 0) - PySys_FormatStdout("%s", ss.str().c_str()); - - return boost::python::object(boost::python::handle<>(listObj)); - } - } - PythonDataSet PythonDataSet::map(const std::string &lambda_code, const std::string &pickled_code, const py::object& closure) { auto& logger = Logger::instance().logger("python"); diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py index 976a751f4..1046505f2 100644 --- a/tuplex/python/tuplex/dataset.py +++ b/tuplex/python/tuplex/dataset.py @@ -191,7 +191,7 @@ def collect(self): assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' return self._dataSet.collect() - def take(self, nrows=5): + def take(self, nrows=5, nbottom=0): """ action that generates a physical plan, processes data and collects the top results then as list of tuples. Args: @@ -203,27 +203,12 @@ def take(self, nrows=5): assert isinstance(nrows, int), 'num rows must be an integer' assert nrows > 0, 'please specify a number greater than zero' + assert isinstance(nbottom, int), 'num bottom last must be an integer' + assert nbottom >= 0, 'please specify a number greater or equal to zero' assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' - return self._dataSet.take(nrows) - - def takeLast(self, nrows=5): - """ action that generates a physical plan, processes data and collects the last results then as list of tuples. - - Args: - nrows (int): number of rows to collect. Per default ``5``. - Returns: - (list): A list of tuples - - """ - - assert isinstance(nrows, int), 'num rows must be an integer' - assert nrows > 0, 'please specify a number greater than zero' - - assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' - - return self._dataSet.takeLast(nrows) + return self._dataSet.take(nrows, nbottom) def show(self, nrows=None): """ action that generates a physical plan, processes data and prints results as nicely formatted From a6f31ddb42e635df259de7c8f2057215118b6335 Mon Sep 17 00:00:00 2001 From: korlamarch Date: Wed, 16 Feb 2022 12:17:36 -0500 Subject: [PATCH 06/56] Add unit tests --- tuplex/test/core/TakeTest.cc | 125 +++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 tuplex/test/core/TakeTest.cc diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc new file mode 100644 index 000000000..08b648f34 --- /dev/null +++ b/tuplex/test/core/TakeTest.cc @@ -0,0 +1,125 @@ +//--------------------------------------------------------------------------------------------------------------------// +// // +// Tuplex: Blazing Fast Python Data Science // +// // +// // +// (c) 2017 - 2021, Tuplex team // +// Created by Leonhard Spiegelberg first on 1/1/2021 // +// License: Apache 2.0 // +//--------------------------------------------------------------------------------------------------------------------// + +#include +#include "TestUtils.h" + +class TakeTest : public PyTest {}; + +TEST_F(TakeTest, takeTopTest) { + using namespace tuplex; + auto opt = testOptions(); + Context context(opt); + + auto rs = context.parallelize( + {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(1, 0); + + ASSERT_EQ(rs->rowCount(), 1); + auto v = rs->getRows(1); + + EXPECT_EQ(v[0].getInt(0), 1); + + auto rs2 = context.parallelize( + {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(3, 0); + + ASSERT_EQ(rs2->rowCount(), 3); + auto v2 = rs2->getRows(3); + + EXPECT_EQ(v2[0].getInt(0), 1); + EXPECT_EQ(v2[1].getInt(0), 2); + EXPECT_EQ(v2[2].getInt(0), 3); + + auto rs3 = context.parallelize( + {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), Row("!")}).take(5, 0); + + ASSERT_EQ(rs3->rowCount(), 5); + auto v3 = rs3->getRows(5); + + EXPECT_EQ(v3[0].getString(0), "hello"); + EXPECT_EQ(v3[1].getString(0), "world"); + EXPECT_EQ(v3[2].getString(0), "! :)"); + EXPECT_EQ(v3[3].getString(0), "world"); + EXPECT_EQ(v3[4].getString(0), "hello"); + +} + +TEST_F(TakeTest, takeBottomTest) { + using namespace tuplex; + auto opt = testOptions(); + Context context(opt); + + auto rs = context.parallelize( + {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(0, 1); + + ASSERT_EQ(rs->rowCount(), 1); + auto v = rs->getRows(1); + + EXPECT_EQ(v[0].getInt(0), 6); + + auto rs2 = context.parallelize( + {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(0, 3); + + ASSERT_EQ(rs2->rowCount(), 3); + auto v2 = rs2->getRows(3); + + EXPECT_EQ(v2[0].getInt(0), 4); + EXPECT_EQ(v2[1].getInt(0), 5); + EXPECT_EQ(v2[2].getInt(0), 6); + + auto rs3 = context.parallelize( + {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), Row("!")}).take(0, 5); + + ASSERT_EQ(rs3->rowCount(), 5); + auto v3 = rs3->getRows(5); + + EXPECT_EQ(v3[0].getString(0), "world"); + EXPECT_EQ(v3[1].getString(0), "hello"); + EXPECT_EQ(v3[2].getString(0), "!"); + EXPECT_EQ(v3[3].getString(0), "! :)"); + EXPECT_EQ(v3[4].getString(0), "!"); + +} + +TEST_F(TakeTest, takeBothTest) { + using namespace tuplex; + auto opt = testOptions(); + Context context(opt); + + auto rs = context.parallelize( + {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(1, 1); + + ASSERT_EQ(rs->rowCount(), 2); + auto v = rs->getRows(2); + + EXPECT_EQ(v[0].getInt(0), 1); + EXPECT_EQ(v[1].getInt(0), 6); + + auto rs2 = context.parallelize( + {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(2, 1); + + ASSERT_EQ(rs2->rowCount(), 3); + auto v2 = rs2->getRows(3); + + EXPECT_EQ(v2[0].getInt(0), 1); + EXPECT_EQ(v2[1].getInt(0), 2); + EXPECT_EQ(v2[2].getInt(0), 6); + + auto rs3 = context.parallelize( + {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), Row("!")}).take(2, 3); + + ASSERT_EQ(rs3->rowCount(), 5); + auto v3 = rs3->getRows(5); + + EXPECT_EQ(v3[0].getString(0), "hello"); + EXPECT_EQ(v3[1].getString(0), "world"); + EXPECT_EQ(v3[2].getString(0), "!"); + EXPECT_EQ(v3[3].getString(0), "! :)"); + EXPECT_EQ(v3[4].getString(0), "!"); +} \ No newline at end of file From 89cee2ee24d6f45f6d92ef92187e9eb0ee733846 Mon Sep 17 00:00:00 2001 From: korlamarch Date: Wed, 16 Feb 2022 13:08:25 -0500 Subject: [PATCH 07/56] add bottom limit to transform stage (wip) --- tuplex/core/include/logical/TakeOperator.h | 2 +- tuplex/core/src/physical/PhysicalPlan.cc | 2 ++ tuplex/core/src/physical/TransformStage.cc | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tuplex/core/include/logical/TakeOperator.h b/tuplex/core/include/logical/TakeOperator.h index 20c035a74..b5dd5db6e 100644 --- a/tuplex/core/include/logical/TakeOperator.h +++ b/tuplex/core/include/logical/TakeOperator.h @@ -40,7 +40,7 @@ namespace tuplex { int64_t limit() { return _limitTop; } - bool limitBottom() { return _limitBottom; } + int64_t limitBottom() { return _limitBottom; } std::vector getSample(const size_t num) const override; diff --git a/tuplex/core/src/physical/PhysicalPlan.cc b/tuplex/core/src/physical/PhysicalPlan.cc index 17a4c7c0e..3985fe1ab 100644 --- a/tuplex/core/src/physical/PhysicalPlan.cc +++ b/tuplex/core/src/physical/PhysicalPlan.cc @@ -383,6 +383,8 @@ namespace tuplex { if(outputNode->type() == LogicalOperatorType::TAKE) { auto top = static_cast(outputNode); builder.setOutputLimit(top->limit()); + // TODO: work here + ... } // @TODO: add slowPip builder to this process... diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc index b61f9cbe2..6eb3f2e1f 100644 --- a/tuplex/core/src/physical/TransformStage.cc +++ b/tuplex/core/src/physical/TransformStage.cc @@ -142,6 +142,8 @@ namespace tuplex { size_t numOutputRows = 0; for (auto partition : partitions) { numOutputRows += partition->getNumRows(); + // TODO(march): work here + ... if (numOutputRows >= outputLimit()) { // clip last partition & leave loop auto clipped = outputLimit() - (numOutputRows - partition->getNumRows()); From 07b87fdc4eaf35f320933d1b7cc1b43c536fd946 Mon Sep 17 00:00:00 2001 From: korlamarch Date: Thu, 24 Feb 2022 23:29:11 -0500 Subject: [PATCH 08/56] more physical stage update (wip) --- tuplex/core/include/Partition.h | 41 ++++++--- tuplex/core/include/physical/StageBuilder.h | 10 ++- tuplex/core/include/physical/TransformStage.h | 10 ++- tuplex/core/include/physical/TransformTask.h | 4 + tuplex/core/src/ee/local/LocalBackend.cc | 17 ++-- tuplex/core/src/physical/PhysicalPlan.cc | 4 +- tuplex/core/src/physical/ResultSet.cc | 3 +- tuplex/core/src/physical/StageBuilder.cc | 5 +- tuplex/core/src/physical/TransformStage.cc | 85 ++++++++++++++++--- tuplex/core/src/physical/TransformTask.cc | 2 + 10 files changed, 138 insertions(+), 43 deletions(-) diff --git a/tuplex/core/include/Partition.h b/tuplex/core/include/Partition.h index 5a66023fd..2eba22764 100644 --- a/tuplex/core/include/Partition.h +++ b/tuplex/core/include/Partition.h @@ -69,6 +69,7 @@ namespace tuplex { void loadFromFile(const URI& uri); int64_t _numRows; + int64_t _numSkip; // number of rows to skip, currently only used at the output (Result set) uint64_t _bytesWritten; Schema _schema; //! Schema of the partition. May be optimized away later. @@ -110,6 +111,24 @@ namespace tuplex { setNumRows(0); } + explicit Partition(Partition* part) : + _owner(part->_owner), + _arena(part->_arena), + _size(part->_size), + _uuid(part->_uuid), + _active(false), + _immortal(false), + _locked(false), + _numRows(part->_numRows), + _bytesWritten(part->_bytesWritten), + _schema(part->_schema), + _dataSetID(part->_dataSetID), + _contextID(part->_contextID), + _swappedToFile(part->_swappedToFile) { + + // TODO(march): to actually allocate memory here? + } + ~Partition() { assert(!_locked); } @@ -157,7 +176,7 @@ namespace tuplex { * return how much capacity is left, i.e. how many bytes can be actually written * @return */ - size_t capacity() { return _size - sizeof(int64_t); } + size_t capacity() const { return _size - sizeof(int64_t); } uniqueid_t uuid() const { return _uuid; } @@ -248,21 +267,19 @@ namespace tuplex { _mutex.unlock(); } - void setNumLastRows(const size_t numRows) { - // TODO: set another value instead + size_t getNumSkip() { + size_t res = 0; _mutex.lock(); - - _numRows = numRows; - - // save to memptr - if(_arena) { - *((int64_t*)_arena) = numRows; - } - + res = num_skip; _mutex.unlock(); + return res; } - + void setNumSkip(const size_t numSkip) { + _mutex.lock(); + _numSkip = numSkip; + _mutex.unlock(); + } int64_t getDataSetID() const { return _dataSetID; } diff --git a/tuplex/core/include/physical/StageBuilder.h b/tuplex/core/include/physical/StageBuilder.h index 63b94bd57..e678ead3d 100644 --- a/tuplex/core/include/physical/StageBuilder.h +++ b/tuplex/core/include/physical/StageBuilder.h @@ -76,8 +76,9 @@ namespace tuplex { void addFileInput(FileInputOperator* csvop); void addFileOutput(FileOutputOperator* fop); - inline void setOutputLimit(size_t limit) { - _outputLimit = limit; + inline void setOutputLimit(size_t topLimit, size_t bottomLimit) { + _outputTopLimit = topLimit; + _outputBottomLimit = bottomLimit; } TransformStage* build(PhysicalPlan* plan, IBackend* backend); @@ -134,7 +135,8 @@ namespace tuplex { FileFormat _outputFileFormat; int64_t _outputNodeID; int64_t _inputNodeID; - size_t _outputLimit; + size_t _outputTopLimit; + size_t _outputBottomLimit; LogicalOperator* _inputNode; std::vector _columnsToRead; @@ -157,7 +159,7 @@ namespace tuplex { int64_t outputDataSetID() const; inline bool hasOutputLimit() const { - return _outputLimit < std::numeric_limits::max(); + return _outputTopLimit < std::numeric_limits::max() || _outputBottomLimit > 0; } inline char csvOutputDelimiter() const { diff --git a/tuplex/core/include/physical/TransformStage.h b/tuplex/core/include/physical/TransformStage.h index 22d7f5fb4..e63eaec31 100644 --- a/tuplex/core/include/physical/TransformStage.h +++ b/tuplex/core/include/physical/TransformStage.h @@ -111,14 +111,15 @@ namespace tuplex { * @param outputLimit */ void setOutputLimit(size_t outputLimit) { - _outputLimit = outputLimit; + _outputTopLimit = outputLimit; // @TODO: move this logic to physical plan! // pushdown limit //pushDownOutputLimit(); } - size_t outputLimit() const { return _outputLimit; } + size_t outputTopLimit() const { return _outputTopLimit; } + size_t outputBottomLimit() const { return _outputBottomLimit; } size_t inputLimit() const { return _inputLimit; } /*! @@ -442,7 +443,8 @@ namespace tuplex { std::vector _inputPartitions; //! memory input partitions for this task. size_t _inputLimit; //! limit number of input rows (inf per default) - size_t _outputLimit; //! output limit, set e.g. by take, to_csv etc. (inf per default) + size_t _outputTopLimit; //! output limit, set e.g. by take, to_csv etc. (inf per default) + size_t _outputBottomLimit; //! output limit, set e.g. by take, to_csv etc. (0 per default) std::shared_ptr _rs; //! result set @@ -479,7 +481,7 @@ namespace tuplex { python::Type _hashOutputBucketType; bool hasOutputLimit() const { - return _outputLimit < std::numeric_limits::max(); + return _outputTopLimit < std::numeric_limits::max(); } }; } diff --git a/tuplex/core/include/physical/TransformTask.h b/tuplex/core/include/physical/TransformTask.h index 2868ba668..c3b9dbeb4 100644 --- a/tuplex/core/include/physical/TransformTask.h +++ b/tuplex/core/include/physical/TransformTask.h @@ -183,6 +183,7 @@ namespace tuplex { HashTableSink hashTableSink() const { return _htable; } // needs to be freed manually! void setOutputLimit(size_t limit) { _outLimit = limit; resetOutputLimitCounter(); } + void setOutputBottomLimit(size_t limit) { _outBottomLimit = limit; resetOutputLimitCounter(); } void setOutputSkip(size_t numRowsToSkip) { _outSkipRows = numRowsToSkip; } void execute() override; @@ -250,6 +251,8 @@ namespace tuplex { size_t output_rows_written() const { return _numOutputRowsWritten; } size_t output_limit() const { return _outLimit; } + size_t output_bottom_limit() const { return _outBottomLimit; } + private: void resetSinks(); void resetSources(); @@ -277,6 +280,7 @@ namespace tuplex { std::unordered_map _outOptions; size_t _outLimit; // limits how many rows to write at max + size_t _outBottomLimit; // limits how many last rows to write at max size_t _outSkipRows; // how many rows at start to skip // memory source variables diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc index bed96ec5a..47da4dc23 100644 --- a/tuplex/core/src/ee/local/LocalBackend.cc +++ b/tuplex/core/src/ee/local/LocalBackend.cc @@ -486,6 +486,7 @@ namespace tuplex { // check what type of input the pipeline has (memory or files) if(tstage->fileInputMode()) { + // TODO(march): deal with file input // files // input is multiple files, use split file strategy here. // and issue tasks to executor workqueue! @@ -550,7 +551,7 @@ namespace tuplex { task->sinkExceptionsToMemory(inputSchema); task->setStageID(tstage->getID()); - task->setOutputLimit(tstage->outputLimit()); + task->setOutputLimit(tstage->outputTopLimit()); // add to tasks tasks.emplace_back(std::move(task)); } else { @@ -584,7 +585,7 @@ namespace tuplex { } task->sinkExceptionsToMemory(inputSchema); task->setStageID(tstage->getID()); - task->setOutputLimit(tstage->outputLimit()); + task->setOutputLimit(tstage->outputTopLimit()); // add to tasks tasks.emplace_back(std::move(task)); num_parts++; @@ -621,7 +622,7 @@ namespace tuplex { } task->sinkExceptionsToMemory(inputSchema); task->setStageID(tstage->getID()); - task->setOutputLimit(tstage->outputLimit()); + task->setOutputLimit(tstage->outputTopLimit()); // add to tasks tasks.emplace_back(std::move(task)); @@ -683,7 +684,11 @@ namespace tuplex { task->setInputExceptions(tstage->inputExceptions()); task->sinkExceptionsToMemory(inputSchema); task->setStageID(tstage->getID()); - task->setOutputLimit(tstage->outputLimit()); + task->setOutputLimit(tstage->outputTopLimit()); + if (tstage->outputBottomLimit()) { + // TODO(march): work here + task->setOutputBottomLimit(tstage->outputBottomLimit()); + } tasks.emplace_back(std::move(task)); numInputRows += partition->getNumRows(); @@ -837,6 +842,7 @@ namespace tuplex { } void LocalBackend::executeTransformStage(tuplex::TransformStage *tstage) { + // TODO(march): work here Timer stageTimer; Timer timer; // for detailed measurements. @@ -1529,6 +1535,7 @@ namespace tuplex { #endif // add all tasks to queue + // TODO(march): question here for(auto& task : tasks) wq.addTask(task); // clear tasks.clear(); @@ -1955,7 +1962,7 @@ namespace tuplex { // now simply go over the partitions and write the full buffers out // check all the params from TrafoStage - size_t limit = tstage->outputLimit(); + size_t limit = tstage->outputTopLimit(); size_t splitSize = tstage->splitSize(); size_t numOutputFiles = tstage->numOutputFiles(); URI uri = tstage->outputURI(); diff --git a/tuplex/core/src/physical/PhysicalPlan.cc b/tuplex/core/src/physical/PhysicalPlan.cc index 3985fe1ab..9c22837ad 100644 --- a/tuplex/core/src/physical/PhysicalPlan.cc +++ b/tuplex/core/src/physical/PhysicalPlan.cc @@ -382,9 +382,7 @@ namespace tuplex { // set limit if output node has a limit (currently only TakeOperator) if(outputNode->type() == LogicalOperatorType::TAKE) { auto top = static_cast(outputNode); - builder.setOutputLimit(top->limit()); - // TODO: work here - ... + builder.setOutputLimit(top->limit(), top->limitBottom()); } // @TODO: add slowPip builder to this process... diff --git a/tuplex/core/src/physical/ResultSet.cc b/tuplex/core/src/physical/ResultSet.cc index 0f7bf7319..5e15867f7 100644 --- a/tuplex/core/src/physical/ResultSet.cc +++ b/tuplex/core/src/physical/ResultSet.cc @@ -98,7 +98,7 @@ namespace tuplex { Partition *first = _partitions.front(); assert(_schema == first->schema()); - auto numRows = first->getNumRows(); + auto numRows = first->getNumRows() - first->getNumSkip(); _rowsRetrieved += numRows; _partitions.pop_front(); @@ -183,6 +183,7 @@ namespace tuplex { } Row ResultSet::getNextRow() { + // TODO(march): logic in skip row count here // merge rows from objects if(!_pyobjects.empty()) { auto row_number = std::get<0>(_pyobjects.front()); diff --git a/tuplex/core/src/physical/StageBuilder.cc b/tuplex/core/src/physical/StageBuilder.cc index 0bf509ed1..bc814182b 100644 --- a/tuplex/core/src/physical/StageBuilder.cc +++ b/tuplex/core/src/physical/StageBuilder.cc @@ -50,7 +50,7 @@ namespace tuplex { : _stageNumber(stage_number), _isRootStage(rootStage), _allowUndefinedBehavior(allowUndefinedBehavior), _generateParser(generateParser), _normalCaseThreshold(normalCaseThreshold), _sharedObjectPropagation(sharedObjectPropagation), _nullValueOptimization(nullValueOptimization), _updateInputExceptions(updateInputExceptions), - _inputNode(nullptr), _outputLimit(std::numeric_limits::max()) { + _inputNode(nullptr), _outputTopLimit(std::numeric_limits::max()), _outputBottomLimit(0) { } void StageBuilder::generatePythonCode() { @@ -1426,7 +1426,8 @@ namespace tuplex { // no limit operator yet... // get limit - stage->_outputLimit = _outputLimit; + stage->_outputTopLimit = _outputTopLimit; + stage->_outputBottomLimit = _outputBottomLimit; // copy input/output configurations stage->_fileInputParameters = _fileInputParameters; diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc index 6eb3f2e1f..ff54c2db3 100644 --- a/tuplex/core/src/physical/TransformStage.cc +++ b/tuplex/core/src/physical/TransformStage.cc @@ -48,7 +48,8 @@ namespace tuplex { int64_t number, bool allowUndefinedBehavior) : PhysicalStage::PhysicalStage(plan, backend, number), _inputLimit(std::numeric_limits::max()), - _outputLimit(std::numeric_limits::max()), + _outputTopLimit(std::numeric_limits::max()), + _outputBottomLimit(0), _aggMode(AggregateType::AGG_NONE) { // TODO: is this code out of date? + is allowUndefinedBehavior needed here? @@ -129,7 +130,7 @@ namespace tuplex { if (partitions.empty() && interpreterRows.empty() && generalCase.empty()) _rs = emptyResultSet(); else { - std::vector limitedPartitions; + std::vector limitedPartitions, limitedTailPartitions; auto schema = Schema::UNKNOWN; if(!partitions.empty()) { @@ -138,31 +139,91 @@ namespace tuplex { assert(schema == partition->schema()); } - // check output limit, adjust partitions if necessary - size_t numOutputRows = 0; + // check top output limit, adjust partitions if necessary + size_t numTopOutputRows = 0; + Partition* lastTopPart = nullptr; + size_t clippedTop = 0; for (auto partition : partitions) { - numOutputRows += partition->getNumRows(); - // TODO(march): work here - ... - if (numOutputRows >= outputLimit()) { + numTopOutputRows += partition->getNumRows(); + lastTopPart = partition; + if (numTopOutputRows >= outputTopLimit()) { // clip last partition & leave loop - auto clipped = outputLimit() - (numOutputRows - partition->getNumRows()); + clippedTop = outputTopLimit() - (numTopOutputRows - partition->getNumRows()); + assert(clippedTop <= partition->getNumRows()); + break; + } else if (partition == *partitions.end()) { + // last partition, mark full row, but don't put to output set yet to avoid double put + clippedTop = partition->getNumRows(); + break; + } else { + // put full partition to output set + limitedPartitions.push_back(partition); + } + } + + // check the bottom output limit, adjust partitions if necessary + size_t numBottomOutputRows = 0; + size_t clippedBottom = 0; + for (auto it = partitions.rbegin(); it != partitions.rend(); it++) { + auto partition = *it; + numBottomOutputRows += partition->getNumRows(); + + if (partition == lastTopPart) { + // the bottom and the top partitions are overlapping + clippedBottom = outputBottomLimit() - (numBottomOutputRows - partition->getNumRows()); + if (clippedTop + clippedBottom >= partition->getNumRows()) { + // if top and bottom range intersect, use full partitions + clippedTop = partition->getNumRows(); + clippedBottom = 0; + } + break; + } else if (numBottomOutputRows >= outputBottomLimit()) { + // clip last partition & leave loop + auto clipped = outputBottomLimit() - (numTopOutputRows - partition->getNumRows()); assert(clipped <= partition->getNumRows()); + partition->setNumSkip(partition->getNumRows() - clippedBottom); partition->setNumRows(clipped); if (clipped > 0) - limitedPartitions.push_back(partition); + limitedTailPartitions.push_back(partition); break; } else { // put full partition to output set - limitedPartitions.push_back(partition); + limitedTailPartitions.push_back(partition); + } + } + + // push the middle partition + if (lastTopPart != nullptr && (clippedTop > 0 || clippedBottom > 0)) { + assert(clippedTop + clippedBottom <= lastTopPart->getNumRows()); + + // TODO(march): to work on this (split into two partitions) + // split into two partitions with both top and bottom are in the same partition + Partition* lastBottomPart = nullptr; + if (clippedBottom != 0) { + lastBottomPart = new Partition(lastTopPart); + lastBottomPart->setNumSkip(lastBottomPart->getNumRows() - clippedBottom); + lastBottomPart->setNumRows(clippedBottom); + } + + lastTopPart->setNumRows(clippedTop); + + limitedPartitions.push_back(lastTopPart); + + if (lastBottomPart != nullptr) { + limitedPartitions.push_back(lastBottomPart); } } + + // merge the head and tail partitions + std::reverse(limitedTailPartitions.begin(), limitedTailPartitions.end()); + limitedPartitions.insert(limitedPartitions.end(), limitedTailPartitions.begin(), limitedTailPartitions.end()); } // put ALL partitions to result set + // TODO(march): handle overlapping case _rs = std::make_shared(schema, limitedPartitions, generalCase, partitionToExceptionsMap, interpreterRows, - outputLimit()); + outputTopLimit() + outputBottomLimit()); } } diff --git a/tuplex/core/src/physical/TransformTask.cc b/tuplex/core/src/physical/TransformTask.cc index c560c4af4..d05e7ce50 100644 --- a/tuplex/core/src/physical/TransformTask.cc +++ b/tuplex/core/src/physical/TransformTask.cc @@ -514,6 +514,7 @@ namespace tuplex { _outFile.reset(nullptr); _outPrefix.reset(); _outLimit = std::numeric_limits::max(); // write all rows + _outBottomLimit = 0; _outSkipRows = 0; // skip no rows // reset memory sink @@ -619,6 +620,7 @@ namespace tuplex { auto functor = reinterpret_cast(_functor); + // TODO(march): question here? // go over all input partitions. for(const auto &inputPartition : _inputPartitions) { // lock ptr, extract number of rows ==> store them From b2beb88f0ef2414a00e81ef22783791849ef27bf Mon Sep 17 00:00:00 2001 From: korlamarch Date: Fri, 4 Mar 2022 11:02:22 -0500 Subject: [PATCH 09/56] Quick push --- tuplex/core/include/Executor.h | 21 ++++- tuplex/core/include/Partition.h | 18 ---- tuplex/core/include/physical/ResultSet.h | 2 + tuplex/core/include/physical/TransformTask.h | 6 +- tuplex/core/src/Executor.cc | 97 +++++++++++--------- tuplex/core/src/ee/local/LocalBackend.cc | 38 ++++---- tuplex/core/src/physical/TransformStage.cc | 1 + tuplex/core/src/physical/TransformTask.cc | 17 ++-- tuplex/utils/include/mt/ITask.h | 26 ++---- 9 files changed, 112 insertions(+), 114 deletions(-) diff --git a/tuplex/core/include/Executor.h b/tuplex/core/include/Executor.h index 0bca412be..3631f7e7d 100644 --- a/tuplex/core/include/Executor.h +++ b/tuplex/core/include/Executor.h @@ -44,12 +44,19 @@ namespace tuplex { */ class WorkQueue { private: - std::atomic_bool _done; // protects against data races + std::atomic_bool _done{}; // protects against data races ExecutorTaskQueueType _queue; std::mutex _completedTasksMutex; std::vector _completedTasks; - std::atomic_int _numPendingTasks; - std::atomic_int _numCompletedTasks; + std::atomic_int _numPendingTasks{}; + std::atomic_int _numCompletedTasks{}; + + // mapping from order number -> row count if the task is finished + std::mutex _rowsDoneMutex; + std::map _rowsDone; + + std::atomic_int _frontRowsLimit{}; + std::atomic_int _bottomRowsLimit{}; public: WorkQueue(); @@ -74,6 +81,14 @@ namespace tuplex { size_t numCompletedTasks() const { return _numCompletedTasks; } + size_t frontRowsLimit() const { + return _frontRowsLimit; + }; + + size_t bottomRowsLimit() const { + return _bottomRowsLimit; + }; + /*! * stop working on this queue & dump all tasks */ diff --git a/tuplex/core/include/Partition.h b/tuplex/core/include/Partition.h index 2eba22764..24b79cc8f 100644 --- a/tuplex/core/include/Partition.h +++ b/tuplex/core/include/Partition.h @@ -111,24 +111,6 @@ namespace tuplex { setNumRows(0); } - explicit Partition(Partition* part) : - _owner(part->_owner), - _arena(part->_arena), - _size(part->_size), - _uuid(part->_uuid), - _active(false), - _immortal(false), - _locked(false), - _numRows(part->_numRows), - _bytesWritten(part->_bytesWritten), - _schema(part->_schema), - _dataSetID(part->_dataSetID), - _contextID(part->_contextID), - _swappedToFile(part->_swappedToFile) { - - // TODO(march): to actually allocate memory here? - } - ~Partition() { assert(!_locked); } diff --git a/tuplex/core/include/physical/ResultSet.h b/tuplex/core/include/physical/ResultSet.h index e94b8f1ae..5e69fef3a 100644 --- a/tuplex/core/include/physical/ResultSet.h +++ b/tuplex/core/include/physical/ResultSet.h @@ -36,6 +36,8 @@ namespace tuplex { size_t _rowsRetrieved; size_t _totalRowCounter; // used for merging in rows! size_t _maxRows; + size_t _maxRowsTop; + size_t _maxRowsBottom; Schema _schema; void removeFirstPartition(); diff --git a/tuplex/core/include/physical/TransformTask.h b/tuplex/core/include/physical/TransformTask.h index c3b9dbeb4..d065e86d3 100644 --- a/tuplex/core/include/physical/TransformTask.h +++ b/tuplex/core/include/physical/TransformTask.h @@ -182,7 +182,7 @@ namespace tuplex { void sinkOutputToHashTable(HashTableFormat fmt, int64_t outputDataSetID); HashTableSink hashTableSink() const { return _htable; } // needs to be freed manually! - void setOutputLimit(size_t limit) { _outLimit = limit; resetOutputLimitCounter(); } + void setOutputTopLimit(size_t limit) { _outTopLimit = limit; resetOutputLimitCounter(); } void setOutputBottomLimit(size_t limit) { _outBottomLimit = limit; resetOutputLimitCounter(); } void setOutputSkip(size_t numRowsToSkip) { _outSkipRows = numRowsToSkip; } void execute() override; @@ -250,7 +250,7 @@ namespace tuplex { double wallTime() const override { return _wallTime; } size_t output_rows_written() const { return _numOutputRowsWritten; } - size_t output_limit() const { return _outLimit; } + size_t output_top_limit() const { return _outTopLimit; } size_t output_bottom_limit() const { return _outBottomLimit; } private: @@ -279,7 +279,7 @@ namespace tuplex { Buffer _outPrefix; std::unordered_map _outOptions; - size_t _outLimit; // limits how many rows to write at max + size_t _outTopLimit; // limits how many rows to write at max size_t _outBottomLimit; // limits how many last rows to write at max size_t _outSkipRows; // how many rows at start to skip diff --git a/tuplex/core/src/Executor.cc b/tuplex/core/src/Executor.cc index 845b78e6a..1cc818010 100644 --- a/tuplex/core/src/Executor.cc +++ b/tuplex/core/src/Executor.cc @@ -32,8 +32,12 @@ namespace tuplex { std::vector WorkQueue::popCompletedTasks() { TRACE_LOCK("workQueue"); - std::lock_guard lock(_completedTasksMutex); + _taskDoneMutex.lock(); + _taskDone.clear(); + _taskDoneMutex.unlock(); + + std::lock_guard lock(_completedTasksMutex); // move leads to circular dependency in gcc and thus a bug on travis-ci. Therefore, just // use the below hack to fool the compiler into actually copying the vectors // // move to reset completed tasks and return array @@ -78,59 +82,66 @@ namespace tuplex { bool WorkQueue::workTask(Executor& executor, bool nonBlocking) { IExecutorTask *task = nullptr; - if(nonBlocking) { - // @Todo: This should be put into a function "work" on the workQueue... - // dequeue from general working queue - if(_queue.try_dequeue(task)) { - if(!task) - return false; - task->setOwner(&executor); - task->setThreadNumber(executor.threadNumber()); // redundant? + // dequeue from general working queue + // Note: is this TODO: outdated? + // @Todo: This should be put into a function "work" on the workQueue... + if (nonBlocking) { + if(!_queue.try_dequeue(task)) { + return false; + } + } else { + _queue.wait_dequeue(task); + } - //executor.logger().info("started task..."); - // process task - task->execute(); - // save which thread executed this task - task->setID(std::this_thread::get_id()); + if(!task) { + return false; + } + // if reach the top limit already, then don't compute the rest + size_t numTopCompleted; + TRACE_LOCK("rowsDone"); + _rowsDoneMutex.lock(); + size_t frontRowsDone = 0; + for (size_t i = 0; _rowsDone.count(i) != 0; i++) { + frontRowsDone += _rowsDone[i]; + if (frontRowsDone >= _queue.frontRowsLimit()) { + // skip execution _numPendingTasks.fetch_add(-1, std::memory_order_release); - - // add task to done list - TRACE_LOCK("completedTasks"); - _completedTasksMutex.lock(); - _completedTasks.push_back(std::move(task)); - _completedTasksMutex.unlock(); - _numCompletedTasks.fetch_add(1, std::memory_order_release); - TRACE_UNLOCK("completedTasks"); + _rowsDoneMutex.unlock(); + TRACE_UNLOCK("rowsDone"); return true; } - } else { - _queue.wait_dequeue(task); + } + _rowsDoneMutex.unlock(); + TRACE_UNLOCK("rowsDone"); - if(!task) - return false; + task->setOwner(&executor); + task->setThreadNumber(executor.threadNumber()); // redundant? - task->setOwner(&executor); - task->setThreadNumber(executor.threadNumber()); // redundant? + // executor.logger().info("started task..."); + // process task + task->execute(); + // save which thread executed this task + task->setID(std::this_thread::get_id()); - // process task - task->execute(); - // save which thread executed this task - task->setID(std::this_thread::get_id()); + _numPendingTasks.fetch_add(-1, std::memory_order_release); - // add task to done list - TRACE_LOCK("completedTasks"); - _completedTasksMutex.lock(); - _completedTasks.push_back(std::move(task)); - _completedTasksMutex.unlock(); - _numCompletedTasks.fetch_add(1, std::memory_order_release); - TRACE_UNLOCK("completedTasks"); + // add task to done list + TRACE_LOCK("completedTasks"); + _completedTasksMutex.lock(); + _completedTasks.push_back(std::move(task)); + _completedTasksMutex.unlock(); + _numCompletedTasks.fetch_add(1, std::memory_order_release); + TRACE_UNLOCK("completedTasks"); - _numPendingTasks.fetch_add(-1, std::memory_order_release); - return true; - } - return false; + TRACE_LOCK("rowsDone"); + _rowsDoneMutex.lock(); + _rowsDone[task->getOrder()] += task->getNumOutputRows(); + _rowsDoneMutex.unlock(); + TRACE_UNLOCK("rowsDone"); + + return true; } void WorkQueue::workUntilAllTasksFinished(tuplex::Executor &executor, bool flushPeriodicallyToPython) { diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc index 47da4dc23..5a1311436 100644 --- a/tuplex/core/src/ee/local/LocalBackend.cc +++ b/tuplex/core/src/ee/local/LocalBackend.cc @@ -551,7 +551,7 @@ namespace tuplex { task->sinkExceptionsToMemory(inputSchema); task->setStageID(tstage->getID()); - task->setOutputLimit(tstage->outputTopLimit()); + task->setOutputTopLimit(tstage->outputTopLimit()); // add to tasks tasks.emplace_back(std::move(task)); } else { @@ -585,7 +585,7 @@ namespace tuplex { } task->sinkExceptionsToMemory(inputSchema); task->setStageID(tstage->getID()); - task->setOutputLimit(tstage->outputTopLimit()); + task->setOutputTopLimit(tstage->outputTopLimit()); // add to tasks tasks.emplace_back(std::move(task)); num_parts++; @@ -622,7 +622,7 @@ namespace tuplex { } task->sinkExceptionsToMemory(inputSchema); task->setStageID(tstage->getID()); - task->setOutputLimit(tstage->outputTopLimit()); + task->setOutputTopLimit(tstage->outputTopLimit()); // add to tasks tasks.emplace_back(std::move(task)); @@ -684,10 +684,10 @@ namespace tuplex { task->setInputExceptions(tstage->inputExceptions()); task->sinkExceptionsToMemory(inputSchema); task->setStageID(tstage->getID()); - task->setOutputLimit(tstage->outputTopLimit()); + task->setOutputTopLimit(tstage->outputTopLimit()); + task->setOutputBottomLimit(tstage->outputBottomLimit()); if (tstage->outputBottomLimit()) { - // TODO(march): work here - task->setOutputBottomLimit(tstage->outputBottomLimit()); + // TODO(march): work here (task output limit generation) } tasks.emplace_back(std::move(task)); numInputRows += partition->getNumRows(); @@ -842,8 +842,6 @@ namespace tuplex { } void LocalBackend::executeTransformStage(tuplex::TransformStage *tstage) { - // TODO(march): work here - Timer stageTimer; Timer timer; // for detailed measurements. @@ -943,6 +941,7 @@ namespace tuplex { } } + // TODO(march): work here (transform stage) auto tasks = createLoadAndTransformToMemoryTasks(tstage, _options, syms); auto completedTasks = performTasks(tasks); @@ -1519,24 +1518,21 @@ namespace tuplex { WorkQueue& wq = LocalEngine::instance().getQueue(); wq.clear(); - // check if ord is set, if not issue warning & add - bool orderlessTaskFound = false; + // assign the order for all tasks for(int i = 0; i < tasks.size(); ++i) { - if(tasks[i]->getOrder().size() == 0) { - tasks[i]->setOrder(i); - orderlessTaskFound = true; - } + tasks[i]->setOrder(i); } -#ifndef NDEBUG - if(orderlessTaskFound) { - logger().debug("task without order found, please fix in code."); + // add all tasks to queue + // TODO(march): add task stage (to do striping) + for(size_t i = 0; i <= tasks.size() - i - 1; i++) { + const size_t revI = tasks.size()- i - 1 + wq.addTask(&tasks[i]); + if (revI > i) { + wq.addTask(&tasks[revI]); + } } -#endif - // add all tasks to queue - // TODO(march): question here - for(auto& task : tasks) wq.addTask(task); // clear tasks.clear(); diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc index ff54c2db3..af58866dc 100644 --- a/tuplex/core/src/physical/TransformStage.cc +++ b/tuplex/core/src/physical/TransformStage.cc @@ -221,6 +221,7 @@ namespace tuplex { // put ALL partitions to result set // TODO(march): handle overlapping case + // TODO(march): maybe do top/bottom limit at the level instead? _rs = std::make_shared(schema, limitedPartitions, generalCase, partitionToExceptionsMap, interpreterRows, outputTopLimit() + outputBottomLimit()); diff --git a/tuplex/core/src/physical/TransformTask.cc b/tuplex/core/src/physical/TransformTask.cc index d05e7ce50..a65aa7f11 100644 --- a/tuplex/core/src/physical/TransformTask.cc +++ b/tuplex/core/src/physical/TransformTask.cc @@ -19,10 +19,12 @@ namespace tuplex { // atomic var to count output rows! - static std::atomic_int64_t g_totalOutputRows; + static std::atomic_int64_t g_totalTopOutputRows; + static std::atomic_int64_t g_totalBottomOutputRows; void TransformTask::resetOutputLimitCounter() { - g_totalOutputRows = 0; + g_totalTopOutputRows = 0; + g_totalBottomOutputRows = 0; } } @@ -41,7 +43,8 @@ extern "C" { static int64_t limited_w2mCallback(tuplex::TransformTask* task, uint8_t* buf, int64_t bufSize) { // i.e. check here how many output rows, if already limit reached - jump to goto! - if(tuplex::g_totalOutputRows >= task->output_limit()) { + // TODO(march): comment this out + if(tuplex::g_totalTopOutputRows >= task->output_top_limit()) { return tuplex::ecToI64(tuplex::ExceptionCode::OUTPUT_LIMIT_REACHED); } @@ -49,10 +52,10 @@ extern "C" { assert(dynamic_cast(task)); auto rc = task->writeRowToMemory(buf, bufSize); if(0 == rc) - tuplex::g_totalOutputRows++; + tuplex::g_totalTopOutputRows++; // i.e. check here how many output rows, if already limit reached - jump to goto! - if(tuplex::g_totalOutputRows >= task->output_limit()) { + if(tuplex::g_totalTopOutputRows >= task->output_top_limit()) { return tuplex::ecToI64(tuplex::ExceptionCode::OUTPUT_LIMIT_REACHED); } return rc; @@ -513,7 +516,7 @@ namespace tuplex { _outputFilePath = URI::INVALID; _outFile.reset(nullptr); _outPrefix.reset(); - _outLimit = std::numeric_limits::max(); // write all rows + _outTopLimit = std::numeric_limits::max(); // write all rows _outBottomLimit = 0; _outSkipRows = 0; // skip no rows @@ -680,7 +683,7 @@ namespace tuplex { // skip rows? limit rows?? - if(_numOutputRowsWritten >= _outSkipRows && _numOutputRowsWritten < (_outLimit - _outSkipRows)) { + if(_numOutputRowsWritten >= _outSkipRows && _numOutputRowsWritten < (_outTopLimit - _outSkipRows)) { if(_outFile->write(buf, bufSize) != VirtualFileSystemStatus::VFS_OK) return ecToI32(ExceptionCode::IOERROR); } diff --git a/tuplex/utils/include/mt/ITask.h b/tuplex/utils/include/mt/ITask.h index 8434896a7..01f7137f1 100644 --- a/tuplex/utils/include/mt/ITask.h +++ b/tuplex/utils/include/mt/ITask.h @@ -29,7 +29,7 @@ namespace tuplex { std::thread::id _id; //! the id of the thread that executed the task. Used to specifically execute a task on a specific thread. //! Per default object is constructed that does not represent a thread - std::vector _orderNumbers; //! for sorting tasks when doing async processing, allows for multiple stages + size_t _orderNumbers; //! for sorting tasks when doing async processing, allows for multiple stages public: ITask() {}; @@ -51,33 +51,21 @@ namespace tuplex { _id = id; } - void setOrder(size_t order) { _orderNumbers = std::vector{order}; } - -// size_t getOrder(const size_t nth = 0) const { -// return _orderNumbers[nth]; -// } - std::vector getOrder() const { return _orderNumbers; } - - void setOrder(const std::vector& order) { + void setOrder(size_t order) { _orderNumbers = order; } + size_t getOrder() const { + return _orderNumbers; + } + /*! * compare the ordering numbers of two tasks to restore initial data order after processing multiple ones * @param other * @return */ bool compareAscOrder(const ITask& other) const { - // make sure they have the same length - assert(_orderNumbers.size() == other._orderNumbers.size()); - - // this < other? - // compare one by one - for(int i = 0; i < other._orderNumbers.size(); ++i) { - if(_orderNumbers[i] >= other._orderNumbers[i]) - return false; - } - return true; + return _orderNumbers[i] < other._orderNumbers[i]; } }; } From 3e1d243c9d1d29b9ea354b0b5c98bd58e59f2d3d Mon Sep 17 00:00:00 2001 From: korlamarch Date: Wed, 9 Mar 2022 13:00:17 -0500 Subject: [PATCH 10/56] Rework LocalBackend and TransformTask to support top and bottom limit --- tuplex/core/include/Executor.h | 6 - tuplex/core/include/Partition.h | 14 -- tuplex/core/include/ee/local/LocalBackend.h | 3 + tuplex/core/include/physical/TransformStage.h | 12 +- tuplex/core/src/Executor.cc | 24 --- tuplex/core/src/ee/local/LocalBackend.cc | 182 ++++++++++++++++-- tuplex/core/src/physical/PhysicalPlan.cc | 2 +- tuplex/core/src/physical/ResultSet.cc | 1 - tuplex/core/src/physical/TransformStage.cc | 86 +-------- tuplex/core/src/physical/TransformTask.cc | 54 ++++-- tuplex/utils/include/mt/ITask.h | 85 ++++---- 11 files changed, 270 insertions(+), 199 deletions(-) diff --git a/tuplex/core/include/Executor.h b/tuplex/core/include/Executor.h index 3631f7e7d..7eaaee244 100644 --- a/tuplex/core/include/Executor.h +++ b/tuplex/core/include/Executor.h @@ -51,12 +51,6 @@ namespace tuplex { std::atomic_int _numPendingTasks{}; std::atomic_int _numCompletedTasks{}; - // mapping from order number -> row count if the task is finished - std::mutex _rowsDoneMutex; - std::map _rowsDone; - - std::atomic_int _frontRowsLimit{}; - std::atomic_int _bottomRowsLimit{}; public: WorkQueue(); diff --git a/tuplex/core/include/Partition.h b/tuplex/core/include/Partition.h index 24b79cc8f..8bf112051 100644 --- a/tuplex/core/include/Partition.h +++ b/tuplex/core/include/Partition.h @@ -69,7 +69,6 @@ namespace tuplex { void loadFromFile(const URI& uri); int64_t _numRows; - int64_t _numSkip; // number of rows to skip, currently only used at the output (Result set) uint64_t _bytesWritten; Schema _schema; //! Schema of the partition. May be optimized away later. @@ -249,19 +248,6 @@ namespace tuplex { _mutex.unlock(); } - size_t getNumSkip() { - size_t res = 0; - _mutex.lock(); - res = num_skip; - _mutex.unlock(); - return res; - } - - void setNumSkip(const size_t numSkip) { - _mutex.lock(); - _numSkip = numSkip; - _mutex.unlock(); - } int64_t getDataSetID() const { return _dataSetID; } diff --git a/tuplex/core/include/ee/local/LocalBackend.h b/tuplex/core/include/ee/local/LocalBackend.h index 77d375aed..0dbfafdc9 100644 --- a/tuplex/core/include/ee/local/LocalBackend.h +++ b/tuplex/core/include/ee/local/LocalBackend.h @@ -88,6 +88,9 @@ namespace tuplex { MessageHandler& logger() const { return Logger::instance().logger("local ee"); } + void trimPartitionsToLimit(std::vector &partitions, size_t topLimit, size_t bottomLimit, TransformStage* tstage); + Partition* newPartitionWithSkipRows(Partition* p_in, int numToSkip, TransformStage* tstage); + // write output (may be already in correct format!) void writeOutput(TransformStage* tstage, std::vector& sortedTasks); diff --git a/tuplex/core/include/physical/TransformStage.h b/tuplex/core/include/physical/TransformStage.h index e63eaec31..f489f1f6c 100644 --- a/tuplex/core/include/physical/TransformStage.h +++ b/tuplex/core/include/physical/TransformStage.h @@ -107,11 +107,13 @@ namespace tuplex { std::unordered_map partitionToExceptionsMap() { return _partitionToExceptionsMap; } /*! - * sets maximum number of rows this pipeline will produce - * @param outputLimit + * sets maximum number of top rows this pipeline will produce + * @param topLimit + * @param bottomLimit */ - void setOutputLimit(size_t outputLimit) { - _outputTopLimit = outputLimit; + inline void setOutputLimit(size_t topLimit, size_t bottomLimit) { + _outputTopLimit = topLimit; + _outputBottomLimit = bottomLimit; // @TODO: move this logic to physical plan! // pushdown limit @@ -481,7 +483,7 @@ namespace tuplex { python::Type _hashOutputBucketType; bool hasOutputLimit() const { - return _outputTopLimit < std::numeric_limits::max(); + return _outputTopLimit < std::numeric_limits::max() && _outputBottomLimit != 0; } }; } diff --git a/tuplex/core/src/Executor.cc b/tuplex/core/src/Executor.cc index 1cc818010..388199e4d 100644 --- a/tuplex/core/src/Executor.cc +++ b/tuplex/core/src/Executor.cc @@ -98,24 +98,6 @@ namespace tuplex { return false; } - // if reach the top limit already, then don't compute the rest - size_t numTopCompleted; - TRACE_LOCK("rowsDone"); - _rowsDoneMutex.lock(); - size_t frontRowsDone = 0; - for (size_t i = 0; _rowsDone.count(i) != 0; i++) { - frontRowsDone += _rowsDone[i]; - if (frontRowsDone >= _queue.frontRowsLimit()) { - // skip execution - _numPendingTasks.fetch_add(-1, std::memory_order_release); - _rowsDoneMutex.unlock(); - TRACE_UNLOCK("rowsDone"); - return true; - } - } - _rowsDoneMutex.unlock(); - TRACE_UNLOCK("rowsDone"); - task->setOwner(&executor); task->setThreadNumber(executor.threadNumber()); // redundant? @@ -135,12 +117,6 @@ namespace tuplex { _numCompletedTasks.fetch_add(1, std::memory_order_release); TRACE_UNLOCK("completedTasks"); - TRACE_LOCK("rowsDone"); - _rowsDoneMutex.lock(); - _rowsDone[task->getOrder()] += task->getNumOutputRows(); - _rowsDoneMutex.unlock(); - TRACE_UNLOCK("rowsDone"); - return true; } diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc index 5a1311436..dbceaa1b9 100644 --- a/tuplex/core/src/ee/local/LocalBackend.cc +++ b/tuplex/core/src/ee/local/LocalBackend.cc @@ -486,7 +486,6 @@ namespace tuplex { // check what type of input the pipeline has (memory or files) if(tstage->fileInputMode()) { - // TODO(march): deal with file input // files // input is multiple files, use split file strategy here. // and issue tasks to executor workqueue! @@ -686,9 +685,6 @@ namespace tuplex { task->setStageID(tstage->getID()); task->setOutputTopLimit(tstage->outputTopLimit()); task->setOutputBottomLimit(tstage->outputBottomLimit()); - if (tstage->outputBottomLimit()) { - // TODO(march): work here (task output limit generation) - } tasks.emplace_back(std::move(task)); numInputRows += partition->getNumRows(); @@ -698,6 +694,31 @@ namespace tuplex { } } + // assign the order for all tasks + for(size_t i = 0; i < tasks.size(); ++i) { + tasks[i]->setOrder(i); + } + + if (tstage->hasOutputLimit()) { + if (tstage->outputTopLimit() > 0 && tstage->outputBottomLimit() > 0) { + // do task striping for output limit on both ends + vector newTasks; + for(size_t i = 0; i < tasks.size() - i; i++) { + const size_t rev_i = tasks.size() - 1 - i; + newTasks.push_back(tasks[i]); + if (i < rev_i) { + newTasks.push_back(tasks[rev_i]); + } + } + assert(tasks.size() == newTasks.size()); + tasks.swap(newTasks); + } else if (tstage->outputBottomLimit() > 0) { + // bottom limit only, just reverse the task order + std::reverse(tasks.begin(), tasks.end()); + } + // if top limit only, do nothing since the order is already good + } + return tasks; } @@ -941,8 +962,8 @@ namespace tuplex { } } - // TODO(march): work here (transform stage) auto tasks = createLoadAndTransformToMemoryTasks(tstage, _options, syms); + auto completedTasks = performTasks(tasks); // Note: this doesn't work yet because of the globals. @@ -1175,6 +1196,10 @@ namespace tuplex { rowDelta += taskNonConformingRows.size(); } + if (tstage->hasOutputLimit()) { + trimPartitionsToLimit(output, tstage->outputTopLimit(), tstage->outputBottomLimit()); + } + tstage->setMemoryResult(output, generalOutput, partitionToExceptionsMap, nonConformingRows, remainingExceptions, ecounts); break; } @@ -1518,21 +1543,29 @@ namespace tuplex { WorkQueue& wq = LocalEngine::instance().getQueue(); wq.clear(); - // assign the order for all tasks + // check if ord is set, if not issue warning & add + bool orderlessTaskFound = false; for(int i = 0; i < tasks.size(); ++i) { - tasks[i]->setOrder(i); + if(tasks[i]->getOrder().size() == 0) { + tasks[i]->setOrder(i); + orderlessTaskFound = true; + } } - // add all tasks to queue - // TODO(march): add task stage (to do striping) - for(size_t i = 0; i <= tasks.size() - i - 1; i++) { - const size_t revI = tasks.size()- i - 1 - wq.addTask(&tasks[i]); - if (revI > i) { - wq.addTask(&tasks[revI]); - } +#ifndef NDEBUG + if(orderlessTaskFound) { + logger().debug("task without order found, please fix in code."); + } +#endif + + for (int i = 0; i < tasks.size(); i++) { + // take limit only work with uniform order + assert(task.getOrder(0) == i); } + // add all tasks to queue + for(auto& task : tasks) wq.addTask(task); + // clear tasks.clear(); @@ -2083,4 +2116,123 @@ namespace tuplex { Logger::instance().defaultLogger().info("writing output took " + std::to_string(timer.time()) + "s"); tstage->setFileResult(ecounts); } + + void LocalBackend::trimPartitionsToLimit(std::vector &partitions, + size_t topLimit, + size_t bottomLimit, + TransformStage* tstage) { + std::vector limitedPartitions, limitedTailPartitions; + + // check top output limit, adjust partitions if necessary + size_t numTopOutputRows = 0; + Partition* lastTopPart = nullptr; + size_t clippedTop = 0; + for (auto partition : partitions) { + numTopOutputRows += partition->getNumRows(); + lastTopPart = partition; + if (numTopOutputRows >= topLimit) { + // clip last partition & leave loop + clippedTop = topLimit - (numTopOutputRows - partition->getNumRows()); + assert(clippedTop <= partition->getNumRows()); + break; + } else if (partition == *partitions.end()) { + // last partition, mark full row, but don't put to output set yet to avoid double put + clippedTop = partition->getNumRows(); + break; + } else { + // put full partition to output set + limitedPartitions.push_back(partition); + } + } + + // check the bottom output limit, adjust partitions if necessary + size_t numBottomOutputRows = 0; + size_t clippedBottom = 0; + for (auto it = partitions.rbegin(); it != partitions.rend(); it++) { + auto partition = *it; + numBottomOutputRows += partition->getNumRows(); + + if (partition == lastTopPart) { + // the bottom and the top partitions are overlapping + clippedBottom = bottomLimit - (numBottomOutputRows - partition->getNumRows()); + if (clippedTop + clippedBottom >= partition->getNumRows()) { + // if top and bottom range intersect, use full partitions + clippedTop = partition->getNumRows(); + clippedBottom = 0; + } + break; + } else if (numBottomOutputRows >= bottomLimit) { + // clip last partition & leave loop + auto clipped = bottomLimit - (numTopOutputRows - partition->getNumRows()); + assert(clipped <= partition->getNumRows()); + Partition newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage); + partition->invalidate(); + parition = newPart; + assert(partition->getNumRows() == clipped); + if (clipped > 0) + limitedTailPartitions.push_back(partition); + break; + } else { + // put full partition to output set + limitedTailPartitions.push_back(partition); + } + } + + // push the middle partition + if (lastTopPart != nullptr && (clippedTop > 0 || clippedBottom > 0)) { + assert(clippedTop + clippedBottom <= lastTopPart->getNumRows()); + + // split into two partitions with both top and bottom are in the same partition + Partition* lastBottomPart = nullptr; + + if (clippedBottom != 0) { + lastBottomPart = newPartitionWithSkipRows(lastTopPart, lastTopPart->getNumRows() - clippedBottom, tstage); + } + + lastTopPart->setNumRows(clippedTop); + + limitedPartitions.push_back(lastTopPart); + + if (lastBottomPart != nullptr) { + limitedPartitions.push_back(lastBottomPart); + } + } + + // merge the head and tail partitions + partitions.clear() + partitions.insert(partitions.end(), limitedPartitions.begin(), limitedPartitions.end()); + partitions.insert(partitions.end(), limitedTailPartitions.rbegin(), limitedTailPartitions.rend()); + } + + Partition* LocalBackend::newPartitionWithSkipRows(Partition *p_in, int numToSkip, TransformStage* tstage) { + if(!numToSkip) + return nullptr; + + auto ptr = p_in->lockRaw(); + auto num_rows = *((int64_t*) ptr); + assert(numToSkip < num_rows); + + Partition *p_out = _driver->allocWritablePartition(num_rows - numToSkip + sizeof(int64_t), + tstage->outputSchema(), tstage->outputDataSetID(), + tstage->context().id()); + + ptr += sizeof(int64_t); + size_t numBytesToSkip = 0; + + for(unsigned i = 0; i < numToSkip; ++i) { + Rows r = Row::fromMemory(tstage->outputSchema(), ptr, p_in->capacity() - numBytesToSkip); + ptr += r.serializedLength(); + numBytesToSkip += r.serializedLength(); + } + + auto ptr_out = p_out->lockRaw(); + *((int64_t*) ptr_out) = p_in->getNumRows() - numToSkip; + ptr_out += sizeof(int64_t); + memcpy(ptr_out, ptr, p_in->size() - numBytesToSkip); + p_out->unlock(); + + p_in->unlock(); + + return p_out; + } } // namespace tuplex \ No newline at end of file diff --git a/tuplex/core/src/physical/PhysicalPlan.cc b/tuplex/core/src/physical/PhysicalPlan.cc index 9c22837ad..ff67e4add 100644 --- a/tuplex/core/src/physical/PhysicalPlan.cc +++ b/tuplex/core/src/physical/PhysicalPlan.cc @@ -240,7 +240,7 @@ namespace tuplex { // user wants to merge exceptions in order. bool updateInputExceptions = hasFilter && hasInputExceptions && _context.getOptions().OPT_MERGE_EXCEPTIONS_INORDER(); - // create trafostage via builder pattern + // create transfrom stage via builder pattern auto builder = codegen::StageBuilder(_num_stages++, isRootStage, _context.getOptions().UNDEFINED_BEHAVIOR_FOR_OPERATORS(), diff --git a/tuplex/core/src/physical/ResultSet.cc b/tuplex/core/src/physical/ResultSet.cc index 5e15867f7..e31e78cec 100644 --- a/tuplex/core/src/physical/ResultSet.cc +++ b/tuplex/core/src/physical/ResultSet.cc @@ -183,7 +183,6 @@ namespace tuplex { } Row ResultSet::getNextRow() { - // TODO(march): logic in skip row count here // merge rows from objects if(!_pyobjects.empty()) { auto row_number = std::get<0>(_pyobjects.front()); diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc index af58866dc..6e216ac5c 100644 --- a/tuplex/core/src/physical/TransformStage.cc +++ b/tuplex/core/src/physical/TransformStage.cc @@ -130,99 +130,23 @@ namespace tuplex { if (partitions.empty() && interpreterRows.empty() && generalCase.empty()) _rs = emptyResultSet(); else { - std::vector limitedPartitions, limitedTailPartitions; auto schema = Schema::UNKNOWN; if(!partitions.empty()) { + size_t totalRowsCount = 0; schema = partitions.front()->schema(); for (auto partition : partitions) { assert(schema == partition->schema()); + totalRowsCount += partition->getNumRows(); } - // check top output limit, adjust partitions if necessary - size_t numTopOutputRows = 0; - Partition* lastTopPart = nullptr; - size_t clippedTop = 0; - for (auto partition : partitions) { - numTopOutputRows += partition->getNumRows(); - lastTopPart = partition; - if (numTopOutputRows >= outputTopLimit()) { - // clip last partition & leave loop - clippedTop = outputTopLimit() - (numTopOutputRows - partition->getNumRows()); - assert(clippedTop <= partition->getNumRows()); - break; - } else if (partition == *partitions.end()) { - // last partition, mark full row, but don't put to output set yet to avoid double put - clippedTop = partition->getNumRows(); - break; - } else { - // put full partition to output set - limitedPartitions.push_back(partition); - } - } - - // check the bottom output limit, adjust partitions if necessary - size_t numBottomOutputRows = 0; - size_t clippedBottom = 0; - for (auto it = partitions.rbegin(); it != partitions.rend(); it++) { - auto partition = *it; - numBottomOutputRows += partition->getNumRows(); - - if (partition == lastTopPart) { - // the bottom and the top partitions are overlapping - clippedBottom = outputBottomLimit() - (numBottomOutputRows - partition->getNumRows()); - if (clippedTop + clippedBottom >= partition->getNumRows()) { - // if top and bottom range intersect, use full partitions - clippedTop = partition->getNumRows(); - clippedBottom = 0; - } - break; - } else if (numBottomOutputRows >= outputBottomLimit()) { - // clip last partition & leave loop - auto clipped = outputBottomLimit() - (numTopOutputRows - partition->getNumRows()); - assert(clipped <= partition->getNumRows()); - partition->setNumSkip(partition->getNumRows() - clippedBottom); - partition->setNumRows(clipped); - if (clipped > 0) - limitedTailPartitions.push_back(partition); - break; - } else { - // put full partition to output set - limitedTailPartitions.push_back(partition); - } - } - - // push the middle partition - if (lastTopPart != nullptr && (clippedTop > 0 || clippedBottom > 0)) { - assert(clippedTop + clippedBottom <= lastTopPart->getNumRows()); - - // TODO(march): to work on this (split into two partitions) - // split into two partitions with both top and bottom are in the same partition - Partition* lastBottomPart = nullptr; - if (clippedBottom != 0) { - lastBottomPart = new Partition(lastTopPart); - lastBottomPart->setNumSkip(lastBottomPart->getNumRows() - clippedBottom); - lastBottomPart->setNumRows(clippedBottom); - } - - lastTopPart->setNumRows(clippedTop); - - limitedPartitions.push_back(lastTopPart); - - if (lastBottomPart != nullptr) { - limitedPartitions.push_back(lastBottomPart); - } + if (hasOutputLimit()) { + assert(totalRowsCount == _outputTopLimit + _outputBottomLimit); } - - // merge the head and tail partitions - std::reverse(limitedTailPartitions.begin(), limitedTailPartitions.end()); - limitedPartitions.insert(limitedPartitions.end(), limitedTailPartitions.begin(), limitedTailPartitions.end()); } // put ALL partitions to result set - // TODO(march): handle overlapping case - // TODO(march): maybe do top/bottom limit at the level instead? - _rs = std::make_shared(schema, limitedPartitions, + _rs = std::make_shared(schema, partitions, generalCase, partitionToExceptionsMap, interpreterRows, outputTopLimit() + outputBottomLimit()); } diff --git a/tuplex/core/src/physical/TransformTask.cc b/tuplex/core/src/physical/TransformTask.cc index a65aa7f11..49d104bcc 100644 --- a/tuplex/core/src/physical/TransformTask.cc +++ b/tuplex/core/src/physical/TransformTask.cc @@ -22,9 +22,14 @@ namespace tuplex { static std::atomic_int64_t g_totalTopOutputRows; static std::atomic_int64_t g_totalBottomOutputRows; + // mapping from order number -> row count if the task is finished + static std::mutex g_rowsDoneMutex; + static std::map g_rowsDone; + void TransformTask::resetOutputLimitCounter() { g_totalTopOutputRows = 0; g_totalBottomOutputRows = 0; + g_rowsDone.clear(); } } @@ -42,23 +47,9 @@ extern "C" { } static int64_t limited_w2mCallback(tuplex::TransformTask* task, uint8_t* buf, int64_t bufSize) { - // i.e. check here how many output rows, if already limit reached - jump to goto! - // TODO(march): comment this out - if(tuplex::g_totalTopOutputRows >= task->output_top_limit()) { - return tuplex::ecToI64(tuplex::ExceptionCode::OUTPUT_LIMIT_REACHED); - } - assert(task); assert(dynamic_cast(task)); - auto rc = task->writeRowToMemory(buf, bufSize); - if(0 == rc) - tuplex::g_totalTopOutputRows++; - - // i.e. check here how many output rows, if already limit reached - jump to goto! - if(tuplex::g_totalTopOutputRows >= task->output_top_limit()) { - return tuplex::ecToI64(tuplex::ExceptionCode::OUTPUT_LIMIT_REACHED); - } - return rc; + return task->writeRowToMemory(buf, bufSize); } static int64_t limited_w2fCallback(tuplex::TransformTask* task, uint8_t* buf, int64_t bufSize) { @@ -623,9 +614,36 @@ namespace tuplex { auto functor = reinterpret_cast(_functor); - // TODO(march): question here? // go over all input partitions. for(const auto &inputPartition : _inputPartitions) { + size_t numTopCompleted = 0; + size_t numBottomCompleted = 0; + bool isTopLimitReached = false; + bool isBottomLimitReached = false; + + tuplex::g_rowsDoneMutex.lock(); + for (size_t i = 0; tuplex::g_rowsDone.count(i) != 0; i++) { + numTopCompleted += tuplex::g_rowsDone[i]; + if (numTopCompleted >= _outTopLimit) { + isTopLimitReached = true; + break; + } + } + // TODO: what is the max task number here + for (size_t i = 100; tuplex::g_rowsDone.count(i) != 0; i--) { + numBottomCompleted += tuplex::g_rowsDone[i]; + if (numBottomCompleted >= _outTopLimit) { + isBottomLimitReached = true; + break; + } + } + tuplex::g_rowsDoneMutex.unlock(); + + if (isTopLimitReached && isBottomLimitReached) { + // skip the execution, enough is done + break; + } + // lock ptr, extract number of rows ==> store them // lock raw & call functor! int64_t inSize = inputPartition->size(); @@ -647,6 +665,10 @@ namespace tuplex { // delete partition if desired... if(_invalidateSourceAfterUse) inputPartition->invalidate(); + + tuplex::g_rowsDoneMutex.lock(); + tuplex::g_rowsDone[getOrder(0)] += getNumOutputRows(); + tuplex::g_rowsDoneMutex.unlock(); } #ifndef NDEBUG diff --git a/tuplex/utils/include/mt/ITask.h b/tuplex/utils/include/mt/ITask.h index 01f7137f1..a5ca4058f 100644 --- a/tuplex/utils/include/mt/ITask.h +++ b/tuplex/utils/include/mt/ITask.h @@ -21,52 +21,65 @@ namespace tuplex { +/*! + * interface for defining tasks that can be run via a threadpool + */ +class ITask { +private: + std::thread::id _id; //! the id of the thread that executed the task. Used to specifically execute a task on a specific thread. +//! Per default object is constructed that does not represent a thread + + std::vector _orderNumbers; //! for sorting tasks when doing async processing, allows for multiple stages + +public: + ITask() {}; + ITask(const ITask& other) : _id(other._id), _orderNumbers(other._orderNumbers) {} + virtual ~ITask() = default; + ITask(ITask&& other) = default; + ITask& operator = (ITask&& other) = default; + /*! - * interface for defining tasks that can be run via a threadpool + * interface to run a task */ - class ITask { - private: - std::thread::id _id; //! the id of the thread that executed the task. Used to specifically execute a task on a specific thread. -//! Per default object is constructed that does not represent a thread + virtual void execute() = 0; - size_t _orderNumbers; //! for sorting tasks when doing async processing, allows for multiple stages + std::thread::id getID() { + return _id; + } - public: - ITask() {}; - ITask(const ITask& other) : _id(other._id), _orderNumbers(other._orderNumbers) {} - virtual ~ITask() = default; - ITask(ITask&& other) = default; - ITask& operator = (ITask&& other) = default; + void setID(const std::thread::id& id) { + _id = id; + } - /*! - * interface to run a task - */ - virtual void execute() = 0; + void setOrder(size_t order) { _orderNumbers = std::vector{order}; } - std::thread::id getID() { - return _id; - } + size_t getOrder(const size_t nth) const { + return _orderNumbers[nth]; + } - void setID(const std::thread::id& id) { - _id = id; - } + std::vector getOrder() const { return _orderNumbers; } - void setOrder(size_t order) { - _orderNumbers = order; - } + void setOrder(const std::vector& order) { + _orderNumbers = order; + } - size_t getOrder() const { - return _orderNumbers; - } + /*! + * compare the ordering numbers of two tasks to restore initial data order after processing multiple ones + * @param other + * @return + */ + bool compareAscOrder(const ITask& other) const { + // make sure they have the same length + assert(_orderNumbers.size() == other._orderNumbers.size()); - /*! - * compare the ordering numbers of two tasks to restore initial data order after processing multiple ones - * @param other - * @return - */ - bool compareAscOrder(const ITask& other) const { - return _orderNumbers[i] < other._orderNumbers[i]; + // this < other? + // compare one by one + for(int i = 0; i < other._orderNumbers.size(); ++i) { + if(_orderNumbers[i] >= other._orderNumbers[i]) + return false; } - }; + return true; + } +}; } #endif //TUPLEX_ITASK_H \ No newline at end of file From 3bf283fb003dfa54bc82396bc750a65464969c55 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Fri, 25 Mar 2022 00:16:11 -0400 Subject: [PATCH 11/56] Address Review Comments --- tuplex/core/include/DataSet.h | 5 +- tuplex/core/include/EmptyDataset.h | 4 +- tuplex/core/include/ErrorDataSet.h | 4 +- tuplex/core/include/Executor.h | 10 +- tuplex/core/include/ee/local/LocalBackend.h | 2 +- tuplex/core/include/logical/TakeOperator.h | 12 +-- tuplex/core/include/physical/ResultSet.h | 2 - tuplex/core/include/physical/StageBuilder.h | 5 +- tuplex/core/include/physical/TransformStage.h | 8 +- tuplex/core/src/DataSet.cc | 12 +-- tuplex/core/src/EmptyDataset.cc | 6 +- tuplex/core/src/ErrorDataSet.cc | 6 +- tuplex/core/src/Executor.cc | 4 - tuplex/core/src/ee/local/LocalBackend.cc | 17 ++-- tuplex/core/src/logical/TakeOperator.cc | 6 +- tuplex/core/src/physical/PhysicalPlan.cc | 2 +- tuplex/core/src/physical/ResultSet.cc | 2 +- tuplex/core/src/physical/StageBuilder.cc | 2 +- tuplex/python/src/PythonDataSet.cc | 2 +- tuplex/python/tuplex/dataset.py | 15 +-- tuplex/utils/include/mt/ITask.h | 94 +++++++++---------- 21 files changed, 102 insertions(+), 118 deletions(-) diff --git a/tuplex/core/include/DataSet.h b/tuplex/core/include/DataSet.h index 65a766a87..f6bb97f2c 100644 --- a/tuplex/core/include/DataSet.h +++ b/tuplex/core/include/DataSet.h @@ -263,13 +263,12 @@ namespace tuplex { // these are actions that cause execution virtual std::shared_ptr collect(std::ostream &os = std::cout); - virtual std::shared_ptr take(int64_t numTop, int64_t numBottom, std::ostream &os = std::cout); + virtual std::shared_ptr take(size_t topLimit, size_t bottomLimit, std::ostream &os = std::cout); virtual std::vector collectAsVector(std::ostream &os = std::cout); - virtual std::vector takeAsVector(int64_t numElements, std::ostream &os = std::cout); + virtual std::vector takeAsVector(size_t numElements, std::ostream &os = std::cout); - /*! * saves dataset to file. There are multiple options to control the behavior * ==> 1.) files can be split across multiple ones. Specify number of files to split rows to diff --git a/tuplex/core/include/EmptyDataset.h b/tuplex/core/include/EmptyDataset.h index 0f8a1f52c..6fc3219a4 100644 --- a/tuplex/core/include/EmptyDataset.h +++ b/tuplex/core/include/EmptyDataset.h @@ -70,13 +70,13 @@ namespace tuplex { virtual std::shared_ptr collect(std::ostream& os) override; // take / collect will print out the error only - virtual std::shared_ptr take(int64_t numTop, int64_t numBottom, std::ostream& os) override; + virtual std::shared_ptr take(size_t topLimit, size_t bottomLimit, std::ostream& os) override; //virtual void show(const int64_t numRows=-1, std::ostream& os=std::cout) override; virtual std::vector collectAsVector(std::ostream& os) override; // take / collect will print out the error only - virtual std::vector takeAsVector(int64_t numElements, std::ostream& os) override; + virtual std::vector takeAsVector(size_t numElements, std::ostream& os) override; DataSet& cache(const Schema::MemoryLayout& memoryLayout, bool storeSpecialized) override { return *this; diff --git a/tuplex/core/include/ErrorDataSet.h b/tuplex/core/include/ErrorDataSet.h index 34fc60685..cf283ebd1 100644 --- a/tuplex/core/include/ErrorDataSet.h +++ b/tuplex/core/include/ErrorDataSet.h @@ -90,13 +90,13 @@ namespace tuplex { std::shared_ptr collect(std::ostream& os) override; // take / collect will print out the error only - std::shared_ptr take(int64_t numTop, int64_t numBottom, std::ostream& os) override; + std::shared_ptr take(size_t topLimit, size_t bottomLimit, std::ostream& os) override; //virtual void show(const int64_t numRows=-1, std::ostream& os=std::cout) override; std::vector collectAsVector(std::ostream& os) override; // take / collect will print out the error only - std::vector takeAsVector(int64_t numElements, std::ostream& os) override; + std::vector takeAsVector(size_t numElements, std::ostream& os) override; }; } diff --git a/tuplex/core/include/Executor.h b/tuplex/core/include/Executor.h index 7eaaee244..b6b7edac1 100644 --- a/tuplex/core/include/Executor.h +++ b/tuplex/core/include/Executor.h @@ -44,7 +44,7 @@ namespace tuplex { */ class WorkQueue { private: - std::atomic_bool _done{}; // protects against data races + std::atomic_bool _done; // protects against data races ExecutorTaskQueueType _queue; std::mutex _completedTasksMutex; std::vector _completedTasks; @@ -75,14 +75,6 @@ namespace tuplex { size_t numCompletedTasks() const { return _numCompletedTasks; } - size_t frontRowsLimit() const { - return _frontRowsLimit; - }; - - size_t bottomRowsLimit() const { - return _bottomRowsLimit; - }; - /*! * stop working on this queue & dump all tasks */ diff --git a/tuplex/core/include/ee/local/LocalBackend.h b/tuplex/core/include/ee/local/LocalBackend.h index 0dbfafdc9..d7a5ec25b 100644 --- a/tuplex/core/include/ee/local/LocalBackend.h +++ b/tuplex/core/include/ee/local/LocalBackend.h @@ -89,7 +89,7 @@ namespace tuplex { MessageHandler& logger() const { return Logger::instance().logger("local ee"); } void trimPartitionsToLimit(std::vector &partitions, size_t topLimit, size_t bottomLimit, TransformStage* tstage); - Partition* newPartitionWithSkipRows(Partition* p_in, int numToSkip, TransformStage* tstage); + Partition* newPartitionWithSkipRows(Partition* p_in, size_t numToSkip, TransformStage* tstage); // write output (may be already in correct format!) void writeOutput(TransformStage* tstage, std::vector& sortedTasks); diff --git a/tuplex/core/include/logical/TakeOperator.h b/tuplex/core/include/logical/TakeOperator.h index b5dd5db6e..b7c4892dc 100644 --- a/tuplex/core/include/logical/TakeOperator.h +++ b/tuplex/core/include/logical/TakeOperator.h @@ -17,16 +17,16 @@ namespace tuplex { class TakeOperator : public LogicalOperator { private: - int64_t _limitTop; - int64_t _limitBottom; + size_t _topLimit; + size_t _bottomLimit; public: LogicalOperator *clone() override; public: - TakeOperator(LogicalOperator *parent, const int64_t numTop, const int64_t numBottom); + TakeOperator(LogicalOperator *parent, size_t topLimit, size_t bottomLimit); std::string name() override { - if(_limitTop < 0 || std::numeric_limits::max() == _limitTop) + if(_topLimit < 0 || std::numeric_limits::max() == _topLimit) return "collect"; return "take"; } @@ -38,9 +38,9 @@ namespace tuplex { bool good() const override; - int64_t limit() { return _limitTop; } + size_t topLimit() const { return _topLimit; } - int64_t limitBottom() { return _limitBottom; } + size_t bottomLimit() const { return _bottomLimit; } std::vector getSample(const size_t num) const override; diff --git a/tuplex/core/include/physical/ResultSet.h b/tuplex/core/include/physical/ResultSet.h index 5e69fef3a..e94b8f1ae 100644 --- a/tuplex/core/include/physical/ResultSet.h +++ b/tuplex/core/include/physical/ResultSet.h @@ -36,8 +36,6 @@ namespace tuplex { size_t _rowsRetrieved; size_t _totalRowCounter; // used for merging in rows! size_t _maxRows; - size_t _maxRowsTop; - size_t _maxRowsBottom; Schema _schema; void removeFirstPartition(); diff --git a/tuplex/core/include/physical/StageBuilder.h b/tuplex/core/include/physical/StageBuilder.h index e678ead3d..83e63208a 100644 --- a/tuplex/core/include/physical/StageBuilder.h +++ b/tuplex/core/include/physical/StageBuilder.h @@ -76,7 +76,7 @@ namespace tuplex { void addFileInput(FileInputOperator* csvop); void addFileOutput(FileOutputOperator* fop); - inline void setOutputLimit(size_t topLimit, size_t bottomLimit) { + inline void setOutputLimit(size_t topLimit, size_t bottomLimit = 0) { _outputTopLimit = topLimit; _outputBottomLimit = bottomLimit; } @@ -158,8 +158,9 @@ namespace tuplex { size_t number() const { return _stageNumber; } int64_t outputDataSetID() const; + // default case: both _outputTopLimit and _outputBottomLimit is zero = take everything inline bool hasOutputLimit() const { - return _outputTopLimit < std::numeric_limits::max() || _outputBottomLimit > 0; + return _outputTopLimit != 0 || _outputBottomLimit != 0; } inline char csvOutputDelimiter() const { diff --git a/tuplex/core/include/physical/TransformStage.h b/tuplex/core/include/physical/TransformStage.h index f489f1f6c..f4efeebeb 100644 --- a/tuplex/core/include/physical/TransformStage.h +++ b/tuplex/core/include/physical/TransformStage.h @@ -393,6 +393,10 @@ namespace tuplex { */ void setDataAggregationMode(const AggregateType& t) { _aggMode = t; } + // default case: both _outputTopLimit and _outputBottomLimit is zero = take everything + bool hasOutputLimit() const { + return _outputTopLimit > 0 || _outputBottomLimit > 0; + } private: /*! * creates a new TransformStage with generated code @@ -481,10 +485,6 @@ namespace tuplex { // for hash output, the key and bucket type python::Type _hashOutputKeyType; python::Type _hashOutputBucketType; - - bool hasOutputLimit() const { - return _outputTopLimit < std::numeric_limits::max() && _outputBottomLimit != 0; - } }; } #endif //TUPLEX_TRANSFORMSTAGE_H \ No newline at end of file diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc index 3de903d1c..c11482f86 100644 --- a/tuplex/core/src/DataSet.cc +++ b/tuplex/core/src/DataSet.cc @@ -38,21 +38,17 @@ namespace tuplex { } std::shared_ptr DataSet::collect(std::ostream &os) { - return take(-1, false, os); + return take(0, 0, os); } - std::shared_ptr DataSet::take(int64_t numTop, int64_t numBottom, std::ostream &os) { + std::shared_ptr DataSet::take(size_t topLimit, size_t bottomLimit, std::ostream &os) { // error dataset? if (isError()) throw std::runtime_error("is error dataset!"); - // negative numbers mean get all elements! - if (numTop < 0) - numTop = std::numeric_limits::max(); - // create a take node assert(_context); - LogicalOperator *op = _context->addOperator(new TakeOperator(this->_operator, numTop, numBottom)); + LogicalOperator *op = _context->addOperator(new TakeOperator(this->_operator, topLimit, bottomLimit)); DataSet *dsptr = _context->createDataSet(op->getOutputSchema()); dsptr->_operator = op; op->setDataSet(dsptr); @@ -70,7 +66,7 @@ namespace tuplex { } // -1 means to retrieve all elements - std::vector DataSet::takeAsVector(int64_t numElements, std::ostream &os) { + std::vector DataSet::takeAsVector(size_t numElements, std::ostream &os) { auto rs = take(numElements, false, os); Timer timer; diff --git a/tuplex/core/src/EmptyDataset.cc b/tuplex/core/src/EmptyDataset.cc index 7504e8499..3664a591a 100644 --- a/tuplex/core/src/EmptyDataset.cc +++ b/tuplex/core/src/EmptyDataset.cc @@ -11,16 +11,16 @@ #include namespace tuplex { - std::shared_ptr EmptyDataset::take(int64_t numTop, int64_t numBottom, std::ostream &os) { + std::shared_ptr EmptyDataset::take(size_t topLimit, size_t bottomLimit, std::ostream &os) { return std::make_shared(); } - std::vector EmptyDataset::takeAsVector(int64_t numElements, std::ostream &os) { + std::vector EmptyDataset::takeAsVector(size_t numElements, std::ostream &os) { return std::vector{}; } std::shared_ptr EmptyDataset::collect(std::ostream &os) { - return take(0, false, os); + return take(0, 0, os); } std::vector EmptyDataset::collectAsVector(std::ostream &os) { diff --git a/tuplex/core/src/ErrorDataSet.cc b/tuplex/core/src/ErrorDataSet.cc index 9d19594f2..c87999e5f 100644 --- a/tuplex/core/src/ErrorDataSet.cc +++ b/tuplex/core/src/ErrorDataSet.cc @@ -12,7 +12,7 @@ namespace tuplex { - std::vector ErrorDataSet::takeAsVector(int64_t numElements, std::ostream &os) { + std::vector ErrorDataSet::takeAsVector(size_t numElements, std::ostream &os) { // return empty vector and print err message Logger::instance().logger("core").error(this->_error); @@ -23,7 +23,7 @@ namespace tuplex { return takeAsVector(0, os); } - std::shared_ptr ErrorDataSet::take(int64_t numTop, int64_t numBottom, std::ostream &os) { + std::shared_ptr ErrorDataSet::take(size_t topLimit, size_t bottomLimit, std::ostream &os) { // return empty vector and print err message Logger::instance().logger("core").error(this->_error); @@ -31,7 +31,7 @@ namespace tuplex { } std::shared_ptr ErrorDataSet::collect(std::ostream &os) { - return take(0, false, os); + return take(0, 0, os); } void diff --git a/tuplex/core/src/Executor.cc b/tuplex/core/src/Executor.cc index 388199e4d..acfdd0aa6 100644 --- a/tuplex/core/src/Executor.cc +++ b/tuplex/core/src/Executor.cc @@ -33,10 +33,6 @@ namespace tuplex { std::vector WorkQueue::popCompletedTasks() { TRACE_LOCK("workQueue"); - _taskDoneMutex.lock(); - _taskDone.clear(); - _taskDoneMutex.unlock(); - std::lock_guard lock(_completedTasksMutex); // move leads to circular dependency in gcc and thus a bug on travis-ci. Therefore, just // use the below hack to fool the compiler into actually copying the vectors diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc index dbceaa1b9..e477b653b 100644 --- a/tuplex/core/src/ee/local/LocalBackend.cc +++ b/tuplex/core/src/ee/local/LocalBackend.cc @@ -551,6 +551,7 @@ namespace tuplex { task->sinkExceptionsToMemory(inputSchema); task->setStageID(tstage->getID()); task->setOutputTopLimit(tstage->outputTopLimit()); + task->setOutputBottomLimit(tstage->outputBottomLimit()); // add to tasks tasks.emplace_back(std::move(task)); } else { @@ -1197,7 +1198,7 @@ namespace tuplex { } if (tstage->hasOutputLimit()) { - trimPartitionsToLimit(output, tstage->outputTopLimit(), tstage->outputBottomLimit()); + trimPartitionsToLimit(output, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage); } tstage->setMemoryResult(output, generalOutput, partitionToExceptionsMap, nonConformingRows, remainingExceptions, ecounts); @@ -1560,7 +1561,7 @@ namespace tuplex { for (int i = 0; i < tasks.size(); i++) { // take limit only work with uniform order - assert(task.getOrder(0) == i); + assert(tasks[i]->getOrder(0) == i); } // add all tasks to queue @@ -2165,9 +2166,9 @@ namespace tuplex { // clip last partition & leave loop auto clipped = bottomLimit - (numTopOutputRows - partition->getNumRows()); assert(clipped <= partition->getNumRows()); - Partition newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage); + Partition* newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage); partition->invalidate(); - parition = newPart; + partition = newPart; assert(partition->getNumRows() == clipped); if (clipped > 0) limitedTailPartitions.push_back(partition); @@ -2199,12 +2200,12 @@ namespace tuplex { } // merge the head and tail partitions - partitions.clear() + partitions.clear(); partitions.insert(partitions.end(), limitedPartitions.begin(), limitedPartitions.end()); partitions.insert(partitions.end(), limitedTailPartitions.rbegin(), limitedTailPartitions.rend()); } - Partition* LocalBackend::newPartitionWithSkipRows(Partition *p_in, int numToSkip, TransformStage* tstage) { + Partition* LocalBackend::newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage* tstage) { if(!numToSkip) return nullptr; @@ -2220,7 +2221,7 @@ namespace tuplex { size_t numBytesToSkip = 0; for(unsigned i = 0; i < numToSkip; ++i) { - Rows r = Row::fromMemory(tstage->outputSchema(), ptr, p_in->capacity() - numBytesToSkip); + Row r = Row::fromMemory(tstage->outputSchema(), ptr, p_in->capacity() - numBytesToSkip); ptr += r.serializedLength(); numBytesToSkip += r.serializedLength(); } @@ -2228,7 +2229,7 @@ namespace tuplex { auto ptr_out = p_out->lockRaw(); *((int64_t*) ptr_out) = p_in->getNumRows() - numToSkip; ptr_out += sizeof(int64_t); - memcpy(ptr_out, ptr, p_in->size() - numBytesToSkip); + memcpy((void *) ptr_out, ptr, p_in->size() - numBytesToSkip); p_out->unlock(); p_in->unlock(); diff --git a/tuplex/core/src/logical/TakeOperator.cc b/tuplex/core/src/logical/TakeOperator.cc index e588b5e97..49a4452b4 100644 --- a/tuplex/core/src/logical/TakeOperator.cc +++ b/tuplex/core/src/logical/TakeOperator.cc @@ -12,13 +12,13 @@ #include namespace tuplex { - TakeOperator::TakeOperator(LogicalOperator *parent, const int64_t numTop, const int64_t numBottom) : LogicalOperator::LogicalOperator(parent), _limitTop(numTop), _limitBottom(numBottom) { + TakeOperator::TakeOperator(LogicalOperator *parent, size_t topLimit, size_t bottomLimit) : LogicalOperator::LogicalOperator(parent), _topLimit(topLimit), _bottomLimit(bottomLimit) { // take schema from parent node setSchema(this->parent()->getOutputSchema()); } bool TakeOperator::good() const { - return _limitTop >= -1 && _limitBottom >= -1; + return _topLimit >= 0 && _bottomLimit >= 0; } std::vector TakeOperator::getSample(const size_t num) const { @@ -33,7 +33,7 @@ namespace tuplex { LogicalOperator *TakeOperator::clone() { // create clone of this operator - auto copy = new TakeOperator(parent()->clone(), _limitTop, _limitBottom); + auto copy = new TakeOperator(parent()->clone(), _topLimit, _bottomLimit); copy->setDataSet(getDataSet()); // weak ptr to old dataset... copy->copyMembers(this); diff --git a/tuplex/core/src/physical/PhysicalPlan.cc b/tuplex/core/src/physical/PhysicalPlan.cc index ff67e4add..f289064d5 100644 --- a/tuplex/core/src/physical/PhysicalPlan.cc +++ b/tuplex/core/src/physical/PhysicalPlan.cc @@ -382,7 +382,7 @@ namespace tuplex { // set limit if output node has a limit (currently only TakeOperator) if(outputNode->type() == LogicalOperatorType::TAKE) { auto top = static_cast(outputNode); - builder.setOutputLimit(top->limit(), top->limitBottom()); + builder.setOutputLimit(top->topLimit(), top->bottomLimit()); } // @TODO: add slowPip builder to this process... diff --git a/tuplex/core/src/physical/ResultSet.cc b/tuplex/core/src/physical/ResultSet.cc index e31e78cec..0f7bf7319 100644 --- a/tuplex/core/src/physical/ResultSet.cc +++ b/tuplex/core/src/physical/ResultSet.cc @@ -98,7 +98,7 @@ namespace tuplex { Partition *first = _partitions.front(); assert(_schema == first->schema()); - auto numRows = first->getNumRows() - first->getNumSkip(); + auto numRows = first->getNumRows(); _rowsRetrieved += numRows; _partitions.pop_front(); diff --git a/tuplex/core/src/physical/StageBuilder.cc b/tuplex/core/src/physical/StageBuilder.cc index bc814182b..78bc8dea4 100644 --- a/tuplex/core/src/physical/StageBuilder.cc +++ b/tuplex/core/src/physical/StageBuilder.cc @@ -458,7 +458,7 @@ namespace tuplex { } case LogicalOperatorType::TAKE: { auto takeOp = dynamic_cast(node); - opt_ops.push_back(new TakeOperator(lastParent, takeOp->limit(), takeOp->limitBottom())); + opt_ops.push_back(new TakeOperator(lastParent, takeOp->topLimit(), takeOp->bottomLimit())); opt_ops.back()->setID(node->getID()); break; } diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc index 853b910db..66f94e33f 100644 --- a/tuplex/python/src/PythonDataSet.cc +++ b/tuplex/python/src/PythonDataSet.cc @@ -130,7 +130,7 @@ namespace tuplex { std::shared_ptr rs; std::string err_message = ""; try { - rs = _dataset->take(numRows, ss); + rs = _dataset->take(numTop, numBottom, ss); if(!rs) throw std::runtime_error("invalid result set"); // if there are more than 1 million (100k in debug mode) elements print message... diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py index 1046505f2..850b4ed83 100644 --- a/tuplex/python/tuplex/dataset.py +++ b/tuplex/python/tuplex/dataset.py @@ -191,24 +191,25 @@ def collect(self): assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' return self._dataSet.collect() - def take(self, nrows=5, nbottom=0): + def take(self, limitTop=5, limitBottom=0): """ action that generates a physical plan, processes data and collects the top results then as list of tuples. Args: - nrows (int): number of rows to collect. Per default ``5``. + limitTop (int): number of top rows to collect. Per default ``5``. + limitBottom (int): number of bottom rows to collect. Per default ``0``. Returns: (list): A list of tuples """ - assert isinstance(nrows, int), 'num rows must be an integer' - assert nrows > 0, 'please specify a number greater than zero' - assert isinstance(nbottom, int), 'num bottom last must be an integer' - assert nbottom >= 0, 'please specify a number greater or equal to zero' + assert isinstance(limitTop, int), 'num rows must be an integer' + assert limitTop > 0, 'please specify a number greater than zero' + assert isinstance(limitBottom, int), 'num bottom last must be an integer' + assert limitBottom >= 0, 'please specify a number greater or equal to zero' assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' - return self._dataSet.take(nrows, nbottom) + return self._dataSet.take(limitTop, limitBottom) def show(self, nrows=None): """ action that generates a physical plan, processes data and prints results as nicely formatted diff --git a/tuplex/utils/include/mt/ITask.h b/tuplex/utils/include/mt/ITask.h index a5ca4058f..6c85d2d36 100644 --- a/tuplex/utils/include/mt/ITask.h +++ b/tuplex/utils/include/mt/ITask.h @@ -21,65 +21,65 @@ namespace tuplex { -/*! - * interface for defining tasks that can be run via a threadpool - */ -class ITask { -private: - std::thread::id _id; //! the id of the thread that executed the task. Used to specifically execute a task on a specific thread. + /*! + * interface for defining tasks that can be run via a threadpool + */ + class ITask { + private: + std::thread::id _id; //! the id of the thread that executed the task. Used to specifically execute a task on a specific thread. //! Per default object is constructed that does not represent a thread - std::vector _orderNumbers; //! for sorting tasks when doing async processing, allows for multiple stages + std::vector _orderNumbers; //! for sorting tasks when doing async processing, allows for multiple stages -public: - ITask() {}; - ITask(const ITask& other) : _id(other._id), _orderNumbers(other._orderNumbers) {} - virtual ~ITask() = default; - ITask(ITask&& other) = default; - ITask& operator = (ITask&& other) = default; + public: + ITask() {}; + ITask(const ITask& other) : _id(other._id), _orderNumbers(other._orderNumbers) {} + virtual ~ITask() = default; + ITask(ITask&& other) = default; + ITask& operator = (ITask&& other) = default; - /*! - * interface to run a task - */ - virtual void execute() = 0; + /*! + * interface to run a task + */ + virtual void execute() = 0; - std::thread::id getID() { - return _id; - } + std::thread::id getID() { + return _id; + } - void setID(const std::thread::id& id) { - _id = id; - } + void setID(const std::thread::id& id) { + _id = id; + } - void setOrder(size_t order) { _orderNumbers = std::vector{order}; } + void setOrder(size_t order) { _orderNumbers = std::vector{order}; } - size_t getOrder(const size_t nth) const { - return _orderNumbers[nth]; - } + size_t getOrder(size_t nth) const { + return _orderNumbers[nth]; + } - std::vector getOrder() const { return _orderNumbers; } + std::vector getOrder() const { return _orderNumbers; } - void setOrder(const std::vector& order) { - _orderNumbers = order; - } + void setOrder(const std::vector& order) { + _orderNumbers = order; + } - /*! - * compare the ordering numbers of two tasks to restore initial data order after processing multiple ones - * @param other - * @return - */ - bool compareAscOrder(const ITask& other) const { - // make sure they have the same length - assert(_orderNumbers.size() == other._orderNumbers.size()); + /*! + * compare the ordering numbers of two tasks to restore initial data order after processing multiple ones + * @param other + * @return + */ + bool compareAscOrder(const ITask& other) const { + // make sure they have the same length + assert(_orderNumbers.size() == other._orderNumbers.size()); - // this < other? - // compare one by one - for(int i = 0; i < other._orderNumbers.size(); ++i) { - if(_orderNumbers[i] >= other._orderNumbers[i]) - return false; + // this < other? + // compare one by one + for(int i = 0; i < other._orderNumbers.size(); ++i) { + if(_orderNumbers[i] >= other._orderNumbers[i]) + return false; + } + return true; } - return true; - } -}; + }; } #endif //TUPLEX_ITASK_H \ No newline at end of file From fb90aefd563e1469f97b3bce6c75204df3aff861 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Fri, 25 Mar 2022 00:23:41 -0400 Subject: [PATCH 12/56] Address Review Comments (2) --- tuplex/core/include/logical/TakeOperator.h | 2 +- tuplex/core/include/physical/TransformStage.h | 2 +- tuplex/core/src/ee/local/LocalBackend.cc | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tuplex/core/include/logical/TakeOperator.h b/tuplex/core/include/logical/TakeOperator.h index b7c4892dc..fe5f1b0f2 100644 --- a/tuplex/core/include/logical/TakeOperator.h +++ b/tuplex/core/include/logical/TakeOperator.h @@ -26,7 +26,7 @@ namespace tuplex { TakeOperator(LogicalOperator *parent, size_t topLimit, size_t bottomLimit); std::string name() override { - if(_topLimit < 0 || std::numeric_limits::max() == _topLimit) + if(_topLimit == 0 && _bottomLimit == 0) return "collect"; return "take"; } diff --git a/tuplex/core/include/physical/TransformStage.h b/tuplex/core/include/physical/TransformStage.h index f4efeebeb..05c7df448 100644 --- a/tuplex/core/include/physical/TransformStage.h +++ b/tuplex/core/include/physical/TransformStage.h @@ -395,7 +395,7 @@ namespace tuplex { // default case: both _outputTopLimit and _outputBottomLimit is zero = take everything bool hasOutputLimit() const { - return _outputTopLimit > 0 || _outputBottomLimit > 0; + return _outputTopLimit != 0 || _outputBottomLimit != 0; } private: /*! diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc index e477b653b..022d5a036 100644 --- a/tuplex/core/src/ee/local/LocalBackend.cc +++ b/tuplex/core/src/ee/local/LocalBackend.cc @@ -586,6 +586,7 @@ namespace tuplex { task->sinkExceptionsToMemory(inputSchema); task->setStageID(tstage->getID()); task->setOutputTopLimit(tstage->outputTopLimit()); + task->setOutputBottomLimit(tstage->outputBottomLimit()); // add to tasks tasks.emplace_back(std::move(task)); num_parts++; @@ -623,6 +624,7 @@ namespace tuplex { task->sinkExceptionsToMemory(inputSchema); task->setStageID(tstage->getID()); task->setOutputTopLimit(tstage->outputTopLimit()); + task->setOutputBottomLimit(tstage->outputBottomLimit()); // add to tasks tasks.emplace_back(std::move(task)); From cb4031325e7c8a3ad70c3a720cca545abf7c1e4d Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Fri, 25 Mar 2022 00:52:36 -0400 Subject: [PATCH 13/56] Debugging Tests --- tuplex/core/src/ee/local/LocalBackend.cc | 7 ++++++- tuplex/core/src/physical/ResultSet.cc | 3 +-- tuplex/test/core/TakeTest.cc | 4 +++- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc index 022d5a036..d51ef4523 100644 --- a/tuplex/core/src/ee/local/LocalBackend.cc +++ b/tuplex/core/src/ee/local/LocalBackend.cc @@ -884,7 +884,12 @@ namespace tuplex { // special case: skip stage, i.e. empty code and mem2mem if(tstage->code().empty() && !tstage->fileInputMode() && !tstage->fileOutputMode()) { auto pyObjects = inputExceptionsToPythonObjects(tstage->inputExceptions(), tstage->normalCaseInputSchema()); - tstage->setMemoryResult(tstage->inputPartitions(), std::vector{}, std::unordered_map(), pyObjects); + + auto output_par = tstage->inputPartitions(); + if (tstage->hasOutputLimit()) { + trimPartitionsToLimit(output_par, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage); + } + tstage->setMemoryResult(output_par, std::vector{}, std::unordered_map(), pyObjects); pyObjects.clear(); // skip stage Logger::instance().defaultLogger().info("[Transform Stage] skipped stage " + std::to_string(tstage->number()) + " because there is nothing todo here."); diff --git a/tuplex/core/src/physical/ResultSet.cc b/tuplex/core/src/physical/ResultSet.cc index 0f7bf7319..bfd656dc8 100644 --- a/tuplex/core/src/physical/ResultSet.cc +++ b/tuplex/core/src/physical/ResultSet.cc @@ -138,8 +138,7 @@ namespace tuplex { auto num_rows = first->getNumRows(); // how many left to retrieve? auto num_to_retrieve_from_partition = std::min(limit - i, num_rows - _curRowCounter); - if(num_to_retrieve_from_partition <= 0) - break; + assert(num_to_retrieve_from_partition >= 0); // make sure partition schema matches stored schema assert(_schema == first->schema()); diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc index 08b648f34..2d8f81f2f 100644 --- a/tuplex/test/core/TakeTest.cc +++ b/tuplex/test/core/TakeTest.cc @@ -122,4 +122,6 @@ TEST_F(TakeTest, takeBothTest) { EXPECT_EQ(v3[2].getString(0), "!"); EXPECT_EQ(v3[3].getString(0), "! :)"); EXPECT_EQ(v3[4].getString(0), "!"); -} \ No newline at end of file +} + +// TODO(march): test empty code when reusing partitions. This might not work if user reused dataset \ No newline at end of file From 517f2fcb3730662a1fa2c5abd181539ba87053e7 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Wed, 6 Apr 2022 23:55:59 -0400 Subject: [PATCH 14/56] Change definition of take all --- tuplex/core/include/DataSet.h | 2 +- tuplex/core/include/logical/TakeOperator.h | 2 +- tuplex/core/include/physical/StageBuilder.h | 3 +-- tuplex/core/include/physical/TransformStage.h | 2 +- tuplex/core/include/physical/TransformTask.h | 2 -- tuplex/core/src/DataSet.cc | 14 +++++++------- tuplex/core/src/physical/TransformTask.cc | 3 +-- tuplex/python/include/PythonDataSet.h | 2 +- tuplex/python/src/PythonDataSet.cc | 17 ++++++++++++++--- 9 files changed, 27 insertions(+), 20 deletions(-) diff --git a/tuplex/core/include/DataSet.h b/tuplex/core/include/DataSet.h index f6bb97f2c..86ca23b6a 100644 --- a/tuplex/core/include/DataSet.h +++ b/tuplex/core/include/DataSet.h @@ -128,7 +128,7 @@ namespace tuplex { * @param numRows how many rows to print, i.e. top numRows are printed.xs * @param os ostream where to print table to */ - virtual void show(const int64_t numRows = -1, std::ostream &os = std::cout); + virtual void show(int64_t numRows = -1, std::ostream &os = std::cout); // named dataset management functions /*! diff --git a/tuplex/core/include/logical/TakeOperator.h b/tuplex/core/include/logical/TakeOperator.h index fe5f1b0f2..f3841236b 100644 --- a/tuplex/core/include/logical/TakeOperator.h +++ b/tuplex/core/include/logical/TakeOperator.h @@ -26,7 +26,7 @@ namespace tuplex { TakeOperator(LogicalOperator *parent, size_t topLimit, size_t bottomLimit); std::string name() override { - if(_topLimit == 0 && _bottomLimit == 0) + if(_topLimit == std::numeric_limits::max() || _bottomLimit == std::numeric_limits::max()) return "collect"; return "take"; } diff --git a/tuplex/core/include/physical/StageBuilder.h b/tuplex/core/include/physical/StageBuilder.h index 83e63208a..1c322b9a6 100644 --- a/tuplex/core/include/physical/StageBuilder.h +++ b/tuplex/core/include/physical/StageBuilder.h @@ -158,9 +158,8 @@ namespace tuplex { size_t number() const { return _stageNumber; } int64_t outputDataSetID() const; - // default case: both _outputTopLimit and _outputBottomLimit is zero = take everything inline bool hasOutputLimit() const { - return _outputTopLimit != 0 || _outputBottomLimit != 0; + return _outputTopLimit != std::numeric_limits::max() && _outputBottomLimit != std::numeric_limits::max(); } inline char csvOutputDelimiter() const { diff --git a/tuplex/core/include/physical/TransformStage.h b/tuplex/core/include/physical/TransformStage.h index 05c7df448..e1e45c97b 100644 --- a/tuplex/core/include/physical/TransformStage.h +++ b/tuplex/core/include/physical/TransformStage.h @@ -395,7 +395,7 @@ namespace tuplex { // default case: both _outputTopLimit and _outputBottomLimit is zero = take everything bool hasOutputLimit() const { - return _outputTopLimit != 0 || _outputBottomLimit != 0; + return _outputTopLimit != std::numeric_limits::max() && _outputBottomLimit != std::numeric_limits::max(); } private: /*! diff --git a/tuplex/core/include/physical/TransformTask.h b/tuplex/core/include/physical/TransformTask.h index d065e86d3..e2b8bc5b6 100644 --- a/tuplex/core/include/physical/TransformTask.h +++ b/tuplex/core/include/physical/TransformTask.h @@ -184,7 +184,6 @@ namespace tuplex { void setOutputTopLimit(size_t limit) { _outTopLimit = limit; resetOutputLimitCounter(); } void setOutputBottomLimit(size_t limit) { _outBottomLimit = limit; resetOutputLimitCounter(); } - void setOutputSkip(size_t numRowsToSkip) { _outSkipRows = numRowsToSkip; } void execute() override; bool hasFileSink() const { return _outputFilePath != URI::INVALID; } @@ -281,7 +280,6 @@ namespace tuplex { size_t _outTopLimit; // limits how many rows to write at max size_t _outBottomLimit; // limits how many last rows to write at max - size_t _outSkipRows; // how many rows at start to skip // memory source variables std::vector _inputPartitions; diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc index c11482f86..d54edb567 100644 --- a/tuplex/core/src/DataSet.cc +++ b/tuplex/core/src/DataSet.cc @@ -38,7 +38,7 @@ namespace tuplex { } std::shared_ptr DataSet::collect(std::ostream &os) { - return take(0, 0, os); + return take(std::numeric_limits::max(), 0, os); } std::shared_ptr DataSet::take(size_t topLimit, size_t bottomLimit, std::ostream &os) { @@ -62,18 +62,14 @@ namespace tuplex { // collect functions std::vector DataSet::collectAsVector(std::ostream &os) { - return takeAsVector(-1, os); + return takeAsVector(std::numeric_limits::max(), os); } - // -1 means to retrieve all elements std::vector DataSet::takeAsVector(size_t numElements, std::ostream &os) { auto rs = take(numElements, false, os); Timer timer; #warning "limiting should make this hack irrelevant..." - if (numElements < 0) - numElements = std::numeric_limits::max(); - // std::vector v; // while (rs->hasNextRow() && v.size() < numElements) { // v.push_back(rs->getNextRow()); @@ -730,10 +726,14 @@ namespace tuplex { } - void DataSet::show(const int64_t numRows, std::ostream &os) { + void DataSet::show(int64_t numRows, std::ostream &os) { assert(_context); // get rows + if (numRows < 0) { + numRows = std::numeric_limits::max(); + } + auto rows = takeAsVector(numRows, os); if (rows.empty()) { return; diff --git a/tuplex/core/src/physical/TransformTask.cc b/tuplex/core/src/physical/TransformTask.cc index 49d104bcc..377385deb 100644 --- a/tuplex/core/src/physical/TransformTask.cc +++ b/tuplex/core/src/physical/TransformTask.cc @@ -509,7 +509,6 @@ namespace tuplex { _outPrefix.reset(); _outTopLimit = std::numeric_limits::max(); // write all rows _outBottomLimit = 0; - _outSkipRows = 0; // skip no rows // reset memory sink _output.reset(); @@ -705,7 +704,7 @@ namespace tuplex { // skip rows? limit rows?? - if(_numOutputRowsWritten >= _outSkipRows && _numOutputRowsWritten < (_outTopLimit - _outSkipRows)) { + if(_numOutputRowsWritten < _outTopLimit) { if(_outFile->write(buf, bufSize) != VirtualFileSystemStatus::VFS_OK) return ecToI32(ExceptionCode::IOERROR); } diff --git a/tuplex/python/include/PythonDataSet.h b/tuplex/python/include/PythonDataSet.h index 23b09314d..ede482d9c 100644 --- a/tuplex/python/include/PythonDataSet.h +++ b/tuplex/python/include/PythonDataSet.h @@ -77,7 +77,7 @@ namespace tuplex { PythonDataSet resolve(const int64_t exceptionCode, const std::string& lambda_code, const std::string& pickled_code, const py::object& closure=py::object()); py::object collect(); - py::object take(const int64_t numTop, const int64_t numBottom); + py::object take(const int64_t topLimit, const int64_t bottomLimit); void show(const int64_t numRows=-1); // DataFrame like operations diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc index 66f94e33f..f6079a143 100644 --- a/tuplex/python/src/PythonDataSet.cc +++ b/tuplex/python/src/PythonDataSet.cc @@ -107,7 +107,7 @@ namespace tuplex { } } - py::object PythonDataSet::take(const int64_t numTop, const int64_t numBottom) { + py::object PythonDataSet::take(const int64_t topLimit, const int64_t bottomLimit) { // make sure a dataset is wrapped assert(this->_dataset); @@ -129,8 +129,19 @@ namespace tuplex { std::shared_ptr rs; std::string err_message = ""; + + size_t castedTopLimit = 0; + if (topLimit < 0) { + castedTopLimit = std::numeric_limits::max(); + } + + size_t castedBottomLimit = 0; + if (bottomLimit < 0) { + castedBottomLimit = std::numeric_limits::max(); + } + try { - rs = _dataset->take(numTop, numBottom, ss); + rs = _dataset->take(castedTopLimit, castedBottomLimit, ss); if(!rs) throw std::runtime_error("invalid result set"); // if there are more than 1 million (100k in debug mode) elements print message... @@ -162,7 +173,7 @@ namespace tuplex { // new version, directly interact with the interpreter Timer timer; // build python list object from resultset - auto listObj = resultSetToCPython(rs.get(), numTop); + auto listObj = resultSetToCPython(rs.get(), castedTopLimit); Logger::instance().logger("python").info("Data transfer back to python took " + std::to_string(timer.time()) + " seconds"); // Logger::instance().flushAll(); From c33fc23bd1b4a2790b2e49b1985ec37f2d1ce8f6 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Wed, 6 Apr 2022 23:56:59 -0400 Subject: [PATCH 15/56] Random take test with some debugging --- tuplex/core/src/ee/local/LocalBackend.cc | 35 ++++----- tuplex/core/src/physical/TransformStage.cc | 6 -- tuplex/test/core/TakeTest.cc | 87 +++++++++++++++++++++- 3 files changed, 101 insertions(+), 27 deletions(-) diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc index d51ef4523..0b8157ecc 100644 --- a/tuplex/core/src/ee/local/LocalBackend.cc +++ b/tuplex/core/src/ee/local/LocalBackend.cc @@ -2143,7 +2143,7 @@ namespace tuplex { clippedTop = topLimit - (numTopOutputRows - partition->getNumRows()); assert(clippedTop <= partition->getNumRows()); break; - } else if (partition == *partitions.end()) { + } else if (partition == partitions.back()) { // last partition, mark full row, but don't put to output set yet to avoid double put clippedTop = partition->getNumRows(); break; @@ -2171,14 +2171,14 @@ namespace tuplex { break; } else if (numBottomOutputRows >= bottomLimit) { // clip last partition & leave loop - auto clipped = bottomLimit - (numTopOutputRows - partition->getNumRows()); + auto clipped = bottomLimit - (numBottomOutputRows - partition->getNumRows()); assert(clipped <= partition->getNumRows()); - Partition* newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage); + if (clipped > 0) { + Partition *newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage); + assert(newPart->getNumRows() == clipped); + limitedTailPartitions.push_back(newPart); + } partition->invalidate(); - partition = newPart; - assert(partition->getNumRows() == clipped); - if (clipped > 0) - limitedTailPartitions.push_back(partition); break; } else { // put full partition to output set @@ -2197,9 +2197,12 @@ namespace tuplex { lastBottomPart = newPartitionWithSkipRows(lastTopPart, lastTopPart->getNumRows() - clippedBottom, tstage); } - lastTopPart->setNumRows(clippedTop); - - limitedPartitions.push_back(lastTopPart); + if (clippedTop != 0) { + lastTopPart->setNumRows(clippedTop); + limitedPartitions.push_back(lastTopPart); + } else { + lastTopPart->invalidate(); + } if (lastBottomPart != nullptr) { limitedPartitions.push_back(lastBottomPart); @@ -2213,17 +2216,10 @@ namespace tuplex { } Partition* LocalBackend::newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage* tstage) { - if(!numToSkip) - return nullptr; - auto ptr = p_in->lockRaw(); auto num_rows = *((int64_t*) ptr); assert(numToSkip < num_rows); - Partition *p_out = _driver->allocWritablePartition(num_rows - numToSkip + sizeof(int64_t), - tstage->outputSchema(), tstage->outputDataSetID(), - tstage->context().id()); - ptr += sizeof(int64_t); size_t numBytesToSkip = 0; @@ -2233,6 +2229,11 @@ namespace tuplex { numBytesToSkip += r.serializedLength(); } + Partition *p_out = _driver->allocWritablePartition(p_in->size() - numBytesToSkip + sizeof(int64_t), + tstage->outputSchema(), tstage->outputDataSetID(), + tstage->context().id()); + assert(p_out->capacity() >= p_in->size() - numBytesToSkip); + auto ptr_out = p_out->lockRaw(); *((int64_t*) ptr_out) = p_in->getNumRows() - numToSkip; ptr_out += sizeof(int64_t); diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc index 6e216ac5c..060365697 100644 --- a/tuplex/core/src/physical/TransformStage.cc +++ b/tuplex/core/src/physical/TransformStage.cc @@ -133,15 +133,9 @@ namespace tuplex { auto schema = Schema::UNKNOWN; if(!partitions.empty()) { - size_t totalRowsCount = 0; schema = partitions.front()->schema(); for (auto partition : partitions) { assert(schema == partition->schema()); - totalRowsCount += partition->getNumRows(); - } - - if (hasOutputLimit()) { - assert(totalRowsCount == _outputTopLimit + _outputBottomLimit); } } diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc index 2d8f81f2f..3990fcd07 100644 --- a/tuplex/test/core/TakeTest.cc +++ b/tuplex/test/core/TakeTest.cc @@ -8,13 +8,51 @@ // License: Apache 2.0 // //--------------------------------------------------------------------------------------------------------------------// +#include + #include #include "TestUtils.h" +using namespace tuplex; +using namespace std; + class TakeTest : public PyTest {}; +/** + * Randomly generate a vector of rows for testing + * @param N the size of vector + * @return a vector of size N, containing the random data + */ +vector generateTestData(size_t N, uint64_t seed) { + mt19937 gen(seed); //Standard mersenne_twister_engine seeded with rd() + uniform_int_distribution<> distrib(1, 100000000); + + vector data; + data.reserve(N); + + for (int i = 0; i < N; i++) { + data.emplace_back(distrib(gen), distrib(gen), distrib(gen)); + } + + return data; +} + +vector generateReferenceData(const vector& input, size_t topLimit, size_t bottomLimit) { + vector output; + for(size_t i = 0; i < topLimit && i < input.size(); i++) { + output.push_back(input[i]); + } + size_t start_bottom = input.size() >= bottomLimit ? input.size() - bottomLimit : 0; + start_bottom = max(topLimit, start_bottom); + + for(size_t i = start_bottom; i < input.size(); i++) { + output.push_back(input[i]); + } + + return output; +} + TEST_F(TakeTest, takeTopTest) { - using namespace tuplex; auto opt = testOptions(); Context context(opt); @@ -51,7 +89,6 @@ TEST_F(TakeTest, takeTopTest) { } TEST_F(TakeTest, takeBottomTest) { - using namespace tuplex; auto opt = testOptions(); Context context(opt); @@ -88,7 +125,6 @@ TEST_F(TakeTest, takeBottomTest) { } TEST_F(TakeTest, takeBothTest) { - using namespace tuplex; auto opt = testOptions(); Context context(opt); @@ -124,4 +160,47 @@ TEST_F(TakeTest, takeBothTest) { EXPECT_EQ(v3[4].getString(0), "!"); } -// TODO(march): test empty code when reusing partitions. This might not work if user reused dataset \ No newline at end of file +TEST_F(TakeTest, takeBigTest) { + mt19937 data_seed_gen(4242); + + const std::vector test_size{1, 10, 100, 1001, 10001}; + const std::vector limit_values{0, 1, 5, 11, 600, 10000}; + const std::vector partition_sizes{"256B", "512KB", "1MB"}; + + for(auto& part_size : partition_sizes) { + auto opt = testOptions(); + opt.set("tuplex.partitionSize", part_size); + Context context(opt); + + for(auto data_size : test_size) { + for (auto top_limit: limit_values) { + for (auto bottom_limit: limit_values) { + std::cout << "testing with partition size:" << part_size << " data size:" + << data_size << " top:" << top_limit << " bottom:" << bottom_limit << std::endl; + + auto data = generateTestData(data_size, data_seed_gen()); + auto ref_data = generateReferenceData(data, top_limit, bottom_limit); + + auto res = context.parallelize(data).take(top_limit, bottom_limit); + ASSERT_EQ(ref_data.size(), res->rowCount()); + for (Row &r: ref_data) { + Row res_row = res->getNextRow(); + if (!(res_row == r)) { + ASSERT_EQ(res_row, r); + } + } + } + } + } + } +} + +// TODO(march): with map, filter function +//TEST_F(TakeTest, takeMapFilterTest) { +// srand(4242); +//} + +// TODO(march): with file input +// context.csv("../resources/"); + +// TODO(march): collect operator \ No newline at end of file From 38d9ca971363eb028a02f86bc0920110aa6172fb Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Thu, 7 Apr 2022 23:28:54 -0400 Subject: [PATCH 16/56] Polish the python interface --- tuplex/core/include/DataSet.h | 8 ++ tuplex/core/src/DataSet.cc | 109 +++++++++++++++++++++++++ tuplex/python/include/PythonDataSet.h | 1 + tuplex/python/src/PythonBindings.cc | 1 + tuplex/python/src/PythonDataSet.cc | 49 +++++++++++ tuplex/python/tuplex/dataset.py | 112 +++++++------------------- 6 files changed, 196 insertions(+), 84 deletions(-) diff --git a/tuplex/core/include/DataSet.h b/tuplex/core/include/DataSet.h index 86ca23b6a..9510427e2 100644 --- a/tuplex/core/include/DataSet.h +++ b/tuplex/core/include/DataSet.h @@ -130,6 +130,14 @@ namespace tuplex { */ virtual void show(int64_t numRows = -1, std::ostream &os = std::cout); + /*! + * Displays a formatted HTML table of a small portion of the data + * @param topLimit how many top rows to print + * @param bottomLimit how many bottom rows to print + * @param os ostream where to print table to + */ + virtual void showHTMLPreview(size_t topLimit, size_t bottomLimit, std::ostream &os = std::cout); + // named dataset management functions /*! * map Column using a UDF diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc index d54edb567..210b3ec60 100644 --- a/tuplex/core/src/DataSet.cc +++ b/tuplex/core/src/DataSet.cc @@ -756,6 +756,115 @@ namespace tuplex { printTable(os, headers, rows); } + void printHTMLRow(std::ostream &os, size_t ind, const Row& r) { + os << " \n"; + os << fmt::format(" {}\n", ind); + for (auto& s : r.getAsStrings()) { + os << fmt::format(" {}\n", s); + } + os << " \n"; + } + + void DataSet::showHTMLPreview(size_t topLimit, size_t bottomLimit, std::ostream &os) { + std::string HTML_TEMPLATE = + "
\n" + "\n" + "\n" + " \n" + " \n" + "{}" + " \n" + " \n" + " \n" + "{}" + " \n" + "
\n" + "

{} columns

\n" + "
"; + + assert(_context); + + auto rows = take(topLimit, bottomLimit); + + if (rows->rowCount() == 0) { + os << fmt::format(HTML_TEMPLATE, "\n", "\n", 0); + return; + } + + std::stringstream headers_stream, body_stream; + size_t numColumns = 0; + assert(rows->rowCount() <= topLimit + bottomLimit); + + // construct tables + if (rows->rowCount() < topLimit + bottomLimit) { + // the data is small so we get everything (no need to render ...) + for (size_t i = 0; rows->hasNextRow(); i++) { + Row r = rows->getNextRow(); + if (i == 0) { + // we set num columns based on the first row + numColumns = r.getNumColumns(); + } + + printHTMLRow(body_stream, i, r); + } + } else { + // some data is not processed because of limiting + size_t i; + for (i = 0; rows->hasNextRow() && i < topLimit; i++) { + Row r = rows->getNextRow(); + if (i == 0) { + // we set num columns based on the first row + numColumns = r.getNumColumns(); + } + + printHTMLRow(body_stream, i, r); + } + + // add the ... + body_stream << " \n"; + body_stream << " ...\n"; + for(int j = 0; j < numColumns; j++) { + body_stream << " ...\n"; + body_stream << " \n"; + } + + while (rows->hasNextRow()) { + Row r = rows->getNextRow(); + printHTMLRow(body_stream, i, r); + } + } + + assert(numColumns != 0); + + // construct headers + std::vector headers(numColumns); + if (!_columnNames.empty()) { + assert(numColumns == _columnNames.size()); + for (auto &c_name: _columnNames) { + headers_stream << fmt::format(" {}\n", c_name); + } + } else { + // default to generic name if column name doesn't exist + for (int i = 0; i < numColumns; ++i) { + headers_stream << fmt::format(" Column {}\n", i); + } + } + + os << fmt::format(HTML_TEMPLATE, headers_stream.str(), body_stream.str(), numColumns); + } + Schema DataSet::schema() const { if(!_operator) return Schema::UNKNOWN; diff --git a/tuplex/python/include/PythonDataSet.h b/tuplex/python/include/PythonDataSet.h index ede482d9c..4761ac7f0 100644 --- a/tuplex/python/include/PythonDataSet.h +++ b/tuplex/python/include/PythonDataSet.h @@ -79,6 +79,7 @@ namespace tuplex { py::object collect(); py::object take(const int64_t topLimit, const int64_t bottomLimit); void show(const int64_t numRows=-1); + std::string showHTMLPreview(const int64_t topLimit, const int64_t bottomLimit); // DataFrame like operations PythonDataSet mapColumn(const std::string& column, const std::string& lambda_code, const std::string& pickled_code, const py::object& closure=py::object()); diff --git a/tuplex/python/src/PythonBindings.cc b/tuplex/python/src/PythonBindings.cc index 6b3683853..ab239a1a2 100644 --- a/tuplex/python/src/PythonBindings.cc +++ b/tuplex/python/src/PythonBindings.cc @@ -41,6 +41,7 @@ PYMODULE { py::class_(m, "_DataSet") .def("show", &tuplex::PythonDataSet::show) + .def("showHTMLPreview", &tuplex::PythonDataSet::showHTMLPreview) .def("collect", &tuplex::PythonDataSet::collect) .def("take", &tuplex::PythonDataSet::take) .def("map", &tuplex::PythonDataSet::map) diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc index f6079a143..1f543e5d2 100644 --- a/tuplex/python/src/PythonDataSet.cc +++ b/tuplex/python/src/PythonDataSet.cc @@ -880,6 +880,55 @@ namespace tuplex { } } + std::string PythonDataSet::showHTMLPreview(const int64_t topLimit, const int64_t bottomLimit) { + // make sure a dataset is wrapped + assert(this->_dataset); + + // is callee error dataset? if so return list with error string + if (this->_dataset->isError()) { + auto errset = dynamic_cast(this->_dataset); + assert(errset); + return "Error: " + errset->getError(); + } else { + // release GIL & hand over everything to Tuplex + assert(PyGILState_Check()); // make sure this thread holds the GIL! + python::unlockGIL(); + + std::stringstream ss; + std::string err_message; + + size_t castedTopLimit = 0; + if (topLimit < 0) { + castedTopLimit = std::numeric_limits::max(); + } + + size_t castedBottomLimit = 0; + if (bottomLimit < 0) { + castedBottomLimit = std::numeric_limits::max(); + } + + try { + this->_dataset->showHTMLPreview(castedTopLimit, castedBottomLimit, ss); + } catch (const std::exception &e) { + err_message = e.what(); + Logger::instance().defaultLogger().error(err_message); + } catch (...) { + err_message = "unknown C++ exception occurred, please change type."; + Logger::instance().defaultLogger().error(err_message); + } + + // reacquire GIL + python::lockGIL(); + Logger::instance().flushToPython(); + + if (!ss.str().empty() && err_message.empty()) { + return ss.str(); + } else { + return "Error occurred: " + err_message; + } + } + } + PyObject* PythonDataSet::anyToCPythonWithPyObjects(ResultSet* rs, size_t maxRowCount) { assert(rs); diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py index 850b4ed83..4d02cf4d4 100644 --- a/tuplex/python/tuplex/dataset.py +++ b/tuplex/python/tuplex/dataset.py @@ -28,87 +28,8 @@ class DataSet: def __init__(self): self._dataSet = None - def getColumnSize(self): - data = self.collect() - if len(data) == 0: - return 0, 0 - else: - return len(data[0]) - - def revTake(self, nRows = 5): - return self.collect()[-nRows:] - def _repr_html_(self): - rows_list = self.take() - total_col_cnt = self.getColumnSize() - print('rowlist') - print(rows_list) - if len(rows_list) == 0: - header = '\n' - rows = '\n' - else: - header = '\n' - - if self.columns != None: - for x in self.columns: - header += f' {x}\n' - else: - for i in range(len(rows_list[0])): - header += f' column {i + 1}\n' - - rows = '' - for i, r in enumerate(rows_list): - rows += ' \n' - rows += f' {i}\n' - for data in r: - rows += f' {data}\n' - rows += ' \n' - - # add the ... - rows += ' \n' - rows += ' ...\n' - for i in range(total_col_cnt): - rows += ' ...\n' - rows += ' \n' - - lastData = self.revTake() - for i, r in enumerate(lastData): - rows += ' \n' - rows += f' {0 - len(lastData) + i}\n' - for data in r: - rows += f' {data}\n' - rows += ' \n' - - html_template = ( - '
\n' - '\n' - '\n' - ' \n' - ' \n' - f'{header}' - ' \n' - ' \n' - ' \n' - f'{rows}' - ' \n' - '
\n' - f'

{total_col_cnt} columns

\n' - '
' - ) - - return html_template + return self._dataSet.showHTMLPreview() def unique(self): """ removes duplicates from Dataset (out-of-order). Equivalent to a DISTINCT clause in a SQL-statement. @@ -201,11 +122,14 @@ def take(self, limitTop=5, limitBottom=0): (list): A list of tuples """ + assert limitTop is None or isinstance(limitTop, int), 'num rows must be an integer or None' + assert limitBottom is None or isinstance(limitBottom, int), 'num bottom last must be an integer or None' - assert isinstance(limitTop, int), 'num rows must be an integer' - assert limitTop > 0, 'please specify a number greater than zero' - assert isinstance(limitBottom, int), 'num bottom last must be an integer' - assert limitBottom >= 0, 'please specify a number greater or equal to zero' + if limitTop is None or limitTop < 0: + limitTop = -1 + + if limitBottom is None or limitBottom < 0: + limitBottom = -1 assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' @@ -227,6 +151,26 @@ def show(self, nrows=None): self._dataSet.show(nrows) + def showHTMLPreview(self, topLimit=5, bottomLimit=5): + """ action that generates a physical plan, processes data and return a subset of results as nicely formatted + HTML table to stdout. + + Args: + topLimit (int): number of top rows to collect. If ``None`` all rows will be collected + bottomLimit (int): number of bottom rows to collect. If ``None`` all rows will be collected + + Returns: + string: an HTML table showing a preview of the data + """ + assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' + + if topLimit is None or topLimit < 0: + topLimit = -1 + if bottomLimit is None or bottomLimit < 0: + bottomLimit = -1 + + return self._dataSet.showHTMLPreview(topLimit, bottomLimit) + def resolve(self, eclass, ftor): """ Adds a resolver operator to the pipeline. The signature of ftor needs to be identical to the one of the preceding operator. From 1f5ff5934a6faa5c3ac963fee4f8c22c3d16acc2 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Fri, 8 Apr 2022 01:24:03 -0400 Subject: [PATCH 17/56] Address PR comments --- tuplex/core/include/ee/local/LocalBackend.h | 24 +++++- tuplex/core/include/physical/TransformStage.h | 6 +- tuplex/core/include/physical/TransformTask.h | 20 ++++- tuplex/core/src/ee/local/LocalBackend.cc | 69 ++++++++++------ tuplex/core/src/physical/TransformTask.cc | 81 +++++++++++-------- tuplex/python/tuplex/dataset.py | 23 +++++- tuplex/test/core/TakeTest.cc | 4 +- 7 files changed, 156 insertions(+), 71 deletions(-) diff --git a/tuplex/core/include/ee/local/LocalBackend.h b/tuplex/core/include/ee/local/LocalBackend.h index d7a5ec25b..712f0ae43 100644 --- a/tuplex/core/include/ee/local/LocalBackend.h +++ b/tuplex/core/include/ee/local/LocalBackend.h @@ -88,9 +88,6 @@ namespace tuplex { MessageHandler& logger() const { return Logger::instance().logger("local ee"); } - void trimPartitionsToLimit(std::vector &partitions, size_t topLimit, size_t bottomLimit, TransformStage* tstage); - Partition* newPartitionWithSkipRows(Partition* p_in, size_t numToSkip, TransformStage* tstage); - // write output (may be already in correct format!) void writeOutput(TransformStage* tstage, std::vector& sortedTasks); @@ -187,6 +184,27 @@ namespace tuplex { * @return */ extern URI outputURI(const UDF& udf, const URI& baseURI, int64_t partNo, FileFormat fmt); + + /*! + * Trim list of partitions so that it includes up to the first n rows and the last m rows + * @param partitions [in,out] the list of partitions to trim + * @param topLimit n, the number of top rows to include + * @param bottomLimit m, the number of bottom rows to include + * @param tstage pointer to transform stage, might be used to generate new partition + * @param exec pointer to executor, might be used to allocate new partition + */ + extern void trimPartitionsToLimit(std::vector &partitions, size_t topLimit, size_t bottomLimit, + TransformStage *tstage, Executor *exec); + + /*! + * Create a new partition with the same data as the specified partition, but with the first n rows removed + * @param p_in the input partition + * @param numToSkip number of rows to remove from the new partition + * @param tstage pointer to transform stage, used to generate new partition + * @param exec pointer to executor, used to allocate new partition + * @return the new partition + */ + extern Partition *newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage *tstage, Executor *exec); } #endif //TUPLEX_LOCALBACKEND_H \ No newline at end of file diff --git a/tuplex/core/include/physical/TransformStage.h b/tuplex/core/include/physical/TransformStage.h index e1e45c97b..ff4ece1dd 100644 --- a/tuplex/core/include/physical/TransformStage.h +++ b/tuplex/core/include/physical/TransformStage.h @@ -107,9 +107,9 @@ namespace tuplex { std::unordered_map partitionToExceptionsMap() { return _partitionToExceptionsMap; } /*! - * sets maximum number of top rows this pipeline will produce - * @param topLimit - * @param bottomLimit + * sets maximum number of rows this pipeline will produce + * @param topLimit number of top rows to produce, 0 means none, and size_t::max means everything + * @param bottomLimit number of bottom rows to produce, 0 means none, and size_t::max means everything */ inline void setOutputLimit(size_t topLimit, size_t bottomLimit) { _outputTopLimit = topLimit; diff --git a/tuplex/core/include/physical/TransformTask.h b/tuplex/core/include/physical/TransformTask.h index e2b8bc5b6..d966b69ee 100644 --- a/tuplex/core/include/physical/TransformTask.h +++ b/tuplex/core/include/physical/TransformTask.h @@ -182,8 +182,19 @@ namespace tuplex { void sinkOutputToHashTable(HashTableFormat fmt, int64_t outputDataSetID); HashTableSink hashTableSink() const { return _htable; } // needs to be freed manually! - void setOutputTopLimit(size_t limit) { _outTopLimit = limit; resetOutputLimitCounter(); } - void setOutputBottomLimit(size_t limit) { _outBottomLimit = limit; resetOutputLimitCounter(); } + void setOutputTopLimit(size_t limit) { + _outTopLimit = limit; + } + + void setOutputBottomLimit(size_t limit) { + _outBottomLimit = limit; + } + + // maxOrder of infinity means disregarding the bottomLimit short circuit + static void setMaxOrderAndResetLimits(size_t maxOrder = std::numeric_limits::max()) { + resetLimits(maxOrder); + } + void execute() override; bool hasFileSink() const { return _outputFilePath != URI::INVALID; } @@ -207,7 +218,7 @@ namespace tuplex { static codegen::i64_hash_row_f writeInt64HashTableAggregateCallback(); static codegen::write_row_f aggCombineCallback(); - static void resetOutputLimitCounter(); + static void resetLimits(size_t maxOrder); // most be public because of C++ issues -.- int64_t writeRowToMemory(uint8_t* buf, int64_t bufSize); @@ -314,6 +325,9 @@ namespace tuplex { _exceptions.unlock(); } + bool limitReached() const; + void updateLimits(); + void processMemorySourceWithExp(); void processMemorySource(); void processFileSource(); diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc index 0b8157ecc..9530e9d04 100644 --- a/tuplex/core/src/ee/local/LocalBackend.cc +++ b/tuplex/core/src/ee/local/LocalBackend.cc @@ -651,7 +651,6 @@ namespace tuplex { // --> issue for each memory partition a transform task and put it into local workqueue assert(tstage->inputMode() == EndPointMode::MEMORY); - // restrict after input limit size_t numInputRows = 0; auto inputPartitions = tstage->inputPartitions(); @@ -697,14 +696,24 @@ namespace tuplex { } } + // TODO(march): we can avoid setting order here by pre init g_rowsDone + // assign the order for all tasks for(size_t i = 0; i < tasks.size(); ++i) { tasks[i]->setOrder(i); } + TransformTask::setMaxOrderAndResetLimits(tasks.size() - 1); + if (tstage->hasOutputLimit()) { + // There are 3 possible cases here: + // 1. both top and bottom limit + // 2. only top limit + // 3. only bottom limit if (tstage->outputTopLimit() > 0 && tstage->outputBottomLimit() > 0) { - // do task striping for output limit on both ends + // case 1: do task striping for output limit on both ends + // We are executing in the striping order instead of ascending or descending order + // This is an optimization in the case where we have small limits to avoid executing all partitions vector newTasks; for(size_t i = 0; i < tasks.size() - i; i++) { const size_t rev_i = tasks.size() - 1 - i; @@ -716,10 +725,13 @@ namespace tuplex { assert(tasks.size() == newTasks.size()); tasks.swap(newTasks); } else if (tstage->outputBottomLimit() > 0) { - // bottom limit only, just reverse the task order + // case 3: bottom limit only, just reverse the task order + // We are executing the last partitions first, since we don't need the top rows. + // Thus speeding up the execution time std::reverse(tasks.begin(), tasks.end()); } - // if top limit only, do nothing since the order is already good + // case 3: if top limit only, do nothing since the order is already good + // (the tasks is generated in ascending order) } return tasks; @@ -887,7 +899,7 @@ namespace tuplex { auto output_par = tstage->inputPartitions(); if (tstage->hasOutputLimit()) { - trimPartitionsToLimit(output_par, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage); + trimPartitionsToLimit(output_par, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage, _driver); } tstage->setMemoryResult(output_par, std::vector{}, std::unordered_map(), pyObjects); pyObjects.clear(); @@ -972,6 +984,13 @@ namespace tuplex { auto tasks = createLoadAndTransformToMemoryTasks(tstage, _options, syms); + if (tstage->hasOutputLimit()) { + for (int i = 0; i < tasks.size(); i++) { + // take limit only work with uniform order + assert(tasks[i]->getOrder(0) == i); + } + } + auto completedTasks = performTasks(tasks); // Note: this doesn't work yet because of the globals. @@ -1205,7 +1224,8 @@ namespace tuplex { } if (tstage->hasOutputLimit()) { - trimPartitionsToLimit(output, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage); + // the function expect the output to be sorted in ascending order (guaranteed by sortTasks()) + trimPartitionsToLimit(output, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage, _driver); } tstage->setMemoryResult(output, generalOutput, partitionToExceptionsMap, nonConformingRows, remainingExceptions, ecounts); @@ -1565,12 +1585,6 @@ namespace tuplex { logger().debug("task without order found, please fix in code."); } #endif - - for (int i = 0; i < tasks.size(); i++) { - // take limit only work with uniform order - assert(tasks[i]->getOrder(0) == i); - } - // add all tasks to queue for(auto& task : tasks) wq.addTask(task); @@ -2125,17 +2139,18 @@ namespace tuplex { tstage->setFileResult(ecounts); } - void LocalBackend::trimPartitionsToLimit(std::vector &partitions, + void trimPartitionsToLimit(std::vector &partitions, size_t topLimit, size_t bottomLimit, - TransformStage* tstage) { + TransformStage* tstage, + Executor *exec) { std::vector limitedPartitions, limitedTailPartitions; // check top output limit, adjust partitions if necessary size_t numTopOutputRows = 0; - Partition* lastTopPart = nullptr; + Partition *lastTopPart = nullptr; size_t clippedTop = 0; - for (auto partition : partitions) { + for (auto partition: partitions) { numTopOutputRows += partition->getNumRows(); lastTopPart = partition; if (numTopOutputRows >= topLimit) { @@ -2174,7 +2189,8 @@ namespace tuplex { auto clipped = bottomLimit - (numBottomOutputRows - partition->getNumRows()); assert(clipped <= partition->getNumRows()); if (clipped > 0) { - Partition *newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage); + Partition *newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage, + exec); assert(newPart->getNumRows() == clipped); limitedTailPartitions.push_back(newPart); } @@ -2191,10 +2207,11 @@ namespace tuplex { assert(clippedTop + clippedBottom <= lastTopPart->getNumRows()); // split into two partitions with both top and bottom are in the same partition - Partition* lastBottomPart = nullptr; + Partition *lastBottomPart = nullptr; if (clippedBottom != 0) { - lastBottomPart = newPartitionWithSkipRows(lastTopPart, lastTopPart->getNumRows() - clippedBottom, tstage); + lastBottomPart = newPartitionWithSkipRows(lastTopPart, lastTopPart->getNumRows() - clippedBottom, + tstage, exec); } if (clippedTop != 0) { @@ -2215,27 +2232,27 @@ namespace tuplex { partitions.insert(partitions.end(), limitedTailPartitions.rbegin(), limitedTailPartitions.rend()); } - Partition* LocalBackend::newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage* tstage) { + Partition *newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage *tstage, Executor *exec) { auto ptr = p_in->lockRaw(); - auto num_rows = *((int64_t*) ptr); + auto num_rows = *((int64_t *) ptr); assert(numToSkip < num_rows); ptr += sizeof(int64_t); size_t numBytesToSkip = 0; - for(unsigned i = 0; i < numToSkip; ++i) { + for (unsigned i = 0; i < numToSkip; ++i) { Row r = Row::fromMemory(tstage->outputSchema(), ptr, p_in->capacity() - numBytesToSkip); ptr += r.serializedLength(); numBytesToSkip += r.serializedLength(); } - Partition *p_out = _driver->allocWritablePartition(p_in->size() - numBytesToSkip + sizeof(int64_t), - tstage->outputSchema(), tstage->outputDataSetID(), - tstage->context().id()); + Partition *p_out = exec->allocWritablePartition(p_in->size() - numBytesToSkip + sizeof(int64_t), + tstage->outputSchema(), tstage->outputDataSetID(), + tstage->context().id()); assert(p_out->capacity() >= p_in->size() - numBytesToSkip); auto ptr_out = p_out->lockRaw(); - *((int64_t*) ptr_out) = p_in->getNumRows() - numToSkip; + *((int64_t *) ptr_out) = p_in->getNumRows() - numToSkip; ptr_out += sizeof(int64_t); memcpy((void *) ptr_out, ptr, p_in->size() - numBytesToSkip); p_out->unlock(); diff --git a/tuplex/core/src/physical/TransformTask.cc b/tuplex/core/src/physical/TransformTask.cc index 377385deb..8ebe18a7b 100644 --- a/tuplex/core/src/physical/TransformTask.cc +++ b/tuplex/core/src/physical/TransformTask.cc @@ -18,18 +18,14 @@ #include namespace tuplex { - // atomic var to count output rows! - static std::atomic_int64_t g_totalTopOutputRows; - static std::atomic_int64_t g_totalBottomOutputRows; - // mapping from order number -> row count if the task is finished static std::mutex g_rowsDoneMutex; - static std::map g_rowsDone; + static std::unordered_map g_rowsDone; + static std::atomic_size_t g_maxOrder; - void TransformTask::resetOutputLimitCounter() { - g_totalTopOutputRows = 0; - g_totalBottomOutputRows = 0; + void TransformTask::resetLimits(size_t maxOrder) { g_rowsDone.clear(); + g_maxOrder = maxOrder; } } @@ -602,25 +598,16 @@ namespace tuplex { #endif } - void TransformTask::processMemorySource() { - assert(!_inputPartitions.empty()); - assert(_functor); - - _numInputRowsRead = 0; - _numOutputRowsWritten = 0; - - int64_t num_normal_rows = 0, num_bad_rows = 0; - - auto functor = reinterpret_cast(_functor); - - // go over all input partitions. - for(const auto &inputPartition : _inputPartitions) { - size_t numTopCompleted = 0; - size_t numBottomCompleted = 0; - bool isTopLimitReached = false; - bool isBottomLimitReached = false; + bool TransformTask::limitReached() const { + size_t numTopCompleted = 0; + size_t numBottomCompleted = 0; + bool isTopLimitReached = false; + bool isBottomLimitReached = false; - tuplex::g_rowsDoneMutex.lock(); + tuplex::g_rowsDoneMutex.lock(); + if (_outTopLimit == 0) { + isTopLimitReached = true; + } else { for (size_t i = 0; tuplex::g_rowsDone.count(i) != 0; i++) { numTopCompleted += tuplex::g_rowsDone[i]; if (numTopCompleted >= _outTopLimit) { @@ -628,17 +615,45 @@ namespace tuplex { break; } } - // TODO: what is the max task number here - for (size_t i = 100; tuplex::g_rowsDone.count(i) != 0; i--) { + } + + // TODO: what is the max task number here + if (_outBottomLimit == 0) { + isBottomLimitReached = true; + } else { + for (size_t i = tuplex::g_maxOrder; tuplex::g_rowsDone.count(i) != 0; i--) { numBottomCompleted += tuplex::g_rowsDone[i]; - if (numBottomCompleted >= _outTopLimit) { + if (numBottomCompleted >= _outBottomLimit) { isBottomLimitReached = true; break; } } - tuplex::g_rowsDoneMutex.unlock(); + } + tuplex::g_rowsDoneMutex.unlock(); + + return isTopLimitReached && isBottomLimitReached; + } + + void TransformTask::updateLimits() { + tuplex::g_rowsDoneMutex.lock(); + tuplex::g_rowsDone[getOrder(0)] += getNumOutputRows(); + tuplex::g_rowsDoneMutex.unlock(); + } + + void TransformTask::processMemorySource() { + assert(!_inputPartitions.empty()); + assert(_functor); - if (isTopLimitReached && isBottomLimitReached) { + _numInputRowsRead = 0; + _numOutputRowsWritten = 0; + + int64_t num_normal_rows = 0, num_bad_rows = 0; + + auto functor = reinterpret_cast(_functor); + + // go over all input partitions. + for(const auto &inputPartition : _inputPartitions) { + if (limitReached()) { // skip the execution, enough is done break; } @@ -665,9 +680,7 @@ namespace tuplex { if(_invalidateSourceAfterUse) inputPartition->invalidate(); - tuplex::g_rowsDoneMutex.lock(); - tuplex::g_rowsDone[getOrder(0)] += getNumOutputRows(); - tuplex::g_rowsDoneMutex.unlock(); + updateLimits(); } #ifndef NDEBUG diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py index 4d02cf4d4..376134934 100644 --- a/tuplex/python/tuplex/dataset.py +++ b/tuplex/python/tuplex/dataset.py @@ -113,7 +113,8 @@ def collect(self): return self._dataSet.collect() def take(self, limitTop=5, limitBottom=0): - """ action that generates a physical plan, processes data and collects the top results then as list of tuples. + """ action that generates a physical plan, processes data and collects the top and bottom results + then as list of tuples. Args: limitTop (int): number of top rows to collect. Per default ``5``. @@ -135,6 +136,26 @@ def take(self, limitTop=5, limitBottom=0): return self._dataSet.take(limitTop, limitBottom) + def head(self, nrows): + """ action that generates a physical plan, processes data and collects the top results then as list of tuples. + + Args: + nrows (int): number of rows to collect. + Returns: + (list): A list of tuples + """ + return self.take(nrows, 0) + + def tail(self, nrows): + """ action that generates a physical plan, processes data and collects the bottom results then as list of tuples. + + Args: + nrows (int): number of rows to collect. + Returns: + (list): A list of tuples + """ + return self.take(0, nrows) + def show(self, nrows=None): """ action that generates a physical plan, processes data and prints results as nicely formatted ASCII table to stdout. diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc index 3990fcd07..8c4ed5fe5 100644 --- a/tuplex/test/core/TakeTest.cc +++ b/tuplex/test/core/TakeTest.cc @@ -203,4 +203,6 @@ TEST_F(TakeTest, takeBigTest) { // TODO(march): with file input // context.csv("../resources/"); -// TODO(march): collect operator \ No newline at end of file +// TODO(march): collect operator + +// TODO(march): write test for trimPartitionsToLimit \ No newline at end of file From ac4c600cc43628d89ea08a1558f8fcee93bc89c1 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Fri, 8 Apr 2022 02:01:12 -0400 Subject: [PATCH 18/56] Add two more testcases --- tuplex/core/src/ee/local/LocalBackend.cc | 8 -- tuplex/test/core/TakeTest.cc | 119 +++++++++++++++++++---- 2 files changed, 99 insertions(+), 28 deletions(-) diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc index 9530e9d04..5616488e7 100644 --- a/tuplex/core/src/ee/local/LocalBackend.cc +++ b/tuplex/core/src/ee/local/LocalBackend.cc @@ -983,14 +983,6 @@ namespace tuplex { } auto tasks = createLoadAndTransformToMemoryTasks(tstage, _options, syms); - - if (tstage->hasOutputLimit()) { - for (int i = 0; i < tasks.size(); i++) { - // take limit only work with uniform order - assert(tasks[i]->getOrder(0) == i); - } - } - auto completedTasks = performTasks(tasks); // Note: this doesn't work yet because of the globals. diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc index 8c4ed5fe5..98edecb41 100644 --- a/tuplex/test/core/TakeTest.cc +++ b/tuplex/test/core/TakeTest.cc @@ -16,7 +16,8 @@ using namespace tuplex; using namespace std; -class TakeTest : public PyTest {}; +class TakeTest : public PyTest { +}; /** * Randomly generate a vector of rows for testing @@ -37,15 +38,15 @@ vector generateTestData(size_t N, uint64_t seed) { return data; } -vector generateReferenceData(const vector& input, size_t topLimit, size_t bottomLimit) { +vector generateReferenceData(const vector &input, size_t topLimit, size_t bottomLimit) { vector output; - for(size_t i = 0; i < topLimit && i < input.size(); i++) { + for (size_t i = 0; i < topLimit && i < input.size(); i++) { output.push_back(input[i]); } size_t start_bottom = input.size() >= bottomLimit ? input.size() - bottomLimit : 0; start_bottom = max(topLimit, start_bottom); - for(size_t i = start_bottom; i < input.size(); i++) { + for (size_t i = start_bottom; i < input.size(); i++) { output.push_back(input[i]); } @@ -57,7 +58,7 @@ TEST_F(TakeTest, takeTopTest) { Context context(opt); auto rs = context.parallelize( - {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(1, 0); + {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(1, 0); ASSERT_EQ(rs->rowCount(), 1); auto v = rs->getRows(1); @@ -65,7 +66,7 @@ TEST_F(TakeTest, takeTopTest) { EXPECT_EQ(v[0].getInt(0), 1); auto rs2 = context.parallelize( - {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(3, 0); + {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(3, 0); ASSERT_EQ(rs2->rowCount(), 3); auto v2 = rs2->getRows(3); @@ -75,7 +76,8 @@ TEST_F(TakeTest, takeTopTest) { EXPECT_EQ(v2[2].getInt(0), 3); auto rs3 = context.parallelize( - {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), Row("!")}).take(5, 0); + {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), + Row("!")}).take(5, 0); ASSERT_EQ(rs3->rowCount(), 5); auto v3 = rs3->getRows(5); @@ -93,7 +95,7 @@ TEST_F(TakeTest, takeBottomTest) { Context context(opt); auto rs = context.parallelize( - {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(0, 1); + {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(0, 1); ASSERT_EQ(rs->rowCount(), 1); auto v = rs->getRows(1); @@ -101,7 +103,7 @@ TEST_F(TakeTest, takeBottomTest) { EXPECT_EQ(v[0].getInt(0), 6); auto rs2 = context.parallelize( - {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(0, 3); + {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(0, 3); ASSERT_EQ(rs2->rowCount(), 3); auto v2 = rs2->getRows(3); @@ -111,7 +113,8 @@ TEST_F(TakeTest, takeBottomTest) { EXPECT_EQ(v2[2].getInt(0), 6); auto rs3 = context.parallelize( - {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), Row("!")}).take(0, 5); + {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), + Row("!")}).take(0, 5); ASSERT_EQ(rs3->rowCount(), 5); auto v3 = rs3->getRows(5); @@ -129,7 +132,7 @@ TEST_F(TakeTest, takeBothTest) { Context context(opt); auto rs = context.parallelize( - {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(1, 1); + {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(1, 1); ASSERT_EQ(rs->rowCount(), 2); auto v = rs->getRows(2); @@ -138,7 +141,7 @@ TEST_F(TakeTest, takeBothTest) { EXPECT_EQ(v[1].getInt(0), 6); auto rs2 = context.parallelize( - {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(2, 1); + {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(2, 1); ASSERT_EQ(rs2->rowCount(), 3); auto v2 = rs2->getRows(3); @@ -148,7 +151,8 @@ TEST_F(TakeTest, takeBothTest) { EXPECT_EQ(v2[2].getInt(0), 6); auto rs3 = context.parallelize( - {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), Row("!")}).take(2, 3); + {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), + Row("!")}).take(2, 3); ASSERT_EQ(rs3->rowCount(), 5); auto v3 = rs3->getRows(5); @@ -167,12 +171,12 @@ TEST_F(TakeTest, takeBigTest) { const std::vector limit_values{0, 1, 5, 11, 600, 10000}; const std::vector partition_sizes{"256B", "512KB", "1MB"}; - for(auto& part_size : partition_sizes) { + for (auto &part_size: partition_sizes) { auto opt = testOptions(); opt.set("tuplex.partitionSize", part_size); Context context(opt); - for(auto data_size : test_size) { + for (auto data_size: test_size) { for (auto top_limit: limit_values) { for (auto bottom_limit: limit_values) { std::cout << "testing with partition size:" << part_size << " data size:" @@ -195,14 +199,89 @@ TEST_F(TakeTest, takeBigTest) { } } -// TODO(march): with map, filter function -//TEST_F(TakeTest, takeMapFilterTest) { -// srand(4242); -//} +vector generateMapFilterReferenceData(const vector &input, size_t topLimit, size_t bottomLimit) { + if (input.empty()) { + return {}; + } + + assert(input[0].getNumColumns() == 3); + vector intermedate; + for (const Row &r: input) { + int64_t new_a = r.getInt(0) + r.getInt(1); + + if (new_a % 2 == 0) { + intermedate.emplace_back(new_a, r.getInt(2)); + } + } + + return generateReferenceData(intermedate, topLimit, bottomLimit); +} + +TEST_F(TakeTest, takeMapFilterTest) { + mt19937 data_seed_gen(56120); + + const std::vector test_size{1, 10, 100, 1001, 10001}; + const std::vector limit_values{0, 1, 5, 11, 600, 10000}; + const std::vector partition_sizes{"256B", "512KB", "1MB"}; + + UDF map_udf("lambda a, b, c: ((a + b), c)"); + UDF filter_udf("lambda a, b: a % 2 == 0"); + + for (auto &part_size: partition_sizes) { + auto opt = testOptions(); + opt.set("tuplex.partitionSize", part_size); + Context context(opt); + + for (auto data_size: test_size) { + for (auto top_limit: limit_values) { + for (auto bottom_limit: limit_values) { + std::cout << "testing with partition size:" << part_size << " data size:" + << data_size << " top:" << top_limit << " bottom:" << bottom_limit << std::endl; + + auto data = generateTestData(data_size, data_seed_gen()); + auto ref_data = generateMapFilterReferenceData(data, top_limit, bottom_limit); + + auto ds = context.parallelize(data).map(map_udf).filter(filter_udf); + auto res = ds.take(top_limit, bottom_limit); + ASSERT_EQ(ref_data.size(), res->rowCount()); + for (Row &r: ref_data) { + Row res_row = res->getNextRow(); + if (!(res_row == r)) { + ASSERT_EQ(res_row, r); + } + } + } + } + } + } +} // TODO(march): with file input // context.csv("../resources/"); -// TODO(march): collect operator +TEST_F(TakeTest, collectIdentityTest) { + mt19937 data_seed_gen(123454); + + const std::vector test_size{1, 10, 100, 1001, 10001}; + const std::vector partition_sizes{"256B", "512KB", "1MB"}; + + for (auto &part_size: partition_sizes) { + auto opt = testOptions(); + opt.set("tuplex.partitionSize", part_size); + Context context(opt); + + for (auto data_size: test_size) { + auto data = generateTestData(data_size, data_seed_gen()); + auto res = context.parallelize(data).collect(); + ASSERT_EQ(data.size(), res->rowCount()); + for (Row &r: data) { + Row res_row = res->getNextRow(); + if (!(res_row == r)) { + ASSERT_EQ(res_row, r); + } + } + } + } +} // TODO(march): write test for trimPartitionsToLimit \ No newline at end of file From 56131a7843274a35b37d13ed0a70d0be98d2a155 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Fri, 15 Apr 2022 01:06:39 -0400 Subject: [PATCH 19/56] Address PR feedbacks --- tuplex/core/include/DataSet.h | 35 ++++++- tuplex/core/include/EmptyDataset.h | 14 +-- tuplex/core/include/LocalEngine.h | 59 +++++++---- tuplex/core/include/ee/IBackend.h | 15 +-- tuplex/core/include/ee/local/LocalBackend.h | 14 +-- tuplex/core/include/physical/TransformTask.h | 31 ++++-- tuplex/core/src/DataSet.cc | 1 - tuplex/core/src/LocalEngine.cc | 38 +++++-- tuplex/core/src/ee/local/LocalBackend.cc | 17 +-- tuplex/core/src/physical/PhysicalPlan.cc | 2 +- tuplex/core/src/physical/ResultSet.cc | 12 ++- tuplex/core/src/physical/TransformStage.cc | 9 +- tuplex/core/src/physical/TransformTask.cc | 11 +- tuplex/python/tuplex/dataset.py | 105 ++++++++++++++++++- tuplex/test/core/ContextBasics.cc | 56 +++++++++- tuplex/test/core/ResultSetTest.cc | 5 +- tuplex/test/core/TakeTest.cc | 72 +++++++++++-- 17 files changed, 413 insertions(+), 83 deletions(-) diff --git a/tuplex/core/include/DataSet.h b/tuplex/core/include/DataSet.h index 9510427e2..3a5f450ac 100644 --- a/tuplex/core/include/DataSet.h +++ b/tuplex/core/include/DataSet.h @@ -125,7 +125,7 @@ namespace tuplex { /*! * action that displays tuples as nicely formatted table - * @param numRows how many rows to print, i.e. top numRows are printed.xs + * @param numRows how many rows to print, i.e. top numRows are printed.xs, -1 means print all rows * @param os ostream where to print table to */ virtual void show(int64_t numRows = -1, std::ostream &os = std::cout); @@ -260,21 +260,48 @@ namespace tuplex { * @param memoryLayout * @return */ - virtual DataSet& cache(const Schema::MemoryLayout& memoryLayout, bool storeSpecialized); - DataSet& cache(bool storeSpecialized=true) { return cache(Schema::MemoryLayout::ROW, storeSpecialized); } + virtual DataSet &cache(const Schema::MemoryLayout &memoryLayout, bool storeSpecialized); + + DataSet &cache(bool storeSpecialized = true) { return cache(Schema::MemoryLayout::ROW, storeSpecialized); } /*! * helper setter without checks, to update internal column names. */ void setColumns(const std::vector &columnNames) { _columnNames = columnNames; } - // these are actions that cause execution + /*! + * Execute the pipeline and return all outputs + * @param os the logging output + * @return the output of the execution + */ virtual std::shared_ptr collect(std::ostream &os = std::cout); + /*! + * Execute the pipeline and take a subset of the output from the top and bottom rows. + * If both top and bottom rows limit exist, then the top and bottom rows will be concatenated. + * In the case where topLimit + bottomLimit exceeds the output size, all rows will be taken. + * To take all rows, pass in either topLimit=size_t::max(), bottomLimit=size_t::max(), or both. + * @param topLimit number of top rows to take. size_t::max() means taking all rows + * @param bottomLimit number of bottom rows to take. size_t::max() means taking all rows + * @param os the logging output + * @return result of the execution, trim to the size of top and bottom limit. + */ virtual std::shared_ptr take(size_t topLimit, size_t bottomLimit, std::ostream &os = std::cout); + /*! + * Execute the pipeline and return all outputs as vector + * @param os the logging output + * @return the output of the execution in vector + */ virtual std::vector collectAsVector(std::ostream &os = std::cout); + /*! + * Execute the pipeline and take a subset of the output from the top rows, return as vector + * In the case where numElements exceeds the output size, all rows will be taken. + * @param numElements number of top rows to take. size_t::max() means taking all rows + * @param os the logging output + * @return result of the execution in vector, trim to the size of numElements + */ virtual std::vector takeAsVector(size_t numElements, std::ostream &os = std::cout); /*! diff --git a/tuplex/core/include/EmptyDataset.h b/tuplex/core/include/EmptyDataset.h index 6fc3219a4..585b70881 100644 --- a/tuplex/core/include/EmptyDataset.h +++ b/tuplex/core/include/EmptyDataset.h @@ -67,18 +67,20 @@ namespace tuplex { virtual DataSet& aggregateByKey(const UDF& aggCombine, const UDF& aggUDF, const Row& aggInitial, const std::vector &keyColumns) override { return *this; } //virtual void show(const int64_t numRows=-1, std::ostream& os=std::cout) override; - virtual std::shared_ptr collect(std::ostream& os) override; + virtual std::shared_ptr collect(std::ostream &os) override; // take / collect will print out the error only - virtual std::shared_ptr take(size_t topLimit, size_t bottomLimit, std::ostream& os) override; + virtual std::shared_ptr take(size_t topLimit, size_t bottomLimit, std::ostream &os) override; //virtual void show(const int64_t numRows=-1, std::ostream& os=std::cout) override; - virtual std::vector collectAsVector(std::ostream& os) override; + virtual std::vector collectAsVector(std::ostream &os) override; - // take / collect will print out the error only - virtual std::vector takeAsVector(size_t numElements, std::ostream& os) override; + /*! + * take / collect will print out the error only, return empty rows + */ + virtual std::vector takeAsVector(size_t numElements, std::ostream &os) override; - DataSet& cache(const Schema::MemoryLayout& memoryLayout, bool storeSpecialized) override { + DataSet &cache(const Schema::MemoryLayout &memoryLayout, bool storeSpecialized) override { return *this; } }; diff --git a/tuplex/core/include/LocalEngine.h b/tuplex/core/include/LocalEngine.h index 66ed3a1e8..740a40b4d 100644 --- a/tuplex/core/include/LocalEngine.h +++ b/tuplex/core/include/LocalEngine.h @@ -16,7 +16,28 @@ #include #include "RESTInterface.h" + namespace tuplex { + struct ExecutorConfig { + size_t _size; // size in bytes that each executor should have + size_t _blockSize; // size of individual blocks used (can be used for coarse or fine grained parallelism) + size_t _runTimeMemory; + size_t _runTimeMemoryDefaultBlockSize; + URI _cache_path; + + bool operator==(const ExecutorConfig &rhs) const { + return _size == rhs._size && + _blockSize == rhs._blockSize && + _runTimeMemory == rhs._runTimeMemory && + _runTimeMemoryDefaultBlockSize == rhs._runTimeMemoryDefaultBlockSize && + _cache_path == rhs._cache_path; + } + + bool operator!=(const ExecutorConfig &rhs) const { + return !(rhs == *this); + } + }; + /*! * local execution engine. Provides local executors for a context * THIS IS NOT THREADSAFE. Should be only accessed by driver thread. @@ -25,16 +46,18 @@ namespace tuplex { private: // non-detached executor that serves as the driver - std::unique_ptr _driver; + std::shared_ptr _driver; + ExecutorConfig _driver_cfg; std::vector> _executors; - std::map _refCounts; //! reference counts for each executor + std::map _refCounts; //! reference counts for each executor + + LocalEngine(const LocalEngine &); - LocalEngine(const LocalEngine&); - void operator = (const LocalEngine&); + void operator=(const LocalEngine &); // The local task queue - WorkQueue _queue; + WorkQueue _queue; protected: LocalEngine(); @@ -63,25 +86,25 @@ namespace tuplex { * @param cache_path directory where subfolders will be created for all executors to be started * @return array of executor references */ - std::vector getExecutors(const size_t num, - const size_t size, - const size_t blockSize, - const size_t runTimeMemory, - const size_t runTimeMemoryDefaultBlockSize, - const URI& cache_path); + std::vector getExecutors(const size_t num, + const size_t size, + const size_t blockSize, + const size_t runTimeMemory, + const size_t runTimeMemoryDefaultBlockSize, + const URI &cache_path); /*! * releases executors (invoked by context) * @param executors * @param ctx */ - void freeExecutors(const std::vector& executors, const Context* ctx=nullptr); + void freeExecutors(const std::vector &executors, const Context *ctx = nullptr); - Executor* getDriver(const size_t size, - const size_t blockSize, - const size_t runTimeMemory, - const size_t runTimeMemoryDefaultBlockSize, - const URI& cache_path); + std::shared_ptr getDriver(const size_t size, + const size_t blockSize, + const size_t runTimeMemory, + const size_t runTimeMemoryDefaultBlockSize, + const URI &cache_path); void release(); @@ -89,7 +112,7 @@ namespace tuplex { * retrieves the global work queue for local executors * @return */ - WorkQueue& getQueue() { return _queue; } + WorkQueue &getQueue() { return _queue; } }; } #endif //TUPLEX_LOCALENGINE_H \ No newline at end of file diff --git a/tuplex/core/include/ee/IBackend.h b/tuplex/core/include/ee/IBackend.h index e7a80e5bb..1a543df8f 100644 --- a/tuplex/core/include/ee/IBackend.h +++ b/tuplex/core/include/ee/IBackend.h @@ -29,19 +29,22 @@ namespace tuplex { class IBackend { public: IBackend() = delete; - IBackend(const IBackend& other) = delete; - IBackend(const Context& context) : _context(context) {} + + IBackend(const IBackend &other) = delete; + + IBackend(const Context &context) : _context(context) {} // driver, i.e. where to store local data. - virtual Executor* driver() = 0; - virtual void execute(PhysicalStage* stage) = 0; + virtual Executor *driver() = 0; + + virtual void execute(PhysicalStage *stage) = 0; virtual ~IBackend() {} // virtual destructor needed b.c. of smart pointers - virtual const Context& context() const { return _context; } + virtual const Context &context() const { return _context; } private: - const Context& _context; + const Context &_context; }; inline std::unordered_map, size_t> merge_ecounts(std::unordered_map, size_t> lhs, diff --git a/tuplex/core/include/ee/local/LocalBackend.h b/tuplex/core/include/ee/local/LocalBackend.h index 712f0ae43..3d73a5d9f 100644 --- a/tuplex/core/include/ee/local/LocalBackend.h +++ b/tuplex/core/include/ee/local/LocalBackend.h @@ -40,14 +40,15 @@ namespace tuplex { * constructor for convenience * @param context */ - explicit LocalBackend(const Context& context); + explicit LocalBackend(const Context &context); - Executor* driver() override; // for local execution + Executor *driver() override; // for local execution + + void execute(PhysicalStage *stage) override; - void execute(PhysicalStage* stage) override; private: - Executor *_driver; //! driver from local backend... - std::vector _executors; //! drivers to be used + std::shared_ptr _driver; //! driver from local backend... + std::vector _executors; //! drivers to be used std::unique_ptr _compiler; HistoryServerConnection _historyConn; @@ -187,6 +188,7 @@ namespace tuplex { /*! * Trim list of partitions so that it includes up to the first n rows and the last m rows + * if n + m > number of rows in input partitions, the partitions will remain unchanged * @param partitions [in,out] the list of partitions to trim * @param topLimit n, the number of top rows to include * @param bottomLimit m, the number of bottom rows to include @@ -197,7 +199,7 @@ namespace tuplex { TransformStage *tstage, Executor *exec); /*! - * Create a new partition with the same data as the specified partition, but with the first n rows removed + * Create a newly allocated partition with the same data as the specified partition, but with the first n rows removed * @param p_in the input partition * @param numToSkip number of rows to remove from the new partition * @param tstage pointer to transform stage, used to generate new partition diff --git a/tuplex/core/include/physical/TransformTask.h b/tuplex/core/include/physical/TransformTask.h index d966b69ee..8ac5ba6df 100644 --- a/tuplex/core/include/physical/TransformTask.h +++ b/tuplex/core/include/physical/TransformTask.h @@ -180,6 +180,7 @@ namespace tuplex { void setOutputPrefix(const char* buf, size_t bufSize); // extra prefix to write first to output. void sinkOutputToHashTable(HashTableFormat fmt, int64_t outputDataSetID); + HashTableSink hashTableSink() const { return _htable; } // needs to be freed manually! void setOutputTopLimit(size_t limit) { @@ -190,16 +191,21 @@ namespace tuplex { _outBottomLimit = limit; } - // maxOrder of infinity means disregarding the bottomLimit short circuit - static void setMaxOrderAndResetLimits(size_t maxOrder = std::numeric_limits::max()) { - resetLimits(maxOrder); - } + /*! + * Set the maximum task order number that the current stage execute and reset the row counter. + * This is used to detect and stop the execution when we have reached the rows limit + * @param maxOrder maximum task order number in the pipeline, infinity means disregarding the bottomLimit short circuit + */ + static void setMaxOrderAndResetLimits(size_t maxOrder = std::numeric_limits::max()); void execute() override; bool hasFileSink() const { return _outputFilePath != URI::INVALID; } + bool hasFileSource() const { return _inputFilePath != URI::INVALID; } + bool hasMemorySink() const { return _outputSchema != Schema::UNKNOWN; } + bool hasMemorySource() const { return !_inputPartitions.empty(); } bool hasHashTableSink() const { return _htableFormat != HashTableFormat::UNKNOWN; } HashTableFormat hashTableFormat() const { return _htableFormat; } @@ -218,8 +224,6 @@ namespace tuplex { static codegen::i64_hash_row_f writeInt64HashTableAggregateCallback(); static codegen::write_row_f aggCombineCallback(); - static void resetLimits(size_t maxOrder); - // most be public because of C++ issues -.- int64_t writeRowToMemory(uint8_t* buf, int64_t bufSize); int64_t writeRowToFile(uint8_t* buf, int64_t bufSize); @@ -321,15 +325,26 @@ namespace tuplex { inline int64_t contextID() const { return _contextID; } inline void unlockAllMemorySinks() { // output partition existing? if so unlock - _output.unlock(); - _exceptions.unlock(); + _output.unlock(); + _exceptions.unlock(); } + /*! + * check whether the stage reached both top and bottom limit, to use this one must call + * setMaxOrderAndResetLimits before execution and set both top and bottom limit + * @return true if limit is reached + */ bool limitReached() const; + + /*! + * Update the global stage limit counter, should only be called once, at the end of task + */ void updateLimits(); void processMemorySourceWithExp(); + void processMemorySource(); + void processFileSource(); // exceptions diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc index 210b3ec60..b62946ae4 100644 --- a/tuplex/core/src/DataSet.cc +++ b/tuplex/core/src/DataSet.cc @@ -849,7 +849,6 @@ namespace tuplex { assert(numColumns != 0); // construct headers - std::vector headers(numColumns); if (!_columnNames.empty()) { assert(numColumns == _columnNames.size()); for (auto &c_name: _columnNames) { diff --git a/tuplex/core/src/LocalEngine.cc b/tuplex/core/src/LocalEngine.cc index 02c060a90..c9c6d506b 100644 --- a/tuplex/core/src/LocalEngine.cc +++ b/tuplex/core/src/LocalEngine.cc @@ -98,7 +98,8 @@ namespace tuplex { exec->processQueue(true); std::stringstream ss; - ss<<"started local executor "<name()<<" ("<name() << " (" << sizeToMemString(size) << ", " + << sizeToMemString(blockSize) << " default partition size)"; logger.info(ss.str()); } @@ -107,23 +108,44 @@ namespace tuplex { return execs; } - Executor* LocalEngine::getDriver(const size_t size, const size_t blockSize, const size_t runTimeMemory, - const size_t runTimeMemoryDefaultBlockSize, const tuplex::URI &cache_path) { - // lazy start driver - if(!_driver) { + std::shared_ptr + LocalEngine::getDriver(const size_t size, const size_t blockSize, const size_t runTimeMemory, + const size_t runTimeMemoryDefaultBlockSize, const tuplex::URI &cache_path) { + ExecutorConfig new_cfg = ExecutorConfig{ + ._size = size, + ._blockSize = blockSize, + ._runTimeMemory = runTimeMemory, + ._runTimeMemoryDefaultBlockSize = runTimeMemoryDefaultBlockSize, + ._cache_path = cache_path + }; + + if (!_driver || _driver_cfg != new_cfg) { + if (_driver) { + Logger::instance().logger("local execution engine").info( + "driver already exist, starting new driver with updated config"); + _driver->release(); // TODO(march): test whether we need this + } + + // lazy start driver URI uri = URI(cache_path.toString() + "/" + "driver"); - _driver = std::make_unique(size, blockSize, runTimeMemory, runTimeMemoryDefaultBlockSize, uri, "driver"); + _driver = std::make_shared(size, blockSize, runTimeMemory, runTimeMemoryDefaultBlockSize, uri, + "driver"); + _driver_cfg = new_cfg; + // TODO(march): this could be a problem, if multiple driver with number = 0 + // TODO(march): write a test for two drivers existing together (thread number 0) + // TODO(march): make a comment about potential issue here // driver always has thread number 0! _driver->setThreadNumber(0); std::stringstream ss; - ss<<"started driver ("< & executors, const Context* ctx) { diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc index 5616488e7..351d55b88 100644 --- a/tuplex/core/src/ee/local/LocalBackend.cc +++ b/tuplex/core/src/ee/local/LocalBackend.cc @@ -116,14 +116,14 @@ namespace tuplex { } Executor *LocalBackend::driver() { - assert(_driver); - return _driver; + assert(_driver); + return _driver.get(); } void LocalBackend::execute(tuplex::PhysicalStage *stage) { assert(stage); - if(!stage) + if (!stage) return; // history server connection should be established @@ -696,8 +696,6 @@ namespace tuplex { } } - // TODO(march): we can avoid setting order here by pre init g_rowsDone - // assign the order for all tasks for(size_t i = 0; i < tasks.size(); ++i) { tasks[i]->setOrder(i); @@ -899,7 +897,8 @@ namespace tuplex { auto output_par = tstage->inputPartitions(); if (tstage->hasOutputLimit()) { - trimPartitionsToLimit(output_par, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage, _driver); + trimPartitionsToLimit(output_par, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage, + _driver.get()); } tstage->setMemoryResult(output_par, std::vector{}, std::unordered_map(), pyObjects); pyObjects.clear(); @@ -1217,7 +1216,8 @@ namespace tuplex { if (tstage->hasOutputLimit()) { // the function expect the output to be sorted in ascending order (guaranteed by sortTasks()) - trimPartitionsToLimit(output, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage, _driver); + trimPartitionsToLimit(output, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage, + _driver.get()); } tstage->setMemoryResult(output, generalOutput, partitionToExceptionsMap, nonConformingRows, remainingExceptions, ecounts); @@ -2232,8 +2232,9 @@ namespace tuplex { ptr += sizeof(int64_t); size_t numBytesToSkip = 0; + Deserializer ds(tstage->outputSchema()); for (unsigned i = 0; i < numToSkip; ++i) { - Row r = Row::fromMemory(tstage->outputSchema(), ptr, p_in->capacity() - numBytesToSkip); + Row r = Row::fromMemory(ds, ptr, p_in->capacity() - numBytesToSkip); ptr += r.serializedLength(); numBytesToSkip += r.serializedLength(); } diff --git a/tuplex/core/src/physical/PhysicalPlan.cc b/tuplex/core/src/physical/PhysicalPlan.cc index f289064d5..e88189447 100644 --- a/tuplex/core/src/physical/PhysicalPlan.cc +++ b/tuplex/core/src/physical/PhysicalPlan.cc @@ -240,7 +240,7 @@ namespace tuplex { // user wants to merge exceptions in order. bool updateInputExceptions = hasFilter && hasInputExceptions && _context.getOptions().OPT_MERGE_EXCEPTIONS_INORDER(); - // create transfrom stage via builder pattern + // create transform stage via builder pattern auto builder = codegen::StageBuilder(_num_stages++, isRootStage, _context.getOptions().UNDEFINED_BEHAVIOR_FOR_OPERATORS(), diff --git a/tuplex/core/src/physical/ResultSet.cc b/tuplex/core/src/physical/ResultSet.cc index bfd656dc8..0eb6d95ad 100644 --- a/tuplex/core/src/physical/ResultSet.cc +++ b/tuplex/core/src/physical/ResultSet.cc @@ -127,15 +127,19 @@ namespace tuplex { return vector{}; Deserializer ds(_schema); - for(int i = 0; i < limit;) { + for (size_t i = 0; i < limit;) { // all exhausted - if(_partitions.empty()) + if (_partitions.empty()) break; // get number of rows in first partition Partition *first = _partitions.front(); auto num_rows = first->getNumRows(); + + assert(num_rows >= _curRowCounter); + assert(limit >= i); + // how many left to retrieve? auto num_to_retrieve_from_partition = std::min(limit - i, num_rows - _curRowCounter); assert(num_to_retrieve_from_partition >= 0); @@ -145,8 +149,8 @@ namespace tuplex { // thread safe version (slow) // get next element of partition - const uint8_t* ptr = first->lock(); - for(int j = 0; j < num_to_retrieve_from_partition; ++j) { + const uint8_t *ptr = first->lock(); + for (size_t j = 0; j < num_to_retrieve_from_partition; ++j) { auto row = Row::fromMemory(ds, ptr + _byteCounter, first->capacity() - _byteCounter); _byteCounter += row.serializedLength(); _curRowCounter++; diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc index 060365697..b0e2e70ab 100644 --- a/tuplex/core/src/physical/TransformStage.cc +++ b/tuplex/core/src/physical/TransformStage.cc @@ -139,10 +139,17 @@ namespace tuplex { } } + size_t maxRows; + if (hasOutputLimit()) { + maxRows = outputTopLimit() + outputBottomLimit(); + } else { + maxRows = std::numeric_limits::max(); + } + // put ALL partitions to result set _rs = std::make_shared(schema, partitions, generalCase, partitionToExceptionsMap, interpreterRows, - outputTopLimit() + outputBottomLimit()); + maxRows); } } diff --git a/tuplex/core/src/physical/TransformTask.cc b/tuplex/core/src/physical/TransformTask.cc index 8ebe18a7b..2de71e4fe 100644 --- a/tuplex/core/src/physical/TransformTask.cc +++ b/tuplex/core/src/physical/TransformTask.cc @@ -18,12 +18,20 @@ #include namespace tuplex { + // this is a logic to stop the execution once it has reached the topLimit and bottomLimit + // here, we assume that task order starts with zero and count up by 1, e.g. 0, 1, 2, ..., n + // To implement limit, we maintain a mapping from the task order to the number of rows done in that task + // (rows done are either 0 or #output rows after processing) + // we can then find out how many top rows are done by looking at g_rowsDone[0], g_rowsDone[1], ... + // until we reach some segment that's 0 + // likewise, we can find the bottom rows done by looking at g_rowsDone[g_maxOrder], g_rowsDone[g_maxOrder - 1], ... + // mapping from order number -> row count if the task is finished static std::mutex g_rowsDoneMutex; static std::unordered_map g_rowsDone; static std::atomic_size_t g_maxOrder; - void TransformTask::resetLimits(size_t maxOrder) { + void TransformTask::setMaxOrderAndResetLimits(size_t maxOrder) { g_rowsDone.clear(); g_maxOrder = maxOrder; } @@ -617,7 +625,6 @@ namespace tuplex { } } - // TODO: what is the max task number here if (_outBottomLimit == 0) { isBottomLimitReached = true; } else { diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py index 376134934..7eda223a1 100644 --- a/tuplex/python/tuplex/dataset.py +++ b/tuplex/python/tuplex/dataset.py @@ -172,6 +172,15 @@ def show(self, nrows=None): self._dataSet.show(nrows) + def _getHTMLRow(self, ind, row): + row_str = "" + row_str += " \n" + row_str += " {}\n".format(ind) + for col in row: + row_str += " {}\n".format(col) + row_str += " \n" + return row_str + def showHTMLPreview(self, topLimit=5, bottomLimit=5): """ action that generates a physical plan, processes data and return a subset of results as nicely formatted HTML table to stdout. @@ -183,14 +192,108 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5): Returns: string: an HTML table showing a preview of the data """ + HTML_TEMPLATE = ( + "
\n" + "\n" + "\n" + " \n" + " \n" + "{}" + " \n" + " \n" + " \n" + "{}" + " \n" + "
\n" + "

{} columns

\n" + "
") + assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' + # TODO(march): edit this top/bottom limit if topLimit is None or topLimit < 0: topLimit = -1 if bottomLimit is None or bottomLimit < 0: bottomLimit = -1 - return self._dataSet.showHTMLPreview(topLimit, bottomLimit) + rows = self.take(topLimit, bottomLimit) + + if len(rows) == 0: + return HTML_TEMPLATE.format("\n", "\n") + + assert topLimit == -1 or bottomLimit == -1 or len(rows) <= topLimit + bottomLimit + + headers_str = "" + body = "" + num_columns = None + + # construct tables + if len(rows) < topLimit + bottomLimit: + # the data is small so we get everything (no need to render ...) + i = 0 + for r in rows: + if i == 0: + # we set num columns based on the first row + num_columns = r.getNumColumns() + body += self._getHTMLRow(i, r) + i += 1 + else: + # some data is not processed because of limiting + i = 0 + for r in rows: + if i >= topLimit: + break + if i == 0: + # we set num columns based on the first row + num_columns = r.getNumColumns() + + body += self._getHTMLRow(i, r) + i += 1 + + # add the ... + body += " \n" + body += " ...\n" + for _ in range(num_columns): + body += " ...\n" + body += " \n" + + for j in range(i, len(rows)): + body += self._getHTMLRow(i, rows[j]) + + assert num_columns is not None + + # construct headers + column_names = self._dataSet.columns() + if column_names is not None: + assert (num_columns == column_names.size()) + for c_name in column_names: + headers_str += " {}\n".format(c_name) + else: + # default to generic name if column name doesn't exist + for i in range(num_columns): + headers_str += " Column {}\n".format(i) + + return HTML_TEMPLATE.format(headers_str, body, num_columns) + + def _getConsoleRow(self, ind, row): + # TODO(march): (work on this) + pass + + def showConsolePreview(self, topLimit=5, bottomLimit=5): + # TODO(march): (work on this) + pass def resolve(self, eclass, ftor): """ Adds a resolver operator to the pipeline. The signature of ftor needs to be identical to the one of the preceding operator. diff --git a/tuplex/test/core/ContextBasics.cc b/tuplex/test/core/ContextBasics.cc index fdbdd8d50..0be3c6030 100644 --- a/tuplex/test/core/ContextBasics.cc +++ b/tuplex/test/core/ContextBasics.cc @@ -136,4 +136,58 @@ TEST_F(ContextBasicsTest, JSON) { auto str = ContextOptions::defaults().asJSON(); EXPECT_GT(str.length(), 2); -} \ No newline at end of file +} + +TEST_F(ContextBasicsTest, twoContextTest) { + using namespace tuplex; + + python::initInterpreter(); + python::unlockGIL(); + + ContextOptions co = testOptions(); + co.set("tuplex.partitionSize", "100B"); + co.set("tuplex.executorMemory", "1MB"); + co.set("tuplex.scratchDir", scratchDir + "/context1"); + + // second context with different executor config, should cause the driver to split up + ContextOptions co2 = testOptions(); + co.set("tuplex.partitionSize", "100B"); + co2.set("tuplex.executorMemory", "2MB"); + co2.set("tuplex.scratchDir", scratchDir + "/context2"); + + Context c1(co); + Context c2(co2); + Row row1(Tuple(0), Tuple("hello")); + Row row2(Tuple(1), Tuple("this")); + Row row3(Tuple(2), Tuple("is")); + Row row4(Tuple(3), Tuple("a")); + Row row5(Tuple(4), Tuple("test")); + + for (int t = 0; t < 10; t++) { + auto ds1 = c1.parallelize({row1, row2, row3, row4, row5}) + .map(UDF("lambda x: x[1][0]")); // new code: string index operator! first to raise an exception! + + auto ds2 = c2.parallelize({row1, row2, row3, row4, row5}) + .map(UDF("lambda x: x[1][0]")); // new code: string index operator! first to raise an exception! + + auto v1 = ds1.collectAsVector(); + auto v2 = ds2.collectAsVector(); + + std::vector ref{"hello", "this", "is", "a", "test"}; + + EXPECT_EQ(v1.size(), 5); + for (int i = 0; i < 5; i++) { + EXPECT_EQ(v1[i].getString(0), ref[i]); + } + + EXPECT_EQ(v2.size(), 5); + for (int i = 0; i < 5; i++) { + EXPECT_EQ(v2[i].getString(0), ref[i]); + } + } + + python::lockGIL(); + python::closeInterpreter(); +} + +// TODO(march): multiple context test \ No newline at end of file diff --git a/tuplex/test/core/ResultSetTest.cc b/tuplex/test/core/ResultSetTest.cc index 4acd38921..2ea273062 100644 --- a/tuplex/test/core/ResultSetTest.cc +++ b/tuplex/test/core/ResultSetTest.cc @@ -14,7 +14,7 @@ class ResultSetTest : public PyTest { protected: - tuplex::Executor *driver; + std::shared_ptr driver; tuplex::ContextOptions options; public: // init function @@ -45,7 +45,8 @@ class ResultSetTest : public PyTest { EXPECT_EQ(r.getRowType(), first_type); // now write via partition writer - tuplex::PartitionWriter pw(driver, Schema(Schema::MemoryLayout::ROW, first_type), 0, 0, options.PARTITION_SIZE()); + tuplex::PartitionWriter pw(driver.get(), Schema(Schema::MemoryLayout::ROW, first_type), 0, 0, + options.PARTITION_SIZE()); for(const auto& r : rows) pw.writeRow(r); return pw.getOutputPartitions(); diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc index 98edecb41..86173e40b 100644 --- a/tuplex/test/core/TakeTest.cc +++ b/tuplex/test/core/TakeTest.cc @@ -205,16 +205,16 @@ vector generateMapFilterReferenceData(const vector &input, size_t topL } assert(input[0].getNumColumns() == 3); - vector intermedate; + vector intermediate; for (const Row &r: input) { int64_t new_a = r.getInt(0) + r.getInt(1); if (new_a % 2 == 0) { - intermedate.emplace_back(new_a, r.getInt(2)); + intermediate.emplace_back(new_a, r.getInt(2)); } } - return generateReferenceData(intermedate, topLimit, bottomLimit); + return generateReferenceData(intermediate, topLimit, bottomLimit); } TEST_F(TakeTest, takeMapFilterTest) { @@ -256,9 +256,6 @@ TEST_F(TakeTest, takeMapFilterTest) { } } -// TODO(march): with file input -// context.csv("../resources/"); - TEST_F(TakeTest, collectIdentityTest) { mt19937 data_seed_gen(123454); @@ -284,4 +281,67 @@ TEST_F(TakeTest, collectIdentityTest) { } } +TEST_F(TakeTest, fileInputTest) { + const std::vector test_size{1, 10, 100, 1001, 50001}; + const std::vector limit_values{0, 1, 5, 11, 600, 10000}; + const std::vector partition_sizes{"256B", "512KB", "1MB"}; + std::vector> expected_outputs; + + if (!boost::filesystem::exists(scratchDir)) { + boost::filesystem::create_directory(scratchDir); + } + + std::vector fileInputNames; + for (unsigned long N: test_size) { + std::vector ref_output; + // write temp file + auto fName = fmt::format("{}/{}-{}.csv", scratchDir, testName, N); + + FILE *fp = fopen(fName.c_str(), "w"); + ASSERT_TRUE(fp); + fprintf(fp, "colA,colStr,colB\n"); + for (int i = 0; i < N; ++i) { + fprintf(fp, "%d,\"hello%d\",%d\n", i, (i * 3) % 7, i % 15); + ref_output.emplace_back(i, fmt::format("hello{}", (i * 3) % 7), (i % 15) * (i % 15)); + } + fclose(fp); + + expected_outputs.push_back(std::move(ref_output)); + fileInputNames.push_back(fName); + } + + ASSERT_TRUE(expected_outputs.size() == test_size.size()); + ASSERT_TRUE(fileInputNames.size() == test_size.size()); + + for (auto &part_size: partition_sizes) { + auto opt = microTestOptions(); + opt.set("tuplex.partitionSize", part_size); + Context context(opt); + + for (int t = 0; t < test_size.size(); t++) { + const size_t data_size = test_size[t]; + + for (auto top_limit: limit_values) { + for (auto bottom_limit: limit_values) { + std::cout << "file testing with partition size:" << part_size << " data size:" + << data_size << " top:" << top_limit << " bottom:" << bottom_limit << std::endl; + + auto ref_output = generateReferenceData(expected_outputs[t], top_limit, bottom_limit); + auto res = context.csv(testName + ".csv") + .mapColumn("colB", UDF("lambda x: x * x")) + .take(top_limit, bottom_limit); + + ASSERT_EQ(ref_output.size(), res->rowCount()); + for (Row &r: ref_output) { + Row res_row = res->getNextRow(); + if (!(res_row == r)) { + ASSERT_EQ(res_row, r); + } + } + } + } + } + } +} + // TODO(march): write test for trimPartitionsToLimit \ No newline at end of file From 2005458822a3e8f03c9eb5cea95c04f13178d6b3 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Fri, 15 Apr 2022 02:50:20 -0400 Subject: [PATCH 20/56] Add file testcases --- tuplex/core/src/LocalEngine.cc | 6 ++---- tuplex/test/core/ContextBasics.cc | 4 +--- tuplex/test/core/TakeTest.cc | 22 ++++++++++++---------- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/tuplex/core/src/LocalEngine.cc b/tuplex/core/src/LocalEngine.cc index c9c6d506b..91892d44d 100644 --- a/tuplex/core/src/LocalEngine.cc +++ b/tuplex/core/src/LocalEngine.cc @@ -123,7 +123,6 @@ namespace tuplex { if (_driver) { Logger::instance().logger("local execution engine").info( "driver already exist, starting new driver with updated config"); - _driver->release(); // TODO(march): test whether we need this } // lazy start driver @@ -132,10 +131,9 @@ namespace tuplex { "driver"); _driver_cfg = new_cfg; - // TODO(march): this could be a problem, if multiple driver with number = 0 - // TODO(march): write a test for two drivers existing together (thread number 0) - // TODO(march): make a comment about potential issue here // driver always has thread number 0! + // Note: this could be a potential issue if the config change and the old driver is still running + // due to external reference. Then there could be two executors with the same number _driver->setThreadNumber(0); std::stringstream ss; diff --git a/tuplex/test/core/ContextBasics.cc b/tuplex/test/core/ContextBasics.cc index 0be3c6030..e85107b40 100644 --- a/tuplex/test/core/ContextBasics.cc +++ b/tuplex/test/core/ContextBasics.cc @@ -188,6 +188,4 @@ TEST_F(ContextBasicsTest, twoContextTest) { python::lockGIL(); python::closeInterpreter(); -} - -// TODO(march): multiple context test \ No newline at end of file +} \ No newline at end of file diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc index 86173e40b..eda609518 100644 --- a/tuplex/test/core/TakeTest.cc +++ b/tuplex/test/core/TakeTest.cc @@ -282,9 +282,9 @@ TEST_F(TakeTest, collectIdentityTest) { } TEST_F(TakeTest, fileInputTest) { - const std::vector test_size{1, 10, 100, 1001, 50001}; - const std::vector limit_values{0, 1, 5, 11, 600, 10000}; - const std::vector partition_sizes{"256B", "512KB", "1MB"}; + const std::vector test_size{1, 10, 1001, 50001}; + const std::vector limit_values{0, 1, 6, 600, 10000}; + const std::vector partition_sizes{"256B", "1MB"}; std::vector> expected_outputs; if (!boost::filesystem::exists(scratchDir)) { @@ -327,21 +327,23 @@ TEST_F(TakeTest, fileInputTest) { << data_size << " top:" << top_limit << " bottom:" << bottom_limit << std::endl; auto ref_output = generateReferenceData(expected_outputs[t], top_limit, bottom_limit); - auto res = context.csv(testName + ".csv") + auto res = context.csv(fileInputNames[t]) .mapColumn("colB", UDF("lambda x: x * x")) .take(top_limit, bottom_limit); ASSERT_EQ(ref_output.size(), res->rowCount()); for (Row &r: ref_output) { Row res_row = res->getNextRow(); - if (!(res_row == r)) { - ASSERT_EQ(res_row, r); - } + ASSERT_EQ(res_row.getInt(0), r.getInt(0)); + ASSERT_EQ(res_row.getString(1), r.getString(1)); + ASSERT_EQ(res_row.getInt(2), r.getInt(2)); + // TODO(march): this doesn't work because schema are different (for some reason infer as opt[int]?) + // if (!(res_row == r)) { + // ASSERT_EQ(res_row, r); + // } } } } } } -} - -// TODO(march): write test for trimPartitionsToLimit \ No newline at end of file +} \ No newline at end of file From 41b04a75e945865bbaca5f230ccfd65fc14b5629 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Tue, 19 Apr 2022 23:45:01 -0400 Subject: [PATCH 21/56] Python Dataset Debug --- tuplex/python/src/PythonDataSet.cc | 4 + tuplex/python/tuplex/dataset.py | 128 +++++++++++++++------ tuplex/python/tuplex/utils/table_format.py | 80 +++++++++++++ tuplex/test/core/TakeTest.cc | 8 +- 4 files changed, 178 insertions(+), 42 deletions(-) create mode 100644 tuplex/python/tuplex/utils/table_format.py diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc index 1f543e5d2..5382ad24d 100644 --- a/tuplex/python/src/PythonDataSet.cc +++ b/tuplex/python/src/PythonDataSet.cc @@ -133,11 +133,15 @@ namespace tuplex { size_t castedTopLimit = 0; if (topLimit < 0) { castedTopLimit = std::numeric_limits::max(); + } else { + castedTopLimit = topLimit; } size_t castedBottomLimit = 0; if (bottomLimit < 0) { castedBottomLimit = std::numeric_limits::max(); + } else { + castedBottomLimit = bottomLimit; } try { diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py index 7eda223a1..c0b9ef4d0 100644 --- a/tuplex/python/tuplex/dataset.py +++ b/tuplex/python/tuplex/dataset.py @@ -19,6 +19,7 @@ from tuplex.utils.framework import UDFCodeExtractionError from tuplex.utils.source_vault import SourceVault from .exceptions import classToExceptionCode +import tuplex.utils.table_format as table_format # signed 64bit limit max_rows = 9223372036854775807 @@ -29,7 +30,10 @@ def __init__(self): self._dataSet = None def _repr_html_(self): - return self._dataSet.showHTMLPreview() + return self.showHTMLPreview() + + def __repr__(self): + return self.showStrPreview() def unique(self): """ removes duplicates from Dataset (out-of-order). Equivalent to a DISTINCT clause in a SQL-statement. @@ -172,15 +176,6 @@ def show(self, nrows=None): self._dataSet.show(nrows) - def _getHTMLRow(self, ind, row): - row_str = "" - row_str += " \n" - row_str += " {}\n".format(ind) - for col in row: - row_str += " {}\n".format(col) - row_str += " \n" - return row_str - def showHTMLPreview(self, topLimit=5, bottomLimit=5): """ action that generates a physical plan, processes data and return a subset of results as nicely formatted HTML table to stdout. @@ -195,17 +190,17 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5): HTML_TEMPLATE = ( "
\n" "\n" "\n" " \n" @@ -222,16 +217,10 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5): assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' - # TODO(march): edit this top/bottom limit - if topLimit is None or topLimit < 0: - topLimit = -1 - if bottomLimit is None or bottomLimit < 0: - bottomLimit = -1 - rows = self.take(topLimit, bottomLimit) if len(rows) == 0: - return HTML_TEMPLATE.format("\n", "\n") + return HTML_TEMPLATE.format("\n", "\n", 0) assert topLimit == -1 or bottomLimit == -1 or len(rows) <= topLimit + bottomLimit @@ -246,8 +235,8 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5): for r in rows: if i == 0: # we set num columns based on the first row - num_columns = r.getNumColumns() - body += self._getHTMLRow(i, r) + num_columns = len(r) if isinstance(r, list) or isinstance(r, tuple) else 1 + body += table_format.getHTMLRow(i, r) i += 1 else: # some data is not processed because of limiting @@ -257,9 +246,9 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5): break if i == 0: # we set num columns based on the first row - num_columns = r.getNumColumns() + num_columns = len(r) if isinstance(r, list) or isinstance(r, tuple) else 1 - body += self._getHTMLRow(i, r) + body += table_format.getHTMLRow(i, r) i += 1 # add the ... @@ -270,14 +259,15 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5): body += " \n" for j in range(i, len(rows)): - body += self._getHTMLRow(i, rows[j]) + body += table_format.getHTMLRow(len(rows) - j, rows[j]) assert num_columns is not None # construct headers column_names = self._dataSet.columns() - if column_names is not None: - assert (num_columns == column_names.size()) + headers_str += " \n" + if len(column_names) > 0: + assert (num_columns == len(column_names)) for c_name in column_names: headers_str += " \n".format(c_name) else: @@ -287,13 +277,79 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5): return HTML_TEMPLATE.format(headers_str, body, num_columns) - def _getConsoleRow(self, ind, row): - # TODO(march): (work on this) - pass + def showStrPreview(self, topLimit=5, bottomLimit=5): + """ action that generates a physical plan, processes data and return a subset of results as nicely formatted + ASCII table to stdout. + + Args: + topLimit (int): number of top rows to collect. If ``None`` all rows will be collected + bottomLimit (int): number of bottom rows to collect. If ``None`` all rows will be collected + + Returns: + string: an HTML table showing a preview of the data + """ + assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' + + rows = self.take(topLimit, bottomLimit) + + if len(rows) == 0: + return ( + "---\n" + "| |\n" + "---\n" + "0 columns\n") + + assert topLimit == -1 or bottomLimit == -1 or len(rows) <= topLimit + bottomLimit + + str_table = [] + num_columns = None + + # construct tables + if len(rows) < topLimit + bottomLimit: + # the data is small so we get everything (no need to render ...) + i = 0 + for r in rows: + if i == 0: + # we set num columns based on the first row + num_columns = len(r) if isinstance(r, list) or isinstance(r, tuple) else 1 + str_table.append(table_format.getStrTableRow(i, r)) + i += 1 + else: + # some data is not processed because of limiting + i = 0 + for r in rows: + if i >= topLimit: + break + if i == 0: + # we set num columns based on the first row + num_columns = len(r) if isinstance(r, list) or isinstance(r, tuple) else 1 + + str_table.append(table_format.getStrTableRow(i, r)) + i += 1 + + # add the ... + str_table.append(["..."] * (num_columns + 1)) + + for j in range(i, len(rows)): + str_table.append(table_format.getStrTableRow(len(rows) - j, rows[j])) + + assert num_columns is not None + + # construct headers + column_names = self._dataSet.columns() + headers_list = [""] + if len(column_names) > 0: + assert (num_columns == len(column_names)) + for c_name in column_names: + headers_list.append("{}".format(c_name)) + else: + # default to generic name if column name doesn't exist + for i in range(num_columns): + headers_list.append("Column {}".format(i)) + + str_table = [headers_list] + str_table - def showConsolePreview(self, topLimit=5, bottomLimit=5): - # TODO(march): (work on this) - pass + return table_format.generateStrTable(num_columns + 1, str_table) def resolve(self, eclass, ftor): """ Adds a resolver operator to the pipeline. The signature of ftor needs to be identical to the one of the preceding operator. diff --git a/tuplex/python/tuplex/utils/table_format.py b/tuplex/python/tuplex/utils/table_format.py new file mode 100644 index 000000000..bb83118b4 --- /dev/null +++ b/tuplex/python/tuplex/utils/table_format.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +# ----------------------------------------------------------------------------------------------------------------------# +# # +# Tuplex: Blazing Fast Python Data Science # +# # +# # +# (c) 2017 - 2021, Tuplex team # +# Created by Leonhard Spiegelberg first on 4/19/2022 # +# License: Apache 2.0 # +# ----------------------------------------------------------------------------------------------------------------------# + +def getHTMLRow(ind, row): + """ + Given a row, converts all the contents to an HTML row and return + :param ind: the index of that row + :param row: a row output from dataset + :return: an HTML row, representative of the row + """ + row_str = "" + row_str += " \n" + row_str += " \n".format(ind) + if isinstance(row, list) or isinstance(row, tuple): + for col in row: + row_str += " \n".format(col) + else: + row_str += " \n".format(row) + row_str += " \n" + return row_str + + +def getStrTableRow(ind, row): + """ + Given a row, converts all the contents to string and return + :param ind: the index of that row + :param row: a row output from dataset + :return: a list of string, representative of the row + """ + row_str_list = ["{}".format(ind)] + if isinstance(row, list) or isinstance(row, tuple): + for col in row: + row_str_list.append("{}".format(col)) + else: + row_str_list.append("{}".format(row)) + return row_str_list + + +def _getLineDivider(col_width): + out = "" + for w in col_width: + out += "+" + ("-" * (w + 2)) + out += "+\n" + + return out + +def generateStrTable(numCols, strTable): + """ + Given a 2-dimensional list of strings, print a nicely formatted table of the contents in the list + :param numCols: number of columns in the table + :param strTable: 2-dimensional list of strings, as list of list + :return: a nicely formatted table in string + """ + max_col_width = [0] * numCols + + for r in strTable: + for i in range(0, len(r)): + assert (isinstance(r[i], str)) + if len(r[i]) > max_col_width[i]: + max_col_width[i] = len(r[i]) + + output_str = "" + + for r in strTable: + output_str += _getLineDivider(max_col_width) + for i in range(0, len(r)): + output_str += "| {:<{width}} ".format(r[i], width=max_col_width[i]) + output_str += "|\n" + + output_str += _getLineDivider(max_col_width) + "{} columns\n".format(numCols) + + return output_str diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc index eda609518..4e4a70f53 100644 --- a/tuplex/test/core/TakeTest.cc +++ b/tuplex/test/core/TakeTest.cc @@ -282,8 +282,8 @@ TEST_F(TakeTest, collectIdentityTest) { } TEST_F(TakeTest, fileInputTest) { - const std::vector test_size{1, 10, 1001, 50001}; - const std::vector limit_values{0, 1, 6, 600, 10000}; + const std::vector test_size{1, 1001, 50001}; + const std::vector limit_values{0, 1, 600, 10000}; const std::vector partition_sizes{"256B", "1MB"}; std::vector> expected_outputs; @@ -337,10 +337,6 @@ TEST_F(TakeTest, fileInputTest) { ASSERT_EQ(res_row.getInt(0), r.getInt(0)); ASSERT_EQ(res_row.getString(1), r.getString(1)); ASSERT_EQ(res_row.getInt(2), r.getInt(2)); - // TODO(march): this doesn't work because schema are different (for some reason infer as opt[int]?) - // if (!(res_row == r)) { - // ASSERT_EQ(res_row, r); - // } } } } From fc751f190b4c6f97af2b60f46b3fa2c25675ae7f Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Tue, 19 Apr 2022 23:47:17 -0400 Subject: [PATCH 22/56] Remove showHTMLPreview from Dataset in C++ --- tuplex/core/include/DataSet.h | 8 -- tuplex/core/src/DataSet.cc | 108 -------------------------- tuplex/python/include/PythonDataSet.h | 1 - tuplex/python/src/PythonBindings.cc | 1 - tuplex/python/src/PythonDataSet.cc | 49 ------------ 5 files changed, 167 deletions(-) diff --git a/tuplex/core/include/DataSet.h b/tuplex/core/include/DataSet.h index 3a5f450ac..1b11c1f75 100644 --- a/tuplex/core/include/DataSet.h +++ b/tuplex/core/include/DataSet.h @@ -130,14 +130,6 @@ namespace tuplex { */ virtual void show(int64_t numRows = -1, std::ostream &os = std::cout); - /*! - * Displays a formatted HTML table of a small portion of the data - * @param topLimit how many top rows to print - * @param bottomLimit how many bottom rows to print - * @param os ostream where to print table to - */ - virtual void showHTMLPreview(size_t topLimit, size_t bottomLimit, std::ostream &os = std::cout); - // named dataset management functions /*! * map Column using a UDF diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc index b62946ae4..d54edb567 100644 --- a/tuplex/core/src/DataSet.cc +++ b/tuplex/core/src/DataSet.cc @@ -756,114 +756,6 @@ namespace tuplex { printTable(os, headers, rows); } - void printHTMLRow(std::ostream &os, size_t ind, const Row& r) { - os << " \n"; - os << fmt::format(" \n", ind); - for (auto& s : r.getAsStrings()) { - os << fmt::format(" \n", s); - } - os << " \n"; - } - - void DataSet::showHTMLPreview(size_t topLimit, size_t bottomLimit, std::ostream &os) { - std::string HTML_TEMPLATE = - "
\n" - "\n" - "
{}
{}{}{}
{}{}
\n" - " \n" - " \n" - "{}" - " \n" - " \n" - " \n" - "{}" - " \n" - "
\n" - "

{} columns

\n" - "
"; - - assert(_context); - - auto rows = take(topLimit, bottomLimit); - - if (rows->rowCount() == 0) { - os << fmt::format(HTML_TEMPLATE, "\n", "\n", 0); - return; - } - - std::stringstream headers_stream, body_stream; - size_t numColumns = 0; - assert(rows->rowCount() <= topLimit + bottomLimit); - - // construct tables - if (rows->rowCount() < topLimit + bottomLimit) { - // the data is small so we get everything (no need to render ...) - for (size_t i = 0; rows->hasNextRow(); i++) { - Row r = rows->getNextRow(); - if (i == 0) { - // we set num columns based on the first row - numColumns = r.getNumColumns(); - } - - printHTMLRow(body_stream, i, r); - } - } else { - // some data is not processed because of limiting - size_t i; - for (i = 0; rows->hasNextRow() && i < topLimit; i++) { - Row r = rows->getNextRow(); - if (i == 0) { - // we set num columns based on the first row - numColumns = r.getNumColumns(); - } - - printHTMLRow(body_stream, i, r); - } - - // add the ... - body_stream << " \n"; - body_stream << " ...\n"; - for(int j = 0; j < numColumns; j++) { - body_stream << " ...\n"; - body_stream << " \n"; - } - - while (rows->hasNextRow()) { - Row r = rows->getNextRow(); - printHTMLRow(body_stream, i, r); - } - } - - assert(numColumns != 0); - - // construct headers - if (!_columnNames.empty()) { - assert(numColumns == _columnNames.size()); - for (auto &c_name: _columnNames) { - headers_stream << fmt::format(" {}\n", c_name); - } - } else { - // default to generic name if column name doesn't exist - for (int i = 0; i < numColumns; ++i) { - headers_stream << fmt::format(" Column {}\n", i); - } - } - - os << fmt::format(HTML_TEMPLATE, headers_stream.str(), body_stream.str(), numColumns); - } - Schema DataSet::schema() const { if(!_operator) return Schema::UNKNOWN; diff --git a/tuplex/python/include/PythonDataSet.h b/tuplex/python/include/PythonDataSet.h index 4761ac7f0..ede482d9c 100644 --- a/tuplex/python/include/PythonDataSet.h +++ b/tuplex/python/include/PythonDataSet.h @@ -79,7 +79,6 @@ namespace tuplex { py::object collect(); py::object take(const int64_t topLimit, const int64_t bottomLimit); void show(const int64_t numRows=-1); - std::string showHTMLPreview(const int64_t topLimit, const int64_t bottomLimit); // DataFrame like operations PythonDataSet mapColumn(const std::string& column, const std::string& lambda_code, const std::string& pickled_code, const py::object& closure=py::object()); diff --git a/tuplex/python/src/PythonBindings.cc b/tuplex/python/src/PythonBindings.cc index ab239a1a2..6b3683853 100644 --- a/tuplex/python/src/PythonBindings.cc +++ b/tuplex/python/src/PythonBindings.cc @@ -41,7 +41,6 @@ PYMODULE { py::class_(m, "_DataSet") .def("show", &tuplex::PythonDataSet::show) - .def("showHTMLPreview", &tuplex::PythonDataSet::showHTMLPreview) .def("collect", &tuplex::PythonDataSet::collect) .def("take", &tuplex::PythonDataSet::take) .def("map", &tuplex::PythonDataSet::map) diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc index 5382ad24d..ec972a899 100644 --- a/tuplex/python/src/PythonDataSet.cc +++ b/tuplex/python/src/PythonDataSet.cc @@ -884,55 +884,6 @@ namespace tuplex { } } - std::string PythonDataSet::showHTMLPreview(const int64_t topLimit, const int64_t bottomLimit) { - // make sure a dataset is wrapped - assert(this->_dataset); - - // is callee error dataset? if so return list with error string - if (this->_dataset->isError()) { - auto errset = dynamic_cast(this->_dataset); - assert(errset); - return "Error: " + errset->getError(); - } else { - // release GIL & hand over everything to Tuplex - assert(PyGILState_Check()); // make sure this thread holds the GIL! - python::unlockGIL(); - - std::stringstream ss; - std::string err_message; - - size_t castedTopLimit = 0; - if (topLimit < 0) { - castedTopLimit = std::numeric_limits::max(); - } - - size_t castedBottomLimit = 0; - if (bottomLimit < 0) { - castedBottomLimit = std::numeric_limits::max(); - } - - try { - this->_dataset->showHTMLPreview(castedTopLimit, castedBottomLimit, ss); - } catch (const std::exception &e) { - err_message = e.what(); - Logger::instance().defaultLogger().error(err_message); - } catch (...) { - err_message = "unknown C++ exception occurred, please change type."; - Logger::instance().defaultLogger().error(err_message); - } - - // reacquire GIL - python::lockGIL(); - Logger::instance().flushToPython(); - - if (!ss.str().empty() && err_message.empty()) { - return ss.str(); - } else { - return "Error occurred: " + err_message; - } - } - } - PyObject* PythonDataSet::anyToCPythonWithPyObjects(ResultSet* rs, size_t maxRowCount) { assert(rs); From 6b5c692e353582b9a012d26d967ea939f8236c84 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Wed, 20 Apr 2022 00:15:03 -0400 Subject: [PATCH 23/56] Separate out partition utils --- tuplex/core/include/PartitionUtils.h | 46 +++++++ tuplex/core/include/ee/local/LocalBackend.h | 22 ---- tuplex/core/src/PartitionUtils.cc | 138 ++++++++++++++++++++ tuplex/core/src/ee/local/LocalBackend.cc | 125 +----------------- tuplex/python/tuplex/utils/table_format.py | 2 +- tuplex/test/core/TakeTest.cc | 2 +- 6 files changed, 187 insertions(+), 148 deletions(-) create mode 100644 tuplex/core/include/PartitionUtils.h create mode 100644 tuplex/core/src/PartitionUtils.cc diff --git a/tuplex/core/include/PartitionUtils.h b/tuplex/core/include/PartitionUtils.h new file mode 100644 index 000000000..d247edcfc --- /dev/null +++ b/tuplex/core/include/PartitionUtils.h @@ -0,0 +1,46 @@ +//--------------------------------------------------------------------------------------------------------------------// +// // +// Tuplex: Blazing Fast Python Data Science // +// // +// // +// (c) 2017 - 2021, Tuplex team // +// Created by March Boonyapaluk first on 4/19/2021 // +// License: Apache 2.0 // +//--------------------------------------------------------------------------------------------------------------------// + +#ifndef TUPLEX_PARTITIONUTILS_H +#define TUPLEX_PARTITIONUTILS_H + +#include +#include +#include + +namespace tuplex { + /*! + * Trim list of partitions so that it includes up to the first n rows and the last m rows + * if n + m > number of rows in input partitions, the partitions will remain unchanged + * @param partitions [in,out] the list of partitions to trim + * @param topLimit n, the number of top rows to include + * @param bottomLimit m, the number of bottom rows to include + * @param tstage pointer to transform stage, might be used to generate new partition + * @param exec pointer to executor, might be used to allocate new partition + */ + void trimPartitionsToLimit(std::vector &partitions, size_t topLimit, size_t bottomLimit, + TransformStage *tstage, Executor *exec); + + /*! + * Create a newly allocated partition with the same data as the specified partition, but with the first n rows removed + * @param p_in the input partition + * @param numToSkip number of rows to remove from the new partition + * @param tstage pointer to transform stage, used to generate new partition + * @param exec pointer to executor, used to allocate new partition + * @return the new partition + */ + Partition *newPartitionWithSkipRows(Partition *p_in, + size_t numToSkip, + TransformStage *tstage, + Executor *exec); + +} + +#endif //TUPLEX_PARTITIONUTILS_H diff --git a/tuplex/core/include/ee/local/LocalBackend.h b/tuplex/core/include/ee/local/LocalBackend.h index 3d73a5d9f..7f42ff1cb 100644 --- a/tuplex/core/include/ee/local/LocalBackend.h +++ b/tuplex/core/include/ee/local/LocalBackend.h @@ -185,28 +185,6 @@ namespace tuplex { * @return */ extern URI outputURI(const UDF& udf, const URI& baseURI, int64_t partNo, FileFormat fmt); - - /*! - * Trim list of partitions so that it includes up to the first n rows and the last m rows - * if n + m > number of rows in input partitions, the partitions will remain unchanged - * @param partitions [in,out] the list of partitions to trim - * @param topLimit n, the number of top rows to include - * @param bottomLimit m, the number of bottom rows to include - * @param tstage pointer to transform stage, might be used to generate new partition - * @param exec pointer to executor, might be used to allocate new partition - */ - extern void trimPartitionsToLimit(std::vector &partitions, size_t topLimit, size_t bottomLimit, - TransformStage *tstage, Executor *exec); - - /*! - * Create a newly allocated partition with the same data as the specified partition, but with the first n rows removed - * @param p_in the input partition - * @param numToSkip number of rows to remove from the new partition - * @param tstage pointer to transform stage, used to generate new partition - * @param exec pointer to executor, used to allocate new partition - * @return the new partition - */ - extern Partition *newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage *tstage, Executor *exec); } #endif //TUPLEX_LOCALBACKEND_H \ No newline at end of file diff --git a/tuplex/core/src/PartitionUtils.cc b/tuplex/core/src/PartitionUtils.cc new file mode 100644 index 000000000..745332c93 --- /dev/null +++ b/tuplex/core/src/PartitionUtils.cc @@ -0,0 +1,138 @@ +//--------------------------------------------------------------------------------------------------------------------// +// // +// Tuplex: Blazing Fast Python Data Science // +// // +// // +// (c) 2017 - 2021, Tuplex team // +// Created by March Boonyapaluk first on 4/19/2021 // +// License: Apache 2.0 // +//--------------------------------------------------------------------------------------------------------------------// + +#include "PartitionUtils.h" + +namespace tuplex { + + void trimPartitionsToLimit(std::vector &partitions, + size_t topLimit, + size_t bottomLimit, + TransformStage* tstage, + Executor *exec) { + std::vector limitedPartitions, limitedTailPartitions; + + // check top output limit, adjust partitions if necessary + size_t numTopOutputRows = 0; + Partition *lastTopPart = nullptr; + size_t clippedTop = 0; + for (auto partition: partitions) { + numTopOutputRows += partition->getNumRows(); + lastTopPart = partition; + if (numTopOutputRows >= topLimit) { + // clip last partition & leave loop + clippedTop = topLimit - (numTopOutputRows - partition->getNumRows()); + assert(clippedTop <= partition->getNumRows()); + break; + } else if (partition == partitions.back()) { + // last partition, mark full row, but don't put to output set yet to avoid double put + clippedTop = partition->getNumRows(); + break; + } else { + // put full partition to output set + limitedPartitions.push_back(partition); + } + } + + // check the bottom output limit, adjust partitions if necessary + size_t numBottomOutputRows = 0; + size_t clippedBottom = 0; + for (auto it = partitions.rbegin(); it != partitions.rend(); it++) { + auto partition = *it; + numBottomOutputRows += partition->getNumRows(); + + if (partition == lastTopPart) { + // the bottom and the top partitions are overlapping + clippedBottom = bottomLimit - (numBottomOutputRows - partition->getNumRows()); + if (clippedTop + clippedBottom >= partition->getNumRows()) { + // if top and bottom range intersect, use full partitions + clippedTop = partition->getNumRows(); + clippedBottom = 0; + } + break; + } else if (numBottomOutputRows >= bottomLimit) { + // clip last partition & leave loop + auto clipped = bottomLimit - (numBottomOutputRows - partition->getNumRows()); + assert(clipped <= partition->getNumRows()); + if (clipped > 0) { + Partition *newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage, + exec); + assert(newPart->getNumRows() == clipped); + limitedTailPartitions.push_back(newPart); + } + partition->invalidate(); + break; + } else { + // put full partition to output set + limitedTailPartitions.push_back(partition); + } + } + + // push the middle partition + if (lastTopPart != nullptr && (clippedTop > 0 || clippedBottom > 0)) { + assert(clippedTop + clippedBottom <= lastTopPart->getNumRows()); + + // split into two partitions with both top and bottom are in the same partition + Partition *lastBottomPart = nullptr; + + if (clippedBottom != 0) { + lastBottomPart = newPartitionWithSkipRows(lastTopPart, lastTopPart->getNumRows() - clippedBottom, + tstage, exec); + } + + if (clippedTop != 0) { + lastTopPart->setNumRows(clippedTop); + limitedPartitions.push_back(lastTopPart); + } else { + lastTopPart->invalidate(); + } + + if (lastBottomPart != nullptr) { + limitedPartitions.push_back(lastBottomPart); + } + } + + // merge the head and tail partitions + partitions.clear(); + partitions.insert(partitions.end(), limitedPartitions.begin(), limitedPartitions.end()); + partitions.insert(partitions.end(), limitedTailPartitions.rbegin(), limitedTailPartitions.rend()); + } + + Partition *newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage *tstage, Executor *exec) { + auto ptr = p_in->lockRaw(); + auto num_rows = *((int64_t *) ptr); + assert(numToSkip < num_rows); + + ptr += sizeof(int64_t); + size_t numBytesToSkip = 0; + + Deserializer ds(tstage->outputSchema()); + for (unsigned i = 0; i < numToSkip; ++i) { + Row r = Row::fromMemory(ds, ptr, p_in->capacity() - numBytesToSkip); + ptr += r.serializedLength(); + numBytesToSkip += r.serializedLength(); + } + + Partition *p_out = exec->allocWritablePartition(p_in->size() - numBytesToSkip + sizeof(int64_t), + tstage->outputSchema(), tstage->outputDataSetID(), + tstage->context().id()); + assert(p_out->capacity() >= p_in->size() - numBytesToSkip); + + auto ptr_out = p_out->lockRaw(); + *((int64_t *) ptr_out) = p_in->getNumRows() - numToSkip; + ptr_out += sizeof(int64_t); + memcpy((void *) ptr_out, ptr, p_in->size() - numBytesToSkip); + p_out->unlock(); + + p_in->unlock(); + + return p_out; + } +} // namespace tuplex \ No newline at end of file diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc index 351d55b88..676a4e3b3 100644 --- a/tuplex/core/src/ee/local/LocalBackend.cc +++ b/tuplex/core/src/ee/local/LocalBackend.cc @@ -27,7 +27,7 @@ #include #include #include -#include +#include "PartitionUtils.h" namespace tuplex { @@ -2131,127 +2131,4 @@ namespace tuplex { tstage->setFileResult(ecounts); } - void trimPartitionsToLimit(std::vector &partitions, - size_t topLimit, - size_t bottomLimit, - TransformStage* tstage, - Executor *exec) { - std::vector limitedPartitions, limitedTailPartitions; - - // check top output limit, adjust partitions if necessary - size_t numTopOutputRows = 0; - Partition *lastTopPart = nullptr; - size_t clippedTop = 0; - for (auto partition: partitions) { - numTopOutputRows += partition->getNumRows(); - lastTopPart = partition; - if (numTopOutputRows >= topLimit) { - // clip last partition & leave loop - clippedTop = topLimit - (numTopOutputRows - partition->getNumRows()); - assert(clippedTop <= partition->getNumRows()); - break; - } else if (partition == partitions.back()) { - // last partition, mark full row, but don't put to output set yet to avoid double put - clippedTop = partition->getNumRows(); - break; - } else { - // put full partition to output set - limitedPartitions.push_back(partition); - } - } - - // check the bottom output limit, adjust partitions if necessary - size_t numBottomOutputRows = 0; - size_t clippedBottom = 0; - for (auto it = partitions.rbegin(); it != partitions.rend(); it++) { - auto partition = *it; - numBottomOutputRows += partition->getNumRows(); - - if (partition == lastTopPart) { - // the bottom and the top partitions are overlapping - clippedBottom = bottomLimit - (numBottomOutputRows - partition->getNumRows()); - if (clippedTop + clippedBottom >= partition->getNumRows()) { - // if top and bottom range intersect, use full partitions - clippedTop = partition->getNumRows(); - clippedBottom = 0; - } - break; - } else if (numBottomOutputRows >= bottomLimit) { - // clip last partition & leave loop - auto clipped = bottomLimit - (numBottomOutputRows - partition->getNumRows()); - assert(clipped <= partition->getNumRows()); - if (clipped > 0) { - Partition *newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage, - exec); - assert(newPart->getNumRows() == clipped); - limitedTailPartitions.push_back(newPart); - } - partition->invalidate(); - break; - } else { - // put full partition to output set - limitedTailPartitions.push_back(partition); - } - } - - // push the middle partition - if (lastTopPart != nullptr && (clippedTop > 0 || clippedBottom > 0)) { - assert(clippedTop + clippedBottom <= lastTopPart->getNumRows()); - - // split into two partitions with both top and bottom are in the same partition - Partition *lastBottomPart = nullptr; - - if (clippedBottom != 0) { - lastBottomPart = newPartitionWithSkipRows(lastTopPart, lastTopPart->getNumRows() - clippedBottom, - tstage, exec); - } - - if (clippedTop != 0) { - lastTopPart->setNumRows(clippedTop); - limitedPartitions.push_back(lastTopPart); - } else { - lastTopPart->invalidate(); - } - - if (lastBottomPart != nullptr) { - limitedPartitions.push_back(lastBottomPart); - } - } - - // merge the head and tail partitions - partitions.clear(); - partitions.insert(partitions.end(), limitedPartitions.begin(), limitedPartitions.end()); - partitions.insert(partitions.end(), limitedTailPartitions.rbegin(), limitedTailPartitions.rend()); - } - - Partition *newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage *tstage, Executor *exec) { - auto ptr = p_in->lockRaw(); - auto num_rows = *((int64_t *) ptr); - assert(numToSkip < num_rows); - - ptr += sizeof(int64_t); - size_t numBytesToSkip = 0; - - Deserializer ds(tstage->outputSchema()); - for (unsigned i = 0; i < numToSkip; ++i) { - Row r = Row::fromMemory(ds, ptr, p_in->capacity() - numBytesToSkip); - ptr += r.serializedLength(); - numBytesToSkip += r.serializedLength(); - } - - Partition *p_out = exec->allocWritablePartition(p_in->size() - numBytesToSkip + sizeof(int64_t), - tstage->outputSchema(), tstage->outputDataSetID(), - tstage->context().id()); - assert(p_out->capacity() >= p_in->size() - numBytesToSkip); - - auto ptr_out = p_out->lockRaw(); - *((int64_t *) ptr_out) = p_in->getNumRows() - numToSkip; - ptr_out += sizeof(int64_t); - memcpy((void *) ptr_out, ptr, p_in->size() - numBytesToSkip); - p_out->unlock(); - - p_in->unlock(); - - return p_out; - } } // namespace tuplex \ No newline at end of file diff --git a/tuplex/python/tuplex/utils/table_format.py b/tuplex/python/tuplex/utils/table_format.py index bb83118b4..ecd333f5a 100644 --- a/tuplex/python/tuplex/utils/table_format.py +++ b/tuplex/python/tuplex/utils/table_format.py @@ -5,7 +5,7 @@ # # # # # (c) 2017 - 2021, Tuplex team # -# Created by Leonhard Spiegelberg first on 4/19/2022 # +# Created by March Boonyapaluk first on 4/19/2022 # # License: Apache 2.0 # # ----------------------------------------------------------------------------------------------------------------------# diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc index 4e4a70f53..40b624ca8 100644 --- a/tuplex/test/core/TakeTest.cc +++ b/tuplex/test/core/TakeTest.cc @@ -4,7 +4,7 @@ // // // // // (c) 2017 - 2021, Tuplex team // -// Created by Leonhard Spiegelberg first on 1/1/2021 // +// Created by March Boonyapaluk first on 4/19/2021 // // License: Apache 2.0 // //--------------------------------------------------------------------------------------------------------------------// From a072e405956b2b09e0fa3e89e60830ca6fb5612a Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Wed, 20 Apr 2022 11:45:34 -0400 Subject: [PATCH 24/56] Fix Azure pipeline failing --- tuplex/python/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tuplex/python/CMakeLists.txt b/tuplex/python/CMakeLists.txt index b0b0e54c5..7ccb7057c 100644 --- a/tuplex/python/CMakeLists.txt +++ b/tuplex/python/CMakeLists.txt @@ -104,6 +104,7 @@ FILE(COPY ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/__init__.py ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/tracebacks.py ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/version.py ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/globs.py + ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/table_format.py DESTINATION ${PYTHON_DIST_DIR}/tuplex/utils) FILE(COPY ${CMAKE_CURRENT_SOURCE_DIR}/tests/test_tuples.py From 5a1a3429ecdecdd88ea14ac2b246ce53ad3224a3 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Sat, 22 Jan 2022 17:37:43 -0500 Subject: [PATCH 25/56] Modify dataset --- tuplex/python/tuplex/dataset.py | 82 +++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py index a2b8c0b33..aa5b1ca12 100644 --- a/tuplex/python/tuplex/dataset.py +++ b/tuplex/python/tuplex/dataset.py @@ -28,6 +28,88 @@ class DataSet: def __init__(self): self._dataSet = None + def getDataLen(self): + data = self.collect() + if len(data) == 0: + return 0, 0 + else: + return len(data), len(data[0]) + + def revTake(self, nRows = 5): + return self.collect()[-nRows:] + + def _repr_html_(self): + rows_list = self.take() + total_row_cnt, total_col_cnt = self.getDataLen() + print('rowlist') + print(rows_list) + if len(rows_list) == 0: + header = '\n' + rows = '\n' + else: + header = '\n' + + if self.columns != None: + for x in self.columns: + header += f' {x}\n' + else: + for i in range(len(rows_list[0])): + header += f' column {i + 1}\n' + + rows = '' + for i, r in enumerate(rows_list): + rows += ' \n' + rows += f' {i}\n' + for data in r: + rows += f' {data}\n' + rows += ' \n' + + # add the ... + rows += ' \n' + rows += ' ...\n' + for i in range(total_col_cnt): + rows += ' ...\n' + rows += ' \n' + + lastData = self.revTake() + for i, r in enumerate(lastData): + rows += ' \n' + rows += f' {total_row_cnt - len(lastData) + i}\n' + for data in r: + rows += f' {data}\n' + rows += ' \n' + + html_template = ( + '
\n' + '\n' + '\n' + ' \n' + ' \n' + f'{header}' + ' \n' + ' \n' + ' \n' + f'{rows}' + ' \n' + '
\n' + f'

{total_row_cnt} rows × {total_col_cnt} columns

\n' + '
' + ) + + return html_template + def unique(self): """ removes duplicates from Dataset (out-of-order). Equivalent to a DISTINCT clause in a SQL-statement. Returns: From b68b4a10ed68598a3f0f318f3e8008c4d99e8a60 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Thu, 27 Jan 2022 22:12:09 -0500 Subject: [PATCH 26/56] Add in takeLast operator --- tuplex/core/include/DataSet.h | 2 + .../include/logical/LogicalOperatorType.h | 1 + .../core/include/logical/TakeLastOperator.h | 51 ++++++++++++++ tuplex/core/src/DataSet.cc | 24 +++++++ tuplex/core/src/logical/TaskLastOperator.cc | 43 ++++++++++++ tuplex/core/src/physical/PhysicalPlan.cc | 7 +- tuplex/python/include/PythonDataSet.h | 1 + tuplex/python/src/PythonBindings.cc | 1 + tuplex/python/src/PythonDataSet.cc | 69 +++++++++++++++++++ tuplex/python/tuplex/dataset.py | 17 +++++ 10 files changed, 215 insertions(+), 1 deletion(-) create mode 100644 tuplex/core/include/logical/TakeLastOperator.h create mode 100644 tuplex/core/src/logical/TaskLastOperator.cc diff --git a/tuplex/core/include/DataSet.h b/tuplex/core/include/DataSet.h index 899032723..429d8c6a7 100644 --- a/tuplex/core/include/DataSet.h +++ b/tuplex/core/include/DataSet.h @@ -269,6 +269,8 @@ namespace tuplex { virtual std::vector takeAsVector(int64_t numElements, std::ostream &os = std::cout); + virtual std::shared_ptr takeLast(int64_t numElements, std::ostream &os = std::cout); + /*! * saves dataset to file. There are multiple options to control the behavior * ==> 1.) files can be split across multiple ones. Specify number of files to split rows to diff --git a/tuplex/core/include/logical/LogicalOperatorType.h b/tuplex/core/include/logical/LogicalOperatorType.h index 594252820..b6a1c788b 100644 --- a/tuplex/core/include/logical/LogicalOperatorType.h +++ b/tuplex/core/include/logical/LogicalOperatorType.h @@ -17,6 +17,7 @@ namespace tuplex { MAP, FILTER, TAKE, // i.e. output to python / in memory + TAKELAST, PARALLELIZE, // i.e. input from python FILEINPUT, RESOLVE, diff --git a/tuplex/core/include/logical/TakeLastOperator.h b/tuplex/core/include/logical/TakeLastOperator.h new file mode 100644 index 000000000..28896e513 --- /dev/null +++ b/tuplex/core/include/logical/TakeLastOperator.h @@ -0,0 +1,51 @@ +//--------------------------------------------------------------------------------------------------------------------// +// // +// Tuplex: Blazing Fast Python Data Science // +// // +// // +// (c) 2017 - 2021, Tuplex team // +// Created by Leonhard Spiegelberg first on 1/1/2021 // +// License: Apache 2.0 // +//--------------------------------------------------------------------------------------------------------------------// + +#ifndef TUPLEX_TAKELASTOPERATOR_H +#define TUPLEX_TAKELASTOPERATOR_H + + +#include "LogicalOperator.h" + +namespace tuplex { + class TakeLastOperator : public LogicalOperator { + private: + int64_t _limit; + public: + LogicalOperator *clone() override; + + public: + TakeLastOperator(LogicalOperator *parent, const int64_t numElements); + + std::string name() override { + if(_limit < 0 || std::numeric_limits::max() == _limit) + return "collect"; + return "take"; + } + LogicalOperatorType type() const override { return LogicalOperatorType::TAKELAST; } + + bool isActionable() override { return true; } + + bool isDataSource() override { return false; } + + bool good() const override; + + int64_t limit() { return _limit; } + + + std::vector getSample(const size_t num) const override; + + Schema getInputSchema() const override { return getOutputSchema(); } + + std::vector columns() const override; + }; +} + +#endif //TUPLEX_TAKELASTOPERATOR_H \ No newline at end of file diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc index a53a14094..66a6a548c 100644 --- a/tuplex/core/src/DataSet.cc +++ b/tuplex/core/src/DataSet.cc @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -102,6 +103,29 @@ namespace tuplex { return v; } + std::shared_ptr DataSet::takeLast(int64_t numElements, std::ostream &os) { + // error dataset? + if (isError()) + throw std::runtime_error("is error dataset!"); + + // negative numbers mean get all elements! + if (numElements < 0) + numElements = std::numeric_limits::max(); + + // create a take node + assert(_context); + LogicalOperator *op = _context->addOperator(new TakeLastOperator(this->_operator, numElements)); + DataSet *dsptr = _context->createDataSet(op->getOutputSchema()); + dsptr->_operator = op; + op->setDataSet(dsptr); + + // perform action. + assert(this->_context); + auto rs = op->compute(*this->_context); + + return rs; + } + void DataSet::tofile(tuplex::FileFormat fmt, const tuplex::URI &uri, const tuplex::UDF &udf, size_t fileCount, size_t shardSize, const std::unordered_map &outputOptions, size_t limit, diff --git a/tuplex/core/src/logical/TaskLastOperator.cc b/tuplex/core/src/logical/TaskLastOperator.cc new file mode 100644 index 000000000..92295efb3 --- /dev/null +++ b/tuplex/core/src/logical/TaskLastOperator.cc @@ -0,0 +1,43 @@ +//--------------------------------------------------------------------------------------------------------------------// +// // +// Tuplex: Blazing Fast Python Data Science // +// // +// // +// (c) 2017 - 2021, Tuplex team // +// Created by Leonhard Spiegelberg first on 1/1/2021 // +// License: Apache 2.0 // +//--------------------------------------------------------------------------------------------------------------------// + +#include +#include + +namespace tuplex { + TakeLastOperator::TakeLastOperator(LogicalOperator *parent, const int64_t numElements) : LogicalOperator::LogicalOperator(parent), _limit(numElements) { + // take schema from parent node + setSchema(this->parent()->getOutputSchema()); + } + + bool TakeLastOperator::good() const { + return _limit >= -1; + } + + std::vector TakeLastOperator::getSample(const size_t num) const { + // take sample from parent + return parent()->getSample(num); + } + + std::vector TakeLastOperator::columns() const { + assert(parent()); + return parent()->columns(); + } + + LogicalOperator *TakeLastOperator::clone() { + // create clone of this operator + auto copy = new TakeLastOperator(parent()->clone(), _limit); + + copy->setDataSet(getDataSet()); // weak ptr to old dataset... + copy->copyMembers(this); + assert(getID() == copy->getID()); + return copy; + } +} \ No newline at end of file diff --git a/tuplex/core/src/physical/PhysicalPlan.cc b/tuplex/core/src/physical/PhysicalPlan.cc index 2399edf6f..87a73a712 100644 --- a/tuplex/core/src/physical/PhysicalPlan.cc +++ b/tuplex/core/src/physical/PhysicalPlan.cc @@ -208,7 +208,9 @@ namespace tuplex { if(ops.back()->isActionable()) { if(ops.back()->type() == LogicalOperatorType::FILEOUTPUT) outputMode = EndPointMode::FILE; - else if(ops.back()->type() == LogicalOperatorType::TAKE || ops.back()->type() == LogicalOperatorType::CACHE) { + else if(ops.back()->type() == LogicalOperatorType::TAKE || + ops.back()->type() == LogicalOperatorType::TAKELAST || + ops.back()->type() == LogicalOperatorType::CACHE) { // memory? outputMode = EndPointMode::MEMORY; } else @@ -382,6 +384,9 @@ namespace tuplex { if(outputNode->type() == LogicalOperatorType::TAKE) { auto top = static_cast(outputNode); builder.setOutputLimit(top->limit()); + } else if (outputNode->type() == LogicalOperatorType::TAKELAST) { + auto top = static_cast(outputNode); + builder.setOutputLimit(top->limit()); } // @TODO: add slowPip builder to this process... diff --git a/tuplex/python/include/PythonDataSet.h b/tuplex/python/include/PythonDataSet.h index 665d68856..58827ea33 100644 --- a/tuplex/python/include/PythonDataSet.h +++ b/tuplex/python/include/PythonDataSet.h @@ -78,6 +78,7 @@ namespace tuplex { py::object collect(); py::object take(const int64_t numRows); + boost::python::object takeLast(const int64_t numRows); void show(const int64_t numRows=-1); // DataFrame like operations diff --git a/tuplex/python/src/PythonBindings.cc b/tuplex/python/src/PythonBindings.cc index 6b3683853..4d0b1f4e9 100644 --- a/tuplex/python/src/PythonBindings.cc +++ b/tuplex/python/src/PythonBindings.cc @@ -43,6 +43,7 @@ PYMODULE { .def("show", &tuplex::PythonDataSet::show) .def("collect", &tuplex::PythonDataSet::collect) .def("take", &tuplex::PythonDataSet::take) + .def("takeLast", &tuplex::PythonDataSet::takeLast) .def("map", &tuplex::PythonDataSet::map) .def("resolve", &tuplex::PythonDataSet::resolve) .def("ignore", &tuplex::PythonDataSet::ignore) diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc index 36f9a392b..2e54deec5 100644 --- a/tuplex/python/src/PythonDataSet.cc +++ b/tuplex/python/src/PythonDataSet.cc @@ -176,6 +176,75 @@ namespace tuplex { } } + boost::python::object PythonDataSet::takeLast(const int64_t numRows) { + // make sure a dataset is wrapped + assert(this->_dataset); + + // is callee error dataset? if so return list with error string + if (this->_dataset->isError()) { + ErrorDataSet *eds = static_cast(this->_dataset); + boost::python::list L; + L.append(eds->getError()); + // Logger::instance().flushAll(); + Logger::instance().flushToPython(); + return L; + } else { + std::stringstream ss; + + // release GIL & hand over everything to Tuplex + assert(PyGILState_Check()); // make sure this thread holds the GIL! + python::unlockGIL(); + + std::shared_ptr rs; + std::string err_message = ""; + try { + rs = _dataset->takeLast(numRows, ss); + if(!rs) + throw std::runtime_error("invalid result set"); + // if there are more than 1 million (100k in debug mode) elements print message... + if (rs->rowCount() > LARGE_RESULT_SIZE) + Logger::instance().logger("python").info("transferring " + + std::to_string(rs->rowCount()) + + " elements back to Python. This might take a while..."); + } catch(const std::exception& e) { + err_message = e.what(); + Logger::instance().defaultLogger().error(err_message); + } catch(...) { + err_message = "unknown C++ exception occurred, please change type."; + Logger::instance().defaultLogger().error(err_message); + } + + // reqacquire GIL + python::lockGIL(); + + // error? then return list of error string + if(!rs || !err_message.empty()) { + // Logger::instance().flushAll(); + Logger::instance().flushToPython(); + auto listObj = PyList_New(1); + PyList_SetItem(listObj, 0, python::PyString_FromString(err_message.c_str())); + auto list = boost::python::object(boost::python::borrowed<>(listObj)); + return list; + } + + // collect results & transfer them back to python + // new version, directly interact with the interpreter + Timer timer; + // build python list object from resultset + auto listObj = resultSetToCPython(rs.get(), numRows); + Logger::instance().logger("python").info("Data transfer back to python took " + + std::to_string(timer.time()) + " seconds"); + // Logger::instance().flushAll(); + Logger::instance().flushToPython(); + + // print errors + if (ss.str().length() > 0) + PySys_FormatStdout("%s", ss.str().c_str()); + + return boost::python::object(boost::python::handle<>(listObj)); + } + } + PythonDataSet PythonDataSet::map(const std::string &lambda_code, const std::string &pickled_code, const py::object& closure) { auto& logger = Logger::instance().logger("python"); diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py index aa5b1ca12..a1d838526 100644 --- a/tuplex/python/tuplex/dataset.py +++ b/tuplex/python/tuplex/dataset.py @@ -208,6 +208,23 @@ def take(self, nrows=5): return self._dataSet.take(nrows) + def takeLast(self, nrows=5): + """ action that generates a physical plan, processes data and collects the last results then as list of tuples. + + Args: + nrows (int): number of rows to collect. Per default ``5``. + Returns: + (list): A list of tuples + + """ + + assert isinstance(nrows, int), 'num rows must be an integer' + assert nrows > 0, 'please specify a number greater than zero' + + assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' + + return self._dataSet.takeLast(nrows) + def show(self, nrows=None): """ action that generates a physical plan, processes data and prints results as nicely formatted ASCII table to stdout. From 02b51aabb30541dcb14e9cc5d2c0aad3d421f1f8 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Thu, 27 Jan 2022 22:25:39 -0500 Subject: [PATCH 27/56] (wip) add reverse limit in partition --- tuplex/core/include/Partition.h | 15 +++++++ tuplex/core/src/physical/TransformStage.cc | 51 ++++++++++++++++------ 2 files changed, 53 insertions(+), 13 deletions(-) diff --git a/tuplex/core/include/Partition.h b/tuplex/core/include/Partition.h index 9bc7fc54c..5a66023fd 100644 --- a/tuplex/core/include/Partition.h +++ b/tuplex/core/include/Partition.h @@ -248,6 +248,21 @@ namespace tuplex { _mutex.unlock(); } + void setNumLastRows(const size_t numRows) { + // TODO: set another value instead + _mutex.lock(); + + _numRows = numRows; + + // save to memptr + if(_arena) { + *((int64_t*)_arena) = numRows; + } + + _mutex.unlock(); + } + + int64_t getDataSetID() const { return _dataSetID; } diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc index b61f9cbe2..9cd15694a 100644 --- a/tuplex/core/src/physical/TransformStage.cc +++ b/tuplex/core/src/physical/TransformStage.cc @@ -139,21 +139,46 @@ namespace tuplex { } // check output limit, adjust partitions if necessary - size_t numOutputRows = 0; - for (auto partition : partitions) { - numOutputRows += partition->getNumRows(); - if (numOutputRows >= outputLimit()) { - // clip last partition & leave loop - auto clipped = outputLimit() - (numOutputRows - partition->getNumRows()); - assert(clipped <= partition->getNumRows()); - partition->setNumRows(clipped); - if (clipped > 0) + // TODO: add reverse outputLimit condition here + if (true) { + size_t numOutputRows = 0; + for (auto partition : partitions) { + numOutputRows += partition->getNumRows(); + if (numOutputRows >= outputLimit()) { + // clip last partition & leave loop + auto clipped = outputLimit() - (numOutputRows - partition->getNumRows()); + assert(clipped <= partition->getNumRows()); + partition->setNumRows(clipped); + if (clipped > 0) + limitedPartitions.push_back(partition); + break; + } else { + // put full partition to output set limitedPartitions.push_back(partition); - break; - } else { - // put full partition to output set - limitedPartitions.push_back(partition); + } + } + } else { + size_t numOutputRows = 0; + for (auto partitionIt = partitions.rbeing(); + partitionIt != partitions.rend(); partitionIt++) { + numOutputRows += partition->getNumRows(); + if (numOutputRows >= outputLimit()) { + // clip last partition & leave loop + auto clipped = outputLimit() - (numOutputRows - partition->getNumRows()); + assert(clipped <= partition->getNumRows()); + + // TODO: do backward clip here instead + partition->setNumRows(clipped); + if (clipped > 0) + limitedPartitions.push_back(partition); + break; + } else { + // put full partition to output set + limitedPartitions.push_back(partition); + } } + + std::reverse(limitedPartitions.begin(), limitedPartitions.end()); } } From a721e0f4f17a134eb6b8229ab6872b51bd551063 Mon Sep 17 00:00:00 2001 From: korlamarch Date: Fri, 11 Feb 2022 09:20:06 -0500 Subject: [PATCH 28/56] Remove row count --- tuplex/python/tuplex/dataset.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py index a1d838526..976a751f4 100644 --- a/tuplex/python/tuplex/dataset.py +++ b/tuplex/python/tuplex/dataset.py @@ -28,19 +28,19 @@ class DataSet: def __init__(self): self._dataSet = None - def getDataLen(self): + def getColumnSize(self): data = self.collect() if len(data) == 0: return 0, 0 else: - return len(data), len(data[0]) + return len(data[0]) def revTake(self, nRows = 5): return self.collect()[-nRows:] def _repr_html_(self): rows_list = self.take() - total_row_cnt, total_col_cnt = self.getDataLen() + total_col_cnt = self.getColumnSize() print('rowlist') print(rows_list) if len(rows_list) == 0: @@ -74,7 +74,7 @@ def _repr_html_(self): lastData = self.revTake() for i, r in enumerate(lastData): rows += ' \n' - rows += f' {total_row_cnt - len(lastData) + i}\n' + rows += f' {0 - len(lastData) + i}\n' for data in r: rows += f' {data}\n' rows += ' \n' @@ -104,7 +104,7 @@ def _repr_html_(self): f'{rows}' ' \n' '\n' - f'

{total_row_cnt} rows × {total_col_cnt} columns

\n' + f'

{total_col_cnt} columns

\n' '' ) From 6955392a5b098709a1f4b7e8fa1cc487130b93c0 Mon Sep 17 00:00:00 2001 From: korlamarch Date: Tue, 15 Feb 2022 23:30:47 -0500 Subject: [PATCH 29/56] refactor TakeOperator --- tuplex/core/include/DataSet.h | 3 +- tuplex/core/include/EmptyDataset.h | 2 +- tuplex/core/include/ErrorDataSet.h | 2 +- .../include/logical/LogicalOperatorType.h | 1 - .../core/include/logical/TakeLastOperator.h | 51 ------------- tuplex/core/include/logical/TakeOperator.h | 10 ++- tuplex/core/src/DataSet.cc | 36 ++------- tuplex/core/src/EmptyDataset.cc | 4 +- tuplex/core/src/ErrorDataSet.cc | 4 +- tuplex/core/src/logical/TakeOperator.cc | 6 +- tuplex/core/src/logical/TaskLastOperator.cc | 43 ----------- tuplex/core/src/physical/PhysicalPlan.cc | 4 - tuplex/core/src/physical/StageBuilder.cc | 3 +- tuplex/core/src/physical/TransformStage.cc | 51 ++++--------- tuplex/python/include/PythonDataSet.h | 3 +- tuplex/python/src/PythonBindings.cc | 1 - tuplex/python/src/PythonDataSet.cc | 73 +------------------ tuplex/python/tuplex/dataset.py | 23 +----- 18 files changed, 44 insertions(+), 276 deletions(-) delete mode 100644 tuplex/core/include/logical/TakeLastOperator.h delete mode 100644 tuplex/core/src/logical/TaskLastOperator.cc diff --git a/tuplex/core/include/DataSet.h b/tuplex/core/include/DataSet.h index 429d8c6a7..65a766a87 100644 --- a/tuplex/core/include/DataSet.h +++ b/tuplex/core/include/DataSet.h @@ -263,13 +263,12 @@ namespace tuplex { // these are actions that cause execution virtual std::shared_ptr collect(std::ostream &os = std::cout); - virtual std::shared_ptr take(int64_t numElements, std::ostream &os = std::cout); + virtual std::shared_ptr take(int64_t numTop, int64_t numBottom, std::ostream &os = std::cout); virtual std::vector collectAsVector(std::ostream &os = std::cout); virtual std::vector takeAsVector(int64_t numElements, std::ostream &os = std::cout); - virtual std::shared_ptr takeLast(int64_t numElements, std::ostream &os = std::cout); /*! * saves dataset to file. There are multiple options to control the behavior diff --git a/tuplex/core/include/EmptyDataset.h b/tuplex/core/include/EmptyDataset.h index b3c1ed7af..0f8a1f52c 100644 --- a/tuplex/core/include/EmptyDataset.h +++ b/tuplex/core/include/EmptyDataset.h @@ -70,7 +70,7 @@ namespace tuplex { virtual std::shared_ptr collect(std::ostream& os) override; // take / collect will print out the error only - virtual std::shared_ptr take(int64_t numElements, std::ostream& os) override; + virtual std::shared_ptr take(int64_t numTop, int64_t numBottom, std::ostream& os) override; //virtual void show(const int64_t numRows=-1, std::ostream& os=std::cout) override; virtual std::vector collectAsVector(std::ostream& os) override; diff --git a/tuplex/core/include/ErrorDataSet.h b/tuplex/core/include/ErrorDataSet.h index 2f46d8638..34fc60685 100644 --- a/tuplex/core/include/ErrorDataSet.h +++ b/tuplex/core/include/ErrorDataSet.h @@ -90,7 +90,7 @@ namespace tuplex { std::shared_ptr collect(std::ostream& os) override; // take / collect will print out the error only - std::shared_ptr take(int64_t numElements, std::ostream& os) override; + std::shared_ptr take(int64_t numTop, int64_t numBottom, std::ostream& os) override; //virtual void show(const int64_t numRows=-1, std::ostream& os=std::cout) override; std::vector collectAsVector(std::ostream& os) override; diff --git a/tuplex/core/include/logical/LogicalOperatorType.h b/tuplex/core/include/logical/LogicalOperatorType.h index b6a1c788b..594252820 100644 --- a/tuplex/core/include/logical/LogicalOperatorType.h +++ b/tuplex/core/include/logical/LogicalOperatorType.h @@ -17,7 +17,6 @@ namespace tuplex { MAP, FILTER, TAKE, // i.e. output to python / in memory - TAKELAST, PARALLELIZE, // i.e. input from python FILEINPUT, RESOLVE, diff --git a/tuplex/core/include/logical/TakeLastOperator.h b/tuplex/core/include/logical/TakeLastOperator.h deleted file mode 100644 index 28896e513..000000000 --- a/tuplex/core/include/logical/TakeLastOperator.h +++ /dev/null @@ -1,51 +0,0 @@ -//--------------------------------------------------------------------------------------------------------------------// -// // -// Tuplex: Blazing Fast Python Data Science // -// // -// // -// (c) 2017 - 2021, Tuplex team // -// Created by Leonhard Spiegelberg first on 1/1/2021 // -// License: Apache 2.0 // -//--------------------------------------------------------------------------------------------------------------------// - -#ifndef TUPLEX_TAKELASTOPERATOR_H -#define TUPLEX_TAKELASTOPERATOR_H - - -#include "LogicalOperator.h" - -namespace tuplex { - class TakeLastOperator : public LogicalOperator { - private: - int64_t _limit; - public: - LogicalOperator *clone() override; - - public: - TakeLastOperator(LogicalOperator *parent, const int64_t numElements); - - std::string name() override { - if(_limit < 0 || std::numeric_limits::max() == _limit) - return "collect"; - return "take"; - } - LogicalOperatorType type() const override { return LogicalOperatorType::TAKELAST; } - - bool isActionable() override { return true; } - - bool isDataSource() override { return false; } - - bool good() const override; - - int64_t limit() { return _limit; } - - - std::vector getSample(const size_t num) const override; - - Schema getInputSchema() const override { return getOutputSchema(); } - - std::vector columns() const override; - }; -} - -#endif //TUPLEX_TAKELASTOPERATOR_H \ No newline at end of file diff --git a/tuplex/core/include/logical/TakeOperator.h b/tuplex/core/include/logical/TakeOperator.h index 8d0d6dcab..20c035a74 100644 --- a/tuplex/core/include/logical/TakeOperator.h +++ b/tuplex/core/include/logical/TakeOperator.h @@ -17,15 +17,16 @@ namespace tuplex { class TakeOperator : public LogicalOperator { private: - int64_t _limit; + int64_t _limitTop; + int64_t _limitBottom; public: LogicalOperator *clone() override; public: - TakeOperator(LogicalOperator *parent, const int64_t numElements); + TakeOperator(LogicalOperator *parent, const int64_t numTop, const int64_t numBottom); std::string name() override { - if(_limit < 0 || std::numeric_limits::max() == _limit) + if(_limitTop < 0 || std::numeric_limits::max() == _limitTop) return "collect"; return "take"; } @@ -37,8 +38,9 @@ namespace tuplex { bool good() const override; - int64_t limit() { return _limit; } + int64_t limit() { return _limitTop; } + bool limitBottom() { return _limitBottom; } std::vector getSample(const size_t num) const override; diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc index 66a6a548c..3de903d1c 100644 --- a/tuplex/core/src/DataSet.cc +++ b/tuplex/core/src/DataSet.cc @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -39,21 +38,21 @@ namespace tuplex { } std::shared_ptr DataSet::collect(std::ostream &os) { - return take(-1, os); + return take(-1, false, os); } - std::shared_ptr DataSet::take(int64_t numElements, std::ostream &os) { + std::shared_ptr DataSet::take(int64_t numTop, int64_t numBottom, std::ostream &os) { // error dataset? if (isError()) throw std::runtime_error("is error dataset!"); // negative numbers mean get all elements! - if (numElements < 0) - numElements = std::numeric_limits::max(); + if (numTop < 0) + numTop = std::numeric_limits::max(); // create a take node assert(_context); - LogicalOperator *op = _context->addOperator(new TakeOperator(this->_operator, numElements)); + LogicalOperator *op = _context->addOperator(new TakeOperator(this->_operator, numTop, numBottom)); DataSet *dsptr = _context->createDataSet(op->getOutputSchema()); dsptr->_operator = op; op->setDataSet(dsptr); @@ -72,7 +71,7 @@ namespace tuplex { // -1 means to retrieve all elements std::vector DataSet::takeAsVector(int64_t numElements, std::ostream &os) { - auto rs = take(numElements, os); + auto rs = take(numElements, false, os); Timer timer; #warning "limiting should make this hack irrelevant..." @@ -103,29 +102,6 @@ namespace tuplex { return v; } - std::shared_ptr DataSet::takeLast(int64_t numElements, std::ostream &os) { - // error dataset? - if (isError()) - throw std::runtime_error("is error dataset!"); - - // negative numbers mean get all elements! - if (numElements < 0) - numElements = std::numeric_limits::max(); - - // create a take node - assert(_context); - LogicalOperator *op = _context->addOperator(new TakeLastOperator(this->_operator, numElements)); - DataSet *dsptr = _context->createDataSet(op->getOutputSchema()); - dsptr->_operator = op; - op->setDataSet(dsptr); - - // perform action. - assert(this->_context); - auto rs = op->compute(*this->_context); - - return rs; - } - void DataSet::tofile(tuplex::FileFormat fmt, const tuplex::URI &uri, const tuplex::UDF &udf, size_t fileCount, size_t shardSize, const std::unordered_map &outputOptions, size_t limit, diff --git a/tuplex/core/src/EmptyDataset.cc b/tuplex/core/src/EmptyDataset.cc index 984fa904f..7504e8499 100644 --- a/tuplex/core/src/EmptyDataset.cc +++ b/tuplex/core/src/EmptyDataset.cc @@ -11,7 +11,7 @@ #include namespace tuplex { - std::shared_ptr EmptyDataset::take(int64_t numElements, std::ostream &os) { + std::shared_ptr EmptyDataset::take(int64_t numTop, int64_t numBottom, std::ostream &os) { return std::make_shared(); } @@ -20,7 +20,7 @@ namespace tuplex { } std::shared_ptr EmptyDataset::collect(std::ostream &os) { - return take(0, os); + return take(0, false, os); } std::vector EmptyDataset::collectAsVector(std::ostream &os) { diff --git a/tuplex/core/src/ErrorDataSet.cc b/tuplex/core/src/ErrorDataSet.cc index 57c03ffba..9d19594f2 100644 --- a/tuplex/core/src/ErrorDataSet.cc +++ b/tuplex/core/src/ErrorDataSet.cc @@ -23,7 +23,7 @@ namespace tuplex { return takeAsVector(0, os); } - std::shared_ptr ErrorDataSet::take(int64_t numElements, std::ostream &os) { + std::shared_ptr ErrorDataSet::take(int64_t numTop, int64_t numBottom, std::ostream &os) { // return empty vector and print err message Logger::instance().logger("core").error(this->_error); @@ -31,7 +31,7 @@ namespace tuplex { } std::shared_ptr ErrorDataSet::collect(std::ostream &os) { - return take(0, os); + return take(0, false, os); } void diff --git a/tuplex/core/src/logical/TakeOperator.cc b/tuplex/core/src/logical/TakeOperator.cc index aa7c49668..e588b5e97 100644 --- a/tuplex/core/src/logical/TakeOperator.cc +++ b/tuplex/core/src/logical/TakeOperator.cc @@ -12,13 +12,13 @@ #include namespace tuplex { - TakeOperator::TakeOperator(LogicalOperator *parent, const int64_t numElements) : LogicalOperator::LogicalOperator(parent), _limit(numElements) { + TakeOperator::TakeOperator(LogicalOperator *parent, const int64_t numTop, const int64_t numBottom) : LogicalOperator::LogicalOperator(parent), _limitTop(numTop), _limitBottom(numBottom) { // take schema from parent node setSchema(this->parent()->getOutputSchema()); } bool TakeOperator::good() const { - return _limit >= -1; + return _limitTop >= -1 && _limitBottom >= -1; } std::vector TakeOperator::getSample(const size_t num) const { @@ -33,7 +33,7 @@ namespace tuplex { LogicalOperator *TakeOperator::clone() { // create clone of this operator - auto copy = new TakeOperator(parent()->clone(), _limit); + auto copy = new TakeOperator(parent()->clone(), _limitTop, _limitBottom); copy->setDataSet(getDataSet()); // weak ptr to old dataset... copy->copyMembers(this); diff --git a/tuplex/core/src/logical/TaskLastOperator.cc b/tuplex/core/src/logical/TaskLastOperator.cc deleted file mode 100644 index 92295efb3..000000000 --- a/tuplex/core/src/logical/TaskLastOperator.cc +++ /dev/null @@ -1,43 +0,0 @@ -//--------------------------------------------------------------------------------------------------------------------// -// // -// Tuplex: Blazing Fast Python Data Science // -// // -// // -// (c) 2017 - 2021, Tuplex team // -// Created by Leonhard Spiegelberg first on 1/1/2021 // -// License: Apache 2.0 // -//--------------------------------------------------------------------------------------------------------------------// - -#include -#include - -namespace tuplex { - TakeLastOperator::TakeLastOperator(LogicalOperator *parent, const int64_t numElements) : LogicalOperator::LogicalOperator(parent), _limit(numElements) { - // take schema from parent node - setSchema(this->parent()->getOutputSchema()); - } - - bool TakeLastOperator::good() const { - return _limit >= -1; - } - - std::vector TakeLastOperator::getSample(const size_t num) const { - // take sample from parent - return parent()->getSample(num); - } - - std::vector TakeLastOperator::columns() const { - assert(parent()); - return parent()->columns(); - } - - LogicalOperator *TakeLastOperator::clone() { - // create clone of this operator - auto copy = new TakeLastOperator(parent()->clone(), _limit); - - copy->setDataSet(getDataSet()); // weak ptr to old dataset... - copy->copyMembers(this); - assert(getID() == copy->getID()); - return copy; - } -} \ No newline at end of file diff --git a/tuplex/core/src/physical/PhysicalPlan.cc b/tuplex/core/src/physical/PhysicalPlan.cc index 87a73a712..17a4c7c0e 100644 --- a/tuplex/core/src/physical/PhysicalPlan.cc +++ b/tuplex/core/src/physical/PhysicalPlan.cc @@ -209,7 +209,6 @@ namespace tuplex { if(ops.back()->type() == LogicalOperatorType::FILEOUTPUT) outputMode = EndPointMode::FILE; else if(ops.back()->type() == LogicalOperatorType::TAKE || - ops.back()->type() == LogicalOperatorType::TAKELAST || ops.back()->type() == LogicalOperatorType::CACHE) { // memory? outputMode = EndPointMode::MEMORY; @@ -384,9 +383,6 @@ namespace tuplex { if(outputNode->type() == LogicalOperatorType::TAKE) { auto top = static_cast(outputNode); builder.setOutputLimit(top->limit()); - } else if (outputNode->type() == LogicalOperatorType::TAKELAST) { - auto top = static_cast(outputNode); - builder.setOutputLimit(top->limit()); } // @TODO: add slowPip builder to this process... diff --git a/tuplex/core/src/physical/StageBuilder.cc b/tuplex/core/src/physical/StageBuilder.cc index 72f01e2b8..0bf509ed1 100644 --- a/tuplex/core/src/physical/StageBuilder.cc +++ b/tuplex/core/src/physical/StageBuilder.cc @@ -457,7 +457,8 @@ namespace tuplex { break; } case LogicalOperatorType::TAKE: { - opt_ops.push_back(new TakeOperator(lastParent, dynamic_cast(node)->limit())); + auto takeOp = dynamic_cast(node); + opt_ops.push_back(new TakeOperator(lastParent, takeOp->limit(), takeOp->limitBottom())); opt_ops.back()->setID(node->getID()); break; } diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc index 9cd15694a..b61f9cbe2 100644 --- a/tuplex/core/src/physical/TransformStage.cc +++ b/tuplex/core/src/physical/TransformStage.cc @@ -139,46 +139,21 @@ namespace tuplex { } // check output limit, adjust partitions if necessary - // TODO: add reverse outputLimit condition here - if (true) { - size_t numOutputRows = 0; - for (auto partition : partitions) { - numOutputRows += partition->getNumRows(); - if (numOutputRows >= outputLimit()) { - // clip last partition & leave loop - auto clipped = outputLimit() - (numOutputRows - partition->getNumRows()); - assert(clipped <= partition->getNumRows()); - partition->setNumRows(clipped); - if (clipped > 0) - limitedPartitions.push_back(partition); - break; - } else { - // put full partition to output set - limitedPartitions.push_back(partition); - } - } - } else { - size_t numOutputRows = 0; - for (auto partitionIt = partitions.rbeing(); - partitionIt != partitions.rend(); partitionIt++) { - numOutputRows += partition->getNumRows(); - if (numOutputRows >= outputLimit()) { - // clip last partition & leave loop - auto clipped = outputLimit() - (numOutputRows - partition->getNumRows()); - assert(clipped <= partition->getNumRows()); - - // TODO: do backward clip here instead - partition->setNumRows(clipped); - if (clipped > 0) - limitedPartitions.push_back(partition); - break; - } else { - // put full partition to output set + size_t numOutputRows = 0; + for (auto partition : partitions) { + numOutputRows += partition->getNumRows(); + if (numOutputRows >= outputLimit()) { + // clip last partition & leave loop + auto clipped = outputLimit() - (numOutputRows - partition->getNumRows()); + assert(clipped <= partition->getNumRows()); + partition->setNumRows(clipped); + if (clipped > 0) limitedPartitions.push_back(partition); - } + break; + } else { + // put full partition to output set + limitedPartitions.push_back(partition); } - - std::reverse(limitedPartitions.begin(), limitedPartitions.end()); } } diff --git a/tuplex/python/include/PythonDataSet.h b/tuplex/python/include/PythonDataSet.h index 58827ea33..23b09314d 100644 --- a/tuplex/python/include/PythonDataSet.h +++ b/tuplex/python/include/PythonDataSet.h @@ -77,8 +77,7 @@ namespace tuplex { PythonDataSet resolve(const int64_t exceptionCode, const std::string& lambda_code, const std::string& pickled_code, const py::object& closure=py::object()); py::object collect(); - py::object take(const int64_t numRows); - boost::python::object takeLast(const int64_t numRows); + py::object take(const int64_t numTop, const int64_t numBottom); void show(const int64_t numRows=-1); // DataFrame like operations diff --git a/tuplex/python/src/PythonBindings.cc b/tuplex/python/src/PythonBindings.cc index 4d0b1f4e9..6b3683853 100644 --- a/tuplex/python/src/PythonBindings.cc +++ b/tuplex/python/src/PythonBindings.cc @@ -43,7 +43,6 @@ PYMODULE { .def("show", &tuplex::PythonDataSet::show) .def("collect", &tuplex::PythonDataSet::collect) .def("take", &tuplex::PythonDataSet::take) - .def("takeLast", &tuplex::PythonDataSet::takeLast) .def("map", &tuplex::PythonDataSet::map) .def("resolve", &tuplex::PythonDataSet::resolve) .def("ignore", &tuplex::PythonDataSet::ignore) diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc index 2e54deec5..853b910db 100644 --- a/tuplex/python/src/PythonDataSet.cc +++ b/tuplex/python/src/PythonDataSet.cc @@ -107,7 +107,7 @@ namespace tuplex { } } - py::object PythonDataSet::take(const int64_t numRows) { + py::object PythonDataSet::take(const int64_t numTop, const int64_t numBottom) { // make sure a dataset is wrapped assert(this->_dataset); @@ -162,7 +162,7 @@ namespace tuplex { // new version, directly interact with the interpreter Timer timer; // build python list object from resultset - auto listObj = resultSetToCPython(rs.get(), numRows); + auto listObj = resultSetToCPython(rs.get(), numTop); Logger::instance().logger("python").info("Data transfer back to python took " + std::to_string(timer.time()) + " seconds"); // Logger::instance().flushAll(); @@ -176,75 +176,6 @@ namespace tuplex { } } - boost::python::object PythonDataSet::takeLast(const int64_t numRows) { - // make sure a dataset is wrapped - assert(this->_dataset); - - // is callee error dataset? if so return list with error string - if (this->_dataset->isError()) { - ErrorDataSet *eds = static_cast(this->_dataset); - boost::python::list L; - L.append(eds->getError()); - // Logger::instance().flushAll(); - Logger::instance().flushToPython(); - return L; - } else { - std::stringstream ss; - - // release GIL & hand over everything to Tuplex - assert(PyGILState_Check()); // make sure this thread holds the GIL! - python::unlockGIL(); - - std::shared_ptr rs; - std::string err_message = ""; - try { - rs = _dataset->takeLast(numRows, ss); - if(!rs) - throw std::runtime_error("invalid result set"); - // if there are more than 1 million (100k in debug mode) elements print message... - if (rs->rowCount() > LARGE_RESULT_SIZE) - Logger::instance().logger("python").info("transferring " - + std::to_string(rs->rowCount()) + - " elements back to Python. This might take a while..."); - } catch(const std::exception& e) { - err_message = e.what(); - Logger::instance().defaultLogger().error(err_message); - } catch(...) { - err_message = "unknown C++ exception occurred, please change type."; - Logger::instance().defaultLogger().error(err_message); - } - - // reqacquire GIL - python::lockGIL(); - - // error? then return list of error string - if(!rs || !err_message.empty()) { - // Logger::instance().flushAll(); - Logger::instance().flushToPython(); - auto listObj = PyList_New(1); - PyList_SetItem(listObj, 0, python::PyString_FromString(err_message.c_str())); - auto list = boost::python::object(boost::python::borrowed<>(listObj)); - return list; - } - - // collect results & transfer them back to python - // new version, directly interact with the interpreter - Timer timer; - // build python list object from resultset - auto listObj = resultSetToCPython(rs.get(), numRows); - Logger::instance().logger("python").info("Data transfer back to python took " - + std::to_string(timer.time()) + " seconds"); - // Logger::instance().flushAll(); - Logger::instance().flushToPython(); - - // print errors - if (ss.str().length() > 0) - PySys_FormatStdout("%s", ss.str().c_str()); - - return boost::python::object(boost::python::handle<>(listObj)); - } - } - PythonDataSet PythonDataSet::map(const std::string &lambda_code, const std::string &pickled_code, const py::object& closure) { auto& logger = Logger::instance().logger("python"); diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py index 976a751f4..1046505f2 100644 --- a/tuplex/python/tuplex/dataset.py +++ b/tuplex/python/tuplex/dataset.py @@ -191,7 +191,7 @@ def collect(self): assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' return self._dataSet.collect() - def take(self, nrows=5): + def take(self, nrows=5, nbottom=0): """ action that generates a physical plan, processes data and collects the top results then as list of tuples. Args: @@ -203,27 +203,12 @@ def take(self, nrows=5): assert isinstance(nrows, int), 'num rows must be an integer' assert nrows > 0, 'please specify a number greater than zero' + assert isinstance(nbottom, int), 'num bottom last must be an integer' + assert nbottom >= 0, 'please specify a number greater or equal to zero' assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' - return self._dataSet.take(nrows) - - def takeLast(self, nrows=5): - """ action that generates a physical plan, processes data and collects the last results then as list of tuples. - - Args: - nrows (int): number of rows to collect. Per default ``5``. - Returns: - (list): A list of tuples - - """ - - assert isinstance(nrows, int), 'num rows must be an integer' - assert nrows > 0, 'please specify a number greater than zero' - - assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' - - return self._dataSet.takeLast(nrows) + return self._dataSet.take(nrows, nbottom) def show(self, nrows=None): """ action that generates a physical plan, processes data and prints results as nicely formatted From 7fa6b175bc337dd56d0e0d2a39f41adcc5788065 Mon Sep 17 00:00:00 2001 From: korlamarch Date: Wed, 16 Feb 2022 12:17:36 -0500 Subject: [PATCH 30/56] Add unit tests --- tuplex/test/core/TakeTest.cc | 125 +++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 tuplex/test/core/TakeTest.cc diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc new file mode 100644 index 000000000..08b648f34 --- /dev/null +++ b/tuplex/test/core/TakeTest.cc @@ -0,0 +1,125 @@ +//--------------------------------------------------------------------------------------------------------------------// +// // +// Tuplex: Blazing Fast Python Data Science // +// // +// // +// (c) 2017 - 2021, Tuplex team // +// Created by Leonhard Spiegelberg first on 1/1/2021 // +// License: Apache 2.0 // +//--------------------------------------------------------------------------------------------------------------------// + +#include +#include "TestUtils.h" + +class TakeTest : public PyTest {}; + +TEST_F(TakeTest, takeTopTest) { + using namespace tuplex; + auto opt = testOptions(); + Context context(opt); + + auto rs = context.parallelize( + {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(1, 0); + + ASSERT_EQ(rs->rowCount(), 1); + auto v = rs->getRows(1); + + EXPECT_EQ(v[0].getInt(0), 1); + + auto rs2 = context.parallelize( + {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(3, 0); + + ASSERT_EQ(rs2->rowCount(), 3); + auto v2 = rs2->getRows(3); + + EXPECT_EQ(v2[0].getInt(0), 1); + EXPECT_EQ(v2[1].getInt(0), 2); + EXPECT_EQ(v2[2].getInt(0), 3); + + auto rs3 = context.parallelize( + {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), Row("!")}).take(5, 0); + + ASSERT_EQ(rs3->rowCount(), 5); + auto v3 = rs3->getRows(5); + + EXPECT_EQ(v3[0].getString(0), "hello"); + EXPECT_EQ(v3[1].getString(0), "world"); + EXPECT_EQ(v3[2].getString(0), "! :)"); + EXPECT_EQ(v3[3].getString(0), "world"); + EXPECT_EQ(v3[4].getString(0), "hello"); + +} + +TEST_F(TakeTest, takeBottomTest) { + using namespace tuplex; + auto opt = testOptions(); + Context context(opt); + + auto rs = context.parallelize( + {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(0, 1); + + ASSERT_EQ(rs->rowCount(), 1); + auto v = rs->getRows(1); + + EXPECT_EQ(v[0].getInt(0), 6); + + auto rs2 = context.parallelize( + {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(0, 3); + + ASSERT_EQ(rs2->rowCount(), 3); + auto v2 = rs2->getRows(3); + + EXPECT_EQ(v2[0].getInt(0), 4); + EXPECT_EQ(v2[1].getInt(0), 5); + EXPECT_EQ(v2[2].getInt(0), 6); + + auto rs3 = context.parallelize( + {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), Row("!")}).take(0, 5); + + ASSERT_EQ(rs3->rowCount(), 5); + auto v3 = rs3->getRows(5); + + EXPECT_EQ(v3[0].getString(0), "world"); + EXPECT_EQ(v3[1].getString(0), "hello"); + EXPECT_EQ(v3[2].getString(0), "!"); + EXPECT_EQ(v3[3].getString(0), "! :)"); + EXPECT_EQ(v3[4].getString(0), "!"); + +} + +TEST_F(TakeTest, takeBothTest) { + using namespace tuplex; + auto opt = testOptions(); + Context context(opt); + + auto rs = context.parallelize( + {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(1, 1); + + ASSERT_EQ(rs->rowCount(), 2); + auto v = rs->getRows(2); + + EXPECT_EQ(v[0].getInt(0), 1); + EXPECT_EQ(v[1].getInt(0), 6); + + auto rs2 = context.parallelize( + {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(2, 1); + + ASSERT_EQ(rs2->rowCount(), 3); + auto v2 = rs2->getRows(3); + + EXPECT_EQ(v2[0].getInt(0), 1); + EXPECT_EQ(v2[1].getInt(0), 2); + EXPECT_EQ(v2[2].getInt(0), 6); + + auto rs3 = context.parallelize( + {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), Row("!")}).take(2, 3); + + ASSERT_EQ(rs3->rowCount(), 5); + auto v3 = rs3->getRows(5); + + EXPECT_EQ(v3[0].getString(0), "hello"); + EXPECT_EQ(v3[1].getString(0), "world"); + EXPECT_EQ(v3[2].getString(0), "!"); + EXPECT_EQ(v3[3].getString(0), "! :)"); + EXPECT_EQ(v3[4].getString(0), "!"); +} \ No newline at end of file From c78a63784fdab0fa7e311b5969c0017d0b981ebf Mon Sep 17 00:00:00 2001 From: korlamarch Date: Wed, 16 Feb 2022 13:08:25 -0500 Subject: [PATCH 31/56] add bottom limit to transform stage (wip) --- tuplex/core/include/logical/TakeOperator.h | 2 +- tuplex/core/src/physical/PhysicalPlan.cc | 2 ++ tuplex/core/src/physical/TransformStage.cc | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tuplex/core/include/logical/TakeOperator.h b/tuplex/core/include/logical/TakeOperator.h index 20c035a74..b5dd5db6e 100644 --- a/tuplex/core/include/logical/TakeOperator.h +++ b/tuplex/core/include/logical/TakeOperator.h @@ -40,7 +40,7 @@ namespace tuplex { int64_t limit() { return _limitTop; } - bool limitBottom() { return _limitBottom; } + int64_t limitBottom() { return _limitBottom; } std::vector getSample(const size_t num) const override; diff --git a/tuplex/core/src/physical/PhysicalPlan.cc b/tuplex/core/src/physical/PhysicalPlan.cc index 17a4c7c0e..3985fe1ab 100644 --- a/tuplex/core/src/physical/PhysicalPlan.cc +++ b/tuplex/core/src/physical/PhysicalPlan.cc @@ -383,6 +383,8 @@ namespace tuplex { if(outputNode->type() == LogicalOperatorType::TAKE) { auto top = static_cast(outputNode); builder.setOutputLimit(top->limit()); + // TODO: work here + ... } // @TODO: add slowPip builder to this process... diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc index b61f9cbe2..6eb3f2e1f 100644 --- a/tuplex/core/src/physical/TransformStage.cc +++ b/tuplex/core/src/physical/TransformStage.cc @@ -142,6 +142,8 @@ namespace tuplex { size_t numOutputRows = 0; for (auto partition : partitions) { numOutputRows += partition->getNumRows(); + // TODO(march): work here + ... if (numOutputRows >= outputLimit()) { // clip last partition & leave loop auto clipped = outputLimit() - (numOutputRows - partition->getNumRows()); From 5628d279cee0a9923a15befcea8d2ebe46169397 Mon Sep 17 00:00:00 2001 From: korlamarch Date: Thu, 24 Feb 2022 23:29:11 -0500 Subject: [PATCH 32/56] more physical stage update (wip) Quick push --- tuplex/core/include/Executor.h | 21 +++- tuplex/core/include/Partition.h | 23 +++-- tuplex/core/include/physical/ResultSet.h | 2 + tuplex/core/include/physical/StageBuilder.h | 10 +- tuplex/core/include/physical/TransformStage.h | 10 +- tuplex/core/include/physical/TransformTask.h | 10 +- tuplex/core/src/Executor.cc | 97 +++++++++++-------- tuplex/core/src/ee/local/LocalBackend.cc | 39 ++++---- tuplex/core/src/physical/PhysicalPlan.cc | 4 +- tuplex/core/src/physical/ResultSet.cc | 3 +- tuplex/core/src/physical/StageBuilder.cc | 5 +- tuplex/core/src/physical/TransformStage.cc | 86 +++++++++++++--- tuplex/core/src/physical/TransformTask.cc | 19 ++-- tuplex/utils/include/mt/ITask.h | 26 ++--- 14 files changed, 224 insertions(+), 131 deletions(-) diff --git a/tuplex/core/include/Executor.h b/tuplex/core/include/Executor.h index 0bca412be..3631f7e7d 100644 --- a/tuplex/core/include/Executor.h +++ b/tuplex/core/include/Executor.h @@ -44,12 +44,19 @@ namespace tuplex { */ class WorkQueue { private: - std::atomic_bool _done; // protects against data races + std::atomic_bool _done{}; // protects against data races ExecutorTaskQueueType _queue; std::mutex _completedTasksMutex; std::vector _completedTasks; - std::atomic_int _numPendingTasks; - std::atomic_int _numCompletedTasks; + std::atomic_int _numPendingTasks{}; + std::atomic_int _numCompletedTasks{}; + + // mapping from order number -> row count if the task is finished + std::mutex _rowsDoneMutex; + std::map _rowsDone; + + std::atomic_int _frontRowsLimit{}; + std::atomic_int _bottomRowsLimit{}; public: WorkQueue(); @@ -74,6 +81,14 @@ namespace tuplex { size_t numCompletedTasks() const { return _numCompletedTasks; } + size_t frontRowsLimit() const { + return _frontRowsLimit; + }; + + size_t bottomRowsLimit() const { + return _bottomRowsLimit; + }; + /*! * stop working on this queue & dump all tasks */ diff --git a/tuplex/core/include/Partition.h b/tuplex/core/include/Partition.h index 5a66023fd..24b79cc8f 100644 --- a/tuplex/core/include/Partition.h +++ b/tuplex/core/include/Partition.h @@ -69,6 +69,7 @@ namespace tuplex { void loadFromFile(const URI& uri); int64_t _numRows; + int64_t _numSkip; // number of rows to skip, currently only used at the output (Result set) uint64_t _bytesWritten; Schema _schema; //! Schema of the partition. May be optimized away later. @@ -157,7 +158,7 @@ namespace tuplex { * return how much capacity is left, i.e. how many bytes can be actually written * @return */ - size_t capacity() { return _size - sizeof(int64_t); } + size_t capacity() const { return _size - sizeof(int64_t); } uniqueid_t uuid() const { return _uuid; } @@ -248,21 +249,19 @@ namespace tuplex { _mutex.unlock(); } - void setNumLastRows(const size_t numRows) { - // TODO: set another value instead + size_t getNumSkip() { + size_t res = 0; _mutex.lock(); - - _numRows = numRows; - - // save to memptr - if(_arena) { - *((int64_t*)_arena) = numRows; - } - + res = num_skip; _mutex.unlock(); + return res; } - + void setNumSkip(const size_t numSkip) { + _mutex.lock(); + _numSkip = numSkip; + _mutex.unlock(); + } int64_t getDataSetID() const { return _dataSetID; } diff --git a/tuplex/core/include/physical/ResultSet.h b/tuplex/core/include/physical/ResultSet.h index e94b8f1ae..5e69fef3a 100644 --- a/tuplex/core/include/physical/ResultSet.h +++ b/tuplex/core/include/physical/ResultSet.h @@ -36,6 +36,8 @@ namespace tuplex { size_t _rowsRetrieved; size_t _totalRowCounter; // used for merging in rows! size_t _maxRows; + size_t _maxRowsTop; + size_t _maxRowsBottom; Schema _schema; void removeFirstPartition(); diff --git a/tuplex/core/include/physical/StageBuilder.h b/tuplex/core/include/physical/StageBuilder.h index 63b94bd57..e678ead3d 100644 --- a/tuplex/core/include/physical/StageBuilder.h +++ b/tuplex/core/include/physical/StageBuilder.h @@ -76,8 +76,9 @@ namespace tuplex { void addFileInput(FileInputOperator* csvop); void addFileOutput(FileOutputOperator* fop); - inline void setOutputLimit(size_t limit) { - _outputLimit = limit; + inline void setOutputLimit(size_t topLimit, size_t bottomLimit) { + _outputTopLimit = topLimit; + _outputBottomLimit = bottomLimit; } TransformStage* build(PhysicalPlan* plan, IBackend* backend); @@ -134,7 +135,8 @@ namespace tuplex { FileFormat _outputFileFormat; int64_t _outputNodeID; int64_t _inputNodeID; - size_t _outputLimit; + size_t _outputTopLimit; + size_t _outputBottomLimit; LogicalOperator* _inputNode; std::vector _columnsToRead; @@ -157,7 +159,7 @@ namespace tuplex { int64_t outputDataSetID() const; inline bool hasOutputLimit() const { - return _outputLimit < std::numeric_limits::max(); + return _outputTopLimit < std::numeric_limits::max() || _outputBottomLimit > 0; } inline char csvOutputDelimiter() const { diff --git a/tuplex/core/include/physical/TransformStage.h b/tuplex/core/include/physical/TransformStage.h index 22d7f5fb4..e63eaec31 100644 --- a/tuplex/core/include/physical/TransformStage.h +++ b/tuplex/core/include/physical/TransformStage.h @@ -111,14 +111,15 @@ namespace tuplex { * @param outputLimit */ void setOutputLimit(size_t outputLimit) { - _outputLimit = outputLimit; + _outputTopLimit = outputLimit; // @TODO: move this logic to physical plan! // pushdown limit //pushDownOutputLimit(); } - size_t outputLimit() const { return _outputLimit; } + size_t outputTopLimit() const { return _outputTopLimit; } + size_t outputBottomLimit() const { return _outputBottomLimit; } size_t inputLimit() const { return _inputLimit; } /*! @@ -442,7 +443,8 @@ namespace tuplex { std::vector _inputPartitions; //! memory input partitions for this task. size_t _inputLimit; //! limit number of input rows (inf per default) - size_t _outputLimit; //! output limit, set e.g. by take, to_csv etc. (inf per default) + size_t _outputTopLimit; //! output limit, set e.g. by take, to_csv etc. (inf per default) + size_t _outputBottomLimit; //! output limit, set e.g. by take, to_csv etc. (0 per default) std::shared_ptr _rs; //! result set @@ -479,7 +481,7 @@ namespace tuplex { python::Type _hashOutputBucketType; bool hasOutputLimit() const { - return _outputLimit < std::numeric_limits::max(); + return _outputTopLimit < std::numeric_limits::max(); } }; } diff --git a/tuplex/core/include/physical/TransformTask.h b/tuplex/core/include/physical/TransformTask.h index 2868ba668..d065e86d3 100644 --- a/tuplex/core/include/physical/TransformTask.h +++ b/tuplex/core/include/physical/TransformTask.h @@ -182,7 +182,8 @@ namespace tuplex { void sinkOutputToHashTable(HashTableFormat fmt, int64_t outputDataSetID); HashTableSink hashTableSink() const { return _htable; } // needs to be freed manually! - void setOutputLimit(size_t limit) { _outLimit = limit; resetOutputLimitCounter(); } + void setOutputTopLimit(size_t limit) { _outTopLimit = limit; resetOutputLimitCounter(); } + void setOutputBottomLimit(size_t limit) { _outBottomLimit = limit; resetOutputLimitCounter(); } void setOutputSkip(size_t numRowsToSkip) { _outSkipRows = numRowsToSkip; } void execute() override; @@ -249,7 +250,9 @@ namespace tuplex { double wallTime() const override { return _wallTime; } size_t output_rows_written() const { return _numOutputRowsWritten; } - size_t output_limit() const { return _outLimit; } + size_t output_top_limit() const { return _outTopLimit; } + size_t output_bottom_limit() const { return _outBottomLimit; } + private: void resetSinks(); void resetSources(); @@ -276,7 +279,8 @@ namespace tuplex { Buffer _outPrefix; std::unordered_map _outOptions; - size_t _outLimit; // limits how many rows to write at max + size_t _outTopLimit; // limits how many rows to write at max + size_t _outBottomLimit; // limits how many last rows to write at max size_t _outSkipRows; // how many rows at start to skip // memory source variables diff --git a/tuplex/core/src/Executor.cc b/tuplex/core/src/Executor.cc index 845b78e6a..1cc818010 100644 --- a/tuplex/core/src/Executor.cc +++ b/tuplex/core/src/Executor.cc @@ -32,8 +32,12 @@ namespace tuplex { std::vector WorkQueue::popCompletedTasks() { TRACE_LOCK("workQueue"); - std::lock_guard lock(_completedTasksMutex); + _taskDoneMutex.lock(); + _taskDone.clear(); + _taskDoneMutex.unlock(); + + std::lock_guard lock(_completedTasksMutex); // move leads to circular dependency in gcc and thus a bug on travis-ci. Therefore, just // use the below hack to fool the compiler into actually copying the vectors // // move to reset completed tasks and return array @@ -78,59 +82,66 @@ namespace tuplex { bool WorkQueue::workTask(Executor& executor, bool nonBlocking) { IExecutorTask *task = nullptr; - if(nonBlocking) { - // @Todo: This should be put into a function "work" on the workQueue... - // dequeue from general working queue - if(_queue.try_dequeue(task)) { - if(!task) - return false; - task->setOwner(&executor); - task->setThreadNumber(executor.threadNumber()); // redundant? + // dequeue from general working queue + // Note: is this TODO: outdated? + // @Todo: This should be put into a function "work" on the workQueue... + if (nonBlocking) { + if(!_queue.try_dequeue(task)) { + return false; + } + } else { + _queue.wait_dequeue(task); + } - //executor.logger().info("started task..."); - // process task - task->execute(); - // save which thread executed this task - task->setID(std::this_thread::get_id()); + if(!task) { + return false; + } + // if reach the top limit already, then don't compute the rest + size_t numTopCompleted; + TRACE_LOCK("rowsDone"); + _rowsDoneMutex.lock(); + size_t frontRowsDone = 0; + for (size_t i = 0; _rowsDone.count(i) != 0; i++) { + frontRowsDone += _rowsDone[i]; + if (frontRowsDone >= _queue.frontRowsLimit()) { + // skip execution _numPendingTasks.fetch_add(-1, std::memory_order_release); - - // add task to done list - TRACE_LOCK("completedTasks"); - _completedTasksMutex.lock(); - _completedTasks.push_back(std::move(task)); - _completedTasksMutex.unlock(); - _numCompletedTasks.fetch_add(1, std::memory_order_release); - TRACE_UNLOCK("completedTasks"); + _rowsDoneMutex.unlock(); + TRACE_UNLOCK("rowsDone"); return true; } - } else { - _queue.wait_dequeue(task); + } + _rowsDoneMutex.unlock(); + TRACE_UNLOCK("rowsDone"); - if(!task) - return false; + task->setOwner(&executor); + task->setThreadNumber(executor.threadNumber()); // redundant? - task->setOwner(&executor); - task->setThreadNumber(executor.threadNumber()); // redundant? + // executor.logger().info("started task..."); + // process task + task->execute(); + // save which thread executed this task + task->setID(std::this_thread::get_id()); - // process task - task->execute(); - // save which thread executed this task - task->setID(std::this_thread::get_id()); + _numPendingTasks.fetch_add(-1, std::memory_order_release); - // add task to done list - TRACE_LOCK("completedTasks"); - _completedTasksMutex.lock(); - _completedTasks.push_back(std::move(task)); - _completedTasksMutex.unlock(); - _numCompletedTasks.fetch_add(1, std::memory_order_release); - TRACE_UNLOCK("completedTasks"); + // add task to done list + TRACE_LOCK("completedTasks"); + _completedTasksMutex.lock(); + _completedTasks.push_back(std::move(task)); + _completedTasksMutex.unlock(); + _numCompletedTasks.fetch_add(1, std::memory_order_release); + TRACE_UNLOCK("completedTasks"); - _numPendingTasks.fetch_add(-1, std::memory_order_release); - return true; - } - return false; + TRACE_LOCK("rowsDone"); + _rowsDoneMutex.lock(); + _rowsDone[task->getOrder()] += task->getNumOutputRows(); + _rowsDoneMutex.unlock(); + TRACE_UNLOCK("rowsDone"); + + return true; } void WorkQueue::workUntilAllTasksFinished(tuplex::Executor &executor, bool flushPeriodicallyToPython) { diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc index bed96ec5a..5a1311436 100644 --- a/tuplex/core/src/ee/local/LocalBackend.cc +++ b/tuplex/core/src/ee/local/LocalBackend.cc @@ -486,6 +486,7 @@ namespace tuplex { // check what type of input the pipeline has (memory or files) if(tstage->fileInputMode()) { + // TODO(march): deal with file input // files // input is multiple files, use split file strategy here. // and issue tasks to executor workqueue! @@ -550,7 +551,7 @@ namespace tuplex { task->sinkExceptionsToMemory(inputSchema); task->setStageID(tstage->getID()); - task->setOutputLimit(tstage->outputLimit()); + task->setOutputTopLimit(tstage->outputTopLimit()); // add to tasks tasks.emplace_back(std::move(task)); } else { @@ -584,7 +585,7 @@ namespace tuplex { } task->sinkExceptionsToMemory(inputSchema); task->setStageID(tstage->getID()); - task->setOutputLimit(tstage->outputLimit()); + task->setOutputTopLimit(tstage->outputTopLimit()); // add to tasks tasks.emplace_back(std::move(task)); num_parts++; @@ -621,7 +622,7 @@ namespace tuplex { } task->sinkExceptionsToMemory(inputSchema); task->setStageID(tstage->getID()); - task->setOutputLimit(tstage->outputLimit()); + task->setOutputTopLimit(tstage->outputTopLimit()); // add to tasks tasks.emplace_back(std::move(task)); @@ -683,7 +684,11 @@ namespace tuplex { task->setInputExceptions(tstage->inputExceptions()); task->sinkExceptionsToMemory(inputSchema); task->setStageID(tstage->getID()); - task->setOutputLimit(tstage->outputLimit()); + task->setOutputTopLimit(tstage->outputTopLimit()); + task->setOutputBottomLimit(tstage->outputBottomLimit()); + if (tstage->outputBottomLimit()) { + // TODO(march): work here (task output limit generation) + } tasks.emplace_back(std::move(task)); numInputRows += partition->getNumRows(); @@ -837,7 +842,6 @@ namespace tuplex { } void LocalBackend::executeTransformStage(tuplex::TransformStage *tstage) { - Timer stageTimer; Timer timer; // for detailed measurements. @@ -937,6 +941,7 @@ namespace tuplex { } } + // TODO(march): work here (transform stage) auto tasks = createLoadAndTransformToMemoryTasks(tstage, _options, syms); auto completedTasks = performTasks(tasks); @@ -1513,23 +1518,21 @@ namespace tuplex { WorkQueue& wq = LocalEngine::instance().getQueue(); wq.clear(); - // check if ord is set, if not issue warning & add - bool orderlessTaskFound = false; + // assign the order for all tasks for(int i = 0; i < tasks.size(); ++i) { - if(tasks[i]->getOrder().size() == 0) { - tasks[i]->setOrder(i); - orderlessTaskFound = true; - } + tasks[i]->setOrder(i); } -#ifndef NDEBUG - if(orderlessTaskFound) { - logger().debug("task without order found, please fix in code."); + // add all tasks to queue + // TODO(march): add task stage (to do striping) + for(size_t i = 0; i <= tasks.size() - i - 1; i++) { + const size_t revI = tasks.size()- i - 1 + wq.addTask(&tasks[i]); + if (revI > i) { + wq.addTask(&tasks[revI]); + } } -#endif - // add all tasks to queue - for(auto& task : tasks) wq.addTask(task); // clear tasks.clear(); @@ -1955,7 +1958,7 @@ namespace tuplex { // now simply go over the partitions and write the full buffers out // check all the params from TrafoStage - size_t limit = tstage->outputLimit(); + size_t limit = tstage->outputTopLimit(); size_t splitSize = tstage->splitSize(); size_t numOutputFiles = tstage->numOutputFiles(); URI uri = tstage->outputURI(); diff --git a/tuplex/core/src/physical/PhysicalPlan.cc b/tuplex/core/src/physical/PhysicalPlan.cc index 3985fe1ab..9c22837ad 100644 --- a/tuplex/core/src/physical/PhysicalPlan.cc +++ b/tuplex/core/src/physical/PhysicalPlan.cc @@ -382,9 +382,7 @@ namespace tuplex { // set limit if output node has a limit (currently only TakeOperator) if(outputNode->type() == LogicalOperatorType::TAKE) { auto top = static_cast(outputNode); - builder.setOutputLimit(top->limit()); - // TODO: work here - ... + builder.setOutputLimit(top->limit(), top->limitBottom()); } // @TODO: add slowPip builder to this process... diff --git a/tuplex/core/src/physical/ResultSet.cc b/tuplex/core/src/physical/ResultSet.cc index 0f7bf7319..5e15867f7 100644 --- a/tuplex/core/src/physical/ResultSet.cc +++ b/tuplex/core/src/physical/ResultSet.cc @@ -98,7 +98,7 @@ namespace tuplex { Partition *first = _partitions.front(); assert(_schema == first->schema()); - auto numRows = first->getNumRows(); + auto numRows = first->getNumRows() - first->getNumSkip(); _rowsRetrieved += numRows; _partitions.pop_front(); @@ -183,6 +183,7 @@ namespace tuplex { } Row ResultSet::getNextRow() { + // TODO(march): logic in skip row count here // merge rows from objects if(!_pyobjects.empty()) { auto row_number = std::get<0>(_pyobjects.front()); diff --git a/tuplex/core/src/physical/StageBuilder.cc b/tuplex/core/src/physical/StageBuilder.cc index 0bf509ed1..bc814182b 100644 --- a/tuplex/core/src/physical/StageBuilder.cc +++ b/tuplex/core/src/physical/StageBuilder.cc @@ -50,7 +50,7 @@ namespace tuplex { : _stageNumber(stage_number), _isRootStage(rootStage), _allowUndefinedBehavior(allowUndefinedBehavior), _generateParser(generateParser), _normalCaseThreshold(normalCaseThreshold), _sharedObjectPropagation(sharedObjectPropagation), _nullValueOptimization(nullValueOptimization), _updateInputExceptions(updateInputExceptions), - _inputNode(nullptr), _outputLimit(std::numeric_limits::max()) { + _inputNode(nullptr), _outputTopLimit(std::numeric_limits::max()), _outputBottomLimit(0) { } void StageBuilder::generatePythonCode() { @@ -1426,7 +1426,8 @@ namespace tuplex { // no limit operator yet... // get limit - stage->_outputLimit = _outputLimit; + stage->_outputTopLimit = _outputTopLimit; + stage->_outputBottomLimit = _outputBottomLimit; // copy input/output configurations stage->_fileInputParameters = _fileInputParameters; diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc index 6eb3f2e1f..af58866dc 100644 --- a/tuplex/core/src/physical/TransformStage.cc +++ b/tuplex/core/src/physical/TransformStage.cc @@ -48,7 +48,8 @@ namespace tuplex { int64_t number, bool allowUndefinedBehavior) : PhysicalStage::PhysicalStage(plan, backend, number), _inputLimit(std::numeric_limits::max()), - _outputLimit(std::numeric_limits::max()), + _outputTopLimit(std::numeric_limits::max()), + _outputBottomLimit(0), _aggMode(AggregateType::AGG_NONE) { // TODO: is this code out of date? + is allowUndefinedBehavior needed here? @@ -129,7 +130,7 @@ namespace tuplex { if (partitions.empty() && interpreterRows.empty() && generalCase.empty()) _rs = emptyResultSet(); else { - std::vector limitedPartitions; + std::vector limitedPartitions, limitedTailPartitions; auto schema = Schema::UNKNOWN; if(!partitions.empty()) { @@ -138,31 +139,92 @@ namespace tuplex { assert(schema == partition->schema()); } - // check output limit, adjust partitions if necessary - size_t numOutputRows = 0; + // check top output limit, adjust partitions if necessary + size_t numTopOutputRows = 0; + Partition* lastTopPart = nullptr; + size_t clippedTop = 0; for (auto partition : partitions) { - numOutputRows += partition->getNumRows(); - // TODO(march): work here - ... - if (numOutputRows >= outputLimit()) { + numTopOutputRows += partition->getNumRows(); + lastTopPart = partition; + if (numTopOutputRows >= outputTopLimit()) { // clip last partition & leave loop - auto clipped = outputLimit() - (numOutputRows - partition->getNumRows()); + clippedTop = outputTopLimit() - (numTopOutputRows - partition->getNumRows()); + assert(clippedTop <= partition->getNumRows()); + break; + } else if (partition == *partitions.end()) { + // last partition, mark full row, but don't put to output set yet to avoid double put + clippedTop = partition->getNumRows(); + break; + } else { + // put full partition to output set + limitedPartitions.push_back(partition); + } + } + + // check the bottom output limit, adjust partitions if necessary + size_t numBottomOutputRows = 0; + size_t clippedBottom = 0; + for (auto it = partitions.rbegin(); it != partitions.rend(); it++) { + auto partition = *it; + numBottomOutputRows += partition->getNumRows(); + + if (partition == lastTopPart) { + // the bottom and the top partitions are overlapping + clippedBottom = outputBottomLimit() - (numBottomOutputRows - partition->getNumRows()); + if (clippedTop + clippedBottom >= partition->getNumRows()) { + // if top and bottom range intersect, use full partitions + clippedTop = partition->getNumRows(); + clippedBottom = 0; + } + break; + } else if (numBottomOutputRows >= outputBottomLimit()) { + // clip last partition & leave loop + auto clipped = outputBottomLimit() - (numTopOutputRows - partition->getNumRows()); assert(clipped <= partition->getNumRows()); + partition->setNumSkip(partition->getNumRows() - clippedBottom); partition->setNumRows(clipped); if (clipped > 0) - limitedPartitions.push_back(partition); + limitedTailPartitions.push_back(partition); break; } else { // put full partition to output set - limitedPartitions.push_back(partition); + limitedTailPartitions.push_back(partition); + } + } + + // push the middle partition + if (lastTopPart != nullptr && (clippedTop > 0 || clippedBottom > 0)) { + assert(clippedTop + clippedBottom <= lastTopPart->getNumRows()); + + // TODO(march): to work on this (split into two partitions) + // split into two partitions with both top and bottom are in the same partition + Partition* lastBottomPart = nullptr; + if (clippedBottom != 0) { + lastBottomPart = new Partition(lastTopPart); + lastBottomPart->setNumSkip(lastBottomPart->getNumRows() - clippedBottom); + lastBottomPart->setNumRows(clippedBottom); + } + + lastTopPart->setNumRows(clippedTop); + + limitedPartitions.push_back(lastTopPart); + + if (lastBottomPart != nullptr) { + limitedPartitions.push_back(lastBottomPart); } } + + // merge the head and tail partitions + std::reverse(limitedTailPartitions.begin(), limitedTailPartitions.end()); + limitedPartitions.insert(limitedPartitions.end(), limitedTailPartitions.begin(), limitedTailPartitions.end()); } // put ALL partitions to result set + // TODO(march): handle overlapping case + // TODO(march): maybe do top/bottom limit at the level instead? _rs = std::make_shared(schema, limitedPartitions, generalCase, partitionToExceptionsMap, interpreterRows, - outputLimit()); + outputTopLimit() + outputBottomLimit()); } } diff --git a/tuplex/core/src/physical/TransformTask.cc b/tuplex/core/src/physical/TransformTask.cc index c560c4af4..a65aa7f11 100644 --- a/tuplex/core/src/physical/TransformTask.cc +++ b/tuplex/core/src/physical/TransformTask.cc @@ -19,10 +19,12 @@ namespace tuplex { // atomic var to count output rows! - static std::atomic_int64_t g_totalOutputRows; + static std::atomic_int64_t g_totalTopOutputRows; + static std::atomic_int64_t g_totalBottomOutputRows; void TransformTask::resetOutputLimitCounter() { - g_totalOutputRows = 0; + g_totalTopOutputRows = 0; + g_totalBottomOutputRows = 0; } } @@ -41,7 +43,8 @@ extern "C" { static int64_t limited_w2mCallback(tuplex::TransformTask* task, uint8_t* buf, int64_t bufSize) { // i.e. check here how many output rows, if already limit reached - jump to goto! - if(tuplex::g_totalOutputRows >= task->output_limit()) { + // TODO(march): comment this out + if(tuplex::g_totalTopOutputRows >= task->output_top_limit()) { return tuplex::ecToI64(tuplex::ExceptionCode::OUTPUT_LIMIT_REACHED); } @@ -49,10 +52,10 @@ extern "C" { assert(dynamic_cast(task)); auto rc = task->writeRowToMemory(buf, bufSize); if(0 == rc) - tuplex::g_totalOutputRows++; + tuplex::g_totalTopOutputRows++; // i.e. check here how many output rows, if already limit reached - jump to goto! - if(tuplex::g_totalOutputRows >= task->output_limit()) { + if(tuplex::g_totalTopOutputRows >= task->output_top_limit()) { return tuplex::ecToI64(tuplex::ExceptionCode::OUTPUT_LIMIT_REACHED); } return rc; @@ -513,7 +516,8 @@ namespace tuplex { _outputFilePath = URI::INVALID; _outFile.reset(nullptr); _outPrefix.reset(); - _outLimit = std::numeric_limits::max(); // write all rows + _outTopLimit = std::numeric_limits::max(); // write all rows + _outBottomLimit = 0; _outSkipRows = 0; // skip no rows // reset memory sink @@ -619,6 +623,7 @@ namespace tuplex { auto functor = reinterpret_cast(_functor); + // TODO(march): question here? // go over all input partitions. for(const auto &inputPartition : _inputPartitions) { // lock ptr, extract number of rows ==> store them @@ -678,7 +683,7 @@ namespace tuplex { // skip rows? limit rows?? - if(_numOutputRowsWritten >= _outSkipRows && _numOutputRowsWritten < (_outLimit - _outSkipRows)) { + if(_numOutputRowsWritten >= _outSkipRows && _numOutputRowsWritten < (_outTopLimit - _outSkipRows)) { if(_outFile->write(buf, bufSize) != VirtualFileSystemStatus::VFS_OK) return ecToI32(ExceptionCode::IOERROR); } diff --git a/tuplex/utils/include/mt/ITask.h b/tuplex/utils/include/mt/ITask.h index 8434896a7..01f7137f1 100644 --- a/tuplex/utils/include/mt/ITask.h +++ b/tuplex/utils/include/mt/ITask.h @@ -29,7 +29,7 @@ namespace tuplex { std::thread::id _id; //! the id of the thread that executed the task. Used to specifically execute a task on a specific thread. //! Per default object is constructed that does not represent a thread - std::vector _orderNumbers; //! for sorting tasks when doing async processing, allows for multiple stages + size_t _orderNumbers; //! for sorting tasks when doing async processing, allows for multiple stages public: ITask() {}; @@ -51,33 +51,21 @@ namespace tuplex { _id = id; } - void setOrder(size_t order) { _orderNumbers = std::vector{order}; } - -// size_t getOrder(const size_t nth = 0) const { -// return _orderNumbers[nth]; -// } - std::vector getOrder() const { return _orderNumbers; } - - void setOrder(const std::vector& order) { + void setOrder(size_t order) { _orderNumbers = order; } + size_t getOrder() const { + return _orderNumbers; + } + /*! * compare the ordering numbers of two tasks to restore initial data order after processing multiple ones * @param other * @return */ bool compareAscOrder(const ITask& other) const { - // make sure they have the same length - assert(_orderNumbers.size() == other._orderNumbers.size()); - - // this < other? - // compare one by one - for(int i = 0; i < other._orderNumbers.size(); ++i) { - if(_orderNumbers[i] >= other._orderNumbers[i]) - return false; - } - return true; + return _orderNumbers[i] < other._orderNumbers[i]; } }; } From a506d88be56f5f69e1ed2833907e0867c2dda734 Mon Sep 17 00:00:00 2001 From: korlamarch Date: Wed, 9 Mar 2022 13:00:17 -0500 Subject: [PATCH 33/56] Rework LocalBackend and TransformTask to support top and bottom limit --- tuplex/core/include/Executor.h | 6 - tuplex/core/include/Partition.h | 14 -- tuplex/core/include/ee/local/LocalBackend.h | 3 + tuplex/core/include/physical/TransformStage.h | 12 +- tuplex/core/src/Executor.cc | 24 --- tuplex/core/src/ee/local/LocalBackend.cc | 182 ++++++++++++++++-- tuplex/core/src/physical/PhysicalPlan.cc | 2 +- tuplex/core/src/physical/ResultSet.cc | 1 - tuplex/core/src/physical/TransformStage.cc | 86 +-------- tuplex/core/src/physical/TransformTask.cc | 54 ++++-- tuplex/utils/include/mt/ITask.h | 85 ++++---- 11 files changed, 270 insertions(+), 199 deletions(-) diff --git a/tuplex/core/include/Executor.h b/tuplex/core/include/Executor.h index 3631f7e7d..7eaaee244 100644 --- a/tuplex/core/include/Executor.h +++ b/tuplex/core/include/Executor.h @@ -51,12 +51,6 @@ namespace tuplex { std::atomic_int _numPendingTasks{}; std::atomic_int _numCompletedTasks{}; - // mapping from order number -> row count if the task is finished - std::mutex _rowsDoneMutex; - std::map _rowsDone; - - std::atomic_int _frontRowsLimit{}; - std::atomic_int _bottomRowsLimit{}; public: WorkQueue(); diff --git a/tuplex/core/include/Partition.h b/tuplex/core/include/Partition.h index 24b79cc8f..8bf112051 100644 --- a/tuplex/core/include/Partition.h +++ b/tuplex/core/include/Partition.h @@ -69,7 +69,6 @@ namespace tuplex { void loadFromFile(const URI& uri); int64_t _numRows; - int64_t _numSkip; // number of rows to skip, currently only used at the output (Result set) uint64_t _bytesWritten; Schema _schema; //! Schema of the partition. May be optimized away later. @@ -249,19 +248,6 @@ namespace tuplex { _mutex.unlock(); } - size_t getNumSkip() { - size_t res = 0; - _mutex.lock(); - res = num_skip; - _mutex.unlock(); - return res; - } - - void setNumSkip(const size_t numSkip) { - _mutex.lock(); - _numSkip = numSkip; - _mutex.unlock(); - } int64_t getDataSetID() const { return _dataSetID; } diff --git a/tuplex/core/include/ee/local/LocalBackend.h b/tuplex/core/include/ee/local/LocalBackend.h index 77d375aed..0dbfafdc9 100644 --- a/tuplex/core/include/ee/local/LocalBackend.h +++ b/tuplex/core/include/ee/local/LocalBackend.h @@ -88,6 +88,9 @@ namespace tuplex { MessageHandler& logger() const { return Logger::instance().logger("local ee"); } + void trimPartitionsToLimit(std::vector &partitions, size_t topLimit, size_t bottomLimit, TransformStage* tstage); + Partition* newPartitionWithSkipRows(Partition* p_in, int numToSkip, TransformStage* tstage); + // write output (may be already in correct format!) void writeOutput(TransformStage* tstage, std::vector& sortedTasks); diff --git a/tuplex/core/include/physical/TransformStage.h b/tuplex/core/include/physical/TransformStage.h index e63eaec31..f489f1f6c 100644 --- a/tuplex/core/include/physical/TransformStage.h +++ b/tuplex/core/include/physical/TransformStage.h @@ -107,11 +107,13 @@ namespace tuplex { std::unordered_map partitionToExceptionsMap() { return _partitionToExceptionsMap; } /*! - * sets maximum number of rows this pipeline will produce - * @param outputLimit + * sets maximum number of top rows this pipeline will produce + * @param topLimit + * @param bottomLimit */ - void setOutputLimit(size_t outputLimit) { - _outputTopLimit = outputLimit; + inline void setOutputLimit(size_t topLimit, size_t bottomLimit) { + _outputTopLimit = topLimit; + _outputBottomLimit = bottomLimit; // @TODO: move this logic to physical plan! // pushdown limit @@ -481,7 +483,7 @@ namespace tuplex { python::Type _hashOutputBucketType; bool hasOutputLimit() const { - return _outputTopLimit < std::numeric_limits::max(); + return _outputTopLimit < std::numeric_limits::max() && _outputBottomLimit != 0; } }; } diff --git a/tuplex/core/src/Executor.cc b/tuplex/core/src/Executor.cc index 1cc818010..388199e4d 100644 --- a/tuplex/core/src/Executor.cc +++ b/tuplex/core/src/Executor.cc @@ -98,24 +98,6 @@ namespace tuplex { return false; } - // if reach the top limit already, then don't compute the rest - size_t numTopCompleted; - TRACE_LOCK("rowsDone"); - _rowsDoneMutex.lock(); - size_t frontRowsDone = 0; - for (size_t i = 0; _rowsDone.count(i) != 0; i++) { - frontRowsDone += _rowsDone[i]; - if (frontRowsDone >= _queue.frontRowsLimit()) { - // skip execution - _numPendingTasks.fetch_add(-1, std::memory_order_release); - _rowsDoneMutex.unlock(); - TRACE_UNLOCK("rowsDone"); - return true; - } - } - _rowsDoneMutex.unlock(); - TRACE_UNLOCK("rowsDone"); - task->setOwner(&executor); task->setThreadNumber(executor.threadNumber()); // redundant? @@ -135,12 +117,6 @@ namespace tuplex { _numCompletedTasks.fetch_add(1, std::memory_order_release); TRACE_UNLOCK("completedTasks"); - TRACE_LOCK("rowsDone"); - _rowsDoneMutex.lock(); - _rowsDone[task->getOrder()] += task->getNumOutputRows(); - _rowsDoneMutex.unlock(); - TRACE_UNLOCK("rowsDone"); - return true; } diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc index 5a1311436..dbceaa1b9 100644 --- a/tuplex/core/src/ee/local/LocalBackend.cc +++ b/tuplex/core/src/ee/local/LocalBackend.cc @@ -486,7 +486,6 @@ namespace tuplex { // check what type of input the pipeline has (memory or files) if(tstage->fileInputMode()) { - // TODO(march): deal with file input // files // input is multiple files, use split file strategy here. // and issue tasks to executor workqueue! @@ -686,9 +685,6 @@ namespace tuplex { task->setStageID(tstage->getID()); task->setOutputTopLimit(tstage->outputTopLimit()); task->setOutputBottomLimit(tstage->outputBottomLimit()); - if (tstage->outputBottomLimit()) { - // TODO(march): work here (task output limit generation) - } tasks.emplace_back(std::move(task)); numInputRows += partition->getNumRows(); @@ -698,6 +694,31 @@ namespace tuplex { } } + // assign the order for all tasks + for(size_t i = 0; i < tasks.size(); ++i) { + tasks[i]->setOrder(i); + } + + if (tstage->hasOutputLimit()) { + if (tstage->outputTopLimit() > 0 && tstage->outputBottomLimit() > 0) { + // do task striping for output limit on both ends + vector newTasks; + for(size_t i = 0; i < tasks.size() - i; i++) { + const size_t rev_i = tasks.size() - 1 - i; + newTasks.push_back(tasks[i]); + if (i < rev_i) { + newTasks.push_back(tasks[rev_i]); + } + } + assert(tasks.size() == newTasks.size()); + tasks.swap(newTasks); + } else if (tstage->outputBottomLimit() > 0) { + // bottom limit only, just reverse the task order + std::reverse(tasks.begin(), tasks.end()); + } + // if top limit only, do nothing since the order is already good + } + return tasks; } @@ -941,8 +962,8 @@ namespace tuplex { } } - // TODO(march): work here (transform stage) auto tasks = createLoadAndTransformToMemoryTasks(tstage, _options, syms); + auto completedTasks = performTasks(tasks); // Note: this doesn't work yet because of the globals. @@ -1175,6 +1196,10 @@ namespace tuplex { rowDelta += taskNonConformingRows.size(); } + if (tstage->hasOutputLimit()) { + trimPartitionsToLimit(output, tstage->outputTopLimit(), tstage->outputBottomLimit()); + } + tstage->setMemoryResult(output, generalOutput, partitionToExceptionsMap, nonConformingRows, remainingExceptions, ecounts); break; } @@ -1518,21 +1543,29 @@ namespace tuplex { WorkQueue& wq = LocalEngine::instance().getQueue(); wq.clear(); - // assign the order for all tasks + // check if ord is set, if not issue warning & add + bool orderlessTaskFound = false; for(int i = 0; i < tasks.size(); ++i) { - tasks[i]->setOrder(i); + if(tasks[i]->getOrder().size() == 0) { + tasks[i]->setOrder(i); + orderlessTaskFound = true; + } } - // add all tasks to queue - // TODO(march): add task stage (to do striping) - for(size_t i = 0; i <= tasks.size() - i - 1; i++) { - const size_t revI = tasks.size()- i - 1 - wq.addTask(&tasks[i]); - if (revI > i) { - wq.addTask(&tasks[revI]); - } +#ifndef NDEBUG + if(orderlessTaskFound) { + logger().debug("task without order found, please fix in code."); + } +#endif + + for (int i = 0; i < tasks.size(); i++) { + // take limit only work with uniform order + assert(task.getOrder(0) == i); } + // add all tasks to queue + for(auto& task : tasks) wq.addTask(task); + // clear tasks.clear(); @@ -2083,4 +2116,123 @@ namespace tuplex { Logger::instance().defaultLogger().info("writing output took " + std::to_string(timer.time()) + "s"); tstage->setFileResult(ecounts); } + + void LocalBackend::trimPartitionsToLimit(std::vector &partitions, + size_t topLimit, + size_t bottomLimit, + TransformStage* tstage) { + std::vector limitedPartitions, limitedTailPartitions; + + // check top output limit, adjust partitions if necessary + size_t numTopOutputRows = 0; + Partition* lastTopPart = nullptr; + size_t clippedTop = 0; + for (auto partition : partitions) { + numTopOutputRows += partition->getNumRows(); + lastTopPart = partition; + if (numTopOutputRows >= topLimit) { + // clip last partition & leave loop + clippedTop = topLimit - (numTopOutputRows - partition->getNumRows()); + assert(clippedTop <= partition->getNumRows()); + break; + } else if (partition == *partitions.end()) { + // last partition, mark full row, but don't put to output set yet to avoid double put + clippedTop = partition->getNumRows(); + break; + } else { + // put full partition to output set + limitedPartitions.push_back(partition); + } + } + + // check the bottom output limit, adjust partitions if necessary + size_t numBottomOutputRows = 0; + size_t clippedBottom = 0; + for (auto it = partitions.rbegin(); it != partitions.rend(); it++) { + auto partition = *it; + numBottomOutputRows += partition->getNumRows(); + + if (partition == lastTopPart) { + // the bottom and the top partitions are overlapping + clippedBottom = bottomLimit - (numBottomOutputRows - partition->getNumRows()); + if (clippedTop + clippedBottom >= partition->getNumRows()) { + // if top and bottom range intersect, use full partitions + clippedTop = partition->getNumRows(); + clippedBottom = 0; + } + break; + } else if (numBottomOutputRows >= bottomLimit) { + // clip last partition & leave loop + auto clipped = bottomLimit - (numTopOutputRows - partition->getNumRows()); + assert(clipped <= partition->getNumRows()); + Partition newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage); + partition->invalidate(); + parition = newPart; + assert(partition->getNumRows() == clipped); + if (clipped > 0) + limitedTailPartitions.push_back(partition); + break; + } else { + // put full partition to output set + limitedTailPartitions.push_back(partition); + } + } + + // push the middle partition + if (lastTopPart != nullptr && (clippedTop > 0 || clippedBottom > 0)) { + assert(clippedTop + clippedBottom <= lastTopPart->getNumRows()); + + // split into two partitions with both top and bottom are in the same partition + Partition* lastBottomPart = nullptr; + + if (clippedBottom != 0) { + lastBottomPart = newPartitionWithSkipRows(lastTopPart, lastTopPart->getNumRows() - clippedBottom, tstage); + } + + lastTopPart->setNumRows(clippedTop); + + limitedPartitions.push_back(lastTopPart); + + if (lastBottomPart != nullptr) { + limitedPartitions.push_back(lastBottomPart); + } + } + + // merge the head and tail partitions + partitions.clear() + partitions.insert(partitions.end(), limitedPartitions.begin(), limitedPartitions.end()); + partitions.insert(partitions.end(), limitedTailPartitions.rbegin(), limitedTailPartitions.rend()); + } + + Partition* LocalBackend::newPartitionWithSkipRows(Partition *p_in, int numToSkip, TransformStage* tstage) { + if(!numToSkip) + return nullptr; + + auto ptr = p_in->lockRaw(); + auto num_rows = *((int64_t*) ptr); + assert(numToSkip < num_rows); + + Partition *p_out = _driver->allocWritablePartition(num_rows - numToSkip + sizeof(int64_t), + tstage->outputSchema(), tstage->outputDataSetID(), + tstage->context().id()); + + ptr += sizeof(int64_t); + size_t numBytesToSkip = 0; + + for(unsigned i = 0; i < numToSkip; ++i) { + Rows r = Row::fromMemory(tstage->outputSchema(), ptr, p_in->capacity() - numBytesToSkip); + ptr += r.serializedLength(); + numBytesToSkip += r.serializedLength(); + } + + auto ptr_out = p_out->lockRaw(); + *((int64_t*) ptr_out) = p_in->getNumRows() - numToSkip; + ptr_out += sizeof(int64_t); + memcpy(ptr_out, ptr, p_in->size() - numBytesToSkip); + p_out->unlock(); + + p_in->unlock(); + + return p_out; + } } // namespace tuplex \ No newline at end of file diff --git a/tuplex/core/src/physical/PhysicalPlan.cc b/tuplex/core/src/physical/PhysicalPlan.cc index 9c22837ad..ff67e4add 100644 --- a/tuplex/core/src/physical/PhysicalPlan.cc +++ b/tuplex/core/src/physical/PhysicalPlan.cc @@ -240,7 +240,7 @@ namespace tuplex { // user wants to merge exceptions in order. bool updateInputExceptions = hasFilter && hasInputExceptions && _context.getOptions().OPT_MERGE_EXCEPTIONS_INORDER(); - // create trafostage via builder pattern + // create transfrom stage via builder pattern auto builder = codegen::StageBuilder(_num_stages++, isRootStage, _context.getOptions().UNDEFINED_BEHAVIOR_FOR_OPERATORS(), diff --git a/tuplex/core/src/physical/ResultSet.cc b/tuplex/core/src/physical/ResultSet.cc index 5e15867f7..e31e78cec 100644 --- a/tuplex/core/src/physical/ResultSet.cc +++ b/tuplex/core/src/physical/ResultSet.cc @@ -183,7 +183,6 @@ namespace tuplex { } Row ResultSet::getNextRow() { - // TODO(march): logic in skip row count here // merge rows from objects if(!_pyobjects.empty()) { auto row_number = std::get<0>(_pyobjects.front()); diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc index af58866dc..6e216ac5c 100644 --- a/tuplex/core/src/physical/TransformStage.cc +++ b/tuplex/core/src/physical/TransformStage.cc @@ -130,99 +130,23 @@ namespace tuplex { if (partitions.empty() && interpreterRows.empty() && generalCase.empty()) _rs = emptyResultSet(); else { - std::vector limitedPartitions, limitedTailPartitions; auto schema = Schema::UNKNOWN; if(!partitions.empty()) { + size_t totalRowsCount = 0; schema = partitions.front()->schema(); for (auto partition : partitions) { assert(schema == partition->schema()); + totalRowsCount += partition->getNumRows(); } - // check top output limit, adjust partitions if necessary - size_t numTopOutputRows = 0; - Partition* lastTopPart = nullptr; - size_t clippedTop = 0; - for (auto partition : partitions) { - numTopOutputRows += partition->getNumRows(); - lastTopPart = partition; - if (numTopOutputRows >= outputTopLimit()) { - // clip last partition & leave loop - clippedTop = outputTopLimit() - (numTopOutputRows - partition->getNumRows()); - assert(clippedTop <= partition->getNumRows()); - break; - } else if (partition == *partitions.end()) { - // last partition, mark full row, but don't put to output set yet to avoid double put - clippedTop = partition->getNumRows(); - break; - } else { - // put full partition to output set - limitedPartitions.push_back(partition); - } - } - - // check the bottom output limit, adjust partitions if necessary - size_t numBottomOutputRows = 0; - size_t clippedBottom = 0; - for (auto it = partitions.rbegin(); it != partitions.rend(); it++) { - auto partition = *it; - numBottomOutputRows += partition->getNumRows(); - - if (partition == lastTopPart) { - // the bottom and the top partitions are overlapping - clippedBottom = outputBottomLimit() - (numBottomOutputRows - partition->getNumRows()); - if (clippedTop + clippedBottom >= partition->getNumRows()) { - // if top and bottom range intersect, use full partitions - clippedTop = partition->getNumRows(); - clippedBottom = 0; - } - break; - } else if (numBottomOutputRows >= outputBottomLimit()) { - // clip last partition & leave loop - auto clipped = outputBottomLimit() - (numTopOutputRows - partition->getNumRows()); - assert(clipped <= partition->getNumRows()); - partition->setNumSkip(partition->getNumRows() - clippedBottom); - partition->setNumRows(clipped); - if (clipped > 0) - limitedTailPartitions.push_back(partition); - break; - } else { - // put full partition to output set - limitedTailPartitions.push_back(partition); - } - } - - // push the middle partition - if (lastTopPart != nullptr && (clippedTop > 0 || clippedBottom > 0)) { - assert(clippedTop + clippedBottom <= lastTopPart->getNumRows()); - - // TODO(march): to work on this (split into two partitions) - // split into two partitions with both top and bottom are in the same partition - Partition* lastBottomPart = nullptr; - if (clippedBottom != 0) { - lastBottomPart = new Partition(lastTopPart); - lastBottomPart->setNumSkip(lastBottomPart->getNumRows() - clippedBottom); - lastBottomPart->setNumRows(clippedBottom); - } - - lastTopPart->setNumRows(clippedTop); - - limitedPartitions.push_back(lastTopPart); - - if (lastBottomPart != nullptr) { - limitedPartitions.push_back(lastBottomPart); - } + if (hasOutputLimit()) { + assert(totalRowsCount == _outputTopLimit + _outputBottomLimit); } - - // merge the head and tail partitions - std::reverse(limitedTailPartitions.begin(), limitedTailPartitions.end()); - limitedPartitions.insert(limitedPartitions.end(), limitedTailPartitions.begin(), limitedTailPartitions.end()); } // put ALL partitions to result set - // TODO(march): handle overlapping case - // TODO(march): maybe do top/bottom limit at the level instead? - _rs = std::make_shared(schema, limitedPartitions, + _rs = std::make_shared(schema, partitions, generalCase, partitionToExceptionsMap, interpreterRows, outputTopLimit() + outputBottomLimit()); } diff --git a/tuplex/core/src/physical/TransformTask.cc b/tuplex/core/src/physical/TransformTask.cc index a65aa7f11..49d104bcc 100644 --- a/tuplex/core/src/physical/TransformTask.cc +++ b/tuplex/core/src/physical/TransformTask.cc @@ -22,9 +22,14 @@ namespace tuplex { static std::atomic_int64_t g_totalTopOutputRows; static std::atomic_int64_t g_totalBottomOutputRows; + // mapping from order number -> row count if the task is finished + static std::mutex g_rowsDoneMutex; + static std::map g_rowsDone; + void TransformTask::resetOutputLimitCounter() { g_totalTopOutputRows = 0; g_totalBottomOutputRows = 0; + g_rowsDone.clear(); } } @@ -42,23 +47,9 @@ extern "C" { } static int64_t limited_w2mCallback(tuplex::TransformTask* task, uint8_t* buf, int64_t bufSize) { - // i.e. check here how many output rows, if already limit reached - jump to goto! - // TODO(march): comment this out - if(tuplex::g_totalTopOutputRows >= task->output_top_limit()) { - return tuplex::ecToI64(tuplex::ExceptionCode::OUTPUT_LIMIT_REACHED); - } - assert(task); assert(dynamic_cast(task)); - auto rc = task->writeRowToMemory(buf, bufSize); - if(0 == rc) - tuplex::g_totalTopOutputRows++; - - // i.e. check here how many output rows, if already limit reached - jump to goto! - if(tuplex::g_totalTopOutputRows >= task->output_top_limit()) { - return tuplex::ecToI64(tuplex::ExceptionCode::OUTPUT_LIMIT_REACHED); - } - return rc; + return task->writeRowToMemory(buf, bufSize); } static int64_t limited_w2fCallback(tuplex::TransformTask* task, uint8_t* buf, int64_t bufSize) { @@ -623,9 +614,36 @@ namespace tuplex { auto functor = reinterpret_cast(_functor); - // TODO(march): question here? // go over all input partitions. for(const auto &inputPartition : _inputPartitions) { + size_t numTopCompleted = 0; + size_t numBottomCompleted = 0; + bool isTopLimitReached = false; + bool isBottomLimitReached = false; + + tuplex::g_rowsDoneMutex.lock(); + for (size_t i = 0; tuplex::g_rowsDone.count(i) != 0; i++) { + numTopCompleted += tuplex::g_rowsDone[i]; + if (numTopCompleted >= _outTopLimit) { + isTopLimitReached = true; + break; + } + } + // TODO: what is the max task number here + for (size_t i = 100; tuplex::g_rowsDone.count(i) != 0; i--) { + numBottomCompleted += tuplex::g_rowsDone[i]; + if (numBottomCompleted >= _outTopLimit) { + isBottomLimitReached = true; + break; + } + } + tuplex::g_rowsDoneMutex.unlock(); + + if (isTopLimitReached && isBottomLimitReached) { + // skip the execution, enough is done + break; + } + // lock ptr, extract number of rows ==> store them // lock raw & call functor! int64_t inSize = inputPartition->size(); @@ -647,6 +665,10 @@ namespace tuplex { // delete partition if desired... if(_invalidateSourceAfterUse) inputPartition->invalidate(); + + tuplex::g_rowsDoneMutex.lock(); + tuplex::g_rowsDone[getOrder(0)] += getNumOutputRows(); + tuplex::g_rowsDoneMutex.unlock(); } #ifndef NDEBUG diff --git a/tuplex/utils/include/mt/ITask.h b/tuplex/utils/include/mt/ITask.h index 01f7137f1..a5ca4058f 100644 --- a/tuplex/utils/include/mt/ITask.h +++ b/tuplex/utils/include/mt/ITask.h @@ -21,52 +21,65 @@ namespace tuplex { +/*! + * interface for defining tasks that can be run via a threadpool + */ +class ITask { +private: + std::thread::id _id; //! the id of the thread that executed the task. Used to specifically execute a task on a specific thread. +//! Per default object is constructed that does not represent a thread + + std::vector _orderNumbers; //! for sorting tasks when doing async processing, allows for multiple stages + +public: + ITask() {}; + ITask(const ITask& other) : _id(other._id), _orderNumbers(other._orderNumbers) {} + virtual ~ITask() = default; + ITask(ITask&& other) = default; + ITask& operator = (ITask&& other) = default; + /*! - * interface for defining tasks that can be run via a threadpool + * interface to run a task */ - class ITask { - private: - std::thread::id _id; //! the id of the thread that executed the task. Used to specifically execute a task on a specific thread. -//! Per default object is constructed that does not represent a thread + virtual void execute() = 0; - size_t _orderNumbers; //! for sorting tasks when doing async processing, allows for multiple stages + std::thread::id getID() { + return _id; + } - public: - ITask() {}; - ITask(const ITask& other) : _id(other._id), _orderNumbers(other._orderNumbers) {} - virtual ~ITask() = default; - ITask(ITask&& other) = default; - ITask& operator = (ITask&& other) = default; + void setID(const std::thread::id& id) { + _id = id; + } - /*! - * interface to run a task - */ - virtual void execute() = 0; + void setOrder(size_t order) { _orderNumbers = std::vector{order}; } - std::thread::id getID() { - return _id; - } + size_t getOrder(const size_t nth) const { + return _orderNumbers[nth]; + } - void setID(const std::thread::id& id) { - _id = id; - } + std::vector getOrder() const { return _orderNumbers; } - void setOrder(size_t order) { - _orderNumbers = order; - } + void setOrder(const std::vector& order) { + _orderNumbers = order; + } - size_t getOrder() const { - return _orderNumbers; - } + /*! + * compare the ordering numbers of two tasks to restore initial data order after processing multiple ones + * @param other + * @return + */ + bool compareAscOrder(const ITask& other) const { + // make sure they have the same length + assert(_orderNumbers.size() == other._orderNumbers.size()); - /*! - * compare the ordering numbers of two tasks to restore initial data order after processing multiple ones - * @param other - * @return - */ - bool compareAscOrder(const ITask& other) const { - return _orderNumbers[i] < other._orderNumbers[i]; + // this < other? + // compare one by one + for(int i = 0; i < other._orderNumbers.size(); ++i) { + if(_orderNumbers[i] >= other._orderNumbers[i]) + return false; } - }; + return true; + } +}; } #endif //TUPLEX_ITASK_H \ No newline at end of file From 26ed614138e593bf38dae7a44c25ec6dbe278bf2 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Fri, 25 Mar 2022 00:16:11 -0400 Subject: [PATCH 34/56] Address Review Comments Address Review Comments (2) --- tuplex/core/include/DataSet.h | 5 +- tuplex/core/include/EmptyDataset.h | 4 +- tuplex/core/include/ErrorDataSet.h | 4 +- tuplex/core/include/Executor.h | 10 +- tuplex/core/include/ee/local/LocalBackend.h | 2 +- tuplex/core/include/logical/TakeOperator.h | 12 +-- tuplex/core/include/physical/ResultSet.h | 2 - tuplex/core/include/physical/StageBuilder.h | 5 +- tuplex/core/include/physical/TransformStage.h | 8 +- tuplex/core/src/DataSet.cc | 12 +-- tuplex/core/src/EmptyDataset.cc | 6 +- tuplex/core/src/ErrorDataSet.cc | 6 +- tuplex/core/src/Executor.cc | 4 - tuplex/core/src/ee/local/LocalBackend.cc | 19 ++-- tuplex/core/src/logical/TakeOperator.cc | 6 +- tuplex/core/src/physical/PhysicalPlan.cc | 2 +- tuplex/core/src/physical/ResultSet.cc | 2 +- tuplex/core/src/physical/StageBuilder.cc | 2 +- tuplex/python/src/PythonDataSet.cc | 2 +- tuplex/python/tuplex/dataset.py | 15 +-- tuplex/utils/include/mt/ITask.h | 94 +++++++++---------- 21 files changed, 104 insertions(+), 118 deletions(-) diff --git a/tuplex/core/include/DataSet.h b/tuplex/core/include/DataSet.h index 65a766a87..f6bb97f2c 100644 --- a/tuplex/core/include/DataSet.h +++ b/tuplex/core/include/DataSet.h @@ -263,13 +263,12 @@ namespace tuplex { // these are actions that cause execution virtual std::shared_ptr collect(std::ostream &os = std::cout); - virtual std::shared_ptr take(int64_t numTop, int64_t numBottom, std::ostream &os = std::cout); + virtual std::shared_ptr take(size_t topLimit, size_t bottomLimit, std::ostream &os = std::cout); virtual std::vector collectAsVector(std::ostream &os = std::cout); - virtual std::vector takeAsVector(int64_t numElements, std::ostream &os = std::cout); + virtual std::vector takeAsVector(size_t numElements, std::ostream &os = std::cout); - /*! * saves dataset to file. There are multiple options to control the behavior * ==> 1.) files can be split across multiple ones. Specify number of files to split rows to diff --git a/tuplex/core/include/EmptyDataset.h b/tuplex/core/include/EmptyDataset.h index 0f8a1f52c..6fc3219a4 100644 --- a/tuplex/core/include/EmptyDataset.h +++ b/tuplex/core/include/EmptyDataset.h @@ -70,13 +70,13 @@ namespace tuplex { virtual std::shared_ptr collect(std::ostream& os) override; // take / collect will print out the error only - virtual std::shared_ptr take(int64_t numTop, int64_t numBottom, std::ostream& os) override; + virtual std::shared_ptr take(size_t topLimit, size_t bottomLimit, std::ostream& os) override; //virtual void show(const int64_t numRows=-1, std::ostream& os=std::cout) override; virtual std::vector collectAsVector(std::ostream& os) override; // take / collect will print out the error only - virtual std::vector takeAsVector(int64_t numElements, std::ostream& os) override; + virtual std::vector takeAsVector(size_t numElements, std::ostream& os) override; DataSet& cache(const Schema::MemoryLayout& memoryLayout, bool storeSpecialized) override { return *this; diff --git a/tuplex/core/include/ErrorDataSet.h b/tuplex/core/include/ErrorDataSet.h index 34fc60685..cf283ebd1 100644 --- a/tuplex/core/include/ErrorDataSet.h +++ b/tuplex/core/include/ErrorDataSet.h @@ -90,13 +90,13 @@ namespace tuplex { std::shared_ptr collect(std::ostream& os) override; // take / collect will print out the error only - std::shared_ptr take(int64_t numTop, int64_t numBottom, std::ostream& os) override; + std::shared_ptr take(size_t topLimit, size_t bottomLimit, std::ostream& os) override; //virtual void show(const int64_t numRows=-1, std::ostream& os=std::cout) override; std::vector collectAsVector(std::ostream& os) override; // take / collect will print out the error only - std::vector takeAsVector(int64_t numElements, std::ostream& os) override; + std::vector takeAsVector(size_t numElements, std::ostream& os) override; }; } diff --git a/tuplex/core/include/Executor.h b/tuplex/core/include/Executor.h index 7eaaee244..b6b7edac1 100644 --- a/tuplex/core/include/Executor.h +++ b/tuplex/core/include/Executor.h @@ -44,7 +44,7 @@ namespace tuplex { */ class WorkQueue { private: - std::atomic_bool _done{}; // protects against data races + std::atomic_bool _done; // protects against data races ExecutorTaskQueueType _queue; std::mutex _completedTasksMutex; std::vector _completedTasks; @@ -75,14 +75,6 @@ namespace tuplex { size_t numCompletedTasks() const { return _numCompletedTasks; } - size_t frontRowsLimit() const { - return _frontRowsLimit; - }; - - size_t bottomRowsLimit() const { - return _bottomRowsLimit; - }; - /*! * stop working on this queue & dump all tasks */ diff --git a/tuplex/core/include/ee/local/LocalBackend.h b/tuplex/core/include/ee/local/LocalBackend.h index 0dbfafdc9..d7a5ec25b 100644 --- a/tuplex/core/include/ee/local/LocalBackend.h +++ b/tuplex/core/include/ee/local/LocalBackend.h @@ -89,7 +89,7 @@ namespace tuplex { MessageHandler& logger() const { return Logger::instance().logger("local ee"); } void trimPartitionsToLimit(std::vector &partitions, size_t topLimit, size_t bottomLimit, TransformStage* tstage); - Partition* newPartitionWithSkipRows(Partition* p_in, int numToSkip, TransformStage* tstage); + Partition* newPartitionWithSkipRows(Partition* p_in, size_t numToSkip, TransformStage* tstage); // write output (may be already in correct format!) void writeOutput(TransformStage* tstage, std::vector& sortedTasks); diff --git a/tuplex/core/include/logical/TakeOperator.h b/tuplex/core/include/logical/TakeOperator.h index b5dd5db6e..fe5f1b0f2 100644 --- a/tuplex/core/include/logical/TakeOperator.h +++ b/tuplex/core/include/logical/TakeOperator.h @@ -17,16 +17,16 @@ namespace tuplex { class TakeOperator : public LogicalOperator { private: - int64_t _limitTop; - int64_t _limitBottom; + size_t _topLimit; + size_t _bottomLimit; public: LogicalOperator *clone() override; public: - TakeOperator(LogicalOperator *parent, const int64_t numTop, const int64_t numBottom); + TakeOperator(LogicalOperator *parent, size_t topLimit, size_t bottomLimit); std::string name() override { - if(_limitTop < 0 || std::numeric_limits::max() == _limitTop) + if(_topLimit == 0 && _bottomLimit == 0) return "collect"; return "take"; } @@ -38,9 +38,9 @@ namespace tuplex { bool good() const override; - int64_t limit() { return _limitTop; } + size_t topLimit() const { return _topLimit; } - int64_t limitBottom() { return _limitBottom; } + size_t bottomLimit() const { return _bottomLimit; } std::vector getSample(const size_t num) const override; diff --git a/tuplex/core/include/physical/ResultSet.h b/tuplex/core/include/physical/ResultSet.h index 5e69fef3a..e94b8f1ae 100644 --- a/tuplex/core/include/physical/ResultSet.h +++ b/tuplex/core/include/physical/ResultSet.h @@ -36,8 +36,6 @@ namespace tuplex { size_t _rowsRetrieved; size_t _totalRowCounter; // used for merging in rows! size_t _maxRows; - size_t _maxRowsTop; - size_t _maxRowsBottom; Schema _schema; void removeFirstPartition(); diff --git a/tuplex/core/include/physical/StageBuilder.h b/tuplex/core/include/physical/StageBuilder.h index e678ead3d..83e63208a 100644 --- a/tuplex/core/include/physical/StageBuilder.h +++ b/tuplex/core/include/physical/StageBuilder.h @@ -76,7 +76,7 @@ namespace tuplex { void addFileInput(FileInputOperator* csvop); void addFileOutput(FileOutputOperator* fop); - inline void setOutputLimit(size_t topLimit, size_t bottomLimit) { + inline void setOutputLimit(size_t topLimit, size_t bottomLimit = 0) { _outputTopLimit = topLimit; _outputBottomLimit = bottomLimit; } @@ -158,8 +158,9 @@ namespace tuplex { size_t number() const { return _stageNumber; } int64_t outputDataSetID() const; + // default case: both _outputTopLimit and _outputBottomLimit is zero = take everything inline bool hasOutputLimit() const { - return _outputTopLimit < std::numeric_limits::max() || _outputBottomLimit > 0; + return _outputTopLimit != 0 || _outputBottomLimit != 0; } inline char csvOutputDelimiter() const { diff --git a/tuplex/core/include/physical/TransformStage.h b/tuplex/core/include/physical/TransformStage.h index f489f1f6c..05c7df448 100644 --- a/tuplex/core/include/physical/TransformStage.h +++ b/tuplex/core/include/physical/TransformStage.h @@ -393,6 +393,10 @@ namespace tuplex { */ void setDataAggregationMode(const AggregateType& t) { _aggMode = t; } + // default case: both _outputTopLimit and _outputBottomLimit is zero = take everything + bool hasOutputLimit() const { + return _outputTopLimit != 0 || _outputBottomLimit != 0; + } private: /*! * creates a new TransformStage with generated code @@ -481,10 +485,6 @@ namespace tuplex { // for hash output, the key and bucket type python::Type _hashOutputKeyType; python::Type _hashOutputBucketType; - - bool hasOutputLimit() const { - return _outputTopLimit < std::numeric_limits::max() && _outputBottomLimit != 0; - } }; } #endif //TUPLEX_TRANSFORMSTAGE_H \ No newline at end of file diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc index 3de903d1c..c11482f86 100644 --- a/tuplex/core/src/DataSet.cc +++ b/tuplex/core/src/DataSet.cc @@ -38,21 +38,17 @@ namespace tuplex { } std::shared_ptr DataSet::collect(std::ostream &os) { - return take(-1, false, os); + return take(0, 0, os); } - std::shared_ptr DataSet::take(int64_t numTop, int64_t numBottom, std::ostream &os) { + std::shared_ptr DataSet::take(size_t topLimit, size_t bottomLimit, std::ostream &os) { // error dataset? if (isError()) throw std::runtime_error("is error dataset!"); - // negative numbers mean get all elements! - if (numTop < 0) - numTop = std::numeric_limits::max(); - // create a take node assert(_context); - LogicalOperator *op = _context->addOperator(new TakeOperator(this->_operator, numTop, numBottom)); + LogicalOperator *op = _context->addOperator(new TakeOperator(this->_operator, topLimit, bottomLimit)); DataSet *dsptr = _context->createDataSet(op->getOutputSchema()); dsptr->_operator = op; op->setDataSet(dsptr); @@ -70,7 +66,7 @@ namespace tuplex { } // -1 means to retrieve all elements - std::vector DataSet::takeAsVector(int64_t numElements, std::ostream &os) { + std::vector DataSet::takeAsVector(size_t numElements, std::ostream &os) { auto rs = take(numElements, false, os); Timer timer; diff --git a/tuplex/core/src/EmptyDataset.cc b/tuplex/core/src/EmptyDataset.cc index 7504e8499..3664a591a 100644 --- a/tuplex/core/src/EmptyDataset.cc +++ b/tuplex/core/src/EmptyDataset.cc @@ -11,16 +11,16 @@ #include namespace tuplex { - std::shared_ptr EmptyDataset::take(int64_t numTop, int64_t numBottom, std::ostream &os) { + std::shared_ptr EmptyDataset::take(size_t topLimit, size_t bottomLimit, std::ostream &os) { return std::make_shared(); } - std::vector EmptyDataset::takeAsVector(int64_t numElements, std::ostream &os) { + std::vector EmptyDataset::takeAsVector(size_t numElements, std::ostream &os) { return std::vector{}; } std::shared_ptr EmptyDataset::collect(std::ostream &os) { - return take(0, false, os); + return take(0, 0, os); } std::vector EmptyDataset::collectAsVector(std::ostream &os) { diff --git a/tuplex/core/src/ErrorDataSet.cc b/tuplex/core/src/ErrorDataSet.cc index 9d19594f2..c87999e5f 100644 --- a/tuplex/core/src/ErrorDataSet.cc +++ b/tuplex/core/src/ErrorDataSet.cc @@ -12,7 +12,7 @@ namespace tuplex { - std::vector ErrorDataSet::takeAsVector(int64_t numElements, std::ostream &os) { + std::vector ErrorDataSet::takeAsVector(size_t numElements, std::ostream &os) { // return empty vector and print err message Logger::instance().logger("core").error(this->_error); @@ -23,7 +23,7 @@ namespace tuplex { return takeAsVector(0, os); } - std::shared_ptr ErrorDataSet::take(int64_t numTop, int64_t numBottom, std::ostream &os) { + std::shared_ptr ErrorDataSet::take(size_t topLimit, size_t bottomLimit, std::ostream &os) { // return empty vector and print err message Logger::instance().logger("core").error(this->_error); @@ -31,7 +31,7 @@ namespace tuplex { } std::shared_ptr ErrorDataSet::collect(std::ostream &os) { - return take(0, false, os); + return take(0, 0, os); } void diff --git a/tuplex/core/src/Executor.cc b/tuplex/core/src/Executor.cc index 388199e4d..acfdd0aa6 100644 --- a/tuplex/core/src/Executor.cc +++ b/tuplex/core/src/Executor.cc @@ -33,10 +33,6 @@ namespace tuplex { std::vector WorkQueue::popCompletedTasks() { TRACE_LOCK("workQueue"); - _taskDoneMutex.lock(); - _taskDone.clear(); - _taskDoneMutex.unlock(); - std::lock_guard lock(_completedTasksMutex); // move leads to circular dependency in gcc and thus a bug on travis-ci. Therefore, just // use the below hack to fool the compiler into actually copying the vectors diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc index dbceaa1b9..022d5a036 100644 --- a/tuplex/core/src/ee/local/LocalBackend.cc +++ b/tuplex/core/src/ee/local/LocalBackend.cc @@ -551,6 +551,7 @@ namespace tuplex { task->sinkExceptionsToMemory(inputSchema); task->setStageID(tstage->getID()); task->setOutputTopLimit(tstage->outputTopLimit()); + task->setOutputBottomLimit(tstage->outputBottomLimit()); // add to tasks tasks.emplace_back(std::move(task)); } else { @@ -585,6 +586,7 @@ namespace tuplex { task->sinkExceptionsToMemory(inputSchema); task->setStageID(tstage->getID()); task->setOutputTopLimit(tstage->outputTopLimit()); + task->setOutputBottomLimit(tstage->outputBottomLimit()); // add to tasks tasks.emplace_back(std::move(task)); num_parts++; @@ -622,6 +624,7 @@ namespace tuplex { task->sinkExceptionsToMemory(inputSchema); task->setStageID(tstage->getID()); task->setOutputTopLimit(tstage->outputTopLimit()); + task->setOutputBottomLimit(tstage->outputBottomLimit()); // add to tasks tasks.emplace_back(std::move(task)); @@ -1197,7 +1200,7 @@ namespace tuplex { } if (tstage->hasOutputLimit()) { - trimPartitionsToLimit(output, tstage->outputTopLimit(), tstage->outputBottomLimit()); + trimPartitionsToLimit(output, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage); } tstage->setMemoryResult(output, generalOutput, partitionToExceptionsMap, nonConformingRows, remainingExceptions, ecounts); @@ -1560,7 +1563,7 @@ namespace tuplex { for (int i = 0; i < tasks.size(); i++) { // take limit only work with uniform order - assert(task.getOrder(0) == i); + assert(tasks[i]->getOrder(0) == i); } // add all tasks to queue @@ -2165,9 +2168,9 @@ namespace tuplex { // clip last partition & leave loop auto clipped = bottomLimit - (numTopOutputRows - partition->getNumRows()); assert(clipped <= partition->getNumRows()); - Partition newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage); + Partition* newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage); partition->invalidate(); - parition = newPart; + partition = newPart; assert(partition->getNumRows() == clipped); if (clipped > 0) limitedTailPartitions.push_back(partition); @@ -2199,12 +2202,12 @@ namespace tuplex { } // merge the head and tail partitions - partitions.clear() + partitions.clear(); partitions.insert(partitions.end(), limitedPartitions.begin(), limitedPartitions.end()); partitions.insert(partitions.end(), limitedTailPartitions.rbegin(), limitedTailPartitions.rend()); } - Partition* LocalBackend::newPartitionWithSkipRows(Partition *p_in, int numToSkip, TransformStage* tstage) { + Partition* LocalBackend::newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage* tstage) { if(!numToSkip) return nullptr; @@ -2220,7 +2223,7 @@ namespace tuplex { size_t numBytesToSkip = 0; for(unsigned i = 0; i < numToSkip; ++i) { - Rows r = Row::fromMemory(tstage->outputSchema(), ptr, p_in->capacity() - numBytesToSkip); + Row r = Row::fromMemory(tstage->outputSchema(), ptr, p_in->capacity() - numBytesToSkip); ptr += r.serializedLength(); numBytesToSkip += r.serializedLength(); } @@ -2228,7 +2231,7 @@ namespace tuplex { auto ptr_out = p_out->lockRaw(); *((int64_t*) ptr_out) = p_in->getNumRows() - numToSkip; ptr_out += sizeof(int64_t); - memcpy(ptr_out, ptr, p_in->size() - numBytesToSkip); + memcpy((void *) ptr_out, ptr, p_in->size() - numBytesToSkip); p_out->unlock(); p_in->unlock(); diff --git a/tuplex/core/src/logical/TakeOperator.cc b/tuplex/core/src/logical/TakeOperator.cc index e588b5e97..49a4452b4 100644 --- a/tuplex/core/src/logical/TakeOperator.cc +++ b/tuplex/core/src/logical/TakeOperator.cc @@ -12,13 +12,13 @@ #include namespace tuplex { - TakeOperator::TakeOperator(LogicalOperator *parent, const int64_t numTop, const int64_t numBottom) : LogicalOperator::LogicalOperator(parent), _limitTop(numTop), _limitBottom(numBottom) { + TakeOperator::TakeOperator(LogicalOperator *parent, size_t topLimit, size_t bottomLimit) : LogicalOperator::LogicalOperator(parent), _topLimit(topLimit), _bottomLimit(bottomLimit) { // take schema from parent node setSchema(this->parent()->getOutputSchema()); } bool TakeOperator::good() const { - return _limitTop >= -1 && _limitBottom >= -1; + return _topLimit >= 0 && _bottomLimit >= 0; } std::vector TakeOperator::getSample(const size_t num) const { @@ -33,7 +33,7 @@ namespace tuplex { LogicalOperator *TakeOperator::clone() { // create clone of this operator - auto copy = new TakeOperator(parent()->clone(), _limitTop, _limitBottom); + auto copy = new TakeOperator(parent()->clone(), _topLimit, _bottomLimit); copy->setDataSet(getDataSet()); // weak ptr to old dataset... copy->copyMembers(this); diff --git a/tuplex/core/src/physical/PhysicalPlan.cc b/tuplex/core/src/physical/PhysicalPlan.cc index ff67e4add..f289064d5 100644 --- a/tuplex/core/src/physical/PhysicalPlan.cc +++ b/tuplex/core/src/physical/PhysicalPlan.cc @@ -382,7 +382,7 @@ namespace tuplex { // set limit if output node has a limit (currently only TakeOperator) if(outputNode->type() == LogicalOperatorType::TAKE) { auto top = static_cast(outputNode); - builder.setOutputLimit(top->limit(), top->limitBottom()); + builder.setOutputLimit(top->topLimit(), top->bottomLimit()); } // @TODO: add slowPip builder to this process... diff --git a/tuplex/core/src/physical/ResultSet.cc b/tuplex/core/src/physical/ResultSet.cc index e31e78cec..0f7bf7319 100644 --- a/tuplex/core/src/physical/ResultSet.cc +++ b/tuplex/core/src/physical/ResultSet.cc @@ -98,7 +98,7 @@ namespace tuplex { Partition *first = _partitions.front(); assert(_schema == first->schema()); - auto numRows = first->getNumRows() - first->getNumSkip(); + auto numRows = first->getNumRows(); _rowsRetrieved += numRows; _partitions.pop_front(); diff --git a/tuplex/core/src/physical/StageBuilder.cc b/tuplex/core/src/physical/StageBuilder.cc index bc814182b..78bc8dea4 100644 --- a/tuplex/core/src/physical/StageBuilder.cc +++ b/tuplex/core/src/physical/StageBuilder.cc @@ -458,7 +458,7 @@ namespace tuplex { } case LogicalOperatorType::TAKE: { auto takeOp = dynamic_cast(node); - opt_ops.push_back(new TakeOperator(lastParent, takeOp->limit(), takeOp->limitBottom())); + opt_ops.push_back(new TakeOperator(lastParent, takeOp->topLimit(), takeOp->bottomLimit())); opt_ops.back()->setID(node->getID()); break; } diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc index 853b910db..66f94e33f 100644 --- a/tuplex/python/src/PythonDataSet.cc +++ b/tuplex/python/src/PythonDataSet.cc @@ -130,7 +130,7 @@ namespace tuplex { std::shared_ptr rs; std::string err_message = ""; try { - rs = _dataset->take(numRows, ss); + rs = _dataset->take(numTop, numBottom, ss); if(!rs) throw std::runtime_error("invalid result set"); // if there are more than 1 million (100k in debug mode) elements print message... diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py index 1046505f2..850b4ed83 100644 --- a/tuplex/python/tuplex/dataset.py +++ b/tuplex/python/tuplex/dataset.py @@ -191,24 +191,25 @@ def collect(self): assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' return self._dataSet.collect() - def take(self, nrows=5, nbottom=0): + def take(self, limitTop=5, limitBottom=0): """ action that generates a physical plan, processes data and collects the top results then as list of tuples. Args: - nrows (int): number of rows to collect. Per default ``5``. + limitTop (int): number of top rows to collect. Per default ``5``. + limitBottom (int): number of bottom rows to collect. Per default ``0``. Returns: (list): A list of tuples """ - assert isinstance(nrows, int), 'num rows must be an integer' - assert nrows > 0, 'please specify a number greater than zero' - assert isinstance(nbottom, int), 'num bottom last must be an integer' - assert nbottom >= 0, 'please specify a number greater or equal to zero' + assert isinstance(limitTop, int), 'num rows must be an integer' + assert limitTop > 0, 'please specify a number greater than zero' + assert isinstance(limitBottom, int), 'num bottom last must be an integer' + assert limitBottom >= 0, 'please specify a number greater or equal to zero' assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' - return self._dataSet.take(nrows, nbottom) + return self._dataSet.take(limitTop, limitBottom) def show(self, nrows=None): """ action that generates a physical plan, processes data and prints results as nicely formatted diff --git a/tuplex/utils/include/mt/ITask.h b/tuplex/utils/include/mt/ITask.h index a5ca4058f..6c85d2d36 100644 --- a/tuplex/utils/include/mt/ITask.h +++ b/tuplex/utils/include/mt/ITask.h @@ -21,65 +21,65 @@ namespace tuplex { -/*! - * interface for defining tasks that can be run via a threadpool - */ -class ITask { -private: - std::thread::id _id; //! the id of the thread that executed the task. Used to specifically execute a task on a specific thread. + /*! + * interface for defining tasks that can be run via a threadpool + */ + class ITask { + private: + std::thread::id _id; //! the id of the thread that executed the task. Used to specifically execute a task on a specific thread. //! Per default object is constructed that does not represent a thread - std::vector _orderNumbers; //! for sorting tasks when doing async processing, allows for multiple stages + std::vector _orderNumbers; //! for sorting tasks when doing async processing, allows for multiple stages -public: - ITask() {}; - ITask(const ITask& other) : _id(other._id), _orderNumbers(other._orderNumbers) {} - virtual ~ITask() = default; - ITask(ITask&& other) = default; - ITask& operator = (ITask&& other) = default; + public: + ITask() {}; + ITask(const ITask& other) : _id(other._id), _orderNumbers(other._orderNumbers) {} + virtual ~ITask() = default; + ITask(ITask&& other) = default; + ITask& operator = (ITask&& other) = default; - /*! - * interface to run a task - */ - virtual void execute() = 0; + /*! + * interface to run a task + */ + virtual void execute() = 0; - std::thread::id getID() { - return _id; - } + std::thread::id getID() { + return _id; + } - void setID(const std::thread::id& id) { - _id = id; - } + void setID(const std::thread::id& id) { + _id = id; + } - void setOrder(size_t order) { _orderNumbers = std::vector{order}; } + void setOrder(size_t order) { _orderNumbers = std::vector{order}; } - size_t getOrder(const size_t nth) const { - return _orderNumbers[nth]; - } + size_t getOrder(size_t nth) const { + return _orderNumbers[nth]; + } - std::vector getOrder() const { return _orderNumbers; } + std::vector getOrder() const { return _orderNumbers; } - void setOrder(const std::vector& order) { - _orderNumbers = order; - } + void setOrder(const std::vector& order) { + _orderNumbers = order; + } - /*! - * compare the ordering numbers of two tasks to restore initial data order after processing multiple ones - * @param other - * @return - */ - bool compareAscOrder(const ITask& other) const { - // make sure they have the same length - assert(_orderNumbers.size() == other._orderNumbers.size()); + /*! + * compare the ordering numbers of two tasks to restore initial data order after processing multiple ones + * @param other + * @return + */ + bool compareAscOrder(const ITask& other) const { + // make sure they have the same length + assert(_orderNumbers.size() == other._orderNumbers.size()); - // this < other? - // compare one by one - for(int i = 0; i < other._orderNumbers.size(); ++i) { - if(_orderNumbers[i] >= other._orderNumbers[i]) - return false; + // this < other? + // compare one by one + for(int i = 0; i < other._orderNumbers.size(); ++i) { + if(_orderNumbers[i] >= other._orderNumbers[i]) + return false; + } + return true; } - return true; - } -}; + }; } #endif //TUPLEX_ITASK_H \ No newline at end of file From 2cdd269c11dd4cd87d0958846cb2338b7c4e06c8 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Fri, 25 Mar 2022 00:52:36 -0400 Subject: [PATCH 35/56] Debugging Tests --- tuplex/core/src/ee/local/LocalBackend.cc | 7 ++++++- tuplex/core/src/physical/ResultSet.cc | 3 +-- tuplex/test/core/TakeTest.cc | 4 +++- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc index 022d5a036..d51ef4523 100644 --- a/tuplex/core/src/ee/local/LocalBackend.cc +++ b/tuplex/core/src/ee/local/LocalBackend.cc @@ -884,7 +884,12 @@ namespace tuplex { // special case: skip stage, i.e. empty code and mem2mem if(tstage->code().empty() && !tstage->fileInputMode() && !tstage->fileOutputMode()) { auto pyObjects = inputExceptionsToPythonObjects(tstage->inputExceptions(), tstage->normalCaseInputSchema()); - tstage->setMemoryResult(tstage->inputPartitions(), std::vector{}, std::unordered_map(), pyObjects); + + auto output_par = tstage->inputPartitions(); + if (tstage->hasOutputLimit()) { + trimPartitionsToLimit(output_par, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage); + } + tstage->setMemoryResult(output_par, std::vector{}, std::unordered_map(), pyObjects); pyObjects.clear(); // skip stage Logger::instance().defaultLogger().info("[Transform Stage] skipped stage " + std::to_string(tstage->number()) + " because there is nothing todo here."); diff --git a/tuplex/core/src/physical/ResultSet.cc b/tuplex/core/src/physical/ResultSet.cc index 0f7bf7319..bfd656dc8 100644 --- a/tuplex/core/src/physical/ResultSet.cc +++ b/tuplex/core/src/physical/ResultSet.cc @@ -138,8 +138,7 @@ namespace tuplex { auto num_rows = first->getNumRows(); // how many left to retrieve? auto num_to_retrieve_from_partition = std::min(limit - i, num_rows - _curRowCounter); - if(num_to_retrieve_from_partition <= 0) - break; + assert(num_to_retrieve_from_partition >= 0); // make sure partition schema matches stored schema assert(_schema == first->schema()); diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc index 08b648f34..2d8f81f2f 100644 --- a/tuplex/test/core/TakeTest.cc +++ b/tuplex/test/core/TakeTest.cc @@ -122,4 +122,6 @@ TEST_F(TakeTest, takeBothTest) { EXPECT_EQ(v3[2].getString(0), "!"); EXPECT_EQ(v3[3].getString(0), "! :)"); EXPECT_EQ(v3[4].getString(0), "!"); -} \ No newline at end of file +} + +// TODO(march): test empty code when reusing partitions. This might not work if user reused dataset \ No newline at end of file From c203de49f1dcd75062703703c8778eae2bd768e7 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Wed, 6 Apr 2022 23:55:59 -0400 Subject: [PATCH 36/56] Change definition of take all --- tuplex/core/include/DataSet.h | 2 +- tuplex/core/include/logical/TakeOperator.h | 2 +- tuplex/core/include/physical/StageBuilder.h | 3 +-- tuplex/core/include/physical/TransformStage.h | 2 +- tuplex/core/include/physical/TransformTask.h | 2 -- tuplex/core/src/DataSet.cc | 14 +++++++------- tuplex/core/src/physical/TransformTask.cc | 3 +-- tuplex/python/include/PythonDataSet.h | 2 +- tuplex/python/src/PythonDataSet.cc | 17 ++++++++++++++--- 9 files changed, 27 insertions(+), 20 deletions(-) diff --git a/tuplex/core/include/DataSet.h b/tuplex/core/include/DataSet.h index f6bb97f2c..86ca23b6a 100644 --- a/tuplex/core/include/DataSet.h +++ b/tuplex/core/include/DataSet.h @@ -128,7 +128,7 @@ namespace tuplex { * @param numRows how many rows to print, i.e. top numRows are printed.xs * @param os ostream where to print table to */ - virtual void show(const int64_t numRows = -1, std::ostream &os = std::cout); + virtual void show(int64_t numRows = -1, std::ostream &os = std::cout); // named dataset management functions /*! diff --git a/tuplex/core/include/logical/TakeOperator.h b/tuplex/core/include/logical/TakeOperator.h index fe5f1b0f2..f3841236b 100644 --- a/tuplex/core/include/logical/TakeOperator.h +++ b/tuplex/core/include/logical/TakeOperator.h @@ -26,7 +26,7 @@ namespace tuplex { TakeOperator(LogicalOperator *parent, size_t topLimit, size_t bottomLimit); std::string name() override { - if(_topLimit == 0 && _bottomLimit == 0) + if(_topLimit == std::numeric_limits::max() || _bottomLimit == std::numeric_limits::max()) return "collect"; return "take"; } diff --git a/tuplex/core/include/physical/StageBuilder.h b/tuplex/core/include/physical/StageBuilder.h index 83e63208a..1c322b9a6 100644 --- a/tuplex/core/include/physical/StageBuilder.h +++ b/tuplex/core/include/physical/StageBuilder.h @@ -158,9 +158,8 @@ namespace tuplex { size_t number() const { return _stageNumber; } int64_t outputDataSetID() const; - // default case: both _outputTopLimit and _outputBottomLimit is zero = take everything inline bool hasOutputLimit() const { - return _outputTopLimit != 0 || _outputBottomLimit != 0; + return _outputTopLimit != std::numeric_limits::max() && _outputBottomLimit != std::numeric_limits::max(); } inline char csvOutputDelimiter() const { diff --git a/tuplex/core/include/physical/TransformStage.h b/tuplex/core/include/physical/TransformStage.h index 05c7df448..e1e45c97b 100644 --- a/tuplex/core/include/physical/TransformStage.h +++ b/tuplex/core/include/physical/TransformStage.h @@ -395,7 +395,7 @@ namespace tuplex { // default case: both _outputTopLimit and _outputBottomLimit is zero = take everything bool hasOutputLimit() const { - return _outputTopLimit != 0 || _outputBottomLimit != 0; + return _outputTopLimit != std::numeric_limits::max() && _outputBottomLimit != std::numeric_limits::max(); } private: /*! diff --git a/tuplex/core/include/physical/TransformTask.h b/tuplex/core/include/physical/TransformTask.h index d065e86d3..e2b8bc5b6 100644 --- a/tuplex/core/include/physical/TransformTask.h +++ b/tuplex/core/include/physical/TransformTask.h @@ -184,7 +184,6 @@ namespace tuplex { void setOutputTopLimit(size_t limit) { _outTopLimit = limit; resetOutputLimitCounter(); } void setOutputBottomLimit(size_t limit) { _outBottomLimit = limit; resetOutputLimitCounter(); } - void setOutputSkip(size_t numRowsToSkip) { _outSkipRows = numRowsToSkip; } void execute() override; bool hasFileSink() const { return _outputFilePath != URI::INVALID; } @@ -281,7 +280,6 @@ namespace tuplex { size_t _outTopLimit; // limits how many rows to write at max size_t _outBottomLimit; // limits how many last rows to write at max - size_t _outSkipRows; // how many rows at start to skip // memory source variables std::vector _inputPartitions; diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc index c11482f86..d54edb567 100644 --- a/tuplex/core/src/DataSet.cc +++ b/tuplex/core/src/DataSet.cc @@ -38,7 +38,7 @@ namespace tuplex { } std::shared_ptr DataSet::collect(std::ostream &os) { - return take(0, 0, os); + return take(std::numeric_limits::max(), 0, os); } std::shared_ptr DataSet::take(size_t topLimit, size_t bottomLimit, std::ostream &os) { @@ -62,18 +62,14 @@ namespace tuplex { // collect functions std::vector DataSet::collectAsVector(std::ostream &os) { - return takeAsVector(-1, os); + return takeAsVector(std::numeric_limits::max(), os); } - // -1 means to retrieve all elements std::vector DataSet::takeAsVector(size_t numElements, std::ostream &os) { auto rs = take(numElements, false, os); Timer timer; #warning "limiting should make this hack irrelevant..." - if (numElements < 0) - numElements = std::numeric_limits::max(); - // std::vector v; // while (rs->hasNextRow() && v.size() < numElements) { // v.push_back(rs->getNextRow()); @@ -730,10 +726,14 @@ namespace tuplex { } - void DataSet::show(const int64_t numRows, std::ostream &os) { + void DataSet::show(int64_t numRows, std::ostream &os) { assert(_context); // get rows + if (numRows < 0) { + numRows = std::numeric_limits::max(); + } + auto rows = takeAsVector(numRows, os); if (rows.empty()) { return; diff --git a/tuplex/core/src/physical/TransformTask.cc b/tuplex/core/src/physical/TransformTask.cc index 49d104bcc..377385deb 100644 --- a/tuplex/core/src/physical/TransformTask.cc +++ b/tuplex/core/src/physical/TransformTask.cc @@ -509,7 +509,6 @@ namespace tuplex { _outPrefix.reset(); _outTopLimit = std::numeric_limits::max(); // write all rows _outBottomLimit = 0; - _outSkipRows = 0; // skip no rows // reset memory sink _output.reset(); @@ -705,7 +704,7 @@ namespace tuplex { // skip rows? limit rows?? - if(_numOutputRowsWritten >= _outSkipRows && _numOutputRowsWritten < (_outTopLimit - _outSkipRows)) { + if(_numOutputRowsWritten < _outTopLimit) { if(_outFile->write(buf, bufSize) != VirtualFileSystemStatus::VFS_OK) return ecToI32(ExceptionCode::IOERROR); } diff --git a/tuplex/python/include/PythonDataSet.h b/tuplex/python/include/PythonDataSet.h index 23b09314d..ede482d9c 100644 --- a/tuplex/python/include/PythonDataSet.h +++ b/tuplex/python/include/PythonDataSet.h @@ -77,7 +77,7 @@ namespace tuplex { PythonDataSet resolve(const int64_t exceptionCode, const std::string& lambda_code, const std::string& pickled_code, const py::object& closure=py::object()); py::object collect(); - py::object take(const int64_t numTop, const int64_t numBottom); + py::object take(const int64_t topLimit, const int64_t bottomLimit); void show(const int64_t numRows=-1); // DataFrame like operations diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc index 66f94e33f..f6079a143 100644 --- a/tuplex/python/src/PythonDataSet.cc +++ b/tuplex/python/src/PythonDataSet.cc @@ -107,7 +107,7 @@ namespace tuplex { } } - py::object PythonDataSet::take(const int64_t numTop, const int64_t numBottom) { + py::object PythonDataSet::take(const int64_t topLimit, const int64_t bottomLimit) { // make sure a dataset is wrapped assert(this->_dataset); @@ -129,8 +129,19 @@ namespace tuplex { std::shared_ptr rs; std::string err_message = ""; + + size_t castedTopLimit = 0; + if (topLimit < 0) { + castedTopLimit = std::numeric_limits::max(); + } + + size_t castedBottomLimit = 0; + if (bottomLimit < 0) { + castedBottomLimit = std::numeric_limits::max(); + } + try { - rs = _dataset->take(numTop, numBottom, ss); + rs = _dataset->take(castedTopLimit, castedBottomLimit, ss); if(!rs) throw std::runtime_error("invalid result set"); // if there are more than 1 million (100k in debug mode) elements print message... @@ -162,7 +173,7 @@ namespace tuplex { // new version, directly interact with the interpreter Timer timer; // build python list object from resultset - auto listObj = resultSetToCPython(rs.get(), numTop); + auto listObj = resultSetToCPython(rs.get(), castedTopLimit); Logger::instance().logger("python").info("Data transfer back to python took " + std::to_string(timer.time()) + " seconds"); // Logger::instance().flushAll(); From 664cd14a5ef0b1d2bba2723b4fc914a395d1765e Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Wed, 6 Apr 2022 23:56:59 -0400 Subject: [PATCH 37/56] Random take test with some debugging --- tuplex/core/src/ee/local/LocalBackend.cc | 35 ++++----- tuplex/core/src/physical/TransformStage.cc | 6 -- tuplex/test/core/TakeTest.cc | 87 +++++++++++++++++++++- 3 files changed, 101 insertions(+), 27 deletions(-) diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc index d51ef4523..0b8157ecc 100644 --- a/tuplex/core/src/ee/local/LocalBackend.cc +++ b/tuplex/core/src/ee/local/LocalBackend.cc @@ -2143,7 +2143,7 @@ namespace tuplex { clippedTop = topLimit - (numTopOutputRows - partition->getNumRows()); assert(clippedTop <= partition->getNumRows()); break; - } else if (partition == *partitions.end()) { + } else if (partition == partitions.back()) { // last partition, mark full row, but don't put to output set yet to avoid double put clippedTop = partition->getNumRows(); break; @@ -2171,14 +2171,14 @@ namespace tuplex { break; } else if (numBottomOutputRows >= bottomLimit) { // clip last partition & leave loop - auto clipped = bottomLimit - (numTopOutputRows - partition->getNumRows()); + auto clipped = bottomLimit - (numBottomOutputRows - partition->getNumRows()); assert(clipped <= partition->getNumRows()); - Partition* newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage); + if (clipped > 0) { + Partition *newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage); + assert(newPart->getNumRows() == clipped); + limitedTailPartitions.push_back(newPart); + } partition->invalidate(); - partition = newPart; - assert(partition->getNumRows() == clipped); - if (clipped > 0) - limitedTailPartitions.push_back(partition); break; } else { // put full partition to output set @@ -2197,9 +2197,12 @@ namespace tuplex { lastBottomPart = newPartitionWithSkipRows(lastTopPart, lastTopPart->getNumRows() - clippedBottom, tstage); } - lastTopPart->setNumRows(clippedTop); - - limitedPartitions.push_back(lastTopPart); + if (clippedTop != 0) { + lastTopPart->setNumRows(clippedTop); + limitedPartitions.push_back(lastTopPart); + } else { + lastTopPart->invalidate(); + } if (lastBottomPart != nullptr) { limitedPartitions.push_back(lastBottomPart); @@ -2213,17 +2216,10 @@ namespace tuplex { } Partition* LocalBackend::newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage* tstage) { - if(!numToSkip) - return nullptr; - auto ptr = p_in->lockRaw(); auto num_rows = *((int64_t*) ptr); assert(numToSkip < num_rows); - Partition *p_out = _driver->allocWritablePartition(num_rows - numToSkip + sizeof(int64_t), - tstage->outputSchema(), tstage->outputDataSetID(), - tstage->context().id()); - ptr += sizeof(int64_t); size_t numBytesToSkip = 0; @@ -2233,6 +2229,11 @@ namespace tuplex { numBytesToSkip += r.serializedLength(); } + Partition *p_out = _driver->allocWritablePartition(p_in->size() - numBytesToSkip + sizeof(int64_t), + tstage->outputSchema(), tstage->outputDataSetID(), + tstage->context().id()); + assert(p_out->capacity() >= p_in->size() - numBytesToSkip); + auto ptr_out = p_out->lockRaw(); *((int64_t*) ptr_out) = p_in->getNumRows() - numToSkip; ptr_out += sizeof(int64_t); diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc index 6e216ac5c..060365697 100644 --- a/tuplex/core/src/physical/TransformStage.cc +++ b/tuplex/core/src/physical/TransformStage.cc @@ -133,15 +133,9 @@ namespace tuplex { auto schema = Schema::UNKNOWN; if(!partitions.empty()) { - size_t totalRowsCount = 0; schema = partitions.front()->schema(); for (auto partition : partitions) { assert(schema == partition->schema()); - totalRowsCount += partition->getNumRows(); - } - - if (hasOutputLimit()) { - assert(totalRowsCount == _outputTopLimit + _outputBottomLimit); } } diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc index 2d8f81f2f..3990fcd07 100644 --- a/tuplex/test/core/TakeTest.cc +++ b/tuplex/test/core/TakeTest.cc @@ -8,13 +8,51 @@ // License: Apache 2.0 // //--------------------------------------------------------------------------------------------------------------------// +#include + #include #include "TestUtils.h" +using namespace tuplex; +using namespace std; + class TakeTest : public PyTest {}; +/** + * Randomly generate a vector of rows for testing + * @param N the size of vector + * @return a vector of size N, containing the random data + */ +vector generateTestData(size_t N, uint64_t seed) { + mt19937 gen(seed); //Standard mersenne_twister_engine seeded with rd() + uniform_int_distribution<> distrib(1, 100000000); + + vector data; + data.reserve(N); + + for (int i = 0; i < N; i++) { + data.emplace_back(distrib(gen), distrib(gen), distrib(gen)); + } + + return data; +} + +vector generateReferenceData(const vector& input, size_t topLimit, size_t bottomLimit) { + vector output; + for(size_t i = 0; i < topLimit && i < input.size(); i++) { + output.push_back(input[i]); + } + size_t start_bottom = input.size() >= bottomLimit ? input.size() - bottomLimit : 0; + start_bottom = max(topLimit, start_bottom); + + for(size_t i = start_bottom; i < input.size(); i++) { + output.push_back(input[i]); + } + + return output; +} + TEST_F(TakeTest, takeTopTest) { - using namespace tuplex; auto opt = testOptions(); Context context(opt); @@ -51,7 +89,6 @@ TEST_F(TakeTest, takeTopTest) { } TEST_F(TakeTest, takeBottomTest) { - using namespace tuplex; auto opt = testOptions(); Context context(opt); @@ -88,7 +125,6 @@ TEST_F(TakeTest, takeBottomTest) { } TEST_F(TakeTest, takeBothTest) { - using namespace tuplex; auto opt = testOptions(); Context context(opt); @@ -124,4 +160,47 @@ TEST_F(TakeTest, takeBothTest) { EXPECT_EQ(v3[4].getString(0), "!"); } -// TODO(march): test empty code when reusing partitions. This might not work if user reused dataset \ No newline at end of file +TEST_F(TakeTest, takeBigTest) { + mt19937 data_seed_gen(4242); + + const std::vector test_size{1, 10, 100, 1001, 10001}; + const std::vector limit_values{0, 1, 5, 11, 600, 10000}; + const std::vector partition_sizes{"256B", "512KB", "1MB"}; + + for(auto& part_size : partition_sizes) { + auto opt = testOptions(); + opt.set("tuplex.partitionSize", part_size); + Context context(opt); + + for(auto data_size : test_size) { + for (auto top_limit: limit_values) { + for (auto bottom_limit: limit_values) { + std::cout << "testing with partition size:" << part_size << " data size:" + << data_size << " top:" << top_limit << " bottom:" << bottom_limit << std::endl; + + auto data = generateTestData(data_size, data_seed_gen()); + auto ref_data = generateReferenceData(data, top_limit, bottom_limit); + + auto res = context.parallelize(data).take(top_limit, bottom_limit); + ASSERT_EQ(ref_data.size(), res->rowCount()); + for (Row &r: ref_data) { + Row res_row = res->getNextRow(); + if (!(res_row == r)) { + ASSERT_EQ(res_row, r); + } + } + } + } + } + } +} + +// TODO(march): with map, filter function +//TEST_F(TakeTest, takeMapFilterTest) { +// srand(4242); +//} + +// TODO(march): with file input +// context.csv("../resources/"); + +// TODO(march): collect operator \ No newline at end of file From 5048a9b81d35d5c97b14f2970e6316bc23b575a5 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Thu, 7 Apr 2022 23:28:54 -0400 Subject: [PATCH 38/56] Polish the python interface --- tuplex/core/include/DataSet.h | 8 ++ tuplex/core/src/DataSet.cc | 109 +++++++++++++++++++++++++ tuplex/python/include/PythonDataSet.h | 1 + tuplex/python/src/PythonBindings.cc | 1 + tuplex/python/src/PythonDataSet.cc | 49 +++++++++++ tuplex/python/tuplex/dataset.py | 112 +++++++------------------- 6 files changed, 196 insertions(+), 84 deletions(-) diff --git a/tuplex/core/include/DataSet.h b/tuplex/core/include/DataSet.h index 86ca23b6a..9510427e2 100644 --- a/tuplex/core/include/DataSet.h +++ b/tuplex/core/include/DataSet.h @@ -130,6 +130,14 @@ namespace tuplex { */ virtual void show(int64_t numRows = -1, std::ostream &os = std::cout); + /*! + * Displays a formatted HTML table of a small portion of the data + * @param topLimit how many top rows to print + * @param bottomLimit how many bottom rows to print + * @param os ostream where to print table to + */ + virtual void showHTMLPreview(size_t topLimit, size_t bottomLimit, std::ostream &os = std::cout); + // named dataset management functions /*! * map Column using a UDF diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc index d54edb567..210b3ec60 100644 --- a/tuplex/core/src/DataSet.cc +++ b/tuplex/core/src/DataSet.cc @@ -756,6 +756,115 @@ namespace tuplex { printTable(os, headers, rows); } + void printHTMLRow(std::ostream &os, size_t ind, const Row& r) { + os << " \n"; + os << fmt::format(" {}\n", ind); + for (auto& s : r.getAsStrings()) { + os << fmt::format(" {}\n", s); + } + os << " \n"; + } + + void DataSet::showHTMLPreview(size_t topLimit, size_t bottomLimit, std::ostream &os) { + std::string HTML_TEMPLATE = + "
\n" + "\n" + "\n" + " \n" + " \n" + "{}" + " \n" + " \n" + " \n" + "{}" + " \n" + "
\n" + "

{} columns

\n" + "
"; + + assert(_context); + + auto rows = take(topLimit, bottomLimit); + + if (rows->rowCount() == 0) { + os << fmt::format(HTML_TEMPLATE, "\n", "\n", 0); + return; + } + + std::stringstream headers_stream, body_stream; + size_t numColumns = 0; + assert(rows->rowCount() <= topLimit + bottomLimit); + + // construct tables + if (rows->rowCount() < topLimit + bottomLimit) { + // the data is small so we get everything (no need to render ...) + for (size_t i = 0; rows->hasNextRow(); i++) { + Row r = rows->getNextRow(); + if (i == 0) { + // we set num columns based on the first row + numColumns = r.getNumColumns(); + } + + printHTMLRow(body_stream, i, r); + } + } else { + // some data is not processed because of limiting + size_t i; + for (i = 0; rows->hasNextRow() && i < topLimit; i++) { + Row r = rows->getNextRow(); + if (i == 0) { + // we set num columns based on the first row + numColumns = r.getNumColumns(); + } + + printHTMLRow(body_stream, i, r); + } + + // add the ... + body_stream << " \n"; + body_stream << " ...\n"; + for(int j = 0; j < numColumns; j++) { + body_stream << " ...\n"; + body_stream << " \n"; + } + + while (rows->hasNextRow()) { + Row r = rows->getNextRow(); + printHTMLRow(body_stream, i, r); + } + } + + assert(numColumns != 0); + + // construct headers + std::vector headers(numColumns); + if (!_columnNames.empty()) { + assert(numColumns == _columnNames.size()); + for (auto &c_name: _columnNames) { + headers_stream << fmt::format(" {}\n", c_name); + } + } else { + // default to generic name if column name doesn't exist + for (int i = 0; i < numColumns; ++i) { + headers_stream << fmt::format(" Column {}\n", i); + } + } + + os << fmt::format(HTML_TEMPLATE, headers_stream.str(), body_stream.str(), numColumns); + } + Schema DataSet::schema() const { if(!_operator) return Schema::UNKNOWN; diff --git a/tuplex/python/include/PythonDataSet.h b/tuplex/python/include/PythonDataSet.h index ede482d9c..4761ac7f0 100644 --- a/tuplex/python/include/PythonDataSet.h +++ b/tuplex/python/include/PythonDataSet.h @@ -79,6 +79,7 @@ namespace tuplex { py::object collect(); py::object take(const int64_t topLimit, const int64_t bottomLimit); void show(const int64_t numRows=-1); + std::string showHTMLPreview(const int64_t topLimit, const int64_t bottomLimit); // DataFrame like operations PythonDataSet mapColumn(const std::string& column, const std::string& lambda_code, const std::string& pickled_code, const py::object& closure=py::object()); diff --git a/tuplex/python/src/PythonBindings.cc b/tuplex/python/src/PythonBindings.cc index 6b3683853..ab239a1a2 100644 --- a/tuplex/python/src/PythonBindings.cc +++ b/tuplex/python/src/PythonBindings.cc @@ -41,6 +41,7 @@ PYMODULE { py::class_(m, "_DataSet") .def("show", &tuplex::PythonDataSet::show) + .def("showHTMLPreview", &tuplex::PythonDataSet::showHTMLPreview) .def("collect", &tuplex::PythonDataSet::collect) .def("take", &tuplex::PythonDataSet::take) .def("map", &tuplex::PythonDataSet::map) diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc index f6079a143..1f543e5d2 100644 --- a/tuplex/python/src/PythonDataSet.cc +++ b/tuplex/python/src/PythonDataSet.cc @@ -880,6 +880,55 @@ namespace tuplex { } } + std::string PythonDataSet::showHTMLPreview(const int64_t topLimit, const int64_t bottomLimit) { + // make sure a dataset is wrapped + assert(this->_dataset); + + // is callee error dataset? if so return list with error string + if (this->_dataset->isError()) { + auto errset = dynamic_cast(this->_dataset); + assert(errset); + return "Error: " + errset->getError(); + } else { + // release GIL & hand over everything to Tuplex + assert(PyGILState_Check()); // make sure this thread holds the GIL! + python::unlockGIL(); + + std::stringstream ss; + std::string err_message; + + size_t castedTopLimit = 0; + if (topLimit < 0) { + castedTopLimit = std::numeric_limits::max(); + } + + size_t castedBottomLimit = 0; + if (bottomLimit < 0) { + castedBottomLimit = std::numeric_limits::max(); + } + + try { + this->_dataset->showHTMLPreview(castedTopLimit, castedBottomLimit, ss); + } catch (const std::exception &e) { + err_message = e.what(); + Logger::instance().defaultLogger().error(err_message); + } catch (...) { + err_message = "unknown C++ exception occurred, please change type."; + Logger::instance().defaultLogger().error(err_message); + } + + // reacquire GIL + python::lockGIL(); + Logger::instance().flushToPython(); + + if (!ss.str().empty() && err_message.empty()) { + return ss.str(); + } else { + return "Error occurred: " + err_message; + } + } + } + PyObject* PythonDataSet::anyToCPythonWithPyObjects(ResultSet* rs, size_t maxRowCount) { assert(rs); diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py index 850b4ed83..4d02cf4d4 100644 --- a/tuplex/python/tuplex/dataset.py +++ b/tuplex/python/tuplex/dataset.py @@ -28,87 +28,8 @@ class DataSet: def __init__(self): self._dataSet = None - def getColumnSize(self): - data = self.collect() - if len(data) == 0: - return 0, 0 - else: - return len(data[0]) - - def revTake(self, nRows = 5): - return self.collect()[-nRows:] - def _repr_html_(self): - rows_list = self.take() - total_col_cnt = self.getColumnSize() - print('rowlist') - print(rows_list) - if len(rows_list) == 0: - header = '\n' - rows = '\n' - else: - header = '\n' - - if self.columns != None: - for x in self.columns: - header += f' {x}\n' - else: - for i in range(len(rows_list[0])): - header += f' column {i + 1}\n' - - rows = '' - for i, r in enumerate(rows_list): - rows += ' \n' - rows += f' {i}\n' - for data in r: - rows += f' {data}\n' - rows += ' \n' - - # add the ... - rows += ' \n' - rows += ' ...\n' - for i in range(total_col_cnt): - rows += ' ...\n' - rows += ' \n' - - lastData = self.revTake() - for i, r in enumerate(lastData): - rows += ' \n' - rows += f' {0 - len(lastData) + i}\n' - for data in r: - rows += f' {data}\n' - rows += ' \n' - - html_template = ( - '
\n' - '\n' - '\n' - ' \n' - ' \n' - f'{header}' - ' \n' - ' \n' - ' \n' - f'{rows}' - ' \n' - '
\n' - f'

{total_col_cnt} columns

\n' - '
' - ) - - return html_template + return self._dataSet.showHTMLPreview() def unique(self): """ removes duplicates from Dataset (out-of-order). Equivalent to a DISTINCT clause in a SQL-statement. @@ -201,11 +122,14 @@ def take(self, limitTop=5, limitBottom=0): (list): A list of tuples """ + assert limitTop is None or isinstance(limitTop, int), 'num rows must be an integer or None' + assert limitBottom is None or isinstance(limitBottom, int), 'num bottom last must be an integer or None' - assert isinstance(limitTop, int), 'num rows must be an integer' - assert limitTop > 0, 'please specify a number greater than zero' - assert isinstance(limitBottom, int), 'num bottom last must be an integer' - assert limitBottom >= 0, 'please specify a number greater or equal to zero' + if limitTop is None or limitTop < 0: + limitTop = -1 + + if limitBottom is None or limitBottom < 0: + limitBottom = -1 assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' @@ -227,6 +151,26 @@ def show(self, nrows=None): self._dataSet.show(nrows) + def showHTMLPreview(self, topLimit=5, bottomLimit=5): + """ action that generates a physical plan, processes data and return a subset of results as nicely formatted + HTML table to stdout. + + Args: + topLimit (int): number of top rows to collect. If ``None`` all rows will be collected + bottomLimit (int): number of bottom rows to collect. If ``None`` all rows will be collected + + Returns: + string: an HTML table showing a preview of the data + """ + assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' + + if topLimit is None or topLimit < 0: + topLimit = -1 + if bottomLimit is None or bottomLimit < 0: + bottomLimit = -1 + + return self._dataSet.showHTMLPreview(topLimit, bottomLimit) + def resolve(self, eclass, ftor): """ Adds a resolver operator to the pipeline. The signature of ftor needs to be identical to the one of the preceding operator. From 3658d84f0a20ff46f9b8bc81f1ba3c405638136a Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Fri, 8 Apr 2022 01:24:03 -0400 Subject: [PATCH 39/56] Address PR comments Add two more testcases Address PR feedbacks --- tuplex/core/include/DataSet.h | 35 +++- tuplex/core/include/EmptyDataset.h | 14 +- tuplex/core/include/LocalEngine.h | 59 ++++-- tuplex/core/include/ee/IBackend.h | 15 +- tuplex/core/include/ee/local/LocalBackend.h | 36 +++- tuplex/core/include/physical/TransformStage.h | 6 +- tuplex/core/include/physical/TransformTask.h | 41 +++- tuplex/core/src/DataSet.cc | 1 - tuplex/core/src/LocalEngine.cc | 38 +++- tuplex/core/src/ee/local/LocalBackend.cc | 72 ++++--- tuplex/core/src/physical/PhysicalPlan.cc | 2 +- tuplex/core/src/physical/ResultSet.cc | 12 +- tuplex/core/src/physical/TransformStage.cc | 9 +- tuplex/core/src/physical/TransformTask.cc | 86 ++++---- tuplex/python/tuplex/dataset.py | 128 +++++++++++- tuplex/test/core/ContextBasics.cc | 56 +++++- tuplex/test/core/ResultSetTest.cc | 5 +- tuplex/test/core/TakeTest.cc | 185 +++++++++++++++--- 18 files changed, 643 insertions(+), 157 deletions(-) diff --git a/tuplex/core/include/DataSet.h b/tuplex/core/include/DataSet.h index 9510427e2..3a5f450ac 100644 --- a/tuplex/core/include/DataSet.h +++ b/tuplex/core/include/DataSet.h @@ -125,7 +125,7 @@ namespace tuplex { /*! * action that displays tuples as nicely formatted table - * @param numRows how many rows to print, i.e. top numRows are printed.xs + * @param numRows how many rows to print, i.e. top numRows are printed.xs, -1 means print all rows * @param os ostream where to print table to */ virtual void show(int64_t numRows = -1, std::ostream &os = std::cout); @@ -260,21 +260,48 @@ namespace tuplex { * @param memoryLayout * @return */ - virtual DataSet& cache(const Schema::MemoryLayout& memoryLayout, bool storeSpecialized); - DataSet& cache(bool storeSpecialized=true) { return cache(Schema::MemoryLayout::ROW, storeSpecialized); } + virtual DataSet &cache(const Schema::MemoryLayout &memoryLayout, bool storeSpecialized); + + DataSet &cache(bool storeSpecialized = true) { return cache(Schema::MemoryLayout::ROW, storeSpecialized); } /*! * helper setter without checks, to update internal column names. */ void setColumns(const std::vector &columnNames) { _columnNames = columnNames; } - // these are actions that cause execution + /*! + * Execute the pipeline and return all outputs + * @param os the logging output + * @return the output of the execution + */ virtual std::shared_ptr collect(std::ostream &os = std::cout); + /*! + * Execute the pipeline and take a subset of the output from the top and bottom rows. + * If both top and bottom rows limit exist, then the top and bottom rows will be concatenated. + * In the case where topLimit + bottomLimit exceeds the output size, all rows will be taken. + * To take all rows, pass in either topLimit=size_t::max(), bottomLimit=size_t::max(), or both. + * @param topLimit number of top rows to take. size_t::max() means taking all rows + * @param bottomLimit number of bottom rows to take. size_t::max() means taking all rows + * @param os the logging output + * @return result of the execution, trim to the size of top and bottom limit. + */ virtual std::shared_ptr take(size_t topLimit, size_t bottomLimit, std::ostream &os = std::cout); + /*! + * Execute the pipeline and return all outputs as vector + * @param os the logging output + * @return the output of the execution in vector + */ virtual std::vector collectAsVector(std::ostream &os = std::cout); + /*! + * Execute the pipeline and take a subset of the output from the top rows, return as vector + * In the case where numElements exceeds the output size, all rows will be taken. + * @param numElements number of top rows to take. size_t::max() means taking all rows + * @param os the logging output + * @return result of the execution in vector, trim to the size of numElements + */ virtual std::vector takeAsVector(size_t numElements, std::ostream &os = std::cout); /*! diff --git a/tuplex/core/include/EmptyDataset.h b/tuplex/core/include/EmptyDataset.h index 6fc3219a4..585b70881 100644 --- a/tuplex/core/include/EmptyDataset.h +++ b/tuplex/core/include/EmptyDataset.h @@ -67,18 +67,20 @@ namespace tuplex { virtual DataSet& aggregateByKey(const UDF& aggCombine, const UDF& aggUDF, const Row& aggInitial, const std::vector &keyColumns) override { return *this; } //virtual void show(const int64_t numRows=-1, std::ostream& os=std::cout) override; - virtual std::shared_ptr collect(std::ostream& os) override; + virtual std::shared_ptr collect(std::ostream &os) override; // take / collect will print out the error only - virtual std::shared_ptr take(size_t topLimit, size_t bottomLimit, std::ostream& os) override; + virtual std::shared_ptr take(size_t topLimit, size_t bottomLimit, std::ostream &os) override; //virtual void show(const int64_t numRows=-1, std::ostream& os=std::cout) override; - virtual std::vector collectAsVector(std::ostream& os) override; + virtual std::vector collectAsVector(std::ostream &os) override; - // take / collect will print out the error only - virtual std::vector takeAsVector(size_t numElements, std::ostream& os) override; + /*! + * take / collect will print out the error only, return empty rows + */ + virtual std::vector takeAsVector(size_t numElements, std::ostream &os) override; - DataSet& cache(const Schema::MemoryLayout& memoryLayout, bool storeSpecialized) override { + DataSet &cache(const Schema::MemoryLayout &memoryLayout, bool storeSpecialized) override { return *this; } }; diff --git a/tuplex/core/include/LocalEngine.h b/tuplex/core/include/LocalEngine.h index 66ed3a1e8..740a40b4d 100644 --- a/tuplex/core/include/LocalEngine.h +++ b/tuplex/core/include/LocalEngine.h @@ -16,7 +16,28 @@ #include #include "RESTInterface.h" + namespace tuplex { + struct ExecutorConfig { + size_t _size; // size in bytes that each executor should have + size_t _blockSize; // size of individual blocks used (can be used for coarse or fine grained parallelism) + size_t _runTimeMemory; + size_t _runTimeMemoryDefaultBlockSize; + URI _cache_path; + + bool operator==(const ExecutorConfig &rhs) const { + return _size == rhs._size && + _blockSize == rhs._blockSize && + _runTimeMemory == rhs._runTimeMemory && + _runTimeMemoryDefaultBlockSize == rhs._runTimeMemoryDefaultBlockSize && + _cache_path == rhs._cache_path; + } + + bool operator!=(const ExecutorConfig &rhs) const { + return !(rhs == *this); + } + }; + /*! * local execution engine. Provides local executors for a context * THIS IS NOT THREADSAFE. Should be only accessed by driver thread. @@ -25,16 +46,18 @@ namespace tuplex { private: // non-detached executor that serves as the driver - std::unique_ptr _driver; + std::shared_ptr _driver; + ExecutorConfig _driver_cfg; std::vector> _executors; - std::map _refCounts; //! reference counts for each executor + std::map _refCounts; //! reference counts for each executor + + LocalEngine(const LocalEngine &); - LocalEngine(const LocalEngine&); - void operator = (const LocalEngine&); + void operator=(const LocalEngine &); // The local task queue - WorkQueue _queue; + WorkQueue _queue; protected: LocalEngine(); @@ -63,25 +86,25 @@ namespace tuplex { * @param cache_path directory where subfolders will be created for all executors to be started * @return array of executor references */ - std::vector getExecutors(const size_t num, - const size_t size, - const size_t blockSize, - const size_t runTimeMemory, - const size_t runTimeMemoryDefaultBlockSize, - const URI& cache_path); + std::vector getExecutors(const size_t num, + const size_t size, + const size_t blockSize, + const size_t runTimeMemory, + const size_t runTimeMemoryDefaultBlockSize, + const URI &cache_path); /*! * releases executors (invoked by context) * @param executors * @param ctx */ - void freeExecutors(const std::vector& executors, const Context* ctx=nullptr); + void freeExecutors(const std::vector &executors, const Context *ctx = nullptr); - Executor* getDriver(const size_t size, - const size_t blockSize, - const size_t runTimeMemory, - const size_t runTimeMemoryDefaultBlockSize, - const URI& cache_path); + std::shared_ptr getDriver(const size_t size, + const size_t blockSize, + const size_t runTimeMemory, + const size_t runTimeMemoryDefaultBlockSize, + const URI &cache_path); void release(); @@ -89,7 +112,7 @@ namespace tuplex { * retrieves the global work queue for local executors * @return */ - WorkQueue& getQueue() { return _queue; } + WorkQueue &getQueue() { return _queue; } }; } #endif //TUPLEX_LOCALENGINE_H \ No newline at end of file diff --git a/tuplex/core/include/ee/IBackend.h b/tuplex/core/include/ee/IBackend.h index e7a80e5bb..1a543df8f 100644 --- a/tuplex/core/include/ee/IBackend.h +++ b/tuplex/core/include/ee/IBackend.h @@ -29,19 +29,22 @@ namespace tuplex { class IBackend { public: IBackend() = delete; - IBackend(const IBackend& other) = delete; - IBackend(const Context& context) : _context(context) {} + + IBackend(const IBackend &other) = delete; + + IBackend(const Context &context) : _context(context) {} // driver, i.e. where to store local data. - virtual Executor* driver() = 0; - virtual void execute(PhysicalStage* stage) = 0; + virtual Executor *driver() = 0; + + virtual void execute(PhysicalStage *stage) = 0; virtual ~IBackend() {} // virtual destructor needed b.c. of smart pointers - virtual const Context& context() const { return _context; } + virtual const Context &context() const { return _context; } private: - const Context& _context; + const Context &_context; }; inline std::unordered_map, size_t> merge_ecounts(std::unordered_map, size_t> lhs, diff --git a/tuplex/core/include/ee/local/LocalBackend.h b/tuplex/core/include/ee/local/LocalBackend.h index d7a5ec25b..3d73a5d9f 100644 --- a/tuplex/core/include/ee/local/LocalBackend.h +++ b/tuplex/core/include/ee/local/LocalBackend.h @@ -40,14 +40,15 @@ namespace tuplex { * constructor for convenience * @param context */ - explicit LocalBackend(const Context& context); + explicit LocalBackend(const Context &context); - Executor* driver() override; // for local execution + Executor *driver() override; // for local execution + + void execute(PhysicalStage *stage) override; - void execute(PhysicalStage* stage) override; private: - Executor *_driver; //! driver from local backend... - std::vector _executors; //! drivers to be used + std::shared_ptr _driver; //! driver from local backend... + std::vector _executors; //! drivers to be used std::unique_ptr _compiler; HistoryServerConnection _historyConn; @@ -88,9 +89,6 @@ namespace tuplex { MessageHandler& logger() const { return Logger::instance().logger("local ee"); } - void trimPartitionsToLimit(std::vector &partitions, size_t topLimit, size_t bottomLimit, TransformStage* tstage); - Partition* newPartitionWithSkipRows(Partition* p_in, size_t numToSkip, TransformStage* tstage); - // write output (may be already in correct format!) void writeOutput(TransformStage* tstage, std::vector& sortedTasks); @@ -187,6 +185,28 @@ namespace tuplex { * @return */ extern URI outputURI(const UDF& udf, const URI& baseURI, int64_t partNo, FileFormat fmt); + + /*! + * Trim list of partitions so that it includes up to the first n rows and the last m rows + * if n + m > number of rows in input partitions, the partitions will remain unchanged + * @param partitions [in,out] the list of partitions to trim + * @param topLimit n, the number of top rows to include + * @param bottomLimit m, the number of bottom rows to include + * @param tstage pointer to transform stage, might be used to generate new partition + * @param exec pointer to executor, might be used to allocate new partition + */ + extern void trimPartitionsToLimit(std::vector &partitions, size_t topLimit, size_t bottomLimit, + TransformStage *tstage, Executor *exec); + + /*! + * Create a newly allocated partition with the same data as the specified partition, but with the first n rows removed + * @param p_in the input partition + * @param numToSkip number of rows to remove from the new partition + * @param tstage pointer to transform stage, used to generate new partition + * @param exec pointer to executor, used to allocate new partition + * @return the new partition + */ + extern Partition *newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage *tstage, Executor *exec); } #endif //TUPLEX_LOCALBACKEND_H \ No newline at end of file diff --git a/tuplex/core/include/physical/TransformStage.h b/tuplex/core/include/physical/TransformStage.h index e1e45c97b..ff4ece1dd 100644 --- a/tuplex/core/include/physical/TransformStage.h +++ b/tuplex/core/include/physical/TransformStage.h @@ -107,9 +107,9 @@ namespace tuplex { std::unordered_map partitionToExceptionsMap() { return _partitionToExceptionsMap; } /*! - * sets maximum number of top rows this pipeline will produce - * @param topLimit - * @param bottomLimit + * sets maximum number of rows this pipeline will produce + * @param topLimit number of top rows to produce, 0 means none, and size_t::max means everything + * @param bottomLimit number of bottom rows to produce, 0 means none, and size_t::max means everything */ inline void setOutputLimit(size_t topLimit, size_t bottomLimit) { _outputTopLimit = topLimit; diff --git a/tuplex/core/include/physical/TransformTask.h b/tuplex/core/include/physical/TransformTask.h index e2b8bc5b6..8ac5ba6df 100644 --- a/tuplex/core/include/physical/TransformTask.h +++ b/tuplex/core/include/physical/TransformTask.h @@ -180,15 +180,32 @@ namespace tuplex { void setOutputPrefix(const char* buf, size_t bufSize); // extra prefix to write first to output. void sinkOutputToHashTable(HashTableFormat fmt, int64_t outputDataSetID); + HashTableSink hashTableSink() const { return _htable; } // needs to be freed manually! - void setOutputTopLimit(size_t limit) { _outTopLimit = limit; resetOutputLimitCounter(); } - void setOutputBottomLimit(size_t limit) { _outBottomLimit = limit; resetOutputLimitCounter(); } + void setOutputTopLimit(size_t limit) { + _outTopLimit = limit; + } + + void setOutputBottomLimit(size_t limit) { + _outBottomLimit = limit; + } + + /*! + * Set the maximum task order number that the current stage execute and reset the row counter. + * This is used to detect and stop the execution when we have reached the rows limit + * @param maxOrder maximum task order number in the pipeline, infinity means disregarding the bottomLimit short circuit + */ + static void setMaxOrderAndResetLimits(size_t maxOrder = std::numeric_limits::max()); + void execute() override; bool hasFileSink() const { return _outputFilePath != URI::INVALID; } + bool hasFileSource() const { return _inputFilePath != URI::INVALID; } + bool hasMemorySink() const { return _outputSchema != Schema::UNKNOWN; } + bool hasMemorySource() const { return !_inputPartitions.empty(); } bool hasHashTableSink() const { return _htableFormat != HashTableFormat::UNKNOWN; } HashTableFormat hashTableFormat() const { return _htableFormat; } @@ -207,8 +224,6 @@ namespace tuplex { static codegen::i64_hash_row_f writeInt64HashTableAggregateCallback(); static codegen::write_row_f aggCombineCallback(); - static void resetOutputLimitCounter(); - // most be public because of C++ issues -.- int64_t writeRowToMemory(uint8_t* buf, int64_t bufSize); int64_t writeRowToFile(uint8_t* buf, int64_t bufSize); @@ -310,12 +325,26 @@ namespace tuplex { inline int64_t contextID() const { return _contextID; } inline void unlockAllMemorySinks() { // output partition existing? if so unlock - _output.unlock(); - _exceptions.unlock(); + _output.unlock(); + _exceptions.unlock(); } + /*! + * check whether the stage reached both top and bottom limit, to use this one must call + * setMaxOrderAndResetLimits before execution and set both top and bottom limit + * @return true if limit is reached + */ + bool limitReached() const; + + /*! + * Update the global stage limit counter, should only be called once, at the end of task + */ + void updateLimits(); + void processMemorySourceWithExp(); + void processMemorySource(); + void processFileSource(); // exceptions diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc index 210b3ec60..b62946ae4 100644 --- a/tuplex/core/src/DataSet.cc +++ b/tuplex/core/src/DataSet.cc @@ -849,7 +849,6 @@ namespace tuplex { assert(numColumns != 0); // construct headers - std::vector headers(numColumns); if (!_columnNames.empty()) { assert(numColumns == _columnNames.size()); for (auto &c_name: _columnNames) { diff --git a/tuplex/core/src/LocalEngine.cc b/tuplex/core/src/LocalEngine.cc index 02c060a90..c9c6d506b 100644 --- a/tuplex/core/src/LocalEngine.cc +++ b/tuplex/core/src/LocalEngine.cc @@ -98,7 +98,8 @@ namespace tuplex { exec->processQueue(true); std::stringstream ss; - ss<<"started local executor "<name()<<" ("<name() << " (" << sizeToMemString(size) << ", " + << sizeToMemString(blockSize) << " default partition size)"; logger.info(ss.str()); } @@ -107,23 +108,44 @@ namespace tuplex { return execs; } - Executor* LocalEngine::getDriver(const size_t size, const size_t blockSize, const size_t runTimeMemory, - const size_t runTimeMemoryDefaultBlockSize, const tuplex::URI &cache_path) { - // lazy start driver - if(!_driver) { + std::shared_ptr + LocalEngine::getDriver(const size_t size, const size_t blockSize, const size_t runTimeMemory, + const size_t runTimeMemoryDefaultBlockSize, const tuplex::URI &cache_path) { + ExecutorConfig new_cfg = ExecutorConfig{ + ._size = size, + ._blockSize = blockSize, + ._runTimeMemory = runTimeMemory, + ._runTimeMemoryDefaultBlockSize = runTimeMemoryDefaultBlockSize, + ._cache_path = cache_path + }; + + if (!_driver || _driver_cfg != new_cfg) { + if (_driver) { + Logger::instance().logger("local execution engine").info( + "driver already exist, starting new driver with updated config"); + _driver->release(); // TODO(march): test whether we need this + } + + // lazy start driver URI uri = URI(cache_path.toString() + "/" + "driver"); - _driver = std::make_unique(size, blockSize, runTimeMemory, runTimeMemoryDefaultBlockSize, uri, "driver"); + _driver = std::make_shared(size, blockSize, runTimeMemory, runTimeMemoryDefaultBlockSize, uri, + "driver"); + _driver_cfg = new_cfg; + // TODO(march): this could be a problem, if multiple driver with number = 0 + // TODO(march): write a test for two drivers existing together (thread number 0) + // TODO(march): make a comment about potential issue here // driver always has thread number 0! _driver->setThreadNumber(0); std::stringstream ss; - ss<<"started driver ("< & executors, const Context* ctx) { diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc index 0b8157ecc..351d55b88 100644 --- a/tuplex/core/src/ee/local/LocalBackend.cc +++ b/tuplex/core/src/ee/local/LocalBackend.cc @@ -116,14 +116,14 @@ namespace tuplex { } Executor *LocalBackend::driver() { - assert(_driver); - return _driver; + assert(_driver); + return _driver.get(); } void LocalBackend::execute(tuplex::PhysicalStage *stage) { assert(stage); - if(!stage) + if (!stage) return; // history server connection should be established @@ -651,7 +651,6 @@ namespace tuplex { // --> issue for each memory partition a transform task and put it into local workqueue assert(tstage->inputMode() == EndPointMode::MEMORY); - // restrict after input limit size_t numInputRows = 0; auto inputPartitions = tstage->inputPartitions(); @@ -702,9 +701,17 @@ namespace tuplex { tasks[i]->setOrder(i); } + TransformTask::setMaxOrderAndResetLimits(tasks.size() - 1); + if (tstage->hasOutputLimit()) { + // There are 3 possible cases here: + // 1. both top and bottom limit + // 2. only top limit + // 3. only bottom limit if (tstage->outputTopLimit() > 0 && tstage->outputBottomLimit() > 0) { - // do task striping for output limit on both ends + // case 1: do task striping for output limit on both ends + // We are executing in the striping order instead of ascending or descending order + // This is an optimization in the case where we have small limits to avoid executing all partitions vector newTasks; for(size_t i = 0; i < tasks.size() - i; i++) { const size_t rev_i = tasks.size() - 1 - i; @@ -716,10 +723,13 @@ namespace tuplex { assert(tasks.size() == newTasks.size()); tasks.swap(newTasks); } else if (tstage->outputBottomLimit() > 0) { - // bottom limit only, just reverse the task order + // case 3: bottom limit only, just reverse the task order + // We are executing the last partitions first, since we don't need the top rows. + // Thus speeding up the execution time std::reverse(tasks.begin(), tasks.end()); } - // if top limit only, do nothing since the order is already good + // case 3: if top limit only, do nothing since the order is already good + // (the tasks is generated in ascending order) } return tasks; @@ -887,7 +897,8 @@ namespace tuplex { auto output_par = tstage->inputPartitions(); if (tstage->hasOutputLimit()) { - trimPartitionsToLimit(output_par, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage); + trimPartitionsToLimit(output_par, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage, + _driver.get()); } tstage->setMemoryResult(output_par, std::vector{}, std::unordered_map(), pyObjects); pyObjects.clear(); @@ -971,7 +982,6 @@ namespace tuplex { } auto tasks = createLoadAndTransformToMemoryTasks(tstage, _options, syms); - auto completedTasks = performTasks(tasks); // Note: this doesn't work yet because of the globals. @@ -1205,7 +1215,9 @@ namespace tuplex { } if (tstage->hasOutputLimit()) { - trimPartitionsToLimit(output, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage); + // the function expect the output to be sorted in ascending order (guaranteed by sortTasks()) + trimPartitionsToLimit(output, tstage->outputTopLimit(), tstage->outputBottomLimit(), tstage, + _driver.get()); } tstage->setMemoryResult(output, generalOutput, partitionToExceptionsMap, nonConformingRows, remainingExceptions, ecounts); @@ -1565,12 +1577,6 @@ namespace tuplex { logger().debug("task without order found, please fix in code."); } #endif - - for (int i = 0; i < tasks.size(); i++) { - // take limit only work with uniform order - assert(tasks[i]->getOrder(0) == i); - } - // add all tasks to queue for(auto& task : tasks) wq.addTask(task); @@ -2125,17 +2131,18 @@ namespace tuplex { tstage->setFileResult(ecounts); } - void LocalBackend::trimPartitionsToLimit(std::vector &partitions, + void trimPartitionsToLimit(std::vector &partitions, size_t topLimit, size_t bottomLimit, - TransformStage* tstage) { + TransformStage* tstage, + Executor *exec) { std::vector limitedPartitions, limitedTailPartitions; // check top output limit, adjust partitions if necessary size_t numTopOutputRows = 0; - Partition* lastTopPart = nullptr; + Partition *lastTopPart = nullptr; size_t clippedTop = 0; - for (auto partition : partitions) { + for (auto partition: partitions) { numTopOutputRows += partition->getNumRows(); lastTopPart = partition; if (numTopOutputRows >= topLimit) { @@ -2174,7 +2181,8 @@ namespace tuplex { auto clipped = bottomLimit - (numBottomOutputRows - partition->getNumRows()); assert(clipped <= partition->getNumRows()); if (clipped > 0) { - Partition *newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage); + Partition *newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage, + exec); assert(newPart->getNumRows() == clipped); limitedTailPartitions.push_back(newPart); } @@ -2191,10 +2199,11 @@ namespace tuplex { assert(clippedTop + clippedBottom <= lastTopPart->getNumRows()); // split into two partitions with both top and bottom are in the same partition - Partition* lastBottomPart = nullptr; + Partition *lastBottomPart = nullptr; if (clippedBottom != 0) { - lastBottomPart = newPartitionWithSkipRows(lastTopPart, lastTopPart->getNumRows() - clippedBottom, tstage); + lastBottomPart = newPartitionWithSkipRows(lastTopPart, lastTopPart->getNumRows() - clippedBottom, + tstage, exec); } if (clippedTop != 0) { @@ -2215,27 +2224,28 @@ namespace tuplex { partitions.insert(partitions.end(), limitedTailPartitions.rbegin(), limitedTailPartitions.rend()); } - Partition* LocalBackend::newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage* tstage) { + Partition *newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage *tstage, Executor *exec) { auto ptr = p_in->lockRaw(); - auto num_rows = *((int64_t*) ptr); + auto num_rows = *((int64_t *) ptr); assert(numToSkip < num_rows); ptr += sizeof(int64_t); size_t numBytesToSkip = 0; - for(unsigned i = 0; i < numToSkip; ++i) { - Row r = Row::fromMemory(tstage->outputSchema(), ptr, p_in->capacity() - numBytesToSkip); + Deserializer ds(tstage->outputSchema()); + for (unsigned i = 0; i < numToSkip; ++i) { + Row r = Row::fromMemory(ds, ptr, p_in->capacity() - numBytesToSkip); ptr += r.serializedLength(); numBytesToSkip += r.serializedLength(); } - Partition *p_out = _driver->allocWritablePartition(p_in->size() - numBytesToSkip + sizeof(int64_t), - tstage->outputSchema(), tstage->outputDataSetID(), - tstage->context().id()); + Partition *p_out = exec->allocWritablePartition(p_in->size() - numBytesToSkip + sizeof(int64_t), + tstage->outputSchema(), tstage->outputDataSetID(), + tstage->context().id()); assert(p_out->capacity() >= p_in->size() - numBytesToSkip); auto ptr_out = p_out->lockRaw(); - *((int64_t*) ptr_out) = p_in->getNumRows() - numToSkip; + *((int64_t *) ptr_out) = p_in->getNumRows() - numToSkip; ptr_out += sizeof(int64_t); memcpy((void *) ptr_out, ptr, p_in->size() - numBytesToSkip); p_out->unlock(); diff --git a/tuplex/core/src/physical/PhysicalPlan.cc b/tuplex/core/src/physical/PhysicalPlan.cc index f289064d5..e88189447 100644 --- a/tuplex/core/src/physical/PhysicalPlan.cc +++ b/tuplex/core/src/physical/PhysicalPlan.cc @@ -240,7 +240,7 @@ namespace tuplex { // user wants to merge exceptions in order. bool updateInputExceptions = hasFilter && hasInputExceptions && _context.getOptions().OPT_MERGE_EXCEPTIONS_INORDER(); - // create transfrom stage via builder pattern + // create transform stage via builder pattern auto builder = codegen::StageBuilder(_num_stages++, isRootStage, _context.getOptions().UNDEFINED_BEHAVIOR_FOR_OPERATORS(), diff --git a/tuplex/core/src/physical/ResultSet.cc b/tuplex/core/src/physical/ResultSet.cc index bfd656dc8..0eb6d95ad 100644 --- a/tuplex/core/src/physical/ResultSet.cc +++ b/tuplex/core/src/physical/ResultSet.cc @@ -127,15 +127,19 @@ namespace tuplex { return vector{}; Deserializer ds(_schema); - for(int i = 0; i < limit;) { + for (size_t i = 0; i < limit;) { // all exhausted - if(_partitions.empty()) + if (_partitions.empty()) break; // get number of rows in first partition Partition *first = _partitions.front(); auto num_rows = first->getNumRows(); + + assert(num_rows >= _curRowCounter); + assert(limit >= i); + // how many left to retrieve? auto num_to_retrieve_from_partition = std::min(limit - i, num_rows - _curRowCounter); assert(num_to_retrieve_from_partition >= 0); @@ -145,8 +149,8 @@ namespace tuplex { // thread safe version (slow) // get next element of partition - const uint8_t* ptr = first->lock(); - for(int j = 0; j < num_to_retrieve_from_partition; ++j) { + const uint8_t *ptr = first->lock(); + for (size_t j = 0; j < num_to_retrieve_from_partition; ++j) { auto row = Row::fromMemory(ds, ptr + _byteCounter, first->capacity() - _byteCounter); _byteCounter += row.serializedLength(); _curRowCounter++; diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc index 060365697..b0e2e70ab 100644 --- a/tuplex/core/src/physical/TransformStage.cc +++ b/tuplex/core/src/physical/TransformStage.cc @@ -139,10 +139,17 @@ namespace tuplex { } } + size_t maxRows; + if (hasOutputLimit()) { + maxRows = outputTopLimit() + outputBottomLimit(); + } else { + maxRows = std::numeric_limits::max(); + } + // put ALL partitions to result set _rs = std::make_shared(schema, partitions, generalCase, partitionToExceptionsMap, interpreterRows, - outputTopLimit() + outputBottomLimit()); + maxRows); } } diff --git a/tuplex/core/src/physical/TransformTask.cc b/tuplex/core/src/physical/TransformTask.cc index 377385deb..2de71e4fe 100644 --- a/tuplex/core/src/physical/TransformTask.cc +++ b/tuplex/core/src/physical/TransformTask.cc @@ -18,18 +18,22 @@ #include namespace tuplex { - // atomic var to count output rows! - static std::atomic_int64_t g_totalTopOutputRows; - static std::atomic_int64_t g_totalBottomOutputRows; + // this is a logic to stop the execution once it has reached the topLimit and bottomLimit + // here, we assume that task order starts with zero and count up by 1, e.g. 0, 1, 2, ..., n + // To implement limit, we maintain a mapping from the task order to the number of rows done in that task + // (rows done are either 0 or #output rows after processing) + // we can then find out how many top rows are done by looking at g_rowsDone[0], g_rowsDone[1], ... + // until we reach some segment that's 0 + // likewise, we can find the bottom rows done by looking at g_rowsDone[g_maxOrder], g_rowsDone[g_maxOrder - 1], ... // mapping from order number -> row count if the task is finished static std::mutex g_rowsDoneMutex; - static std::map g_rowsDone; + static std::unordered_map g_rowsDone; + static std::atomic_size_t g_maxOrder; - void TransformTask::resetOutputLimitCounter() { - g_totalTopOutputRows = 0; - g_totalBottomOutputRows = 0; + void TransformTask::setMaxOrderAndResetLimits(size_t maxOrder) { g_rowsDone.clear(); + g_maxOrder = maxOrder; } } @@ -602,25 +606,16 @@ namespace tuplex { #endif } - void TransformTask::processMemorySource() { - assert(!_inputPartitions.empty()); - assert(_functor); - - _numInputRowsRead = 0; - _numOutputRowsWritten = 0; - - int64_t num_normal_rows = 0, num_bad_rows = 0; - - auto functor = reinterpret_cast(_functor); + bool TransformTask::limitReached() const { + size_t numTopCompleted = 0; + size_t numBottomCompleted = 0; + bool isTopLimitReached = false; + bool isBottomLimitReached = false; - // go over all input partitions. - for(const auto &inputPartition : _inputPartitions) { - size_t numTopCompleted = 0; - size_t numBottomCompleted = 0; - bool isTopLimitReached = false; - bool isBottomLimitReached = false; - - tuplex::g_rowsDoneMutex.lock(); + tuplex::g_rowsDoneMutex.lock(); + if (_outTopLimit == 0) { + isTopLimitReached = true; + } else { for (size_t i = 0; tuplex::g_rowsDone.count(i) != 0; i++) { numTopCompleted += tuplex::g_rowsDone[i]; if (numTopCompleted >= _outTopLimit) { @@ -628,17 +623,44 @@ namespace tuplex { break; } } - // TODO: what is the max task number here - for (size_t i = 100; tuplex::g_rowsDone.count(i) != 0; i--) { + } + + if (_outBottomLimit == 0) { + isBottomLimitReached = true; + } else { + for (size_t i = tuplex::g_maxOrder; tuplex::g_rowsDone.count(i) != 0; i--) { numBottomCompleted += tuplex::g_rowsDone[i]; - if (numBottomCompleted >= _outTopLimit) { + if (numBottomCompleted >= _outBottomLimit) { isBottomLimitReached = true; break; } } - tuplex::g_rowsDoneMutex.unlock(); + } + tuplex::g_rowsDoneMutex.unlock(); - if (isTopLimitReached && isBottomLimitReached) { + return isTopLimitReached && isBottomLimitReached; + } + + void TransformTask::updateLimits() { + tuplex::g_rowsDoneMutex.lock(); + tuplex::g_rowsDone[getOrder(0)] += getNumOutputRows(); + tuplex::g_rowsDoneMutex.unlock(); + } + + void TransformTask::processMemorySource() { + assert(!_inputPartitions.empty()); + assert(_functor); + + _numInputRowsRead = 0; + _numOutputRowsWritten = 0; + + int64_t num_normal_rows = 0, num_bad_rows = 0; + + auto functor = reinterpret_cast(_functor); + + // go over all input partitions. + for(const auto &inputPartition : _inputPartitions) { + if (limitReached()) { // skip the execution, enough is done break; } @@ -665,9 +687,7 @@ namespace tuplex { if(_invalidateSourceAfterUse) inputPartition->invalidate(); - tuplex::g_rowsDoneMutex.lock(); - tuplex::g_rowsDone[getOrder(0)] += getNumOutputRows(); - tuplex::g_rowsDoneMutex.unlock(); + updateLimits(); } #ifndef NDEBUG diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py index 4d02cf4d4..7eda223a1 100644 --- a/tuplex/python/tuplex/dataset.py +++ b/tuplex/python/tuplex/dataset.py @@ -113,7 +113,8 @@ def collect(self): return self._dataSet.collect() def take(self, limitTop=5, limitBottom=0): - """ action that generates a physical plan, processes data and collects the top results then as list of tuples. + """ action that generates a physical plan, processes data and collects the top and bottom results + then as list of tuples. Args: limitTop (int): number of top rows to collect. Per default ``5``. @@ -135,6 +136,26 @@ def take(self, limitTop=5, limitBottom=0): return self._dataSet.take(limitTop, limitBottom) + def head(self, nrows): + """ action that generates a physical plan, processes data and collects the top results then as list of tuples. + + Args: + nrows (int): number of rows to collect. + Returns: + (list): A list of tuples + """ + return self.take(nrows, 0) + + def tail(self, nrows): + """ action that generates a physical plan, processes data and collects the bottom results then as list of tuples. + + Args: + nrows (int): number of rows to collect. + Returns: + (list): A list of tuples + """ + return self.take(0, nrows) + def show(self, nrows=None): """ action that generates a physical plan, processes data and prints results as nicely formatted ASCII table to stdout. @@ -151,6 +172,15 @@ def show(self, nrows=None): self._dataSet.show(nrows) + def _getHTMLRow(self, ind, row): + row_str = "" + row_str += " \n" + row_str += " {}\n".format(ind) + for col in row: + row_str += " {}\n".format(col) + row_str += " \n" + return row_str + def showHTMLPreview(self, topLimit=5, bottomLimit=5): """ action that generates a physical plan, processes data and return a subset of results as nicely formatted HTML table to stdout. @@ -162,14 +192,108 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5): Returns: string: an HTML table showing a preview of the data """ + HTML_TEMPLATE = ( + "
\n" + "\n" + "\n" + " \n" + " \n" + "{}" + " \n" + " \n" + " \n" + "{}" + " \n" + "
\n" + "

{} columns

\n" + "
") + assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' + # TODO(march): edit this top/bottom limit if topLimit is None or topLimit < 0: topLimit = -1 if bottomLimit is None or bottomLimit < 0: bottomLimit = -1 - return self._dataSet.showHTMLPreview(topLimit, bottomLimit) + rows = self.take(topLimit, bottomLimit) + + if len(rows) == 0: + return HTML_TEMPLATE.format("\n", "\n") + + assert topLimit == -1 or bottomLimit == -1 or len(rows) <= topLimit + bottomLimit + + headers_str = "" + body = "" + num_columns = None + + # construct tables + if len(rows) < topLimit + bottomLimit: + # the data is small so we get everything (no need to render ...) + i = 0 + for r in rows: + if i == 0: + # we set num columns based on the first row + num_columns = r.getNumColumns() + body += self._getHTMLRow(i, r) + i += 1 + else: + # some data is not processed because of limiting + i = 0 + for r in rows: + if i >= topLimit: + break + if i == 0: + # we set num columns based on the first row + num_columns = r.getNumColumns() + + body += self._getHTMLRow(i, r) + i += 1 + + # add the ... + body += " \n" + body += " ...\n" + for _ in range(num_columns): + body += " ...\n" + body += " \n" + + for j in range(i, len(rows)): + body += self._getHTMLRow(i, rows[j]) + + assert num_columns is not None + + # construct headers + column_names = self._dataSet.columns() + if column_names is not None: + assert (num_columns == column_names.size()) + for c_name in column_names: + headers_str += " {}\n".format(c_name) + else: + # default to generic name if column name doesn't exist + for i in range(num_columns): + headers_str += " Column {}\n".format(i) + + return HTML_TEMPLATE.format(headers_str, body, num_columns) + + def _getConsoleRow(self, ind, row): + # TODO(march): (work on this) + pass + + def showConsolePreview(self, topLimit=5, bottomLimit=5): + # TODO(march): (work on this) + pass def resolve(self, eclass, ftor): """ Adds a resolver operator to the pipeline. The signature of ftor needs to be identical to the one of the preceding operator. diff --git a/tuplex/test/core/ContextBasics.cc b/tuplex/test/core/ContextBasics.cc index fdbdd8d50..0be3c6030 100644 --- a/tuplex/test/core/ContextBasics.cc +++ b/tuplex/test/core/ContextBasics.cc @@ -136,4 +136,58 @@ TEST_F(ContextBasicsTest, JSON) { auto str = ContextOptions::defaults().asJSON(); EXPECT_GT(str.length(), 2); -} \ No newline at end of file +} + +TEST_F(ContextBasicsTest, twoContextTest) { + using namespace tuplex; + + python::initInterpreter(); + python::unlockGIL(); + + ContextOptions co = testOptions(); + co.set("tuplex.partitionSize", "100B"); + co.set("tuplex.executorMemory", "1MB"); + co.set("tuplex.scratchDir", scratchDir + "/context1"); + + // second context with different executor config, should cause the driver to split up + ContextOptions co2 = testOptions(); + co.set("tuplex.partitionSize", "100B"); + co2.set("tuplex.executorMemory", "2MB"); + co2.set("tuplex.scratchDir", scratchDir + "/context2"); + + Context c1(co); + Context c2(co2); + Row row1(Tuple(0), Tuple("hello")); + Row row2(Tuple(1), Tuple("this")); + Row row3(Tuple(2), Tuple("is")); + Row row4(Tuple(3), Tuple("a")); + Row row5(Tuple(4), Tuple("test")); + + for (int t = 0; t < 10; t++) { + auto ds1 = c1.parallelize({row1, row2, row3, row4, row5}) + .map(UDF("lambda x: x[1][0]")); // new code: string index operator! first to raise an exception! + + auto ds2 = c2.parallelize({row1, row2, row3, row4, row5}) + .map(UDF("lambda x: x[1][0]")); // new code: string index operator! first to raise an exception! + + auto v1 = ds1.collectAsVector(); + auto v2 = ds2.collectAsVector(); + + std::vector ref{"hello", "this", "is", "a", "test"}; + + EXPECT_EQ(v1.size(), 5); + for (int i = 0; i < 5; i++) { + EXPECT_EQ(v1[i].getString(0), ref[i]); + } + + EXPECT_EQ(v2.size(), 5); + for (int i = 0; i < 5; i++) { + EXPECT_EQ(v2[i].getString(0), ref[i]); + } + } + + python::lockGIL(); + python::closeInterpreter(); +} + +// TODO(march): multiple context test \ No newline at end of file diff --git a/tuplex/test/core/ResultSetTest.cc b/tuplex/test/core/ResultSetTest.cc index 4acd38921..2ea273062 100644 --- a/tuplex/test/core/ResultSetTest.cc +++ b/tuplex/test/core/ResultSetTest.cc @@ -14,7 +14,7 @@ class ResultSetTest : public PyTest { protected: - tuplex::Executor *driver; + std::shared_ptr driver; tuplex::ContextOptions options; public: // init function @@ -45,7 +45,8 @@ class ResultSetTest : public PyTest { EXPECT_EQ(r.getRowType(), first_type); // now write via partition writer - tuplex::PartitionWriter pw(driver, Schema(Schema::MemoryLayout::ROW, first_type), 0, 0, options.PARTITION_SIZE()); + tuplex::PartitionWriter pw(driver.get(), Schema(Schema::MemoryLayout::ROW, first_type), 0, 0, + options.PARTITION_SIZE()); for(const auto& r : rows) pw.writeRow(r); return pw.getOutputPartitions(); diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc index 3990fcd07..86173e40b 100644 --- a/tuplex/test/core/TakeTest.cc +++ b/tuplex/test/core/TakeTest.cc @@ -16,7 +16,8 @@ using namespace tuplex; using namespace std; -class TakeTest : public PyTest {}; +class TakeTest : public PyTest { +}; /** * Randomly generate a vector of rows for testing @@ -37,15 +38,15 @@ vector generateTestData(size_t N, uint64_t seed) { return data; } -vector generateReferenceData(const vector& input, size_t topLimit, size_t bottomLimit) { +vector generateReferenceData(const vector &input, size_t topLimit, size_t bottomLimit) { vector output; - for(size_t i = 0; i < topLimit && i < input.size(); i++) { + for (size_t i = 0; i < topLimit && i < input.size(); i++) { output.push_back(input[i]); } size_t start_bottom = input.size() >= bottomLimit ? input.size() - bottomLimit : 0; start_bottom = max(topLimit, start_bottom); - for(size_t i = start_bottom; i < input.size(); i++) { + for (size_t i = start_bottom; i < input.size(); i++) { output.push_back(input[i]); } @@ -57,7 +58,7 @@ TEST_F(TakeTest, takeTopTest) { Context context(opt); auto rs = context.parallelize( - {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(1, 0); + {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(1, 0); ASSERT_EQ(rs->rowCount(), 1); auto v = rs->getRows(1); @@ -65,7 +66,7 @@ TEST_F(TakeTest, takeTopTest) { EXPECT_EQ(v[0].getInt(0), 1); auto rs2 = context.parallelize( - {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(3, 0); + {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(3, 0); ASSERT_EQ(rs2->rowCount(), 3); auto v2 = rs2->getRows(3); @@ -75,7 +76,8 @@ TEST_F(TakeTest, takeTopTest) { EXPECT_EQ(v2[2].getInt(0), 3); auto rs3 = context.parallelize( - {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), Row("!")}).take(5, 0); + {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), + Row("!")}).take(5, 0); ASSERT_EQ(rs3->rowCount(), 5); auto v3 = rs3->getRows(5); @@ -93,7 +95,7 @@ TEST_F(TakeTest, takeBottomTest) { Context context(opt); auto rs = context.parallelize( - {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(0, 1); + {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(0, 1); ASSERT_EQ(rs->rowCount(), 1); auto v = rs->getRows(1); @@ -101,7 +103,7 @@ TEST_F(TakeTest, takeBottomTest) { EXPECT_EQ(v[0].getInt(0), 6); auto rs2 = context.parallelize( - {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(0, 3); + {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(0, 3); ASSERT_EQ(rs2->rowCount(), 3); auto v2 = rs2->getRows(3); @@ -111,7 +113,8 @@ TEST_F(TakeTest, takeBottomTest) { EXPECT_EQ(v2[2].getInt(0), 6); auto rs3 = context.parallelize( - {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), Row("!")}).take(0, 5); + {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), + Row("!")}).take(0, 5); ASSERT_EQ(rs3->rowCount(), 5); auto v3 = rs3->getRows(5); @@ -129,7 +132,7 @@ TEST_F(TakeTest, takeBothTest) { Context context(opt); auto rs = context.parallelize( - {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(1, 1); + {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(1, 1); ASSERT_EQ(rs->rowCount(), 2); auto v = rs->getRows(2); @@ -138,7 +141,7 @@ TEST_F(TakeTest, takeBothTest) { EXPECT_EQ(v[1].getInt(0), 6); auto rs2 = context.parallelize( - {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(2, 1); + {Row(1), Row(2), Row(3), Row(4), Row(5), Row(6)}).take(2, 1); ASSERT_EQ(rs2->rowCount(), 3); auto v2 = rs2->getRows(3); @@ -148,7 +151,8 @@ TEST_F(TakeTest, takeBothTest) { EXPECT_EQ(v2[2].getInt(0), 6); auto rs3 = context.parallelize( - {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), Row("!")}).take(2, 3); + {Row("hello"), Row("world"), Row("! :)"), Row("world"), Row("hello"), Row("!"), Row("! :)"), + Row("!")}).take(2, 3); ASSERT_EQ(rs3->rowCount(), 5); auto v3 = rs3->getRows(5); @@ -167,12 +171,12 @@ TEST_F(TakeTest, takeBigTest) { const std::vector limit_values{0, 1, 5, 11, 600, 10000}; const std::vector partition_sizes{"256B", "512KB", "1MB"}; - for(auto& part_size : partition_sizes) { + for (auto &part_size: partition_sizes) { auto opt = testOptions(); opt.set("tuplex.partitionSize", part_size); Context context(opt); - for(auto data_size : test_size) { + for (auto data_size: test_size) { for (auto top_limit: limit_values) { for (auto bottom_limit: limit_values) { std::cout << "testing with partition size:" << part_size << " data size:" @@ -195,12 +199,149 @@ TEST_F(TakeTest, takeBigTest) { } } -// TODO(march): with map, filter function -//TEST_F(TakeTest, takeMapFilterTest) { -// srand(4242); -//} +vector generateMapFilterReferenceData(const vector &input, size_t topLimit, size_t bottomLimit) { + if (input.empty()) { + return {}; + } + + assert(input[0].getNumColumns() == 3); + vector intermediate; + for (const Row &r: input) { + int64_t new_a = r.getInt(0) + r.getInt(1); + + if (new_a % 2 == 0) { + intermediate.emplace_back(new_a, r.getInt(2)); + } + } + + return generateReferenceData(intermediate, topLimit, bottomLimit); +} + +TEST_F(TakeTest, takeMapFilterTest) { + mt19937 data_seed_gen(56120); + + const std::vector test_size{1, 10, 100, 1001, 10001}; + const std::vector limit_values{0, 1, 5, 11, 600, 10000}; + const std::vector partition_sizes{"256B", "512KB", "1MB"}; + + UDF map_udf("lambda a, b, c: ((a + b), c)"); + UDF filter_udf("lambda a, b: a % 2 == 0"); + + for (auto &part_size: partition_sizes) { + auto opt = testOptions(); + opt.set("tuplex.partitionSize", part_size); + Context context(opt); + + for (auto data_size: test_size) { + for (auto top_limit: limit_values) { + for (auto bottom_limit: limit_values) { + std::cout << "testing with partition size:" << part_size << " data size:" + << data_size << " top:" << top_limit << " bottom:" << bottom_limit << std::endl; + + auto data = generateTestData(data_size, data_seed_gen()); + auto ref_data = generateMapFilterReferenceData(data, top_limit, bottom_limit); + + auto ds = context.parallelize(data).map(map_udf).filter(filter_udf); + auto res = ds.take(top_limit, bottom_limit); + ASSERT_EQ(ref_data.size(), res->rowCount()); + for (Row &r: ref_data) { + Row res_row = res->getNextRow(); + if (!(res_row == r)) { + ASSERT_EQ(res_row, r); + } + } + } + } + } + } +} + +TEST_F(TakeTest, collectIdentityTest) { + mt19937 data_seed_gen(123454); + + const std::vector test_size{1, 10, 100, 1001, 10001}; + const std::vector partition_sizes{"256B", "512KB", "1MB"}; + + for (auto &part_size: partition_sizes) { + auto opt = testOptions(); + opt.set("tuplex.partitionSize", part_size); + Context context(opt); + + for (auto data_size: test_size) { + auto data = generateTestData(data_size, data_seed_gen()); + auto res = context.parallelize(data).collect(); + ASSERT_EQ(data.size(), res->rowCount()); + for (Row &r: data) { + Row res_row = res->getNextRow(); + if (!(res_row == r)) { + ASSERT_EQ(res_row, r); + } + } + } + } +} + +TEST_F(TakeTest, fileInputTest) { + const std::vector test_size{1, 10, 100, 1001, 50001}; + const std::vector limit_values{0, 1, 5, 11, 600, 10000}; + const std::vector partition_sizes{"256B", "512KB", "1MB"}; + std::vector> expected_outputs; + + if (!boost::filesystem::exists(scratchDir)) { + boost::filesystem::create_directory(scratchDir); + } + + std::vector fileInputNames; + for (unsigned long N: test_size) { + std::vector ref_output; + // write temp file + auto fName = fmt::format("{}/{}-{}.csv", scratchDir, testName, N); + + FILE *fp = fopen(fName.c_str(), "w"); + ASSERT_TRUE(fp); + fprintf(fp, "colA,colStr,colB\n"); + for (int i = 0; i < N; ++i) { + fprintf(fp, "%d,\"hello%d\",%d\n", i, (i * 3) % 7, i % 15); + ref_output.emplace_back(i, fmt::format("hello{}", (i * 3) % 7), (i % 15) * (i % 15)); + } + fclose(fp); + + expected_outputs.push_back(std::move(ref_output)); + fileInputNames.push_back(fName); + } -// TODO(march): with file input -// context.csv("../resources/"); + ASSERT_TRUE(expected_outputs.size() == test_size.size()); + ASSERT_TRUE(fileInputNames.size() == test_size.size()); + + for (auto &part_size: partition_sizes) { + auto opt = microTestOptions(); + opt.set("tuplex.partitionSize", part_size); + Context context(opt); + + for (int t = 0; t < test_size.size(); t++) { + const size_t data_size = test_size[t]; + + for (auto top_limit: limit_values) { + for (auto bottom_limit: limit_values) { + std::cout << "file testing with partition size:" << part_size << " data size:" + << data_size << " top:" << top_limit << " bottom:" << bottom_limit << std::endl; + + auto ref_output = generateReferenceData(expected_outputs[t], top_limit, bottom_limit); + auto res = context.csv(testName + ".csv") + .mapColumn("colB", UDF("lambda x: x * x")) + .take(top_limit, bottom_limit); + + ASSERT_EQ(ref_output.size(), res->rowCount()); + for (Row &r: ref_output) { + Row res_row = res->getNextRow(); + if (!(res_row == r)) { + ASSERT_EQ(res_row, r); + } + } + } + } + } + } +} -// TODO(march): collect operator \ No newline at end of file +// TODO(march): write test for trimPartitionsToLimit \ No newline at end of file From 72b6580c763f46e9867aa04ee8488fa9bd5400c3 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Fri, 15 Apr 2022 02:50:20 -0400 Subject: [PATCH 40/56] Add file testcases --- tuplex/core/src/LocalEngine.cc | 6 ++---- tuplex/test/core/ContextBasics.cc | 4 +--- tuplex/test/core/TakeTest.cc | 22 ++++++++++++---------- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/tuplex/core/src/LocalEngine.cc b/tuplex/core/src/LocalEngine.cc index c9c6d506b..91892d44d 100644 --- a/tuplex/core/src/LocalEngine.cc +++ b/tuplex/core/src/LocalEngine.cc @@ -123,7 +123,6 @@ namespace tuplex { if (_driver) { Logger::instance().logger("local execution engine").info( "driver already exist, starting new driver with updated config"); - _driver->release(); // TODO(march): test whether we need this } // lazy start driver @@ -132,10 +131,9 @@ namespace tuplex { "driver"); _driver_cfg = new_cfg; - // TODO(march): this could be a problem, if multiple driver with number = 0 - // TODO(march): write a test for two drivers existing together (thread number 0) - // TODO(march): make a comment about potential issue here // driver always has thread number 0! + // Note: this could be a potential issue if the config change and the old driver is still running + // due to external reference. Then there could be two executors with the same number _driver->setThreadNumber(0); std::stringstream ss; diff --git a/tuplex/test/core/ContextBasics.cc b/tuplex/test/core/ContextBasics.cc index 0be3c6030..e85107b40 100644 --- a/tuplex/test/core/ContextBasics.cc +++ b/tuplex/test/core/ContextBasics.cc @@ -188,6 +188,4 @@ TEST_F(ContextBasicsTest, twoContextTest) { python::lockGIL(); python::closeInterpreter(); -} - -// TODO(march): multiple context test \ No newline at end of file +} \ No newline at end of file diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc index 86173e40b..eda609518 100644 --- a/tuplex/test/core/TakeTest.cc +++ b/tuplex/test/core/TakeTest.cc @@ -282,9 +282,9 @@ TEST_F(TakeTest, collectIdentityTest) { } TEST_F(TakeTest, fileInputTest) { - const std::vector test_size{1, 10, 100, 1001, 50001}; - const std::vector limit_values{0, 1, 5, 11, 600, 10000}; - const std::vector partition_sizes{"256B", "512KB", "1MB"}; + const std::vector test_size{1, 10, 1001, 50001}; + const std::vector limit_values{0, 1, 6, 600, 10000}; + const std::vector partition_sizes{"256B", "1MB"}; std::vector> expected_outputs; if (!boost::filesystem::exists(scratchDir)) { @@ -327,21 +327,23 @@ TEST_F(TakeTest, fileInputTest) { << data_size << " top:" << top_limit << " bottom:" << bottom_limit << std::endl; auto ref_output = generateReferenceData(expected_outputs[t], top_limit, bottom_limit); - auto res = context.csv(testName + ".csv") + auto res = context.csv(fileInputNames[t]) .mapColumn("colB", UDF("lambda x: x * x")) .take(top_limit, bottom_limit); ASSERT_EQ(ref_output.size(), res->rowCount()); for (Row &r: ref_output) { Row res_row = res->getNextRow(); - if (!(res_row == r)) { - ASSERT_EQ(res_row, r); - } + ASSERT_EQ(res_row.getInt(0), r.getInt(0)); + ASSERT_EQ(res_row.getString(1), r.getString(1)); + ASSERT_EQ(res_row.getInt(2), r.getInt(2)); + // TODO(march): this doesn't work because schema are different (for some reason infer as opt[int]?) + // if (!(res_row == r)) { + // ASSERT_EQ(res_row, r); + // } } } } } } -} - -// TODO(march): write test for trimPartitionsToLimit \ No newline at end of file +} \ No newline at end of file From 172d6b57690b2cb0b0dadc0c584178b1d1ce862f Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Tue, 19 Apr 2022 23:45:01 -0400 Subject: [PATCH 41/56] Python Dataset Debug --- tuplex/python/src/PythonDataSet.cc | 4 + tuplex/python/tuplex/dataset.py | 128 +++++++++++++++------ tuplex/python/tuplex/utils/table_format.py | 80 +++++++++++++ tuplex/test/core/TakeTest.cc | 8 +- 4 files changed, 178 insertions(+), 42 deletions(-) create mode 100644 tuplex/python/tuplex/utils/table_format.py diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc index 1f543e5d2..5382ad24d 100644 --- a/tuplex/python/src/PythonDataSet.cc +++ b/tuplex/python/src/PythonDataSet.cc @@ -133,11 +133,15 @@ namespace tuplex { size_t castedTopLimit = 0; if (topLimit < 0) { castedTopLimit = std::numeric_limits::max(); + } else { + castedTopLimit = topLimit; } size_t castedBottomLimit = 0; if (bottomLimit < 0) { castedBottomLimit = std::numeric_limits::max(); + } else { + castedBottomLimit = bottomLimit; } try { diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py index 7eda223a1..c0b9ef4d0 100644 --- a/tuplex/python/tuplex/dataset.py +++ b/tuplex/python/tuplex/dataset.py @@ -19,6 +19,7 @@ from tuplex.utils.framework import UDFCodeExtractionError from tuplex.utils.source_vault import SourceVault from .exceptions import classToExceptionCode +import tuplex.utils.table_format as table_format # signed 64bit limit max_rows = 9223372036854775807 @@ -29,7 +30,10 @@ def __init__(self): self._dataSet = None def _repr_html_(self): - return self._dataSet.showHTMLPreview() + return self.showHTMLPreview() + + def __repr__(self): + return self.showStrPreview() def unique(self): """ removes duplicates from Dataset (out-of-order). Equivalent to a DISTINCT clause in a SQL-statement. @@ -172,15 +176,6 @@ def show(self, nrows=None): self._dataSet.show(nrows) - def _getHTMLRow(self, ind, row): - row_str = "" - row_str += " \n" - row_str += " {}\n".format(ind) - for col in row: - row_str += " {}\n".format(col) - row_str += " \n" - return row_str - def showHTMLPreview(self, topLimit=5, bottomLimit=5): """ action that generates a physical plan, processes data and return a subset of results as nicely formatted HTML table to stdout. @@ -195,17 +190,17 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5): HTML_TEMPLATE = ( "
\n" "\n" "\n" " \n" @@ -222,16 +217,10 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5): assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' - # TODO(march): edit this top/bottom limit - if topLimit is None or topLimit < 0: - topLimit = -1 - if bottomLimit is None or bottomLimit < 0: - bottomLimit = -1 - rows = self.take(topLimit, bottomLimit) if len(rows) == 0: - return HTML_TEMPLATE.format("\n", "\n") + return HTML_TEMPLATE.format("\n", "\n", 0) assert topLimit == -1 or bottomLimit == -1 or len(rows) <= topLimit + bottomLimit @@ -246,8 +235,8 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5): for r in rows: if i == 0: # we set num columns based on the first row - num_columns = r.getNumColumns() - body += self._getHTMLRow(i, r) + num_columns = len(r) if isinstance(r, list) or isinstance(r, tuple) else 1 + body += table_format.getHTMLRow(i, r) i += 1 else: # some data is not processed because of limiting @@ -257,9 +246,9 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5): break if i == 0: # we set num columns based on the first row - num_columns = r.getNumColumns() + num_columns = len(r) if isinstance(r, list) or isinstance(r, tuple) else 1 - body += self._getHTMLRow(i, r) + body += table_format.getHTMLRow(i, r) i += 1 # add the ... @@ -270,14 +259,15 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5): body += " \n" for j in range(i, len(rows)): - body += self._getHTMLRow(i, rows[j]) + body += table_format.getHTMLRow(len(rows) - j, rows[j]) assert num_columns is not None # construct headers column_names = self._dataSet.columns() - if column_names is not None: - assert (num_columns == column_names.size()) + headers_str += " \n" + if len(column_names) > 0: + assert (num_columns == len(column_names)) for c_name in column_names: headers_str += " \n".format(c_name) else: @@ -287,13 +277,79 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5): return HTML_TEMPLATE.format(headers_str, body, num_columns) - def _getConsoleRow(self, ind, row): - # TODO(march): (work on this) - pass + def showStrPreview(self, topLimit=5, bottomLimit=5): + """ action that generates a physical plan, processes data and return a subset of results as nicely formatted + ASCII table to stdout. + + Args: + topLimit (int): number of top rows to collect. If ``None`` all rows will be collected + bottomLimit (int): number of bottom rows to collect. If ``None`` all rows will be collected + + Returns: + string: an HTML table showing a preview of the data + """ + assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' + + rows = self.take(topLimit, bottomLimit) + + if len(rows) == 0: + return ( + "---\n" + "| |\n" + "---\n" + "0 columns\n") + + assert topLimit == -1 or bottomLimit == -1 or len(rows) <= topLimit + bottomLimit + + str_table = [] + num_columns = None + + # construct tables + if len(rows) < topLimit + bottomLimit: + # the data is small so we get everything (no need to render ...) + i = 0 + for r in rows: + if i == 0: + # we set num columns based on the first row + num_columns = len(r) if isinstance(r, list) or isinstance(r, tuple) else 1 + str_table.append(table_format.getStrTableRow(i, r)) + i += 1 + else: + # some data is not processed because of limiting + i = 0 + for r in rows: + if i >= topLimit: + break + if i == 0: + # we set num columns based on the first row + num_columns = len(r) if isinstance(r, list) or isinstance(r, tuple) else 1 + + str_table.append(table_format.getStrTableRow(i, r)) + i += 1 + + # add the ... + str_table.append(["..."] * (num_columns + 1)) + + for j in range(i, len(rows)): + str_table.append(table_format.getStrTableRow(len(rows) - j, rows[j])) + + assert num_columns is not None + + # construct headers + column_names = self._dataSet.columns() + headers_list = [""] + if len(column_names) > 0: + assert (num_columns == len(column_names)) + for c_name in column_names: + headers_list.append("{}".format(c_name)) + else: + # default to generic name if column name doesn't exist + for i in range(num_columns): + headers_list.append("Column {}".format(i)) + + str_table = [headers_list] + str_table - def showConsolePreview(self, topLimit=5, bottomLimit=5): - # TODO(march): (work on this) - pass + return table_format.generateStrTable(num_columns + 1, str_table) def resolve(self, eclass, ftor): """ Adds a resolver operator to the pipeline. The signature of ftor needs to be identical to the one of the preceding operator. diff --git a/tuplex/python/tuplex/utils/table_format.py b/tuplex/python/tuplex/utils/table_format.py new file mode 100644 index 000000000..bb83118b4 --- /dev/null +++ b/tuplex/python/tuplex/utils/table_format.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +# ----------------------------------------------------------------------------------------------------------------------# +# # +# Tuplex: Blazing Fast Python Data Science # +# # +# # +# (c) 2017 - 2021, Tuplex team # +# Created by Leonhard Spiegelberg first on 4/19/2022 # +# License: Apache 2.0 # +# ----------------------------------------------------------------------------------------------------------------------# + +def getHTMLRow(ind, row): + """ + Given a row, converts all the contents to an HTML row and return + :param ind: the index of that row + :param row: a row output from dataset + :return: an HTML row, representative of the row + """ + row_str = "" + row_str += " \n" + row_str += " \n".format(ind) + if isinstance(row, list) or isinstance(row, tuple): + for col in row: + row_str += " \n".format(col) + else: + row_str += " \n".format(row) + row_str += " \n" + return row_str + + +def getStrTableRow(ind, row): + """ + Given a row, converts all the contents to string and return + :param ind: the index of that row + :param row: a row output from dataset + :return: a list of string, representative of the row + """ + row_str_list = ["{}".format(ind)] + if isinstance(row, list) or isinstance(row, tuple): + for col in row: + row_str_list.append("{}".format(col)) + else: + row_str_list.append("{}".format(row)) + return row_str_list + + +def _getLineDivider(col_width): + out = "" + for w in col_width: + out += "+" + ("-" * (w + 2)) + out += "+\n" + + return out + +def generateStrTable(numCols, strTable): + """ + Given a 2-dimensional list of strings, print a nicely formatted table of the contents in the list + :param numCols: number of columns in the table + :param strTable: 2-dimensional list of strings, as list of list + :return: a nicely formatted table in string + """ + max_col_width = [0] * numCols + + for r in strTable: + for i in range(0, len(r)): + assert (isinstance(r[i], str)) + if len(r[i]) > max_col_width[i]: + max_col_width[i] = len(r[i]) + + output_str = "" + + for r in strTable: + output_str += _getLineDivider(max_col_width) + for i in range(0, len(r)): + output_str += "| {:<{width}} ".format(r[i], width=max_col_width[i]) + output_str += "|\n" + + output_str += _getLineDivider(max_col_width) + "{} columns\n".format(numCols) + + return output_str diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc index eda609518..4e4a70f53 100644 --- a/tuplex/test/core/TakeTest.cc +++ b/tuplex/test/core/TakeTest.cc @@ -282,8 +282,8 @@ TEST_F(TakeTest, collectIdentityTest) { } TEST_F(TakeTest, fileInputTest) { - const std::vector test_size{1, 10, 1001, 50001}; - const std::vector limit_values{0, 1, 6, 600, 10000}; + const std::vector test_size{1, 1001, 50001}; + const std::vector limit_values{0, 1, 600, 10000}; const std::vector partition_sizes{"256B", "1MB"}; std::vector> expected_outputs; @@ -337,10 +337,6 @@ TEST_F(TakeTest, fileInputTest) { ASSERT_EQ(res_row.getInt(0), r.getInt(0)); ASSERT_EQ(res_row.getString(1), r.getString(1)); ASSERT_EQ(res_row.getInt(2), r.getInt(2)); - // TODO(march): this doesn't work because schema are different (for some reason infer as opt[int]?) - // if (!(res_row == r)) { - // ASSERT_EQ(res_row, r); - // } } } } From a2d41784b6a00050b96920c0b7f7cb61f2fce206 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Tue, 19 Apr 2022 23:47:17 -0400 Subject: [PATCH 42/56] Remove showHTMLPreview from Dataset in C++ --- tuplex/core/include/DataSet.h | 8 -- tuplex/core/src/DataSet.cc | 108 -------------------------- tuplex/python/include/PythonDataSet.h | 1 - tuplex/python/src/PythonBindings.cc | 1 - tuplex/python/src/PythonDataSet.cc | 49 ------------ 5 files changed, 167 deletions(-) diff --git a/tuplex/core/include/DataSet.h b/tuplex/core/include/DataSet.h index 3a5f450ac..1b11c1f75 100644 --- a/tuplex/core/include/DataSet.h +++ b/tuplex/core/include/DataSet.h @@ -130,14 +130,6 @@ namespace tuplex { */ virtual void show(int64_t numRows = -1, std::ostream &os = std::cout); - /*! - * Displays a formatted HTML table of a small portion of the data - * @param topLimit how many top rows to print - * @param bottomLimit how many bottom rows to print - * @param os ostream where to print table to - */ - virtual void showHTMLPreview(size_t topLimit, size_t bottomLimit, std::ostream &os = std::cout); - // named dataset management functions /*! * map Column using a UDF diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc index b62946ae4..d54edb567 100644 --- a/tuplex/core/src/DataSet.cc +++ b/tuplex/core/src/DataSet.cc @@ -756,114 +756,6 @@ namespace tuplex { printTable(os, headers, rows); } - void printHTMLRow(std::ostream &os, size_t ind, const Row& r) { - os << " \n"; - os << fmt::format(" \n", ind); - for (auto& s : r.getAsStrings()) { - os << fmt::format(" \n", s); - } - os << " \n"; - } - - void DataSet::showHTMLPreview(size_t topLimit, size_t bottomLimit, std::ostream &os) { - std::string HTML_TEMPLATE = - "
\n" - "\n" - "
{}
{}{}{}
{}{}
\n" - " \n" - " \n" - "{}" - " \n" - " \n" - " \n" - "{}" - " \n" - "
\n" - "

{} columns

\n" - "
"; - - assert(_context); - - auto rows = take(topLimit, bottomLimit); - - if (rows->rowCount() == 0) { - os << fmt::format(HTML_TEMPLATE, "\n", "\n", 0); - return; - } - - std::stringstream headers_stream, body_stream; - size_t numColumns = 0; - assert(rows->rowCount() <= topLimit + bottomLimit); - - // construct tables - if (rows->rowCount() < topLimit + bottomLimit) { - // the data is small so we get everything (no need to render ...) - for (size_t i = 0; rows->hasNextRow(); i++) { - Row r = rows->getNextRow(); - if (i == 0) { - // we set num columns based on the first row - numColumns = r.getNumColumns(); - } - - printHTMLRow(body_stream, i, r); - } - } else { - // some data is not processed because of limiting - size_t i; - for (i = 0; rows->hasNextRow() && i < topLimit; i++) { - Row r = rows->getNextRow(); - if (i == 0) { - // we set num columns based on the first row - numColumns = r.getNumColumns(); - } - - printHTMLRow(body_stream, i, r); - } - - // add the ... - body_stream << " \n"; - body_stream << " ...\n"; - for(int j = 0; j < numColumns; j++) { - body_stream << " ...\n"; - body_stream << " \n"; - } - - while (rows->hasNextRow()) { - Row r = rows->getNextRow(); - printHTMLRow(body_stream, i, r); - } - } - - assert(numColumns != 0); - - // construct headers - if (!_columnNames.empty()) { - assert(numColumns == _columnNames.size()); - for (auto &c_name: _columnNames) { - headers_stream << fmt::format(" {}\n", c_name); - } - } else { - // default to generic name if column name doesn't exist - for (int i = 0; i < numColumns; ++i) { - headers_stream << fmt::format(" Column {}\n", i); - } - } - - os << fmt::format(HTML_TEMPLATE, headers_stream.str(), body_stream.str(), numColumns); - } - Schema DataSet::schema() const { if(!_operator) return Schema::UNKNOWN; diff --git a/tuplex/python/include/PythonDataSet.h b/tuplex/python/include/PythonDataSet.h index 4761ac7f0..ede482d9c 100644 --- a/tuplex/python/include/PythonDataSet.h +++ b/tuplex/python/include/PythonDataSet.h @@ -79,7 +79,6 @@ namespace tuplex { py::object collect(); py::object take(const int64_t topLimit, const int64_t bottomLimit); void show(const int64_t numRows=-1); - std::string showHTMLPreview(const int64_t topLimit, const int64_t bottomLimit); // DataFrame like operations PythonDataSet mapColumn(const std::string& column, const std::string& lambda_code, const std::string& pickled_code, const py::object& closure=py::object()); diff --git a/tuplex/python/src/PythonBindings.cc b/tuplex/python/src/PythonBindings.cc index ab239a1a2..6b3683853 100644 --- a/tuplex/python/src/PythonBindings.cc +++ b/tuplex/python/src/PythonBindings.cc @@ -41,7 +41,6 @@ PYMODULE { py::class_(m, "_DataSet") .def("show", &tuplex::PythonDataSet::show) - .def("showHTMLPreview", &tuplex::PythonDataSet::showHTMLPreview) .def("collect", &tuplex::PythonDataSet::collect) .def("take", &tuplex::PythonDataSet::take) .def("map", &tuplex::PythonDataSet::map) diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc index 5382ad24d..ec972a899 100644 --- a/tuplex/python/src/PythonDataSet.cc +++ b/tuplex/python/src/PythonDataSet.cc @@ -884,55 +884,6 @@ namespace tuplex { } } - std::string PythonDataSet::showHTMLPreview(const int64_t topLimit, const int64_t bottomLimit) { - // make sure a dataset is wrapped - assert(this->_dataset); - - // is callee error dataset? if so return list with error string - if (this->_dataset->isError()) { - auto errset = dynamic_cast(this->_dataset); - assert(errset); - return "Error: " + errset->getError(); - } else { - // release GIL & hand over everything to Tuplex - assert(PyGILState_Check()); // make sure this thread holds the GIL! - python::unlockGIL(); - - std::stringstream ss; - std::string err_message; - - size_t castedTopLimit = 0; - if (topLimit < 0) { - castedTopLimit = std::numeric_limits::max(); - } - - size_t castedBottomLimit = 0; - if (bottomLimit < 0) { - castedBottomLimit = std::numeric_limits::max(); - } - - try { - this->_dataset->showHTMLPreview(castedTopLimit, castedBottomLimit, ss); - } catch (const std::exception &e) { - err_message = e.what(); - Logger::instance().defaultLogger().error(err_message); - } catch (...) { - err_message = "unknown C++ exception occurred, please change type."; - Logger::instance().defaultLogger().error(err_message); - } - - // reacquire GIL - python::lockGIL(); - Logger::instance().flushToPython(); - - if (!ss.str().empty() && err_message.empty()) { - return ss.str(); - } else { - return "Error occurred: " + err_message; - } - } - } - PyObject* PythonDataSet::anyToCPythonWithPyObjects(ResultSet* rs, size_t maxRowCount) { assert(rs); From 993937d33918ca7e7e3036779ee8d563ff196a89 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Wed, 20 Apr 2022 00:15:03 -0400 Subject: [PATCH 43/56] Separate out partition utils --- tuplex/core/include/PartitionUtils.h | 46 +++++++ tuplex/core/include/ee/local/LocalBackend.h | 22 ---- tuplex/core/src/PartitionUtils.cc | 138 ++++++++++++++++++++ tuplex/core/src/ee/local/LocalBackend.cc | 125 +----------------- tuplex/python/tuplex/utils/table_format.py | 2 +- tuplex/test/core/TakeTest.cc | 2 +- 6 files changed, 187 insertions(+), 148 deletions(-) create mode 100644 tuplex/core/include/PartitionUtils.h create mode 100644 tuplex/core/src/PartitionUtils.cc diff --git a/tuplex/core/include/PartitionUtils.h b/tuplex/core/include/PartitionUtils.h new file mode 100644 index 000000000..d247edcfc --- /dev/null +++ b/tuplex/core/include/PartitionUtils.h @@ -0,0 +1,46 @@ +//--------------------------------------------------------------------------------------------------------------------// +// // +// Tuplex: Blazing Fast Python Data Science // +// // +// // +// (c) 2017 - 2021, Tuplex team // +// Created by March Boonyapaluk first on 4/19/2021 // +// License: Apache 2.0 // +//--------------------------------------------------------------------------------------------------------------------// + +#ifndef TUPLEX_PARTITIONUTILS_H +#define TUPLEX_PARTITIONUTILS_H + +#include +#include +#include + +namespace tuplex { + /*! + * Trim list of partitions so that it includes up to the first n rows and the last m rows + * if n + m > number of rows in input partitions, the partitions will remain unchanged + * @param partitions [in,out] the list of partitions to trim + * @param topLimit n, the number of top rows to include + * @param bottomLimit m, the number of bottom rows to include + * @param tstage pointer to transform stage, might be used to generate new partition + * @param exec pointer to executor, might be used to allocate new partition + */ + void trimPartitionsToLimit(std::vector &partitions, size_t topLimit, size_t bottomLimit, + TransformStage *tstage, Executor *exec); + + /*! + * Create a newly allocated partition with the same data as the specified partition, but with the first n rows removed + * @param p_in the input partition + * @param numToSkip number of rows to remove from the new partition + * @param tstage pointer to transform stage, used to generate new partition + * @param exec pointer to executor, used to allocate new partition + * @return the new partition + */ + Partition *newPartitionWithSkipRows(Partition *p_in, + size_t numToSkip, + TransformStage *tstage, + Executor *exec); + +} + +#endif //TUPLEX_PARTITIONUTILS_H diff --git a/tuplex/core/include/ee/local/LocalBackend.h b/tuplex/core/include/ee/local/LocalBackend.h index 3d73a5d9f..7f42ff1cb 100644 --- a/tuplex/core/include/ee/local/LocalBackend.h +++ b/tuplex/core/include/ee/local/LocalBackend.h @@ -185,28 +185,6 @@ namespace tuplex { * @return */ extern URI outputURI(const UDF& udf, const URI& baseURI, int64_t partNo, FileFormat fmt); - - /*! - * Trim list of partitions so that it includes up to the first n rows and the last m rows - * if n + m > number of rows in input partitions, the partitions will remain unchanged - * @param partitions [in,out] the list of partitions to trim - * @param topLimit n, the number of top rows to include - * @param bottomLimit m, the number of bottom rows to include - * @param tstage pointer to transform stage, might be used to generate new partition - * @param exec pointer to executor, might be used to allocate new partition - */ - extern void trimPartitionsToLimit(std::vector &partitions, size_t topLimit, size_t bottomLimit, - TransformStage *tstage, Executor *exec); - - /*! - * Create a newly allocated partition with the same data as the specified partition, but with the first n rows removed - * @param p_in the input partition - * @param numToSkip number of rows to remove from the new partition - * @param tstage pointer to transform stage, used to generate new partition - * @param exec pointer to executor, used to allocate new partition - * @return the new partition - */ - extern Partition *newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage *tstage, Executor *exec); } #endif //TUPLEX_LOCALBACKEND_H \ No newline at end of file diff --git a/tuplex/core/src/PartitionUtils.cc b/tuplex/core/src/PartitionUtils.cc new file mode 100644 index 000000000..745332c93 --- /dev/null +++ b/tuplex/core/src/PartitionUtils.cc @@ -0,0 +1,138 @@ +//--------------------------------------------------------------------------------------------------------------------// +// // +// Tuplex: Blazing Fast Python Data Science // +// // +// // +// (c) 2017 - 2021, Tuplex team // +// Created by March Boonyapaluk first on 4/19/2021 // +// License: Apache 2.0 // +//--------------------------------------------------------------------------------------------------------------------// + +#include "PartitionUtils.h" + +namespace tuplex { + + void trimPartitionsToLimit(std::vector &partitions, + size_t topLimit, + size_t bottomLimit, + TransformStage* tstage, + Executor *exec) { + std::vector limitedPartitions, limitedTailPartitions; + + // check top output limit, adjust partitions if necessary + size_t numTopOutputRows = 0; + Partition *lastTopPart = nullptr; + size_t clippedTop = 0; + for (auto partition: partitions) { + numTopOutputRows += partition->getNumRows(); + lastTopPart = partition; + if (numTopOutputRows >= topLimit) { + // clip last partition & leave loop + clippedTop = topLimit - (numTopOutputRows - partition->getNumRows()); + assert(clippedTop <= partition->getNumRows()); + break; + } else if (partition == partitions.back()) { + // last partition, mark full row, but don't put to output set yet to avoid double put + clippedTop = partition->getNumRows(); + break; + } else { + // put full partition to output set + limitedPartitions.push_back(partition); + } + } + + // check the bottom output limit, adjust partitions if necessary + size_t numBottomOutputRows = 0; + size_t clippedBottom = 0; + for (auto it = partitions.rbegin(); it != partitions.rend(); it++) { + auto partition = *it; + numBottomOutputRows += partition->getNumRows(); + + if (partition == lastTopPart) { + // the bottom and the top partitions are overlapping + clippedBottom = bottomLimit - (numBottomOutputRows - partition->getNumRows()); + if (clippedTop + clippedBottom >= partition->getNumRows()) { + // if top and bottom range intersect, use full partitions + clippedTop = partition->getNumRows(); + clippedBottom = 0; + } + break; + } else if (numBottomOutputRows >= bottomLimit) { + // clip last partition & leave loop + auto clipped = bottomLimit - (numBottomOutputRows - partition->getNumRows()); + assert(clipped <= partition->getNumRows()); + if (clipped > 0) { + Partition *newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage, + exec); + assert(newPart->getNumRows() == clipped); + limitedTailPartitions.push_back(newPart); + } + partition->invalidate(); + break; + } else { + // put full partition to output set + limitedTailPartitions.push_back(partition); + } + } + + // push the middle partition + if (lastTopPart != nullptr && (clippedTop > 0 || clippedBottom > 0)) { + assert(clippedTop + clippedBottom <= lastTopPart->getNumRows()); + + // split into two partitions with both top and bottom are in the same partition + Partition *lastBottomPart = nullptr; + + if (clippedBottom != 0) { + lastBottomPart = newPartitionWithSkipRows(lastTopPart, lastTopPart->getNumRows() - clippedBottom, + tstage, exec); + } + + if (clippedTop != 0) { + lastTopPart->setNumRows(clippedTop); + limitedPartitions.push_back(lastTopPart); + } else { + lastTopPart->invalidate(); + } + + if (lastBottomPart != nullptr) { + limitedPartitions.push_back(lastBottomPart); + } + } + + // merge the head and tail partitions + partitions.clear(); + partitions.insert(partitions.end(), limitedPartitions.begin(), limitedPartitions.end()); + partitions.insert(partitions.end(), limitedTailPartitions.rbegin(), limitedTailPartitions.rend()); + } + + Partition *newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage *tstage, Executor *exec) { + auto ptr = p_in->lockRaw(); + auto num_rows = *((int64_t *) ptr); + assert(numToSkip < num_rows); + + ptr += sizeof(int64_t); + size_t numBytesToSkip = 0; + + Deserializer ds(tstage->outputSchema()); + for (unsigned i = 0; i < numToSkip; ++i) { + Row r = Row::fromMemory(ds, ptr, p_in->capacity() - numBytesToSkip); + ptr += r.serializedLength(); + numBytesToSkip += r.serializedLength(); + } + + Partition *p_out = exec->allocWritablePartition(p_in->size() - numBytesToSkip + sizeof(int64_t), + tstage->outputSchema(), tstage->outputDataSetID(), + tstage->context().id()); + assert(p_out->capacity() >= p_in->size() - numBytesToSkip); + + auto ptr_out = p_out->lockRaw(); + *((int64_t *) ptr_out) = p_in->getNumRows() - numToSkip; + ptr_out += sizeof(int64_t); + memcpy((void *) ptr_out, ptr, p_in->size() - numBytesToSkip); + p_out->unlock(); + + p_in->unlock(); + + return p_out; + } +} // namespace tuplex \ No newline at end of file diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc index 351d55b88..676a4e3b3 100644 --- a/tuplex/core/src/ee/local/LocalBackend.cc +++ b/tuplex/core/src/ee/local/LocalBackend.cc @@ -27,7 +27,7 @@ #include #include #include -#include +#include "PartitionUtils.h" namespace tuplex { @@ -2131,127 +2131,4 @@ namespace tuplex { tstage->setFileResult(ecounts); } - void trimPartitionsToLimit(std::vector &partitions, - size_t topLimit, - size_t bottomLimit, - TransformStage* tstage, - Executor *exec) { - std::vector limitedPartitions, limitedTailPartitions; - - // check top output limit, adjust partitions if necessary - size_t numTopOutputRows = 0; - Partition *lastTopPart = nullptr; - size_t clippedTop = 0; - for (auto partition: partitions) { - numTopOutputRows += partition->getNumRows(); - lastTopPart = partition; - if (numTopOutputRows >= topLimit) { - // clip last partition & leave loop - clippedTop = topLimit - (numTopOutputRows - partition->getNumRows()); - assert(clippedTop <= partition->getNumRows()); - break; - } else if (partition == partitions.back()) { - // last partition, mark full row, but don't put to output set yet to avoid double put - clippedTop = partition->getNumRows(); - break; - } else { - // put full partition to output set - limitedPartitions.push_back(partition); - } - } - - // check the bottom output limit, adjust partitions if necessary - size_t numBottomOutputRows = 0; - size_t clippedBottom = 0; - for (auto it = partitions.rbegin(); it != partitions.rend(); it++) { - auto partition = *it; - numBottomOutputRows += partition->getNumRows(); - - if (partition == lastTopPart) { - // the bottom and the top partitions are overlapping - clippedBottom = bottomLimit - (numBottomOutputRows - partition->getNumRows()); - if (clippedTop + clippedBottom >= partition->getNumRows()) { - // if top and bottom range intersect, use full partitions - clippedTop = partition->getNumRows(); - clippedBottom = 0; - } - break; - } else if (numBottomOutputRows >= bottomLimit) { - // clip last partition & leave loop - auto clipped = bottomLimit - (numBottomOutputRows - partition->getNumRows()); - assert(clipped <= partition->getNumRows()); - if (clipped > 0) { - Partition *newPart = newPartitionWithSkipRows(partition, partition->getNumRows() - clipped, tstage, - exec); - assert(newPart->getNumRows() == clipped); - limitedTailPartitions.push_back(newPart); - } - partition->invalidate(); - break; - } else { - // put full partition to output set - limitedTailPartitions.push_back(partition); - } - } - - // push the middle partition - if (lastTopPart != nullptr && (clippedTop > 0 || clippedBottom > 0)) { - assert(clippedTop + clippedBottom <= lastTopPart->getNumRows()); - - // split into two partitions with both top and bottom are in the same partition - Partition *lastBottomPart = nullptr; - - if (clippedBottom != 0) { - lastBottomPart = newPartitionWithSkipRows(lastTopPart, lastTopPart->getNumRows() - clippedBottom, - tstage, exec); - } - - if (clippedTop != 0) { - lastTopPart->setNumRows(clippedTop); - limitedPartitions.push_back(lastTopPart); - } else { - lastTopPart->invalidate(); - } - - if (lastBottomPart != nullptr) { - limitedPartitions.push_back(lastBottomPart); - } - } - - // merge the head and tail partitions - partitions.clear(); - partitions.insert(partitions.end(), limitedPartitions.begin(), limitedPartitions.end()); - partitions.insert(partitions.end(), limitedTailPartitions.rbegin(), limitedTailPartitions.rend()); - } - - Partition *newPartitionWithSkipRows(Partition *p_in, size_t numToSkip, TransformStage *tstage, Executor *exec) { - auto ptr = p_in->lockRaw(); - auto num_rows = *((int64_t *) ptr); - assert(numToSkip < num_rows); - - ptr += sizeof(int64_t); - size_t numBytesToSkip = 0; - - Deserializer ds(tstage->outputSchema()); - for (unsigned i = 0; i < numToSkip; ++i) { - Row r = Row::fromMemory(ds, ptr, p_in->capacity() - numBytesToSkip); - ptr += r.serializedLength(); - numBytesToSkip += r.serializedLength(); - } - - Partition *p_out = exec->allocWritablePartition(p_in->size() - numBytesToSkip + sizeof(int64_t), - tstage->outputSchema(), tstage->outputDataSetID(), - tstage->context().id()); - assert(p_out->capacity() >= p_in->size() - numBytesToSkip); - - auto ptr_out = p_out->lockRaw(); - *((int64_t *) ptr_out) = p_in->getNumRows() - numToSkip; - ptr_out += sizeof(int64_t); - memcpy((void *) ptr_out, ptr, p_in->size() - numBytesToSkip); - p_out->unlock(); - - p_in->unlock(); - - return p_out; - } } // namespace tuplex \ No newline at end of file diff --git a/tuplex/python/tuplex/utils/table_format.py b/tuplex/python/tuplex/utils/table_format.py index bb83118b4..ecd333f5a 100644 --- a/tuplex/python/tuplex/utils/table_format.py +++ b/tuplex/python/tuplex/utils/table_format.py @@ -5,7 +5,7 @@ # # # # # (c) 2017 - 2021, Tuplex team # -# Created by Leonhard Spiegelberg first on 4/19/2022 # +# Created by March Boonyapaluk first on 4/19/2022 # # License: Apache 2.0 # # ----------------------------------------------------------------------------------------------------------------------# diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc index 4e4a70f53..40b624ca8 100644 --- a/tuplex/test/core/TakeTest.cc +++ b/tuplex/test/core/TakeTest.cc @@ -4,7 +4,7 @@ // // // // // (c) 2017 - 2021, Tuplex team // -// Created by Leonhard Spiegelberg first on 1/1/2021 // +// Created by March Boonyapaluk first on 4/19/2021 // // License: Apache 2.0 // //--------------------------------------------------------------------------------------------------------------------// From 6f528f889221a0553b426d4939572c8fa307a8b2 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Wed, 20 Apr 2022 11:45:34 -0400 Subject: [PATCH 44/56] Fix Azure pipeline failing --- tuplex/python/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tuplex/python/CMakeLists.txt b/tuplex/python/CMakeLists.txt index b0b0e54c5..7ccb7057c 100644 --- a/tuplex/python/CMakeLists.txt +++ b/tuplex/python/CMakeLists.txt @@ -104,6 +104,7 @@ FILE(COPY ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/__init__.py ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/tracebacks.py ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/version.py ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/globs.py + ${CMAKE_CURRENT_SOURCE_DIR}/tuplex/utils/table_format.py DESTINATION ${PYTHON_DIST_DIR}/tuplex/utils) FILE(COPY ${CMAKE_CURRENT_SOURCE_DIR}/tests/test_tuples.py From 816567f9e9c78cc3154286c500318c344802c991 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Wed, 20 Apr 2022 12:58:29 -0400 Subject: [PATCH 45/56] Minor Debug in Python lib --- tuplex/python/src/PythonDataSet.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tuplex/python/src/PythonDataSet.cc b/tuplex/python/src/PythonDataSet.cc index ec972a899..e04fc73fe 100644 --- a/tuplex/python/src/PythonDataSet.cc +++ b/tuplex/python/src/PythonDataSet.cc @@ -177,7 +177,7 @@ namespace tuplex { // new version, directly interact with the interpreter Timer timer; // build python list object from resultset - auto listObj = resultSetToCPython(rs.get(), castedTopLimit); + auto listObj = resultSetToCPython(rs.get(), rs->rowCount()); Logger::instance().logger("python").info("Data transfer back to python took " + std::to_string(timer.time()) + " seconds"); // Logger::instance().flushAll(); From 4b2e2af40ea5c7fa1fcfdafbdc9dbcb992c43b45 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Fri, 22 Apr 2022 03:45:52 -0400 Subject: [PATCH 46/56] Remove column counts --- tuplex/python/tuplex/dataset.py | 5 ++--- tuplex/python/tuplex/utils/table_format.py | 2 -- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/tuplex/python/tuplex/dataset.py b/tuplex/python/tuplex/dataset.py index c0b9ef4d0..6a3f9ca71 100644 --- a/tuplex/python/tuplex/dataset.py +++ b/tuplex/python/tuplex/dataset.py @@ -212,7 +212,6 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5): "{}" " \n" "\n" - "

{} columns

\n" "") assert self._dataSet is not None, 'internal API error, datasets must be created via context objects' @@ -256,7 +255,7 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5): body += " ...\n" for _ in range(num_columns): body += " ...\n" - body += " \n" + body += " \n" for j in range(i, len(rows)): body += table_format.getHTMLRow(len(rows) - j, rows[j]) @@ -275,7 +274,7 @@ def showHTMLPreview(self, topLimit=5, bottomLimit=5): for i in range(num_columns): headers_str += " Column {}\n".format(i) - return HTML_TEMPLATE.format(headers_str, body, num_columns) + return HTML_TEMPLATE.format(headers_str, body) def showStrPreview(self, topLimit=5, bottomLimit=5): """ action that generates a physical plan, processes data and return a subset of results as nicely formatted diff --git a/tuplex/python/tuplex/utils/table_format.py b/tuplex/python/tuplex/utils/table_format.py index ecd333f5a..7bc8dd9d9 100644 --- a/tuplex/python/tuplex/utils/table_format.py +++ b/tuplex/python/tuplex/utils/table_format.py @@ -75,6 +75,4 @@ def generateStrTable(numCols, strTable): output_str += "| {:<{width}} ".format(r[i], width=max_col_width[i]) output_str += "|\n" - output_str += _getLineDivider(max_col_width) + "{} columns\n".format(numCols) - return output_str From a935f1e71c064aea9b631755c2ada55fe22f63b2 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Wed, 11 May 2022 13:18:59 -0400 Subject: [PATCH 47/56] Fix CI not running core tests --- tuplex/test/CMakeLists.txt | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/tuplex/test/CMakeLists.txt b/tuplex/test/CMakeLists.txt index 8497ef8b2..ce7200e01 100755 --- a/tuplex/test/CMakeLists.txt +++ b/tuplex/test/CMakeLists.txt @@ -1,14 +1,26 @@ find_package(Threads REQUIRED) # some tests require the cloudpickle package to be installed, hence check for it here -find_package(Python3 COMPONENTS Interpreter) -if(Python3_FOUND) +find_package(Python3 COMPONENTS Interpreter Development QUIET) +if (Python3_FOUND) + message(STATUS "Found full python3-dev installation") + set(Python3_Embed_FOUND TRUE) +else () + find_package(Python3 COMPONENTS Interpreter REQUIRED) + # python3 -c 'import distutils.sysconfig; print(distutils.sysconfig.get_python_lib(plat_specific=False,standard_lib=True))' + # try to get get module libs at least + + # mark embed lib as not found + unset(Python3_Embed_FOUND) +endif () + +if (Python3_FOUND) # check that cloudpickle is installed via import set(cmd -c "import cloudpickle") execute_process(COMMAND ${Python3_EXECUTABLE} ${cmd} RESULT_VARIABLE ret) - if(NOT "${ret}" STREQUAL "0") + if (NOT "${ret}" STREQUAL "0") message(FATAL_ERROR "Could not find cloudpickle module, please install via pip3 install cloudpickle.") - endif() + endif () # check that numpy is installed too for testing purposes... set(cmd -c "import numpy") @@ -68,7 +80,7 @@ if(Python3_Embed_FOUND) add_subdirectory(core) add_subdirectory(wrappers) else() - message(STATUS "deactivating C++ tests for core/wrappers because no full Python dev installation found.") + message(WARNING "deactivating C++ tests for core/wrappers because no full Python dev installation found.") endif() # Resources::: From 359ffed15f903ccd400b2231b914abae17d201db Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Fri, 13 May 2022 17:22:55 -0400 Subject: [PATCH 48/56] Speed up tests --- tuplex/test/core/TakeTest.cc | 130 ++++++++++++++++++---------- tuplex/test/wrappers/WrapperTest.cc | 41 +++++++++ 2 files changed, 127 insertions(+), 44 deletions(-) diff --git a/tuplex/test/core/TakeTest.cc b/tuplex/test/core/TakeTest.cc index 40b624ca8..2c7a1e067 100644 --- a/tuplex/test/core/TakeTest.cc +++ b/tuplex/test/core/TakeTest.cc @@ -19,6 +19,58 @@ using namespace std; class TakeTest : public PyTest { }; + +struct TakeTestConfig { + size_t data_size; + size_t top_limit; + size_t bottom_limit; + string partition_sizes; +}; + +/** + * Generate a predefine list of test scenarios composing of different data size and limit values + */ +vector generateTakeTestCfgs() { + std::vector testCfgs; + + // generate exhaustive test for small values + const std::vector small_test_size{1, 10}; + const std::vector small_limit_values{0, 1, 5, 11}; + for (auto data_size: small_test_size) { + for (auto top_limit: small_limit_values) { + for (auto bottom_limit: small_limit_values) { + testCfgs.push_back({data_size, top_limit, bottom_limit, "256B"}); + } + } + } + + // add pre-defined bigger cases + testCfgs.push_back({1000, 600, 0, "256B"}); + testCfgs.push_back({1000, 600, 600, "256B"}); + testCfgs.push_back({1000, 11, 600, "512KB"}); + + testCfgs.push_back({10001, 600, 1001, "256B"}); + testCfgs.push_back({10001, 600, 1001, "512KB"}); + testCfgs.push_back({10001, 600, 1001, "1MB"}); + + testCfgs.push_back({10001, 5000, 4950, "256B"}); + testCfgs.push_back({10001, 5000, 4950, "512KB"}); + testCfgs.push_back({10001, 5000, 4950, "1MB"}); + + return testCfgs; +} + +/** + * partition test into different partition sizes to avoid reinitializing the same context multiple times + */ +map> splitCfgsByPartitionSize(const std::vector &testCfgs) { + map> mp; + for (const auto &cfg: testCfgs) { + mp[cfg.partition_sizes].push_back(cfg); + } + return mp; +} + /** * Randomly generate a vector of rows for testing * @param N the size of vector @@ -167,32 +219,27 @@ TEST_F(TakeTest, takeBothTest) { TEST_F(TakeTest, takeBigTest) { mt19937 data_seed_gen(4242); - const std::vector test_size{1, 10, 100, 1001, 10001}; - const std::vector limit_values{0, 1, 5, 11, 600, 10000}; - const std::vector partition_sizes{"256B", "512KB", "1MB"}; + auto testCfgs = generateTakeTestCfgs(); + auto partitionedCfgs = splitCfgsByPartitionSize(testCfgs); - for (auto &part_size: partition_sizes) { + for (const auto &cfg_pair: partitionedCfgs) { auto opt = testOptions(); - opt.set("tuplex.partitionSize", part_size); + opt.set("tuplex.partitionSize", cfg_pair.first); Context context(opt); - for (auto data_size: test_size) { - for (auto top_limit: limit_values) { - for (auto bottom_limit: limit_values) { - std::cout << "testing with partition size:" << part_size << " data size:" - << data_size << " top:" << top_limit << " bottom:" << bottom_limit << std::endl; + for (const auto &cfg: cfg_pair.second) { + std::cout << "testing with partition size:" << cfg.partition_sizes << " data size:" + << cfg.data_size << " top:" << cfg.top_limit << " bottom:" << cfg.bottom_limit << std::endl; - auto data = generateTestData(data_size, data_seed_gen()); - auto ref_data = generateReferenceData(data, top_limit, bottom_limit); + auto data = generateTestData(cfg.data_size, data_seed_gen()); + auto ref_data = generateReferenceData(data, cfg.top_limit, cfg.bottom_limit); - auto res = context.parallelize(data).take(top_limit, bottom_limit); - ASSERT_EQ(ref_data.size(), res->rowCount()); - for (Row &r: ref_data) { - Row res_row = res->getNextRow(); - if (!(res_row == r)) { - ASSERT_EQ(res_row, r); - } - } + auto res = context.parallelize(data).take(cfg.top_limit, cfg.bottom_limit); + ASSERT_EQ(ref_data.size(), res->rowCount()); + for (Row &r: ref_data) { + Row res_row = res->getNextRow(); + if (!(res_row == r)) { + ASSERT_EQ(res_row, r); } } } @@ -220,36 +267,31 @@ vector generateMapFilterReferenceData(const vector &input, size_t topL TEST_F(TakeTest, takeMapFilterTest) { mt19937 data_seed_gen(56120); - const std::vector test_size{1, 10, 100, 1001, 10001}; - const std::vector limit_values{0, 1, 5, 11, 600, 10000}; - const std::vector partition_sizes{"256B", "512KB", "1MB"}; + auto testCfgs = generateTakeTestCfgs(); + auto partitionedCfgs = splitCfgsByPartitionSize(testCfgs); UDF map_udf("lambda a, b, c: ((a + b), c)"); UDF filter_udf("lambda a, b: a % 2 == 0"); - for (auto &part_size: partition_sizes) { + for (const auto &cfg_pair: partitionedCfgs) { auto opt = testOptions(); - opt.set("tuplex.partitionSize", part_size); + opt.set("tuplex.partitionSize", cfg_pair.first); Context context(opt); - for (auto data_size: test_size) { - for (auto top_limit: limit_values) { - for (auto bottom_limit: limit_values) { - std::cout << "testing with partition size:" << part_size << " data size:" - << data_size << " top:" << top_limit << " bottom:" << bottom_limit << std::endl; + for (const auto &cfg: cfg_pair.second) { + std::cout << "testing with partition size:" << cfg.partition_sizes << " data size:" + << cfg.data_size << " top:" << cfg.top_limit << " bottom:" << cfg.bottom_limit << std::endl; - auto data = generateTestData(data_size, data_seed_gen()); - auto ref_data = generateMapFilterReferenceData(data, top_limit, bottom_limit); + auto data = generateTestData(cfg.data_size, data_seed_gen()); + auto ref_data = generateMapFilterReferenceData(data, cfg.top_limit, cfg.bottom_limit); - auto ds = context.parallelize(data).map(map_udf).filter(filter_udf); - auto res = ds.take(top_limit, bottom_limit); - ASSERT_EQ(ref_data.size(), res->rowCount()); - for (Row &r: ref_data) { - Row res_row = res->getNextRow(); - if (!(res_row == r)) { - ASSERT_EQ(res_row, r); - } - } + auto ds = context.parallelize(data).map(map_udf).filter(filter_udf); + auto res = ds.take(cfg.top_limit, cfg.bottom_limit); + ASSERT_EQ(ref_data.size(), res->rowCount()); + for (Row &r: ref_data) { + Row res_row = res->getNextRow(); + if (!(res_row == r)) { + ASSERT_EQ(res_row, r); } } } @@ -259,7 +301,7 @@ TEST_F(TakeTest, takeMapFilterTest) { TEST_F(TakeTest, collectIdentityTest) { mt19937 data_seed_gen(123454); - const std::vector test_size{1, 10, 100, 1001, 10001}; + const std::vector test_size{1, 10, 1000, 10001}; const std::vector partition_sizes{"256B", "512KB", "1MB"}; for (auto &part_size: partition_sizes) { @@ -282,8 +324,8 @@ TEST_F(TakeTest, collectIdentityTest) { } TEST_F(TakeTest, fileInputTest) { - const std::vector test_size{1, 1001, 50001}; - const std::vector limit_values{0, 1, 600, 10000}; + const std::vector test_size{1, 1001, 10001}; + const std::vector limit_values{0, 1, 600, 5000}; const std::vector partition_sizes{"256B", "1MB"}; std::vector> expected_outputs; diff --git a/tuplex/test/wrappers/WrapperTest.cc b/tuplex/test/wrappers/WrapperTest.cc index ede9dd82d..cec703086 100644 --- a/tuplex/test/wrappers/WrapperTest.cc +++ b/tuplex/test/wrappers/WrapperTest.cc @@ -2521,6 +2521,47 @@ TEST_F(WrapperTest, PartitionRelease) { } +TEST_F(WrapperTest, ResultWithLimitMerge) { + using namespace tuplex; + + PythonContext c("c", "", testOptions()); + + PyObject *listObj = PyList_New(4); + PyObject *tupleObj1 = PyTuple_New(2); + PyTuple_SET_ITEM(tupleObj1, 0, PyLong_FromLong(1)); + PyTuple_SET_ITEM(tupleObj1, 1, python::PyString_FromString("a")); + + PyObject *tupleObj2 = PyTuple_New(2); + PyTuple_SET_ITEM(tupleObj2, 0, PyLong_FromLong(2)); + PyTuple_SET_ITEM(tupleObj2, 1, python::PyString_FromString("b")); + + + PyObject *tupleObj3 = PyTuple_New(2); + PyTuple_SET_ITEM(tupleObj3, 0, PyLong_FromLong(3)); + PyTuple_SET_ITEM(tupleObj3, 1, PyLong_FromLong(42)); + + + PyObject *tupleObj4 = PyTuple_New(2); + PyTuple_SET_ITEM(tupleObj4, 0, PyLong_FromLong(4)); + PyTuple_SET_ITEM(tupleObj4, 1, python::PyString_FromString("d")); + + PyList_SetItem(listObj, 0, tupleObj1); + PyList_SetItem(listObj, 1, tupleObj2); + PyList_SetItem(listObj, 2, tupleObj3); + PyList_SetItem(listObj, 3, tupleObj4); + + { + auto list = py::reinterpret_borrow(listObj); + auto res = c.parallelize(list).filter("lambda a, b: a > 1", "").take(1, 0); + auto resObj = res.ptr(); + + ASSERT_TRUE(PyList_Check(resObj)); + ASSERT_EQ(PyList_GET_SIZE(resObj), 1); + + PyObject_Print(resObj, stdout, 0); + } +} + //// debug any python module... ///** Takes a path and adds it to sys.paths by calling PyRun_SimpleString. // * This does rather laborious C string concatenation so that it will work in From d36e4b2c78be6dd8bb9e490c3ce1a7ac18dad77b Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Wed, 18 May 2022 13:13:14 -0400 Subject: [PATCH 49/56] Disable the limit merge test --- tuplex/test/wrappers/WrapperTest.cc | 82 ++++++++++++++--------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/tuplex/test/wrappers/WrapperTest.cc b/tuplex/test/wrappers/WrapperTest.cc index cec703086..97fd2a3f6 100644 --- a/tuplex/test/wrappers/WrapperTest.cc +++ b/tuplex/test/wrappers/WrapperTest.cc @@ -2520,47 +2520,47 @@ TEST_F(WrapperTest, PartitionRelease) { } - -TEST_F(WrapperTest, ResultWithLimitMerge) { - using namespace tuplex; - - PythonContext c("c", "", testOptions()); - - PyObject *listObj = PyList_New(4); - PyObject *tupleObj1 = PyTuple_New(2); - PyTuple_SET_ITEM(tupleObj1, 0, PyLong_FromLong(1)); - PyTuple_SET_ITEM(tupleObj1, 1, python::PyString_FromString("a")); - - PyObject *tupleObj2 = PyTuple_New(2); - PyTuple_SET_ITEM(tupleObj2, 0, PyLong_FromLong(2)); - PyTuple_SET_ITEM(tupleObj2, 1, python::PyString_FromString("b")); - - - PyObject *tupleObj3 = PyTuple_New(2); - PyTuple_SET_ITEM(tupleObj3, 0, PyLong_FromLong(3)); - PyTuple_SET_ITEM(tupleObj3, 1, PyLong_FromLong(42)); - - - PyObject *tupleObj4 = PyTuple_New(2); - PyTuple_SET_ITEM(tupleObj4, 0, PyLong_FromLong(4)); - PyTuple_SET_ITEM(tupleObj4, 1, python::PyString_FromString("d")); - - PyList_SetItem(listObj, 0, tupleObj1); - PyList_SetItem(listObj, 1, tupleObj2); - PyList_SetItem(listObj, 2, tupleObj3); - PyList_SetItem(listObj, 3, tupleObj4); - - { - auto list = py::reinterpret_borrow(listObj); - auto res = c.parallelize(list).filter("lambda a, b: a > 1", "").take(1, 0); - auto resObj = res.ptr(); - - ASSERT_TRUE(PyList_Check(resObj)); - ASSERT_EQ(PyList_GET_SIZE(resObj), 1); - - PyObject_Print(resObj, stdout, 0); - } -} +// TODO: reenable this once the issue is fixed +//TEST_F(WrapperTest, ResultWithLimitMerge) { +// using namespace tuplex; +// +// PythonContext c("c", "", testOptions()); +// +// PyObject *listObj = PyList_New(4); +// PyObject *tupleObj1 = PyTuple_New(2); +// PyTuple_SET_ITEM(tupleObj1, 0, PyLong_FromLong(1)); +// PyTuple_SET_ITEM(tupleObj1, 1, python::PyString_FromString("a")); +// +// PyObject *tupleObj2 = PyTuple_New(2); +// PyTuple_SET_ITEM(tupleObj2, 0, PyLong_FromLong(2)); +// PyTuple_SET_ITEM(tupleObj2, 1, python::PyString_FromString("b")); +// +// +// PyObject *tupleObj3 = PyTuple_New(2); +// PyTuple_SET_ITEM(tupleObj3, 0, PyLong_FromLong(3)); +// PyTuple_SET_ITEM(tupleObj3, 1, PyLong_FromLong(42)); +// +// +// PyObject *tupleObj4 = PyTuple_New(2); +// PyTuple_SET_ITEM(tupleObj4, 0, PyLong_FromLong(4)); +// PyTuple_SET_ITEM(tupleObj4, 1, python::PyString_FromString("d")); +// +// PyList_SetItem(listObj, 0, tupleObj1); +// PyList_SetItem(listObj, 1, tupleObj2); +// PyList_SetItem(listObj, 2, tupleObj3); +// PyList_SetItem(listObj, 3, tupleObj4); +// +// { +// auto list = py::reinterpret_borrow(listObj); +// auto res = c.parallelize(list).filter("lambda a, b: a > 1", "").take(1, 0); +// auto resObj = res.ptr(); +// +// ASSERT_TRUE(PyList_Check(resObj)); +// ASSERT_EQ(PyList_GET_SIZE(resObj), 1); +// +// PyObject_Print(resObj, stdout, 0); +// } +//} //// debug any python module... ///** Takes a path and adds it to sys.paths by calling PyRun_SimpleString. From 2ed45d5b2d62f04e538bcd22ef7bec3aa64fb362 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Wed, 18 May 2022 14:18:58 -0400 Subject: [PATCH 50/56] Fix the wrapper test --- tuplex/core/src/physical/ResultSet.cc | 2 +- tuplex/test/wrappers/WrapperTest.cc | 83 +++++++++++++-------------- 2 files changed, 42 insertions(+), 43 deletions(-) diff --git a/tuplex/core/src/physical/ResultSet.cc b/tuplex/core/src/physical/ResultSet.cc index 0eb6d95ad..977c0e188 100644 --- a/tuplex/core/src/physical/ResultSet.cc +++ b/tuplex/core/src/physical/ResultSet.cc @@ -254,7 +254,7 @@ namespace tuplex { for(const auto& partition : _partitions) { count += partition->getNumRows(); } - return count + _pyobjects.size(); + return std::min(count + _pyobjects.size(), _maxRows); } void ResultSet::removeFirstPartition() { diff --git a/tuplex/test/wrappers/WrapperTest.cc b/tuplex/test/wrappers/WrapperTest.cc index 97fd2a3f6..c615c53aa 100644 --- a/tuplex/test/wrappers/WrapperTest.cc +++ b/tuplex/test/wrappers/WrapperTest.cc @@ -2511,7 +2511,7 @@ TEST_F(WrapperTest, PartitionRelease) { cols_to_select = PyList_New(1); PyList_SET_ITEM(cols_to_select, 0, python::PyString_FromString("Incident Zip")); - ctx2.csv(service_path,py::none(), true, false, "", "\"", + ctx2.csv(service_path, py::none(), true, false, "", "\"", py::none(), py::reinterpret_steal(type_dict)) .mapColumn("Incident Zip", fix_zip_codes_c, "") .selectColumns(py::reinterpret_steal(cols_to_select)) @@ -2520,47 +2520,46 @@ TEST_F(WrapperTest, PartitionRelease) { } -// TODO: reenable this once the issue is fixed -//TEST_F(WrapperTest, ResultWithLimitMerge) { -// using namespace tuplex; -// -// PythonContext c("c", "", testOptions()); -// -// PyObject *listObj = PyList_New(4); -// PyObject *tupleObj1 = PyTuple_New(2); -// PyTuple_SET_ITEM(tupleObj1, 0, PyLong_FromLong(1)); -// PyTuple_SET_ITEM(tupleObj1, 1, python::PyString_FromString("a")); -// -// PyObject *tupleObj2 = PyTuple_New(2); -// PyTuple_SET_ITEM(tupleObj2, 0, PyLong_FromLong(2)); -// PyTuple_SET_ITEM(tupleObj2, 1, python::PyString_FromString("b")); -// -// -// PyObject *tupleObj3 = PyTuple_New(2); -// PyTuple_SET_ITEM(tupleObj3, 0, PyLong_FromLong(3)); -// PyTuple_SET_ITEM(tupleObj3, 1, PyLong_FromLong(42)); -// -// -// PyObject *tupleObj4 = PyTuple_New(2); -// PyTuple_SET_ITEM(tupleObj4, 0, PyLong_FromLong(4)); -// PyTuple_SET_ITEM(tupleObj4, 1, python::PyString_FromString("d")); -// -// PyList_SetItem(listObj, 0, tupleObj1); -// PyList_SetItem(listObj, 1, tupleObj2); -// PyList_SetItem(listObj, 2, tupleObj3); -// PyList_SetItem(listObj, 3, tupleObj4); -// -// { -// auto list = py::reinterpret_borrow(listObj); -// auto res = c.parallelize(list).filter("lambda a, b: a > 1", "").take(1, 0); -// auto resObj = res.ptr(); -// -// ASSERT_TRUE(PyList_Check(resObj)); -// ASSERT_EQ(PyList_GET_SIZE(resObj), 1); -// -// PyObject_Print(resObj, stdout, 0); -// } -//} +TEST_F(WrapperTest, ResultWithLimitMerge) { + using namespace tuplex; + + PythonContext c("c", "", testOptions()); + + PyObject *listObj = PyList_New(4); + PyObject *tupleObj1 = PyTuple_New(2); + PyTuple_SET_ITEM(tupleObj1, 0, PyLong_FromLong(1)); + PyTuple_SET_ITEM(tupleObj1, 1, python::PyString_FromString("a")); + + PyObject *tupleObj2 = PyTuple_New(2); + PyTuple_SET_ITEM(tupleObj2, 0, PyLong_FromLong(2)); + PyTuple_SET_ITEM(tupleObj2, 1, python::PyString_FromString("b")); + + + PyObject *tupleObj3 = PyTuple_New(2); + PyTuple_SET_ITEM(tupleObj3, 0, PyLong_FromLong(3)); + PyTuple_SET_ITEM(tupleObj3, 1, PyLong_FromLong(42)); + + + PyObject *tupleObj4 = PyTuple_New(2); + PyTuple_SET_ITEM(tupleObj4, 0, PyLong_FromLong(4)); + PyTuple_SET_ITEM(tupleObj4, 1, python::PyString_FromString("d")); + + PyList_SetItem(listObj, 0, tupleObj1); + PyList_SetItem(listObj, 1, tupleObj2); + PyList_SetItem(listObj, 2, tupleObj3); + PyList_SetItem(listObj, 3, tupleObj4); + + { + auto list = py::reinterpret_borrow(listObj); + auto res = c.parallelize(list).filter("lambda a, b: a > 1", "").take(1, 0); + auto resObj = res.ptr(); + + ASSERT_TRUE(PyList_Check(resObj)); + ASSERT_EQ(PyList_GET_SIZE(resObj), 1); + + PyObject_Print(resObj, stdout, 0); + } +} //// debug any python module... ///** Takes a path and adds it to sys.paths by calling PyRun_SimpleString. From 053081984562e2ad5d4e954e93a08833744b3a23 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Thu, 19 May 2022 22:47:59 -0400 Subject: [PATCH 51/56] Fix Typo (and rerun CI) --- tuplex/core/src/DataSet.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tuplex/core/src/DataSet.cc b/tuplex/core/src/DataSet.cc index d54edb567..8e618d012 100644 --- a/tuplex/core/src/DataSet.cc +++ b/tuplex/core/src/DataSet.cc @@ -66,7 +66,7 @@ namespace tuplex { } std::vector DataSet::takeAsVector(size_t numElements, std::ostream &os) { - auto rs = take(numElements, false, os); + auto rs = take(numElements, 0, os); Timer timer; #warning "limiting should make this hack irrelevant..." From 2099d7a86a5dd5a13e4445a6e5226802634ca68b Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Fri, 20 May 2022 00:02:47 -0400 Subject: [PATCH 52/56] Add logging after load and transform task --- tuplex/core/src/ee/local/LocalBackend.cc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tuplex/core/src/ee/local/LocalBackend.cc b/tuplex/core/src/ee/local/LocalBackend.cc index 676a4e3b3..da9d77d43 100644 --- a/tuplex/core/src/ee/local/LocalBackend.cc +++ b/tuplex/core/src/ee/local/LocalBackend.cc @@ -982,6 +982,13 @@ namespace tuplex { } auto tasks = createLoadAndTransformToMemoryTasks(tstage, _options, syms); + + { + std::stringstream ss; + ss<<"[Transform Stage] Stage "<number()<<" starting "< Date: Fri, 20 May 2022 00:25:32 -0400 Subject: [PATCH 53/56] Fix missing completed work issue --- tuplex/core/src/Executor.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tuplex/core/src/Executor.cc b/tuplex/core/src/Executor.cc index acfdd0aa6..618b01345 100644 --- a/tuplex/core/src/Executor.cc +++ b/tuplex/core/src/Executor.cc @@ -103,8 +103,6 @@ namespace tuplex { // save which thread executed this task task->setID(std::this_thread::get_id()); - _numPendingTasks.fetch_add(-1, std::memory_order_release); - // add task to done list TRACE_LOCK("completedTasks"); _completedTasksMutex.lock(); @@ -113,6 +111,8 @@ namespace tuplex { _numCompletedTasks.fetch_add(1, std::memory_order_release); TRACE_UNLOCK("completedTasks"); + _numPendingTasks.fetch_add(-1, std::memory_order_release); + return true; } From bfb56a3a680167f8f6f9f150622db333f095d5a1 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Fri, 20 May 2022 10:41:24 -0400 Subject: [PATCH 54/56] Resolve merge conflict --- tuplex/core/src/PartitionUtils.cc | 3 ++- tuplex/core/src/physical/ResultSet.cc | 6 +++--- tuplex/core/src/physical/TransformStage.cc | 4 ---- tuplex/test/wrappers/WrapperTest.cc | 9 ++++++++- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/tuplex/core/src/PartitionUtils.cc b/tuplex/core/src/PartitionUtils.cc index 745332c93..52f1ffab7 100644 --- a/tuplex/core/src/PartitionUtils.cc +++ b/tuplex/core/src/PartitionUtils.cc @@ -125,10 +125,11 @@ namespace tuplex { tstage->context().id()); assert(p_out->capacity() >= p_in->size() - numBytesToSkip); - auto ptr_out = p_out->lockRaw(); + auto ptr_out = p_out->lockWriteRaw(); *((int64_t *) ptr_out) = p_in->getNumRows() - numToSkip; ptr_out += sizeof(int64_t); memcpy((void *) ptr_out, ptr, p_in->size() - numBytesToSkip); + p_out->setNumRows(p_in->getNumRows() - numToSkip); p_out->unlock(); p_in->unlock(); diff --git a/tuplex/core/src/physical/ResultSet.cc b/tuplex/core/src/physical/ResultSet.cc index 0b72c2d6d..cb9373335 100644 --- a/tuplex/core/src/physical/ResultSet.cc +++ b/tuplex/core/src/physical/ResultSet.cc @@ -210,19 +210,19 @@ namespace tuplex { for (size_t i = 0; i < limit;) { // all exhausted - if(_currentNormalPartitions.empty()) + if (_currentNormalPartitions.empty()) break; // get number of rows in first partition Partition *first = _currentNormalPartitions.front(); auto num_rows = first->getNumRows(); - assert(num_rows >= _curRowCounter); + assert(num_rows >= _curNormalRowCounter); assert(limit >= i); // how many left to retrieve? auto num_to_retrieve_from_partition = std::min(limit - i, num_rows - _curNormalRowCounter); - if(num_to_retrieve_from_partition <= 0) + if (num_to_retrieve_from_partition <= 0) break; // make sure partition schema matches stored schema diff --git a/tuplex/core/src/physical/TransformStage.cc b/tuplex/core/src/physical/TransformStage.cc index e21012edb..bd468d67e 100644 --- a/tuplex/core/src/physical/TransformStage.cc +++ b/tuplex/core/src/physical/TransformStage.cc @@ -146,10 +146,6 @@ namespace tuplex { } // put ALL partitions to result set - _rs = std::make_shared(schema, partitions, - generalCase, partitionToExceptionsMap, interpreterRows, - maxRows); - _rs = std::make_shared(schema, normalPartitions, generalPartitions, fallbackPartitions, partitionGroups, maxRows); } } diff --git a/tuplex/test/wrappers/WrapperTest.cc b/tuplex/test/wrappers/WrapperTest.cc index 263c63a5f..314c6e21c 100644 --- a/tuplex/test/wrappers/WrapperTest.cc +++ b/tuplex/test/wrappers/WrapperTest.cc @@ -2573,7 +2573,14 @@ TEST_F(WrapperTest, PartitionRelease) { TEST_F(WrapperTest, ResultWithLimitMerge) { using namespace tuplex; - PythonContext c("c", "", testOptions()); + auto ctx_opts = "{\"webui.enable\": false," + " \"driverMemory\": \"8MB\"," + " \"partitionSize\": \"256KB\"," + "\"executorCount\": 0," + "\"tuplex.scratchDir\": \"file://" + scratchDir + "\"," + "\"resolveWithInterpreterOnly\": true}"; + + PythonContext c("c", "", ctx_opts); PyObject *listObj = PyList_New(4); PyObject *tupleObj1 = PyTuple_New(2); From 71e0fe508e7dc0ac26f95895c6a544c6c81b88a1 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Fri, 27 May 2022 18:36:49 -0400 Subject: [PATCH 55/56] Resolve merge conflict --- tuplex/core/src/PartitionUtils.cc | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/tuplex/core/src/PartitionUtils.cc b/tuplex/core/src/PartitionUtils.cc index 52f1ffab7..7f2aeedfc 100644 --- a/tuplex/core/src/PartitionUtils.cc +++ b/tuplex/core/src/PartitionUtils.cc @@ -15,7 +15,7 @@ namespace tuplex { void trimPartitionsToLimit(std::vector &partitions, size_t topLimit, size_t bottomLimit, - TransformStage* tstage, + TransformStage *tstage, Executor *exec) { std::vector limitedPartitions, limitedTailPartitions; @@ -99,6 +99,21 @@ namespace tuplex { } } + if (partitions.size() != limitedPartitions.size() + limitedTailPartitions.size()) { + // partition is changed, we need to change the partition grouping too + std::vector oldGrouping = tstage->partitionGroups(); + std::vector newGrouping; + size_t new_normal_num = limitedPartitions.size() + limitedTailPartitions.size(); + // remove all normal partition, put new one at the front + newGrouping.push_back(PartitionGroup(new_normal_num, 0, 0, 0, 0, 0)); + for (auto gp: oldGrouping) { + gp.numNormalPartitions = 0; + newGrouping.push_back(gp); + } + + tstage->setPartitionGroups(newGrouping); + } + // merge the head and tail partitions partitions.clear(); partitions.insert(partitions.end(), limitedPartitions.begin(), limitedPartitions.end()); From ee5ff30a9209b626ded55ea7f3fe0c50c60b6352 Mon Sep 17 00:00:00 2001 From: KorlaMarch Date: Fri, 27 May 2022 18:37:35 -0400 Subject: [PATCH 56/56] update partition grouping when trim partitions --- tuplex/core/src/PartitionUtils.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tuplex/core/src/PartitionUtils.cc b/tuplex/core/src/PartitionUtils.cc index 7f2aeedfc..349f41048 100644 --- a/tuplex/core/src/PartitionUtils.cc +++ b/tuplex/core/src/PartitionUtils.cc @@ -105,7 +105,7 @@ namespace tuplex { std::vector newGrouping; size_t new_normal_num = limitedPartitions.size() + limitedTailPartitions.size(); // remove all normal partition, put new one at the front - newGrouping.push_back(PartitionGroup(new_normal_num, 0, 0, 0, 0, 0)); + newGrouping.emplace_back(new_normal_num, 0, 0, 0, 0, 0); for (auto gp: oldGrouping) { gp.numNormalPartitions = 0; newGrouping.push_back(gp);